hyungjoochae's picture
Upload folder using huggingface_hub
9cb705c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 112,
"global_step": 1344,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002232142857142857,
"grad_norm": 6.444167137145996,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.7893,
"step": 1
},
{
"epoch": 0.002232142857142857,
"eval_loss": 0.7994120121002197,
"eval_runtime": 31.574,
"eval_samples_per_second": 2.312,
"eval_steps_per_second": 0.317,
"step": 1
},
{
"epoch": 0.004464285714285714,
"grad_norm": 6.631099224090576,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.8404,
"step": 2
},
{
"epoch": 0.006696428571428571,
"grad_norm": 6.917625427246094,
"learning_rate": 6.000000000000001e-07,
"loss": 0.8045,
"step": 3
},
{
"epoch": 0.008928571428571428,
"grad_norm": 7.057511806488037,
"learning_rate": 8.000000000000001e-07,
"loss": 0.7771,
"step": 4
},
{
"epoch": 0.011160714285714286,
"grad_norm": 6.829500198364258,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.9127,
"step": 5
},
{
"epoch": 0.013392857142857142,
"grad_norm": 5.697404384613037,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.8142,
"step": 6
},
{
"epoch": 0.015625,
"grad_norm": 5.567355155944824,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.8286,
"step": 7
},
{
"epoch": 0.017857142857142856,
"grad_norm": 5.8241071701049805,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.7863,
"step": 8
},
{
"epoch": 0.020089285714285716,
"grad_norm": 5.002991676330566,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.7995,
"step": 9
},
{
"epoch": 0.022321428571428572,
"grad_norm": 4.243339538574219,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7777,
"step": 10
},
{
"epoch": 0.024553571428571428,
"grad_norm": 4.812699794769287,
"learning_rate": 2.2e-06,
"loss": 0.8632,
"step": 11
},
{
"epoch": 0.026785714285714284,
"grad_norm": 3.2879092693328857,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.7006,
"step": 12
},
{
"epoch": 0.029017857142857144,
"grad_norm": 3.256328821182251,
"learning_rate": 2.6e-06,
"loss": 0.8796,
"step": 13
},
{
"epoch": 0.03125,
"grad_norm": 2.9795191287994385,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.7379,
"step": 14
},
{
"epoch": 0.033482142857142856,
"grad_norm": 2.269883394241333,
"learning_rate": 3e-06,
"loss": 0.6732,
"step": 15
},
{
"epoch": 0.03571428571428571,
"grad_norm": 2.641052484512329,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.7464,
"step": 16
},
{
"epoch": 0.03794642857142857,
"grad_norm": 2.5419418811798096,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.7677,
"step": 17
},
{
"epoch": 0.04017857142857143,
"grad_norm": 1.9274882078170776,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.6629,
"step": 18
},
{
"epoch": 0.04241071428571429,
"grad_norm": 1.6530262231826782,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.6453,
"step": 19
},
{
"epoch": 0.044642857142857144,
"grad_norm": 1.4658329486846924,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6656,
"step": 20
},
{
"epoch": 0.046875,
"grad_norm": 1.5799874067306519,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.7575,
"step": 21
},
{
"epoch": 0.049107142857142856,
"grad_norm": 1.6194658279418945,
"learning_rate": 4.4e-06,
"loss": 0.7072,
"step": 22
},
{
"epoch": 0.05133928571428571,
"grad_norm": 1.4326906204223633,
"learning_rate": 4.600000000000001e-06,
"loss": 0.6723,
"step": 23
},
{
"epoch": 0.05357142857142857,
"grad_norm": 1.7949881553649902,
"learning_rate": 4.800000000000001e-06,
"loss": 0.6745,
"step": 24
},
{
"epoch": 0.05580357142857143,
"grad_norm": 1.594699501991272,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 25
},
{
"epoch": 0.05803571428571429,
"grad_norm": 1.2770566940307617,
"learning_rate": 5.2e-06,
"loss": 0.6907,
"step": 26
},
{
"epoch": 0.060267857142857144,
"grad_norm": 1.2886347770690918,
"learning_rate": 5.400000000000001e-06,
"loss": 0.6691,
"step": 27
},
{
"epoch": 0.0625,
"grad_norm": 1.4923341274261475,
"learning_rate": 5.600000000000001e-06,
"loss": 0.7632,
"step": 28
},
{
"epoch": 0.06473214285714286,
"grad_norm": 1.3870608806610107,
"learning_rate": 5.8e-06,
"loss": 0.6068,
"step": 29
},
{
"epoch": 0.06696428571428571,
"grad_norm": 1.2519007921218872,
"learning_rate": 6e-06,
"loss": 0.608,
"step": 30
},
{
"epoch": 0.06919642857142858,
"grad_norm": 1.2436811923980713,
"learning_rate": 6.200000000000001e-06,
"loss": 0.5889,
"step": 31
},
{
"epoch": 0.07142857142857142,
"grad_norm": 1.2719563245773315,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.6538,
"step": 32
},
{
"epoch": 0.07366071428571429,
"grad_norm": 1.388859510421753,
"learning_rate": 6.600000000000001e-06,
"loss": 0.6368,
"step": 33
},
{
"epoch": 0.07589285714285714,
"grad_norm": 1.2473232746124268,
"learning_rate": 6.800000000000001e-06,
"loss": 0.6367,
"step": 34
},
{
"epoch": 0.078125,
"grad_norm": 1.3017948865890503,
"learning_rate": 7e-06,
"loss": 0.6646,
"step": 35
},
{
"epoch": 0.08035714285714286,
"grad_norm": 1.1188671588897705,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.6325,
"step": 36
},
{
"epoch": 0.08258928571428571,
"grad_norm": 1.117879867553711,
"learning_rate": 7.4e-06,
"loss": 0.5879,
"step": 37
},
{
"epoch": 0.08482142857142858,
"grad_norm": 1.1564981937408447,
"learning_rate": 7.600000000000001e-06,
"loss": 0.6176,
"step": 38
},
{
"epoch": 0.08705357142857142,
"grad_norm": 1.613521933555603,
"learning_rate": 7.800000000000002e-06,
"loss": 0.7525,
"step": 39
},
{
"epoch": 0.08928571428571429,
"grad_norm": 1.1391220092773438,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5593,
"step": 40
},
{
"epoch": 0.09151785714285714,
"grad_norm": 1.1999366283416748,
"learning_rate": 8.2e-06,
"loss": 0.589,
"step": 41
},
{
"epoch": 0.09375,
"grad_norm": 1.2285315990447998,
"learning_rate": 8.400000000000001e-06,
"loss": 0.6326,
"step": 42
},
{
"epoch": 0.09598214285714286,
"grad_norm": 1.0952249765396118,
"learning_rate": 8.6e-06,
"loss": 0.6319,
"step": 43
},
{
"epoch": 0.09821428571428571,
"grad_norm": 1.3287895917892456,
"learning_rate": 8.8e-06,
"loss": 0.6771,
"step": 44
},
{
"epoch": 0.10044642857142858,
"grad_norm": 1.251396656036377,
"learning_rate": 9e-06,
"loss": 0.615,
"step": 45
},
{
"epoch": 0.10267857142857142,
"grad_norm": 1.173791766166687,
"learning_rate": 9.200000000000002e-06,
"loss": 0.6431,
"step": 46
},
{
"epoch": 0.10491071428571429,
"grad_norm": 1.2991195917129517,
"learning_rate": 9.4e-06,
"loss": 0.6643,
"step": 47
},
{
"epoch": 0.10714285714285714,
"grad_norm": 1.1051254272460938,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6203,
"step": 48
},
{
"epoch": 0.109375,
"grad_norm": 1.310900330543518,
"learning_rate": 9.800000000000001e-06,
"loss": 0.643,
"step": 49
},
{
"epoch": 0.11160714285714286,
"grad_norm": 1.0683242082595825,
"learning_rate": 1e-05,
"loss": 0.5383,
"step": 50
},
{
"epoch": 0.11383928571428571,
"grad_norm": 1.139321208000183,
"learning_rate": 1.02e-05,
"loss": 0.6041,
"step": 51
},
{
"epoch": 0.11607142857142858,
"grad_norm": 1.095990777015686,
"learning_rate": 1.04e-05,
"loss": 0.5986,
"step": 52
},
{
"epoch": 0.11830357142857142,
"grad_norm": 1.0637718439102173,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.5558,
"step": 53
},
{
"epoch": 0.12053571428571429,
"grad_norm": 1.1787432432174683,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.6232,
"step": 54
},
{
"epoch": 0.12276785714285714,
"grad_norm": 4.439203262329102,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.6276,
"step": 55
},
{
"epoch": 0.125,
"grad_norm": 1.1904429197311401,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.5637,
"step": 56
},
{
"epoch": 0.12723214285714285,
"grad_norm": 1.2871860265731812,
"learning_rate": 1.14e-05,
"loss": 0.6973,
"step": 57
},
{
"epoch": 0.12946428571428573,
"grad_norm": 1.317662000656128,
"learning_rate": 1.16e-05,
"loss": 0.6334,
"step": 58
},
{
"epoch": 0.13169642857142858,
"grad_norm": 1.267655611038208,
"learning_rate": 1.18e-05,
"loss": 0.637,
"step": 59
},
{
"epoch": 0.13392857142857142,
"grad_norm": 1.1938740015029907,
"learning_rate": 1.2e-05,
"loss": 0.6162,
"step": 60
},
{
"epoch": 0.13616071428571427,
"grad_norm": 1.1768426895141602,
"learning_rate": 1.22e-05,
"loss": 0.5861,
"step": 61
},
{
"epoch": 0.13839285714285715,
"grad_norm": 1.2728022336959839,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.5684,
"step": 62
},
{
"epoch": 0.140625,
"grad_norm": 1.2520177364349365,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.6199,
"step": 63
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.2249557971954346,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.5968,
"step": 64
},
{
"epoch": 0.14508928571428573,
"grad_norm": 1.094007134437561,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.5696,
"step": 65
},
{
"epoch": 0.14732142857142858,
"grad_norm": 1.1172953844070435,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.6564,
"step": 66
},
{
"epoch": 0.14955357142857142,
"grad_norm": 1.0176945924758911,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.5915,
"step": 67
},
{
"epoch": 0.15178571428571427,
"grad_norm": 1.1159842014312744,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.587,
"step": 68
},
{
"epoch": 0.15401785714285715,
"grad_norm": 1.3211426734924316,
"learning_rate": 1.38e-05,
"loss": 0.6512,
"step": 69
},
{
"epoch": 0.15625,
"grad_norm": 1.2271831035614014,
"learning_rate": 1.4e-05,
"loss": 0.6185,
"step": 70
},
{
"epoch": 0.15848214285714285,
"grad_norm": 1.0738003253936768,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.5992,
"step": 71
},
{
"epoch": 0.16071428571428573,
"grad_norm": 1.1245979070663452,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.5878,
"step": 72
},
{
"epoch": 0.16294642857142858,
"grad_norm": 1.3597056865692139,
"learning_rate": 1.46e-05,
"loss": 0.6344,
"step": 73
},
{
"epoch": 0.16517857142857142,
"grad_norm": 1.2197428941726685,
"learning_rate": 1.48e-05,
"loss": 0.5806,
"step": 74
},
{
"epoch": 0.16741071428571427,
"grad_norm": 1.1941276788711548,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.6668,
"step": 75
},
{
"epoch": 0.16964285714285715,
"grad_norm": 1.2710192203521729,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.6386,
"step": 76
},
{
"epoch": 0.171875,
"grad_norm": 1.282441258430481,
"learning_rate": 1.54e-05,
"loss": 0.6265,
"step": 77
},
{
"epoch": 0.17410714285714285,
"grad_norm": 1.465880274772644,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.6674,
"step": 78
},
{
"epoch": 0.17633928571428573,
"grad_norm": 1.1180906295776367,
"learning_rate": 1.58e-05,
"loss": 0.583,
"step": 79
},
{
"epoch": 0.17857142857142858,
"grad_norm": 1.1118671894073486,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6287,
"step": 80
},
{
"epoch": 0.18080357142857142,
"grad_norm": 1.1944739818572998,
"learning_rate": 1.62e-05,
"loss": 0.6626,
"step": 81
},
{
"epoch": 0.18303571428571427,
"grad_norm": 1.2908122539520264,
"learning_rate": 1.64e-05,
"loss": 0.6691,
"step": 82
},
{
"epoch": 0.18526785714285715,
"grad_norm": 1.3516288995742798,
"learning_rate": 1.66e-05,
"loss": 0.6543,
"step": 83
},
{
"epoch": 0.1875,
"grad_norm": 1.1028647422790527,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.5868,
"step": 84
},
{
"epoch": 0.18973214285714285,
"grad_norm": 1.2997535467147827,
"learning_rate": 1.7e-05,
"loss": 0.6145,
"step": 85
},
{
"epoch": 0.19196428571428573,
"grad_norm": 1.0468411445617676,
"learning_rate": 1.72e-05,
"loss": 0.5493,
"step": 86
},
{
"epoch": 0.19419642857142858,
"grad_norm": 1.3448480367660522,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.6751,
"step": 87
},
{
"epoch": 0.19642857142857142,
"grad_norm": 1.119872808456421,
"learning_rate": 1.76e-05,
"loss": 0.5593,
"step": 88
},
{
"epoch": 0.19866071428571427,
"grad_norm": 1.3660776615142822,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.6085,
"step": 89
},
{
"epoch": 0.20089285714285715,
"grad_norm": 1.189186930656433,
"learning_rate": 1.8e-05,
"loss": 0.6327,
"step": 90
},
{
"epoch": 0.203125,
"grad_norm": 1.5373879671096802,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.6988,
"step": 91
},
{
"epoch": 0.20535714285714285,
"grad_norm": 1.3453340530395508,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.5824,
"step": 92
},
{
"epoch": 0.20758928571428573,
"grad_norm": 1.2179492712020874,
"learning_rate": 1.86e-05,
"loss": 0.6408,
"step": 93
},
{
"epoch": 0.20982142857142858,
"grad_norm": 1.1074484586715698,
"learning_rate": 1.88e-05,
"loss": 0.5831,
"step": 94
},
{
"epoch": 0.21205357142857142,
"grad_norm": 1.4239832162857056,
"learning_rate": 1.9e-05,
"loss": 0.6534,
"step": 95
},
{
"epoch": 0.21428571428571427,
"grad_norm": 1.1983468532562256,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.602,
"step": 96
},
{
"epoch": 0.21651785714285715,
"grad_norm": 1.116683006286621,
"learning_rate": 1.94e-05,
"loss": 0.5104,
"step": 97
},
{
"epoch": 0.21875,
"grad_norm": 1.200826644897461,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.5871,
"step": 98
},
{
"epoch": 0.22098214285714285,
"grad_norm": 1.1437289714813232,
"learning_rate": 1.98e-05,
"loss": 0.586,
"step": 99
},
{
"epoch": 0.22321428571428573,
"grad_norm": 1.2034040689468384,
"learning_rate": 2e-05,
"loss": 0.5807,
"step": 100
},
{
"epoch": 0.22544642857142858,
"grad_norm": 1.2772138118743896,
"learning_rate": 1.9999968111891562e-05,
"loss": 0.6783,
"step": 101
},
{
"epoch": 0.22767857142857142,
"grad_norm": 1.0830761194229126,
"learning_rate": 1.9999872447769624e-05,
"loss": 0.6097,
"step": 102
},
{
"epoch": 0.22991071428571427,
"grad_norm": 1.179870843887329,
"learning_rate": 1.9999713008244287e-05,
"loss": 0.6053,
"step": 103
},
{
"epoch": 0.23214285714285715,
"grad_norm": 1.3241559267044067,
"learning_rate": 1.9999489794332404e-05,
"loss": 0.7262,
"step": 104
},
{
"epoch": 0.234375,
"grad_norm": 1.2744848728179932,
"learning_rate": 1.9999202807457537e-05,
"loss": 0.6816,
"step": 105
},
{
"epoch": 0.23660714285714285,
"grad_norm": 1.124351978302002,
"learning_rate": 1.9998852049449998e-05,
"loss": 0.7003,
"step": 106
},
{
"epoch": 0.23883928571428573,
"grad_norm": 1.009700894355774,
"learning_rate": 1.999843752254677e-05,
"loss": 0.603,
"step": 107
},
{
"epoch": 0.24107142857142858,
"grad_norm": 1.1453275680541992,
"learning_rate": 1.9997959229391567e-05,
"loss": 0.5768,
"step": 108
},
{
"epoch": 0.24330357142857142,
"grad_norm": 1.0525116920471191,
"learning_rate": 1.9997417173034746e-05,
"loss": 0.5811,
"step": 109
},
{
"epoch": 0.24553571428571427,
"grad_norm": 1.1967023611068726,
"learning_rate": 1.9996811356933346e-05,
"loss": 0.5993,
"step": 110
},
{
"epoch": 0.24776785714285715,
"grad_norm": 1.190104365348816,
"learning_rate": 1.999614178495103e-05,
"loss": 0.6231,
"step": 111
},
{
"epoch": 0.25,
"grad_norm": 1.2647576332092285,
"learning_rate": 1.9995408461358074e-05,
"loss": 0.6269,
"step": 112
},
{
"epoch": 0.25,
"eval_loss": 0.5685862898826599,
"eval_runtime": 31.2257,
"eval_samples_per_second": 2.338,
"eval_steps_per_second": 0.32,
"step": 112
},
{
"epoch": 0.25223214285714285,
"grad_norm": 1.121337652206421,
"learning_rate": 1.9994611390831342e-05,
"loss": 0.5947,
"step": 113
},
{
"epoch": 0.2544642857142857,
"grad_norm": 1.3274226188659668,
"learning_rate": 1.9993750578454248e-05,
"loss": 0.7194,
"step": 114
},
{
"epoch": 0.25669642857142855,
"grad_norm": 1.2155532836914062,
"learning_rate": 1.9992826029716722e-05,
"loss": 0.6605,
"step": 115
},
{
"epoch": 0.25892857142857145,
"grad_norm": 1.4203304052352905,
"learning_rate": 1.999183775051519e-05,
"loss": 0.5979,
"step": 116
},
{
"epoch": 0.2611607142857143,
"grad_norm": 1.1040465831756592,
"learning_rate": 1.9990785747152527e-05,
"loss": 0.5968,
"step": 117
},
{
"epoch": 0.26339285714285715,
"grad_norm": 1.3391578197479248,
"learning_rate": 1.9989670026338002e-05,
"loss": 0.6921,
"step": 118
},
{
"epoch": 0.265625,
"grad_norm": 1.3182568550109863,
"learning_rate": 1.9988490595187273e-05,
"loss": 0.6563,
"step": 119
},
{
"epoch": 0.26785714285714285,
"grad_norm": 1.2267478704452515,
"learning_rate": 1.9987247461222297e-05,
"loss": 0.5942,
"step": 120
},
{
"epoch": 0.2700892857142857,
"grad_norm": 1.4536434412002563,
"learning_rate": 1.9985940632371316e-05,
"loss": 0.6894,
"step": 121
},
{
"epoch": 0.27232142857142855,
"grad_norm": 1.394129991531372,
"learning_rate": 1.9984570116968785e-05,
"loss": 0.7047,
"step": 122
},
{
"epoch": 0.27455357142857145,
"grad_norm": 1.4202784299850464,
"learning_rate": 1.9983135923755336e-05,
"loss": 0.7424,
"step": 123
},
{
"epoch": 0.2767857142857143,
"grad_norm": 1.3307374715805054,
"learning_rate": 1.9981638061877714e-05,
"loss": 0.6857,
"step": 124
},
{
"epoch": 0.27901785714285715,
"grad_norm": 1.0905797481536865,
"learning_rate": 1.998007654088871e-05,
"loss": 0.5316,
"step": 125
},
{
"epoch": 0.28125,
"grad_norm": 1.1642409563064575,
"learning_rate": 1.9978451370747122e-05,
"loss": 0.6388,
"step": 126
},
{
"epoch": 0.28348214285714285,
"grad_norm": 1.2434855699539185,
"learning_rate": 1.9976762561817656e-05,
"loss": 0.5913,
"step": 127
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.5033081769943237,
"learning_rate": 1.997501012487091e-05,
"loss": 0.6407,
"step": 128
},
{
"epoch": 0.28794642857142855,
"grad_norm": 1.1626683473587036,
"learning_rate": 1.997319407108326e-05,
"loss": 0.6786,
"step": 129
},
{
"epoch": 0.29017857142857145,
"grad_norm": 1.342909812927246,
"learning_rate": 1.9971314412036807e-05,
"loss": 0.624,
"step": 130
},
{
"epoch": 0.2924107142857143,
"grad_norm": 1.140647530555725,
"learning_rate": 1.9969371159719307e-05,
"loss": 0.6178,
"step": 131
},
{
"epoch": 0.29464285714285715,
"grad_norm": 1.40773606300354,
"learning_rate": 1.996736432652409e-05,
"loss": 0.5911,
"step": 132
},
{
"epoch": 0.296875,
"grad_norm": 1.2301044464111328,
"learning_rate": 1.9965293925249976e-05,
"loss": 0.5775,
"step": 133
},
{
"epoch": 0.29910714285714285,
"grad_norm": 1.3424404859542847,
"learning_rate": 1.9963159969101207e-05,
"loss": 0.6405,
"step": 134
},
{
"epoch": 0.3013392857142857,
"grad_norm": 1.0376389026641846,
"learning_rate": 1.996096247168734e-05,
"loss": 0.5248,
"step": 135
},
{
"epoch": 0.30357142857142855,
"grad_norm": 1.3082736730575562,
"learning_rate": 1.9958701447023188e-05,
"loss": 0.6588,
"step": 136
},
{
"epoch": 0.30580357142857145,
"grad_norm": 1.3725180625915527,
"learning_rate": 1.9956376909528704e-05,
"loss": 0.6416,
"step": 137
},
{
"epoch": 0.3080357142857143,
"grad_norm": 1.3519439697265625,
"learning_rate": 1.9953988874028917e-05,
"loss": 0.6421,
"step": 138
},
{
"epoch": 0.31026785714285715,
"grad_norm": 1.259487509727478,
"learning_rate": 1.995153735575381e-05,
"loss": 0.6523,
"step": 139
},
{
"epoch": 0.3125,
"grad_norm": 1.498024821281433,
"learning_rate": 1.994902237033824e-05,
"loss": 0.6319,
"step": 140
},
{
"epoch": 0.31473214285714285,
"grad_norm": 1.369606375694275,
"learning_rate": 1.994644393382183e-05,
"loss": 0.652,
"step": 141
},
{
"epoch": 0.3169642857142857,
"grad_norm": 1.268442153930664,
"learning_rate": 1.9943802062648877e-05,
"loss": 0.6082,
"step": 142
},
{
"epoch": 0.31919642857142855,
"grad_norm": 1.1077290773391724,
"learning_rate": 1.9941096773668232e-05,
"loss": 0.577,
"step": 143
},
{
"epoch": 0.32142857142857145,
"grad_norm": 1.3842663764953613,
"learning_rate": 1.9938328084133206e-05,
"loss": 0.6668,
"step": 144
},
{
"epoch": 0.3236607142857143,
"grad_norm": 1.1705602407455444,
"learning_rate": 1.9935496011701453e-05,
"loss": 0.5888,
"step": 145
},
{
"epoch": 0.32589285714285715,
"grad_norm": 1.029478907585144,
"learning_rate": 1.9932600574434864e-05,
"loss": 0.5286,
"step": 146
},
{
"epoch": 0.328125,
"grad_norm": 1.2074710130691528,
"learning_rate": 1.9929641790799438e-05,
"loss": 0.6419,
"step": 147
},
{
"epoch": 0.33035714285714285,
"grad_norm": 1.2905062437057495,
"learning_rate": 1.9926619679665175e-05,
"loss": 0.6704,
"step": 148
},
{
"epoch": 0.3325892857142857,
"grad_norm": 1.1862680912017822,
"learning_rate": 1.992353426030596e-05,
"loss": 0.591,
"step": 149
},
{
"epoch": 0.33482142857142855,
"grad_norm": 1.023520588874817,
"learning_rate": 1.9920385552399434e-05,
"loss": 0.551,
"step": 150
},
{
"epoch": 0.33705357142857145,
"grad_norm": 1.067068338394165,
"learning_rate": 1.991717357602686e-05,
"loss": 0.5453,
"step": 151
},
{
"epoch": 0.3392857142857143,
"grad_norm": 1.1788963079452515,
"learning_rate": 1.9913898351673006e-05,
"loss": 0.561,
"step": 152
},
{
"epoch": 0.34151785714285715,
"grad_norm": 1.081198811531067,
"learning_rate": 1.991055990022602e-05,
"loss": 0.6095,
"step": 153
},
{
"epoch": 0.34375,
"grad_norm": 1.234728455543518,
"learning_rate": 1.990715824297728e-05,
"loss": 0.5744,
"step": 154
},
{
"epoch": 0.34598214285714285,
"grad_norm": 1.2400490045547485,
"learning_rate": 1.990369340162127e-05,
"loss": 0.6289,
"step": 155
},
{
"epoch": 0.3482142857142857,
"grad_norm": 1.2360931634902954,
"learning_rate": 1.9900165398255434e-05,
"loss": 0.6163,
"step": 156
},
{
"epoch": 0.35044642857142855,
"grad_norm": 0.9136845469474792,
"learning_rate": 1.9896574255380045e-05,
"loss": 0.545,
"step": 157
},
{
"epoch": 0.35267857142857145,
"grad_norm": 1.1531745195388794,
"learning_rate": 1.9892919995898052e-05,
"loss": 0.5915,
"step": 158
},
{
"epoch": 0.3549107142857143,
"grad_norm": 1.074296474456787,
"learning_rate": 1.988920264311494e-05,
"loss": 0.5395,
"step": 159
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.1261378526687622,
"learning_rate": 1.9885422220738583e-05,
"loss": 0.548,
"step": 160
},
{
"epoch": 0.359375,
"grad_norm": 1.2132823467254639,
"learning_rate": 1.988157875287908e-05,
"loss": 0.6395,
"step": 161
},
{
"epoch": 0.36160714285714285,
"grad_norm": 1.0568815469741821,
"learning_rate": 1.9877672264048618e-05,
"loss": 0.5183,
"step": 162
},
{
"epoch": 0.3638392857142857,
"grad_norm": 1.2066727876663208,
"learning_rate": 1.98737027791613e-05,
"loss": 0.5492,
"step": 163
},
{
"epoch": 0.36607142857142855,
"grad_norm": 1.156691551208496,
"learning_rate": 1.9869670323533005e-05,
"loss": 0.6742,
"step": 164
},
{
"epoch": 0.36830357142857145,
"grad_norm": 1.2685508728027344,
"learning_rate": 1.9865574922881204e-05,
"loss": 0.5676,
"step": 165
},
{
"epoch": 0.3705357142857143,
"grad_norm": 1.1878615617752075,
"learning_rate": 1.986141660332482e-05,
"loss": 0.6537,
"step": 166
},
{
"epoch": 0.37276785714285715,
"grad_norm": 1.0642375946044922,
"learning_rate": 1.9857195391384038e-05,
"loss": 0.6212,
"step": 167
},
{
"epoch": 0.375,
"grad_norm": 1.1920270919799805,
"learning_rate": 1.9852911313980146e-05,
"loss": 0.5452,
"step": 168
},
{
"epoch": 0.37723214285714285,
"grad_norm": 0.920224130153656,
"learning_rate": 1.9848564398435374e-05,
"loss": 0.5534,
"step": 169
},
{
"epoch": 0.3794642857142857,
"grad_norm": 1.190719723701477,
"learning_rate": 1.9844154672472707e-05,
"loss": 0.5595,
"step": 170
},
{
"epoch": 0.38169642857142855,
"grad_norm": 1.0356184244155884,
"learning_rate": 1.9839682164215707e-05,
"loss": 0.6007,
"step": 171
},
{
"epoch": 0.38392857142857145,
"grad_norm": 1.1822032928466797,
"learning_rate": 1.9835146902188336e-05,
"loss": 0.6195,
"step": 172
},
{
"epoch": 0.3861607142857143,
"grad_norm": 1.0653460025787354,
"learning_rate": 1.983054891531478e-05,
"loss": 0.6015,
"step": 173
},
{
"epoch": 0.38839285714285715,
"grad_norm": 1.2356278896331787,
"learning_rate": 1.9825888232919268e-05,
"loss": 0.6477,
"step": 174
},
{
"epoch": 0.390625,
"grad_norm": 1.0866338014602661,
"learning_rate": 1.982116488472586e-05,
"loss": 0.5896,
"step": 175
},
{
"epoch": 0.39285714285714285,
"grad_norm": 1.1473841667175293,
"learning_rate": 1.9816378900858288e-05,
"loss": 0.5805,
"step": 176
},
{
"epoch": 0.3950892857142857,
"grad_norm": 1.1367318630218506,
"learning_rate": 1.9811530311839747e-05,
"loss": 0.6801,
"step": 177
},
{
"epoch": 0.39732142857142855,
"grad_norm": 1.2681716680526733,
"learning_rate": 1.98066191485927e-05,
"loss": 0.7205,
"step": 178
},
{
"epoch": 0.39955357142857145,
"grad_norm": 1.10531747341156,
"learning_rate": 1.980164544243869e-05,
"loss": 0.5849,
"step": 179
},
{
"epoch": 0.4017857142857143,
"grad_norm": 1.2467155456542969,
"learning_rate": 1.9796609225098136e-05,
"loss": 0.6424,
"step": 180
},
{
"epoch": 0.40401785714285715,
"grad_norm": 0.9677994847297668,
"learning_rate": 1.9791510528690125e-05,
"loss": 0.5607,
"step": 181
},
{
"epoch": 0.40625,
"grad_norm": 1.1776186227798462,
"learning_rate": 1.9786349385732212e-05,
"loss": 0.574,
"step": 182
},
{
"epoch": 0.40848214285714285,
"grad_norm": 1.1970598697662354,
"learning_rate": 1.9781125829140214e-05,
"loss": 0.5488,
"step": 183
},
{
"epoch": 0.4107142857142857,
"grad_norm": 1.0842020511627197,
"learning_rate": 1.9775839892228004e-05,
"loss": 0.5859,
"step": 184
},
{
"epoch": 0.41294642857142855,
"grad_norm": 1.1312363147735596,
"learning_rate": 1.977049160870728e-05,
"loss": 0.5971,
"step": 185
},
{
"epoch": 0.41517857142857145,
"grad_norm": 1.147937297821045,
"learning_rate": 1.976508101268738e-05,
"loss": 0.6647,
"step": 186
},
{
"epoch": 0.4174107142857143,
"grad_norm": 1.27888822555542,
"learning_rate": 1.975960813867503e-05,
"loss": 0.6283,
"step": 187
},
{
"epoch": 0.41964285714285715,
"grad_norm": 1.1987580060958862,
"learning_rate": 1.9754073021574153e-05,
"loss": 0.5747,
"step": 188
},
{
"epoch": 0.421875,
"grad_norm": 1.211571455001831,
"learning_rate": 1.9748475696685637e-05,
"loss": 0.6622,
"step": 189
},
{
"epoch": 0.42410714285714285,
"grad_norm": 1.1061530113220215,
"learning_rate": 1.9742816199707096e-05,
"loss": 0.5731,
"step": 190
},
{
"epoch": 0.4263392857142857,
"grad_norm": 1.3477433919906616,
"learning_rate": 1.9737094566732663e-05,
"loss": 0.669,
"step": 191
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.1551015377044678,
"learning_rate": 1.9731310834252747e-05,
"loss": 0.5935,
"step": 192
},
{
"epoch": 0.43080357142857145,
"grad_norm": 1.1217777729034424,
"learning_rate": 1.972546503915381e-05,
"loss": 0.5141,
"step": 193
},
{
"epoch": 0.4330357142857143,
"grad_norm": 1.1012550592422485,
"learning_rate": 1.9719557218718116e-05,
"loss": 0.4681,
"step": 194
},
{
"epoch": 0.43526785714285715,
"grad_norm": 1.241761565208435,
"learning_rate": 1.9713587410623516e-05,
"loss": 0.5332,
"step": 195
},
{
"epoch": 0.4375,
"grad_norm": 1.2476496696472168,
"learning_rate": 1.970755565294318e-05,
"loss": 0.6408,
"step": 196
},
{
"epoch": 0.43973214285714285,
"grad_norm": 1.0551302433013916,
"learning_rate": 1.970146198414538e-05,
"loss": 0.5983,
"step": 197
},
{
"epoch": 0.4419642857142857,
"grad_norm": 1.0154962539672852,
"learning_rate": 1.969530644309323e-05,
"loss": 0.5094,
"step": 198
},
{
"epoch": 0.44419642857142855,
"grad_norm": 1.04148530960083,
"learning_rate": 1.968908906904444e-05,
"loss": 0.5191,
"step": 199
},
{
"epoch": 0.44642857142857145,
"grad_norm": 1.1981557607650757,
"learning_rate": 1.9682809901651074e-05,
"loss": 0.6159,
"step": 200
},
{
"epoch": 0.4486607142857143,
"grad_norm": 1.05272376537323,
"learning_rate": 1.9676468980959284e-05,
"loss": 0.5368,
"step": 201
},
{
"epoch": 0.45089285714285715,
"grad_norm": 1.3401468992233276,
"learning_rate": 1.9670066347409063e-05,
"loss": 0.659,
"step": 202
},
{
"epoch": 0.453125,
"grad_norm": 1.1549017429351807,
"learning_rate": 1.9663602041833983e-05,
"loss": 0.6243,
"step": 203
},
{
"epoch": 0.45535714285714285,
"grad_norm": 1.3726199865341187,
"learning_rate": 1.9657076105460945e-05,
"loss": 0.6091,
"step": 204
},
{
"epoch": 0.4575892857142857,
"grad_norm": 1.1885762214660645,
"learning_rate": 1.9650488579909898e-05,
"loss": 0.6273,
"step": 205
},
{
"epoch": 0.45982142857142855,
"grad_norm": 1.225096344947815,
"learning_rate": 1.964383950719359e-05,
"loss": 0.6524,
"step": 206
},
{
"epoch": 0.46205357142857145,
"grad_norm": 0.9724453687667847,
"learning_rate": 1.9637128929717294e-05,
"loss": 0.5768,
"step": 207
},
{
"epoch": 0.4642857142857143,
"grad_norm": 1.1051849126815796,
"learning_rate": 1.9630356890278527e-05,
"loss": 0.571,
"step": 208
},
{
"epoch": 0.46651785714285715,
"grad_norm": 1.3608918190002441,
"learning_rate": 1.96235234320668e-05,
"loss": 0.5879,
"step": 209
},
{
"epoch": 0.46875,
"grad_norm": 1.2607719898223877,
"learning_rate": 1.9616628598663322e-05,
"loss": 0.6728,
"step": 210
},
{
"epoch": 0.47098214285714285,
"grad_norm": 1.1398773193359375,
"learning_rate": 1.9609672434040736e-05,
"loss": 0.5693,
"step": 211
},
{
"epoch": 0.4732142857142857,
"grad_norm": 1.0851867198944092,
"learning_rate": 1.9602654982562822e-05,
"loss": 0.6436,
"step": 212
},
{
"epoch": 0.47544642857142855,
"grad_norm": 1.1385678052902222,
"learning_rate": 1.9595576288984233e-05,
"loss": 0.5584,
"step": 213
},
{
"epoch": 0.47767857142857145,
"grad_norm": 1.1551555395126343,
"learning_rate": 1.9588436398450206e-05,
"loss": 0.6299,
"step": 214
},
{
"epoch": 0.4799107142857143,
"grad_norm": 0.976137101650238,
"learning_rate": 1.958123535649625e-05,
"loss": 0.5715,
"step": 215
},
{
"epoch": 0.48214285714285715,
"grad_norm": 1.0276240110397339,
"learning_rate": 1.9573973209047893e-05,
"loss": 0.5675,
"step": 216
},
{
"epoch": 0.484375,
"grad_norm": 1.194334626197815,
"learning_rate": 1.9566650002420363e-05,
"loss": 0.635,
"step": 217
},
{
"epoch": 0.48660714285714285,
"grad_norm": 1.171510100364685,
"learning_rate": 1.9559265783318304e-05,
"loss": 0.5989,
"step": 218
},
{
"epoch": 0.4888392857142857,
"grad_norm": 1.1479747295379639,
"learning_rate": 1.9551820598835464e-05,
"loss": 0.584,
"step": 219
},
{
"epoch": 0.49107142857142855,
"grad_norm": 1.0744856595993042,
"learning_rate": 1.9544314496454423e-05,
"loss": 0.6199,
"step": 220
},
{
"epoch": 0.49330357142857145,
"grad_norm": 1.1071079969406128,
"learning_rate": 1.9536747524046254e-05,
"loss": 0.6514,
"step": 221
},
{
"epoch": 0.4955357142857143,
"grad_norm": 1.1865029335021973,
"learning_rate": 1.9529119729870253e-05,
"loss": 0.5937,
"step": 222
},
{
"epoch": 0.49776785714285715,
"grad_norm": 1.1087124347686768,
"learning_rate": 1.9521431162573596e-05,
"loss": 0.6303,
"step": 223
},
{
"epoch": 0.5,
"grad_norm": 1.0572322607040405,
"learning_rate": 1.9513681871191063e-05,
"loss": 0.5568,
"step": 224
},
{
"epoch": 0.5,
"eval_loss": 0.5669442415237427,
"eval_runtime": 31.5503,
"eval_samples_per_second": 2.314,
"eval_steps_per_second": 0.317,
"step": 224
},
{
"epoch": 0.5022321428571429,
"grad_norm": 1.0085796117782593,
"learning_rate": 1.95058719051447e-05,
"loss": 0.5449,
"step": 225
},
{
"epoch": 0.5044642857142857,
"grad_norm": 1.0866364240646362,
"learning_rate": 1.949800131424352e-05,
"loss": 0.5364,
"step": 226
},
{
"epoch": 0.5066964285714286,
"grad_norm": 1.1213027238845825,
"learning_rate": 1.9490070148683166e-05,
"loss": 0.6107,
"step": 227
},
{
"epoch": 0.5089285714285714,
"grad_norm": 1.0242129564285278,
"learning_rate": 1.9482078459045617e-05,
"loss": 0.5556,
"step": 228
},
{
"epoch": 0.5111607142857143,
"grad_norm": 1.2038688659667969,
"learning_rate": 1.947402629629885e-05,
"loss": 0.5854,
"step": 229
},
{
"epoch": 0.5133928571428571,
"grad_norm": 1.258674144744873,
"learning_rate": 1.9465913711796502e-05,
"loss": 0.6186,
"step": 230
},
{
"epoch": 0.515625,
"grad_norm": 1.2189650535583496,
"learning_rate": 1.9457740757277577e-05,
"loss": 0.5852,
"step": 231
},
{
"epoch": 0.5178571428571429,
"grad_norm": 1.239396572113037,
"learning_rate": 1.9449507484866084e-05,
"loss": 0.6632,
"step": 232
},
{
"epoch": 0.5200892857142857,
"grad_norm": 1.089785099029541,
"learning_rate": 1.944121394707072e-05,
"loss": 0.6285,
"step": 233
},
{
"epoch": 0.5223214285714286,
"grad_norm": 1.3066377639770508,
"learning_rate": 1.9432860196784533e-05,
"loss": 0.7184,
"step": 234
},
{
"epoch": 0.5245535714285714,
"grad_norm": 1.1240565776824951,
"learning_rate": 1.9424446287284576e-05,
"loss": 0.5561,
"step": 235
},
{
"epoch": 0.5267857142857143,
"grad_norm": 1.0432591438293457,
"learning_rate": 1.941597227223159e-05,
"loss": 0.5715,
"step": 236
},
{
"epoch": 0.5290178571428571,
"grad_norm": 1.0457866191864014,
"learning_rate": 1.940743820566963e-05,
"loss": 0.5682,
"step": 237
},
{
"epoch": 0.53125,
"grad_norm": 1.072249174118042,
"learning_rate": 1.9398844142025746e-05,
"loss": 0.5581,
"step": 238
},
{
"epoch": 0.5334821428571429,
"grad_norm": 1.380035161972046,
"learning_rate": 1.9390190136109625e-05,
"loss": 0.6387,
"step": 239
},
{
"epoch": 0.5357142857142857,
"grad_norm": 1.1203244924545288,
"learning_rate": 1.9381476243113243e-05,
"loss": 0.6205,
"step": 240
},
{
"epoch": 0.5379464285714286,
"grad_norm": 1.0932904481887817,
"learning_rate": 1.9372702518610512e-05,
"loss": 0.6444,
"step": 241
},
{
"epoch": 0.5401785714285714,
"grad_norm": 1.3310596942901611,
"learning_rate": 1.9363869018556928e-05,
"loss": 0.6773,
"step": 242
},
{
"epoch": 0.5424107142857143,
"grad_norm": 1.2222163677215576,
"learning_rate": 1.9354975799289215e-05,
"loss": 0.6284,
"step": 243
},
{
"epoch": 0.5446428571428571,
"grad_norm": 1.2926287651062012,
"learning_rate": 1.9346022917524958e-05,
"loss": 0.6252,
"step": 244
},
{
"epoch": 0.546875,
"grad_norm": 1.3600492477416992,
"learning_rate": 1.933701043036225e-05,
"loss": 0.6198,
"step": 245
},
{
"epoch": 0.5491071428571429,
"grad_norm": 1.1185046434402466,
"learning_rate": 1.9327938395279325e-05,
"loss": 0.6239,
"step": 246
},
{
"epoch": 0.5513392857142857,
"grad_norm": 1.1856756210327148,
"learning_rate": 1.9318806870134194e-05,
"loss": 0.5969,
"step": 247
},
{
"epoch": 0.5535714285714286,
"grad_norm": 1.0996602773666382,
"learning_rate": 1.9309615913164262e-05,
"loss": 0.6103,
"step": 248
},
{
"epoch": 0.5558035714285714,
"grad_norm": 1.1660605669021606,
"learning_rate": 1.9300365582985984e-05,
"loss": 0.6003,
"step": 249
},
{
"epoch": 0.5580357142857143,
"grad_norm": 1.158035397529602,
"learning_rate": 1.9291055938594464e-05,
"loss": 0.5799,
"step": 250
},
{
"epoch": 0.5602678571428571,
"grad_norm": 1.1602364778518677,
"learning_rate": 1.9281687039363088e-05,
"loss": 0.6373,
"step": 251
},
{
"epoch": 0.5625,
"grad_norm": 1.181039571762085,
"learning_rate": 1.9272258945043154e-05,
"loss": 0.5917,
"step": 252
},
{
"epoch": 0.5647321428571429,
"grad_norm": 1.0945910215377808,
"learning_rate": 1.9262771715763483e-05,
"loss": 0.644,
"step": 253
},
{
"epoch": 0.5669642857142857,
"grad_norm": 1.1994935274124146,
"learning_rate": 1.9253225412030028e-05,
"loss": 0.6678,
"step": 254
},
{
"epoch": 0.5691964285714286,
"grad_norm": 1.2264413833618164,
"learning_rate": 1.924362009472551e-05,
"loss": 0.599,
"step": 255
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.1776793003082275,
"learning_rate": 1.9233955825109e-05,
"loss": 0.6346,
"step": 256
},
{
"epoch": 0.5736607142857143,
"grad_norm": 1.2656333446502686,
"learning_rate": 1.9224232664815563e-05,
"loss": 0.6538,
"step": 257
},
{
"epoch": 0.5758928571428571,
"grad_norm": 1.1230204105377197,
"learning_rate": 1.9214450675855832e-05,
"loss": 0.6518,
"step": 258
},
{
"epoch": 0.578125,
"grad_norm": 1.183430790901184,
"learning_rate": 1.9204609920615635e-05,
"loss": 0.6264,
"step": 259
},
{
"epoch": 0.5803571428571429,
"grad_norm": 1.3236072063446045,
"learning_rate": 1.919471046185558e-05,
"loss": 0.6611,
"step": 260
},
{
"epoch": 0.5825892857142857,
"grad_norm": 1.1483640670776367,
"learning_rate": 1.9184752362710674e-05,
"loss": 0.558,
"step": 261
},
{
"epoch": 0.5848214285714286,
"grad_norm": 1.333167314529419,
"learning_rate": 1.917473568668991e-05,
"loss": 0.6025,
"step": 262
},
{
"epoch": 0.5870535714285714,
"grad_norm": 1.0283328294754028,
"learning_rate": 1.9164660497675848e-05,
"loss": 0.5354,
"step": 263
},
{
"epoch": 0.5892857142857143,
"grad_norm": 1.1128134727478027,
"learning_rate": 1.9154526859924242e-05,
"loss": 0.6335,
"step": 264
},
{
"epoch": 0.5915178571428571,
"grad_norm": 1.057431697845459,
"learning_rate": 1.9144334838063595e-05,
"loss": 0.5429,
"step": 265
},
{
"epoch": 0.59375,
"grad_norm": 1.1948074102401733,
"learning_rate": 1.9134084497094766e-05,
"loss": 0.6311,
"step": 266
},
{
"epoch": 0.5959821428571429,
"grad_norm": 1.2003923654556274,
"learning_rate": 1.9123775902390555e-05,
"loss": 0.6843,
"step": 267
},
{
"epoch": 0.5982142857142857,
"grad_norm": 1.1661674976348877,
"learning_rate": 1.9113409119695276e-05,
"loss": 0.524,
"step": 268
},
{
"epoch": 0.6004464285714286,
"grad_norm": 1.081857681274414,
"learning_rate": 1.9102984215124352e-05,
"loss": 0.5441,
"step": 269
},
{
"epoch": 0.6026785714285714,
"grad_norm": 1.0927963256835938,
"learning_rate": 1.9092501255163874e-05,
"loss": 0.5054,
"step": 270
},
{
"epoch": 0.6049107142857143,
"grad_norm": 1.1680039167404175,
"learning_rate": 1.9081960306670198e-05,
"loss": 0.59,
"step": 271
},
{
"epoch": 0.6071428571428571,
"grad_norm": 1.2833560705184937,
"learning_rate": 1.907136143686951e-05,
"loss": 0.6486,
"step": 272
},
{
"epoch": 0.609375,
"grad_norm": 1.117241382598877,
"learning_rate": 1.9060704713357382e-05,
"loss": 0.5582,
"step": 273
},
{
"epoch": 0.6116071428571429,
"grad_norm": 0.9881597757339478,
"learning_rate": 1.904999020409837e-05,
"loss": 0.606,
"step": 274
},
{
"epoch": 0.6138392857142857,
"grad_norm": 1.057505488395691,
"learning_rate": 1.9039217977425567e-05,
"loss": 0.571,
"step": 275
},
{
"epoch": 0.6160714285714286,
"grad_norm": 0.9963536858558655,
"learning_rate": 1.902838810204015e-05,
"loss": 0.5612,
"step": 276
},
{
"epoch": 0.6183035714285714,
"grad_norm": 1.0682551860809326,
"learning_rate": 1.901750064701097e-05,
"loss": 0.5194,
"step": 277
},
{
"epoch": 0.6205357142857143,
"grad_norm": 1.136527419090271,
"learning_rate": 1.90065556817741e-05,
"loss": 0.5609,
"step": 278
},
{
"epoch": 0.6227678571428571,
"grad_norm": 0.9889798164367676,
"learning_rate": 1.8995553276132385e-05,
"loss": 0.5247,
"step": 279
},
{
"epoch": 0.625,
"grad_norm": 1.151685118675232,
"learning_rate": 1.8984493500255e-05,
"loss": 0.6895,
"step": 280
},
{
"epoch": 0.6272321428571429,
"grad_norm": 1.2816126346588135,
"learning_rate": 1.8973376424677022e-05,
"loss": 0.6387,
"step": 281
},
{
"epoch": 0.6294642857142857,
"grad_norm": 1.0590577125549316,
"learning_rate": 1.8962202120298948e-05,
"loss": 0.6099,
"step": 282
},
{
"epoch": 0.6316964285714286,
"grad_norm": 1.278604507446289,
"learning_rate": 1.8950970658386262e-05,
"loss": 0.5988,
"step": 283
},
{
"epoch": 0.6339285714285714,
"grad_norm": 1.2634141445159912,
"learning_rate": 1.8939682110568982e-05,
"loss": 0.5764,
"step": 284
},
{
"epoch": 0.6361607142857143,
"grad_norm": 1.1590498685836792,
"learning_rate": 1.8928336548841197e-05,
"loss": 0.544,
"step": 285
},
{
"epoch": 0.6383928571428571,
"grad_norm": 1.0856074094772339,
"learning_rate": 1.8916934045560603e-05,
"loss": 0.6177,
"step": 286
},
{
"epoch": 0.640625,
"grad_norm": 1.1679246425628662,
"learning_rate": 1.8905474673448055e-05,
"loss": 0.5372,
"step": 287
},
{
"epoch": 0.6428571428571429,
"grad_norm": 1.0703572034835815,
"learning_rate": 1.8893958505587093e-05,
"loss": 0.5629,
"step": 288
},
{
"epoch": 0.6450892857142857,
"grad_norm": 1.2214595079421997,
"learning_rate": 1.8882385615423477e-05,
"loss": 0.593,
"step": 289
},
{
"epoch": 0.6473214285714286,
"grad_norm": 1.0748178958892822,
"learning_rate": 1.8870756076764728e-05,
"loss": 0.5756,
"step": 290
},
{
"epoch": 0.6495535714285714,
"grad_norm": 1.0605632066726685,
"learning_rate": 1.8859069963779636e-05,
"loss": 0.5999,
"step": 291
},
{
"epoch": 0.6517857142857143,
"grad_norm": 1.0350596904754639,
"learning_rate": 1.8847327350997814e-05,
"loss": 0.5449,
"step": 292
},
{
"epoch": 0.6540178571428571,
"grad_norm": 0.9954982399940491,
"learning_rate": 1.88355283133092e-05,
"loss": 0.5592,
"step": 293
},
{
"epoch": 0.65625,
"grad_norm": 1.2038896083831787,
"learning_rate": 1.8823672925963598e-05,
"loss": 0.6072,
"step": 294
},
{
"epoch": 0.6584821428571429,
"grad_norm": 1.0482138395309448,
"learning_rate": 1.8811761264570177e-05,
"loss": 0.616,
"step": 295
},
{
"epoch": 0.6607142857142857,
"grad_norm": 1.1453136205673218,
"learning_rate": 1.879979340509701e-05,
"loss": 0.6414,
"step": 296
},
{
"epoch": 0.6629464285714286,
"grad_norm": 0.9775916934013367,
"learning_rate": 1.8787769423870583e-05,
"loss": 0.542,
"step": 297
},
{
"epoch": 0.6651785714285714,
"grad_norm": 1.2236546277999878,
"learning_rate": 1.877568939757529e-05,
"loss": 0.5851,
"step": 298
},
{
"epoch": 0.6674107142857143,
"grad_norm": 1.1174293756484985,
"learning_rate": 1.8763553403252975e-05,
"loss": 0.4804,
"step": 299
},
{
"epoch": 0.6696428571428571,
"grad_norm": 1.034529685974121,
"learning_rate": 1.8751361518302413e-05,
"loss": 0.5805,
"step": 300
},
{
"epoch": 0.671875,
"grad_norm": 1.0704272985458374,
"learning_rate": 1.873911382047884e-05,
"loss": 0.6559,
"step": 301
},
{
"epoch": 0.6741071428571429,
"grad_norm": 1.0234812498092651,
"learning_rate": 1.8726810387893438e-05,
"loss": 0.5194,
"step": 302
},
{
"epoch": 0.6763392857142857,
"grad_norm": 1.131823182106018,
"learning_rate": 1.871445129901284e-05,
"loss": 0.6617,
"step": 303
},
{
"epoch": 0.6785714285714286,
"grad_norm": 1.3236502408981323,
"learning_rate": 1.8702036632658646e-05,
"loss": 0.5964,
"step": 304
},
{
"epoch": 0.6808035714285714,
"grad_norm": 1.0573298931121826,
"learning_rate": 1.8689566468006898e-05,
"loss": 0.588,
"step": 305
},
{
"epoch": 0.6830357142857143,
"grad_norm": 0.9685704708099365,
"learning_rate": 1.867704088458759e-05,
"loss": 0.5622,
"step": 306
},
{
"epoch": 0.6852678571428571,
"grad_norm": 1.1851588487625122,
"learning_rate": 1.866445996228415e-05,
"loss": 0.5787,
"step": 307
},
{
"epoch": 0.6875,
"grad_norm": 1.1072279214859009,
"learning_rate": 1.8651823781332948e-05,
"loss": 0.6292,
"step": 308
},
{
"epoch": 0.6897321428571429,
"grad_norm": 1.1180968284606934,
"learning_rate": 1.863913242232276e-05,
"loss": 0.5659,
"step": 309
},
{
"epoch": 0.6919642857142857,
"grad_norm": 1.1263612508773804,
"learning_rate": 1.8626385966194275e-05,
"loss": 0.6296,
"step": 310
},
{
"epoch": 0.6941964285714286,
"grad_norm": 1.116678237915039,
"learning_rate": 1.8613584494239568e-05,
"loss": 0.6357,
"step": 311
},
{
"epoch": 0.6964285714285714,
"grad_norm": 1.0281323194503784,
"learning_rate": 1.8600728088101587e-05,
"loss": 0.5647,
"step": 312
},
{
"epoch": 0.6986607142857143,
"grad_norm": 1.3668344020843506,
"learning_rate": 1.858781682977362e-05,
"loss": 0.5325,
"step": 313
},
{
"epoch": 0.7008928571428571,
"grad_norm": 1.0618714094161987,
"learning_rate": 1.857485080159879e-05,
"loss": 0.661,
"step": 314
},
{
"epoch": 0.703125,
"grad_norm": 1.1447020769119263,
"learning_rate": 1.8561830086269524e-05,
"loss": 0.6475,
"step": 315
},
{
"epoch": 0.7053571428571429,
"grad_norm": 1.0199358463287354,
"learning_rate": 1.8548754766827016e-05,
"loss": 0.5274,
"step": 316
},
{
"epoch": 0.7075892857142857,
"grad_norm": 0.9829248785972595,
"learning_rate": 1.8535624926660707e-05,
"loss": 0.5969,
"step": 317
},
{
"epoch": 0.7098214285714286,
"grad_norm": 1.1307730674743652,
"learning_rate": 1.852244064950775e-05,
"loss": 0.5884,
"step": 318
},
{
"epoch": 0.7120535714285714,
"grad_norm": 1.0643337965011597,
"learning_rate": 1.8509202019452472e-05,
"loss": 0.5436,
"step": 319
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.2980728149414062,
"learning_rate": 1.8495909120925857e-05,
"loss": 0.6208,
"step": 320
},
{
"epoch": 0.7165178571428571,
"grad_norm": 1.0896912813186646,
"learning_rate": 1.8482562038704975e-05,
"loss": 0.5372,
"step": 321
},
{
"epoch": 0.71875,
"grad_norm": 1.2717396020889282,
"learning_rate": 1.846916085791247e-05,
"loss": 0.6847,
"step": 322
},
{
"epoch": 0.7209821428571429,
"grad_norm": 1.1865652799606323,
"learning_rate": 1.8455705664016003e-05,
"loss": 0.6028,
"step": 323
},
{
"epoch": 0.7232142857142857,
"grad_norm": 1.1411960124969482,
"learning_rate": 1.8442196542827712e-05,
"loss": 0.6161,
"step": 324
},
{
"epoch": 0.7254464285714286,
"grad_norm": 1.3044698238372803,
"learning_rate": 1.8428633580503658e-05,
"loss": 0.7426,
"step": 325
},
{
"epoch": 0.7276785714285714,
"grad_norm": 1.2270197868347168,
"learning_rate": 1.8415016863543286e-05,
"loss": 0.6773,
"step": 326
},
{
"epoch": 0.7299107142857143,
"grad_norm": 1.191628098487854,
"learning_rate": 1.8401346478788865e-05,
"loss": 0.632,
"step": 327
},
{
"epoch": 0.7321428571428571,
"grad_norm": 1.2179105281829834,
"learning_rate": 1.8387622513424942e-05,
"loss": 0.6345,
"step": 328
},
{
"epoch": 0.734375,
"grad_norm": 1.125915288925171,
"learning_rate": 1.8373845054977764e-05,
"loss": 0.5677,
"step": 329
},
{
"epoch": 0.7366071428571429,
"grad_norm": 1.111037015914917,
"learning_rate": 1.836001419131476e-05,
"loss": 0.5699,
"step": 330
},
{
"epoch": 0.7388392857142857,
"grad_norm": 1.1068542003631592,
"learning_rate": 1.834613001064394e-05,
"loss": 0.5378,
"step": 331
},
{
"epoch": 0.7410714285714286,
"grad_norm": 1.2940490245819092,
"learning_rate": 1.8332192601513358e-05,
"loss": 0.6397,
"step": 332
},
{
"epoch": 0.7433035714285714,
"grad_norm": 1.1395896673202515,
"learning_rate": 1.8318202052810538e-05,
"loss": 0.6114,
"step": 333
},
{
"epoch": 0.7455357142857143,
"grad_norm": 1.046424150466919,
"learning_rate": 1.8304158453761904e-05,
"loss": 0.5117,
"step": 334
},
{
"epoch": 0.7477678571428571,
"grad_norm": 1.1523703336715698,
"learning_rate": 1.829006189393222e-05,
"loss": 0.616,
"step": 335
},
{
"epoch": 0.75,
"grad_norm": 1.104680061340332,
"learning_rate": 1.827591246322401e-05,
"loss": 0.5834,
"step": 336
},
{
"epoch": 0.75,
"eval_loss": 0.5638446807861328,
"eval_runtime": 31.0443,
"eval_samples_per_second": 2.351,
"eval_steps_per_second": 0.322,
"step": 336
},
{
"epoch": 0.7522321428571429,
"grad_norm": 1.0383703708648682,
"learning_rate": 1.8261710251876993e-05,
"loss": 0.5373,
"step": 337
},
{
"epoch": 0.7544642857142857,
"grad_norm": 1.1541510820388794,
"learning_rate": 1.8247455350467496e-05,
"loss": 0.6174,
"step": 338
},
{
"epoch": 0.7566964285714286,
"grad_norm": 1.024482250213623,
"learning_rate": 1.8233147849907894e-05,
"loss": 0.5369,
"step": 339
},
{
"epoch": 0.7589285714285714,
"grad_norm": 1.229745864868164,
"learning_rate": 1.8218787841446003e-05,
"loss": 0.5801,
"step": 340
},
{
"epoch": 0.7611607142857143,
"grad_norm": 1.1336222887039185,
"learning_rate": 1.8204375416664536e-05,
"loss": 0.5951,
"step": 341
},
{
"epoch": 0.7633928571428571,
"grad_norm": 1.0349760055541992,
"learning_rate": 1.8189910667480476e-05,
"loss": 0.4982,
"step": 342
},
{
"epoch": 0.765625,
"grad_norm": 1.3031398057937622,
"learning_rate": 1.8175393686144524e-05,
"loss": 0.629,
"step": 343
},
{
"epoch": 0.7678571428571429,
"grad_norm": 1.2154903411865234,
"learning_rate": 1.8160824565240495e-05,
"loss": 0.6085,
"step": 344
},
{
"epoch": 0.7700892857142857,
"grad_norm": 0.9560657143592834,
"learning_rate": 1.8146203397684734e-05,
"loss": 0.5177,
"step": 345
},
{
"epoch": 0.7723214285714286,
"grad_norm": 1.1145116090774536,
"learning_rate": 1.8131530276725514e-05,
"loss": 0.6308,
"step": 346
},
{
"epoch": 0.7745535714285714,
"grad_norm": 1.1008496284484863,
"learning_rate": 1.811680529594245e-05,
"loss": 0.6026,
"step": 347
},
{
"epoch": 0.7767857142857143,
"grad_norm": 1.0683940649032593,
"learning_rate": 1.8102028549245894e-05,
"loss": 0.5556,
"step": 348
},
{
"epoch": 0.7790178571428571,
"grad_norm": 1.1773459911346436,
"learning_rate": 1.808720013087635e-05,
"loss": 0.5652,
"step": 349
},
{
"epoch": 0.78125,
"grad_norm": 1.0454031229019165,
"learning_rate": 1.8072320135403862e-05,
"loss": 0.5117,
"step": 350
},
{
"epoch": 0.7834821428571429,
"grad_norm": 1.130771279335022,
"learning_rate": 1.805738865772741e-05,
"loss": 0.6254,
"step": 351
},
{
"epoch": 0.7857142857142857,
"grad_norm": 1.0127828121185303,
"learning_rate": 1.804240579307431e-05,
"loss": 0.5923,
"step": 352
},
{
"epoch": 0.7879464285714286,
"grad_norm": 1.0340120792388916,
"learning_rate": 1.8027371636999605e-05,
"loss": 0.5331,
"step": 353
},
{
"epoch": 0.7901785714285714,
"grad_norm": 1.3208816051483154,
"learning_rate": 1.8012286285385456e-05,
"loss": 0.7229,
"step": 354
},
{
"epoch": 0.7924107142857143,
"grad_norm": 1.3123232126235962,
"learning_rate": 1.7997149834440527e-05,
"loss": 0.6147,
"step": 355
},
{
"epoch": 0.7946428571428571,
"grad_norm": 1.0704232454299927,
"learning_rate": 1.7981962380699376e-05,
"loss": 0.6055,
"step": 356
},
{
"epoch": 0.796875,
"grad_norm": 1.1561580896377563,
"learning_rate": 1.7966724021021837e-05,
"loss": 0.544,
"step": 357
},
{
"epoch": 0.7991071428571429,
"grad_norm": 1.1618038415908813,
"learning_rate": 1.7951434852592406e-05,
"loss": 0.5955,
"step": 358
},
{
"epoch": 0.8013392857142857,
"grad_norm": 1.0671852827072144,
"learning_rate": 1.793609497291961e-05,
"loss": 0.5581,
"step": 359
},
{
"epoch": 0.8035714285714286,
"grad_norm": 1.125833511352539,
"learning_rate": 1.79207044798354e-05,
"loss": 0.7114,
"step": 360
},
{
"epoch": 0.8058035714285714,
"grad_norm": 1.2748684883117676,
"learning_rate": 1.7905263471494522e-05,
"loss": 0.7434,
"step": 361
},
{
"epoch": 0.8080357142857143,
"grad_norm": 1.0450520515441895,
"learning_rate": 1.788977204637388e-05,
"loss": 0.5177,
"step": 362
},
{
"epoch": 0.8102678571428571,
"grad_norm": 1.2043256759643555,
"learning_rate": 1.7874230303271932e-05,
"loss": 0.7341,
"step": 363
},
{
"epoch": 0.8125,
"grad_norm": 1.1245315074920654,
"learning_rate": 1.7858638341308026e-05,
"loss": 0.6051,
"step": 364
},
{
"epoch": 0.8147321428571429,
"grad_norm": 1.2384027242660522,
"learning_rate": 1.78429962599218e-05,
"loss": 0.7031,
"step": 365
},
{
"epoch": 0.8169642857142857,
"grad_norm": 1.1532883644104004,
"learning_rate": 1.7827304158872538e-05,
"loss": 0.5226,
"step": 366
},
{
"epoch": 0.8191964285714286,
"grad_norm": 1.0486387014389038,
"learning_rate": 1.7811562138238508e-05,
"loss": 0.5454,
"step": 367
},
{
"epoch": 0.8214285714285714,
"grad_norm": 1.1059547662734985,
"learning_rate": 1.779577029841638e-05,
"loss": 0.6249,
"step": 368
},
{
"epoch": 0.8236607142857143,
"grad_norm": 1.2745469808578491,
"learning_rate": 1.7779928740120525e-05,
"loss": 0.6617,
"step": 369
},
{
"epoch": 0.8258928571428571,
"grad_norm": 1.0087100267410278,
"learning_rate": 1.776403756438241e-05,
"loss": 0.5446,
"step": 370
},
{
"epoch": 0.828125,
"grad_norm": 1.2666665315628052,
"learning_rate": 1.774809687254994e-05,
"loss": 0.6908,
"step": 371
},
{
"epoch": 0.8303571428571429,
"grad_norm": 1.4326931238174438,
"learning_rate": 1.773210676628682e-05,
"loss": 0.6975,
"step": 372
},
{
"epoch": 0.8325892857142857,
"grad_norm": 1.0072338581085205,
"learning_rate": 1.77160673475719e-05,
"loss": 0.5109,
"step": 373
},
{
"epoch": 0.8348214285714286,
"grad_norm": 0.9262451529502869,
"learning_rate": 1.769997871869852e-05,
"loss": 0.5139,
"step": 374
},
{
"epoch": 0.8370535714285714,
"grad_norm": 1.3120925426483154,
"learning_rate": 1.768384098227387e-05,
"loss": 0.6241,
"step": 375
},
{
"epoch": 0.8392857142857143,
"grad_norm": 1.2064335346221924,
"learning_rate": 1.7667654241218332e-05,
"loss": 0.6312,
"step": 376
},
{
"epoch": 0.8415178571428571,
"grad_norm": 1.1633553504943848,
"learning_rate": 1.765141859876481e-05,
"loss": 0.619,
"step": 377
},
{
"epoch": 0.84375,
"grad_norm": 1.361649990081787,
"learning_rate": 1.7635134158458095e-05,
"loss": 0.6553,
"step": 378
},
{
"epoch": 0.8459821428571429,
"grad_norm": 1.1922473907470703,
"learning_rate": 1.7618801024154186e-05,
"loss": 0.5775,
"step": 379
},
{
"epoch": 0.8482142857142857,
"grad_norm": 1.3128317594528198,
"learning_rate": 1.7602419300019627e-05,
"loss": 0.5734,
"step": 380
},
{
"epoch": 0.8504464285714286,
"grad_norm": 1.246530532836914,
"learning_rate": 1.758598909053087e-05,
"loss": 0.5827,
"step": 381
},
{
"epoch": 0.8526785714285714,
"grad_norm": 1.207715392112732,
"learning_rate": 1.7569510500473566e-05,
"loss": 0.5851,
"step": 382
},
{
"epoch": 0.8549107142857143,
"grad_norm": 0.8977461457252502,
"learning_rate": 1.7552983634941928e-05,
"loss": 0.4392,
"step": 383
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.298916220664978,
"learning_rate": 1.753640859933806e-05,
"loss": 0.5916,
"step": 384
},
{
"epoch": 0.859375,
"grad_norm": 1.3132508993148804,
"learning_rate": 1.751978549937126e-05,
"loss": 0.6496,
"step": 385
},
{
"epoch": 0.8616071428571429,
"grad_norm": 1.1107579469680786,
"learning_rate": 1.7503114441057374e-05,
"loss": 0.609,
"step": 386
},
{
"epoch": 0.8638392857142857,
"grad_norm": 1.1084481477737427,
"learning_rate": 1.7486395530718104e-05,
"loss": 0.6507,
"step": 387
},
{
"epoch": 0.8660714285714286,
"grad_norm": 1.031421184539795,
"learning_rate": 1.746962887498034e-05,
"loss": 0.5508,
"step": 388
},
{
"epoch": 0.8683035714285714,
"grad_norm": 1.0834870338439941,
"learning_rate": 1.7452814580775467e-05,
"loss": 0.5516,
"step": 389
},
{
"epoch": 0.8705357142857143,
"grad_norm": 0.9795181155204773,
"learning_rate": 1.743595275533869e-05,
"loss": 0.5161,
"step": 390
},
{
"epoch": 0.8727678571428571,
"grad_norm": 0.9782564043998718,
"learning_rate": 1.7419043506208348e-05,
"loss": 0.6326,
"step": 391
},
{
"epoch": 0.875,
"grad_norm": 1.0224997997283936,
"learning_rate": 1.7402086941225246e-05,
"loss": 0.5398,
"step": 392
},
{
"epoch": 0.8772321428571429,
"grad_norm": 1.1170098781585693,
"learning_rate": 1.7385083168531934e-05,
"loss": 0.5963,
"step": 393
},
{
"epoch": 0.8794642857142857,
"grad_norm": 1.1109061241149902,
"learning_rate": 1.736803229657204e-05,
"loss": 0.6092,
"step": 394
},
{
"epoch": 0.8816964285714286,
"grad_norm": 1.0568723678588867,
"learning_rate": 1.7350934434089583e-05,
"loss": 0.5028,
"step": 395
},
{
"epoch": 0.8839285714285714,
"grad_norm": 1.0677611827850342,
"learning_rate": 1.7333789690128252e-05,
"loss": 0.6654,
"step": 396
},
{
"epoch": 0.8861607142857143,
"grad_norm": 1.2150548696517944,
"learning_rate": 1.7316598174030746e-05,
"loss": 0.6201,
"step": 397
},
{
"epoch": 0.8883928571428571,
"grad_norm": 1.2608318328857422,
"learning_rate": 1.7299359995438046e-05,
"loss": 0.6584,
"step": 398
},
{
"epoch": 0.890625,
"grad_norm": 1.141231894493103,
"learning_rate": 1.728207526428873e-05,
"loss": 0.6207,
"step": 399
},
{
"epoch": 0.8928571428571429,
"grad_norm": 1.2013863325119019,
"learning_rate": 1.7264744090818284e-05,
"loss": 0.6354,
"step": 400
},
{
"epoch": 0.8950892857142857,
"grad_norm": 1.1436806917190552,
"learning_rate": 1.7247366585558366e-05,
"loss": 0.6111,
"step": 401
},
{
"epoch": 0.8973214285714286,
"grad_norm": 1.0974454879760742,
"learning_rate": 1.7229942859336142e-05,
"loss": 0.6545,
"step": 402
},
{
"epoch": 0.8995535714285714,
"grad_norm": 1.0279815196990967,
"learning_rate": 1.7212473023273532e-05,
"loss": 0.5486,
"step": 403
},
{
"epoch": 0.9017857142857143,
"grad_norm": 1.2103922367095947,
"learning_rate": 1.719495718878655e-05,
"loss": 0.6333,
"step": 404
},
{
"epoch": 0.9040178571428571,
"grad_norm": 1.1030718088150024,
"learning_rate": 1.7177395467584564e-05,
"loss": 0.581,
"step": 405
},
{
"epoch": 0.90625,
"grad_norm": 1.5910372734069824,
"learning_rate": 1.7159787971669586e-05,
"loss": 0.6665,
"step": 406
},
{
"epoch": 0.9084821428571429,
"grad_norm": 1.4932862520217896,
"learning_rate": 1.7142134813335557e-05,
"loss": 0.6512,
"step": 407
},
{
"epoch": 0.9107142857142857,
"grad_norm": 1.2478338479995728,
"learning_rate": 1.712443610516765e-05,
"loss": 0.6352,
"step": 408
},
{
"epoch": 0.9129464285714286,
"grad_norm": 1.2439334392547607,
"learning_rate": 1.7106691960041527e-05,
"loss": 0.6865,
"step": 409
},
{
"epoch": 0.9151785714285714,
"grad_norm": 1.0125123262405396,
"learning_rate": 1.7088902491122636e-05,
"loss": 0.6067,
"step": 410
},
{
"epoch": 0.9174107142857143,
"grad_norm": 1.1370826959609985,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.6601,
"step": 411
},
{
"epoch": 0.9196428571428571,
"grad_norm": 1.3260993957519531,
"learning_rate": 1.7053188036012885e-05,
"loss": 0.5318,
"step": 412
},
{
"epoch": 0.921875,
"grad_norm": 1.2528599500656128,
"learning_rate": 1.7035263277595314e-05,
"loss": 0.5438,
"step": 413
},
{
"epoch": 0.9241071428571429,
"grad_norm": 1.011576533317566,
"learning_rate": 1.7017293650930083e-05,
"loss": 0.6103,
"step": 414
},
{
"epoch": 0.9263392857142857,
"grad_norm": 1.0763773918151855,
"learning_rate": 1.6999279270620675e-05,
"loss": 0.6163,
"step": 415
},
{
"epoch": 0.9285714285714286,
"grad_norm": 1.0680584907531738,
"learning_rate": 1.6981220251555996e-05,
"loss": 0.5902,
"step": 416
},
{
"epoch": 0.9308035714285714,
"grad_norm": 1.2038758993148804,
"learning_rate": 1.6963116708909637e-05,
"loss": 0.629,
"step": 417
},
{
"epoch": 0.9330357142857143,
"grad_norm": 0.9378647804260254,
"learning_rate": 1.6944968758139144e-05,
"loss": 0.5668,
"step": 418
},
{
"epoch": 0.9352678571428571,
"grad_norm": 0.9915615916252136,
"learning_rate": 1.6926776514985278e-05,
"loss": 0.5527,
"step": 419
},
{
"epoch": 0.9375,
"grad_norm": 1.2160735130310059,
"learning_rate": 1.6908540095471288e-05,
"loss": 0.6082,
"step": 420
},
{
"epoch": 0.9397321428571429,
"grad_norm": 1.1135092973709106,
"learning_rate": 1.6890259615902153e-05,
"loss": 0.6318,
"step": 421
},
{
"epoch": 0.9419642857142857,
"grad_norm": 1.111850619316101,
"learning_rate": 1.6871935192863862e-05,
"loss": 0.558,
"step": 422
},
{
"epoch": 0.9441964285714286,
"grad_norm": 1.0438261032104492,
"learning_rate": 1.6853566943222647e-05,
"loss": 0.6356,
"step": 423
},
{
"epoch": 0.9464285714285714,
"grad_norm": 1.1192708015441895,
"learning_rate": 1.6835154984124266e-05,
"loss": 0.6006,
"step": 424
},
{
"epoch": 0.9486607142857143,
"grad_norm": 0.9830177426338196,
"learning_rate": 1.6816699432993212e-05,
"loss": 0.6372,
"step": 425
},
{
"epoch": 0.9508928571428571,
"grad_norm": 1.1955080032348633,
"learning_rate": 1.6798200407532025e-05,
"loss": 0.6932,
"step": 426
},
{
"epoch": 0.953125,
"grad_norm": 1.1463717222213745,
"learning_rate": 1.677965802572048e-05,
"loss": 0.6139,
"step": 427
},
{
"epoch": 0.9553571428571429,
"grad_norm": 1.1220048666000366,
"learning_rate": 1.676107240581488e-05,
"loss": 0.5997,
"step": 428
},
{
"epoch": 0.9575892857142857,
"grad_norm": 1.2134525775909424,
"learning_rate": 1.674244366634727e-05,
"loss": 0.6085,
"step": 429
},
{
"epoch": 0.9598214285714286,
"grad_norm": 1.4323443174362183,
"learning_rate": 1.6723771926124704e-05,
"loss": 0.7118,
"step": 430
},
{
"epoch": 0.9620535714285714,
"grad_norm": 1.0498019456863403,
"learning_rate": 1.6705057304228488e-05,
"loss": 0.5317,
"step": 431
},
{
"epoch": 0.9642857142857143,
"grad_norm": 1.1488829851150513,
"learning_rate": 1.6686299920013388e-05,
"loss": 0.5828,
"step": 432
},
{
"epoch": 0.9665178571428571,
"grad_norm": 0.9636843800544739,
"learning_rate": 1.666749989310691e-05,
"loss": 0.5752,
"step": 433
},
{
"epoch": 0.96875,
"grad_norm": 1.0637983083724976,
"learning_rate": 1.6648657343408517e-05,
"loss": 0.5987,
"step": 434
},
{
"epoch": 0.9709821428571429,
"grad_norm": 1.0497316122055054,
"learning_rate": 1.6629772391088855e-05,
"loss": 0.5571,
"step": 435
},
{
"epoch": 0.9732142857142857,
"grad_norm": 1.2553322315216064,
"learning_rate": 1.661084515658901e-05,
"loss": 0.6733,
"step": 436
},
{
"epoch": 0.9754464285714286,
"grad_norm": 0.9725781083106995,
"learning_rate": 1.6591875760619718e-05,
"loss": 0.4813,
"step": 437
},
{
"epoch": 0.9776785714285714,
"grad_norm": 1.3850640058517456,
"learning_rate": 1.6572864324160617e-05,
"loss": 0.6368,
"step": 438
},
{
"epoch": 0.9799107142857143,
"grad_norm": 1.094842791557312,
"learning_rate": 1.6553810968459455e-05,
"loss": 0.5475,
"step": 439
},
{
"epoch": 0.9821428571428571,
"grad_norm": 1.1491671800613403,
"learning_rate": 1.6534715815031325e-05,
"loss": 0.6112,
"step": 440
},
{
"epoch": 0.984375,
"grad_norm": 1.1847591400146484,
"learning_rate": 1.651557898565789e-05,
"loss": 0.6173,
"step": 441
},
{
"epoch": 0.9866071428571429,
"grad_norm": 1.0778274536132812,
"learning_rate": 1.649640060238661e-05,
"loss": 0.5835,
"step": 442
},
{
"epoch": 0.9888392857142857,
"grad_norm": 1.1943531036376953,
"learning_rate": 1.6477180787529957e-05,
"loss": 0.6343,
"step": 443
},
{
"epoch": 0.9910714285714286,
"grad_norm": 1.1000702381134033,
"learning_rate": 1.645791966366464e-05,
"loss": 0.5547,
"step": 444
},
{
"epoch": 0.9933035714285714,
"grad_norm": 1.0542763471603394,
"learning_rate": 1.6438617353630823e-05,
"loss": 0.5713,
"step": 445
},
{
"epoch": 0.9955357142857143,
"grad_norm": 1.0858733654022217,
"learning_rate": 1.6419273980531333e-05,
"loss": 0.6663,
"step": 446
},
{
"epoch": 0.9977678571428571,
"grad_norm": 1.0207633972167969,
"learning_rate": 1.6399889667730887e-05,
"loss": 0.5343,
"step": 447
},
{
"epoch": 1.0,
"grad_norm": 1.1045469045639038,
"learning_rate": 1.63804645388553e-05,
"loss": 0.544,
"step": 448
},
{
"epoch": 1.0,
"eval_loss": 0.5578042268753052,
"eval_runtime": 28.1156,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.356,
"step": 448
},
{
"epoch": 1.0022321428571428,
"grad_norm": 1.2090537548065186,
"learning_rate": 1.6360998717790694e-05,
"loss": 0.3826,
"step": 449
},
{
"epoch": 1.0044642857142858,
"grad_norm": 1.4487948417663574,
"learning_rate": 1.6341492328682703e-05,
"loss": 0.5219,
"step": 450
},
{
"epoch": 1.0066964285714286,
"grad_norm": 1.1036040782928467,
"learning_rate": 1.6321945495935717e-05,
"loss": 0.4923,
"step": 451
},
{
"epoch": 1.0089285714285714,
"grad_norm": 1.0219038724899292,
"learning_rate": 1.6302358344212025e-05,
"loss": 0.4067,
"step": 452
},
{
"epoch": 1.0111607142857142,
"grad_norm": 1.129643440246582,
"learning_rate": 1.6282730998431072e-05,
"loss": 0.4884,
"step": 453
},
{
"epoch": 1.0133928571428572,
"grad_norm": 1.1244534254074097,
"learning_rate": 1.6263063583768652e-05,
"loss": 0.3948,
"step": 454
},
{
"epoch": 1.015625,
"grad_norm": 1.1327791213989258,
"learning_rate": 1.624335622565609e-05,
"loss": 0.46,
"step": 455
},
{
"epoch": 1.0178571428571428,
"grad_norm": 1.2120473384857178,
"learning_rate": 1.622360904977946e-05,
"loss": 0.4226,
"step": 456
},
{
"epoch": 1.0200892857142858,
"grad_norm": 1.6121370792388916,
"learning_rate": 1.6203822182078777e-05,
"loss": 0.5237,
"step": 457
},
{
"epoch": 1.0223214285714286,
"grad_norm": 1.3280739784240723,
"learning_rate": 1.6183995748747204e-05,
"loss": 0.4842,
"step": 458
},
{
"epoch": 1.0245535714285714,
"grad_norm": 1.1011444330215454,
"learning_rate": 1.6164129876230226e-05,
"loss": 0.3867,
"step": 459
},
{
"epoch": 1.0267857142857142,
"grad_norm": 1.1854206323623657,
"learning_rate": 1.6144224691224868e-05,
"loss": 0.4239,
"step": 460
},
{
"epoch": 1.0290178571428572,
"grad_norm": 1.2774749994277954,
"learning_rate": 1.6124280320678864e-05,
"loss": 0.4665,
"step": 461
},
{
"epoch": 1.03125,
"grad_norm": 1.2907570600509644,
"learning_rate": 1.6104296891789867e-05,
"loss": 0.5446,
"step": 462
},
{
"epoch": 1.0334821428571428,
"grad_norm": 1.1413031816482544,
"learning_rate": 1.608427453200463e-05,
"loss": 0.4177,
"step": 463
},
{
"epoch": 1.0357142857142858,
"grad_norm": 1.1599458456039429,
"learning_rate": 1.606421336901818e-05,
"loss": 0.4736,
"step": 464
},
{
"epoch": 1.0379464285714286,
"grad_norm": 1.319823145866394,
"learning_rate": 1.6044113530773034e-05,
"loss": 0.5214,
"step": 465
},
{
"epoch": 1.0401785714285714,
"grad_norm": 1.1600507497787476,
"learning_rate": 1.6023975145458352e-05,
"loss": 0.4226,
"step": 466
},
{
"epoch": 1.0424107142857142,
"grad_norm": 0.9671021699905396,
"learning_rate": 1.600379834150914e-05,
"loss": 0.3623,
"step": 467
},
{
"epoch": 1.0446428571428572,
"grad_norm": 0.9996952414512634,
"learning_rate": 1.5983583247605414e-05,
"loss": 0.423,
"step": 468
},
{
"epoch": 1.046875,
"grad_norm": 1.362450122833252,
"learning_rate": 1.5963329992671402e-05,
"loss": 0.4763,
"step": 469
},
{
"epoch": 1.0491071428571428,
"grad_norm": 1.3633580207824707,
"learning_rate": 1.5943038705874697e-05,
"loss": 0.4702,
"step": 470
},
{
"epoch": 1.0513392857142858,
"grad_norm": 1.2059600353240967,
"learning_rate": 1.5922709516625453e-05,
"loss": 0.4417,
"step": 471
},
{
"epoch": 1.0535714285714286,
"grad_norm": 1.2279843091964722,
"learning_rate": 1.590234255457555e-05,
"loss": 0.4629,
"step": 472
},
{
"epoch": 1.0558035714285714,
"grad_norm": 1.2962024211883545,
"learning_rate": 1.588193794961776e-05,
"loss": 0.4493,
"step": 473
},
{
"epoch": 1.0580357142857142,
"grad_norm": 0.9999826550483704,
"learning_rate": 1.5861495831884942e-05,
"loss": 0.3847,
"step": 474
},
{
"epoch": 1.0602678571428572,
"grad_norm": 1.0514469146728516,
"learning_rate": 1.5841016331749185e-05,
"loss": 0.4358,
"step": 475
},
{
"epoch": 1.0625,
"grad_norm": 1.110394835472107,
"learning_rate": 1.582049957982099e-05,
"loss": 0.3376,
"step": 476
},
{
"epoch": 1.0647321428571428,
"grad_norm": 1.2494930028915405,
"learning_rate": 1.5799945706948447e-05,
"loss": 0.4125,
"step": 477
},
{
"epoch": 1.0669642857142858,
"grad_norm": 1.2270575761795044,
"learning_rate": 1.5779354844216377e-05,
"loss": 0.4493,
"step": 478
},
{
"epoch": 1.0691964285714286,
"grad_norm": 1.3279017210006714,
"learning_rate": 1.5758727122945514e-05,
"loss": 0.498,
"step": 479
},
{
"epoch": 1.0714285714285714,
"grad_norm": 1.0876953601837158,
"learning_rate": 1.5738062674691657e-05,
"loss": 0.475,
"step": 480
},
{
"epoch": 1.0736607142857142,
"grad_norm": 1.154995322227478,
"learning_rate": 1.5717361631244842e-05,
"loss": 0.4415,
"step": 481
},
{
"epoch": 1.0758928571428572,
"grad_norm": 1.1559693813323975,
"learning_rate": 1.5696624124628495e-05,
"loss": 0.4855,
"step": 482
},
{
"epoch": 1.078125,
"grad_norm": 1.4143540859222412,
"learning_rate": 1.5675850287098585e-05,
"loss": 0.5263,
"step": 483
},
{
"epoch": 1.0803571428571428,
"grad_norm": 1.0588111877441406,
"learning_rate": 1.5655040251142787e-05,
"loss": 0.444,
"step": 484
},
{
"epoch": 1.0825892857142858,
"grad_norm": 1.0768473148345947,
"learning_rate": 1.5634194149479642e-05,
"loss": 0.4086,
"step": 485
},
{
"epoch": 1.0848214285714286,
"grad_norm": 1.2432984113693237,
"learning_rate": 1.5613312115057697e-05,
"loss": 0.5302,
"step": 486
},
{
"epoch": 1.0870535714285714,
"grad_norm": 1.0816019773483276,
"learning_rate": 1.559239428105467e-05,
"loss": 0.4353,
"step": 487
},
{
"epoch": 1.0892857142857142,
"grad_norm": 1.3731716871261597,
"learning_rate": 1.5571440780876588e-05,
"loss": 0.4319,
"step": 488
},
{
"epoch": 1.0915178571428572,
"grad_norm": 1.2017320394515991,
"learning_rate": 1.5550451748156957e-05,
"loss": 0.4251,
"step": 489
},
{
"epoch": 1.09375,
"grad_norm": 1.1153476238250732,
"learning_rate": 1.5529427316755876e-05,
"loss": 0.4685,
"step": 490
},
{
"epoch": 1.0959821428571428,
"grad_norm": 1.0116766691207886,
"learning_rate": 1.5508367620759224e-05,
"loss": 0.3844,
"step": 491
},
{
"epoch": 1.0982142857142858,
"grad_norm": 1.1723082065582275,
"learning_rate": 1.548727279447777e-05,
"loss": 0.4365,
"step": 492
},
{
"epoch": 1.1004464285714286,
"grad_norm": 1.2044928073883057,
"learning_rate": 1.546614297244634e-05,
"loss": 0.4114,
"step": 493
},
{
"epoch": 1.1026785714285714,
"grad_norm": 1.3545663356781006,
"learning_rate": 1.5444978289422937e-05,
"loss": 0.4485,
"step": 494
},
{
"epoch": 1.1049107142857142,
"grad_norm": 1.1752678155899048,
"learning_rate": 1.542377888038791e-05,
"loss": 0.4622,
"step": 495
},
{
"epoch": 1.1071428571428572,
"grad_norm": 1.1464520692825317,
"learning_rate": 1.540254488054307e-05,
"loss": 0.4101,
"step": 496
},
{
"epoch": 1.109375,
"grad_norm": 1.1562168598175049,
"learning_rate": 1.538127642531083e-05,
"loss": 0.4391,
"step": 497
},
{
"epoch": 1.1116071428571428,
"grad_norm": 1.3013789653778076,
"learning_rate": 1.5359973650333352e-05,
"loss": 0.5043,
"step": 498
},
{
"epoch": 1.1138392857142858,
"grad_norm": 1.0924135446548462,
"learning_rate": 1.533863669147168e-05,
"loss": 0.4345,
"step": 499
},
{
"epoch": 1.1160714285714286,
"grad_norm": 1.099307894706726,
"learning_rate": 1.5317265684804865e-05,
"loss": 0.423,
"step": 500
},
{
"epoch": 1.1183035714285714,
"grad_norm": 1.159507155418396,
"learning_rate": 1.5295860766629098e-05,
"loss": 0.4412,
"step": 501
},
{
"epoch": 1.1205357142857142,
"grad_norm": 1.2187080383300781,
"learning_rate": 1.5274422073456853e-05,
"loss": 0.4726,
"step": 502
},
{
"epoch": 1.1227678571428572,
"grad_norm": 1.1509562730789185,
"learning_rate": 1.5252949742016005e-05,
"loss": 0.3894,
"step": 503
},
{
"epoch": 1.125,
"grad_norm": 1.2710036039352417,
"learning_rate": 1.5231443909248956e-05,
"loss": 0.5062,
"step": 504
},
{
"epoch": 1.1272321428571428,
"grad_norm": 1.172450304031372,
"learning_rate": 1.5209904712311777e-05,
"loss": 0.409,
"step": 505
},
{
"epoch": 1.1294642857142858,
"grad_norm": 1.2753404378890991,
"learning_rate": 1.5188332288573313e-05,
"loss": 0.44,
"step": 506
},
{
"epoch": 1.1316964285714286,
"grad_norm": 1.1560388803482056,
"learning_rate": 1.5166726775614327e-05,
"loss": 0.4809,
"step": 507
},
{
"epoch": 1.1339285714285714,
"grad_norm": 1.3690383434295654,
"learning_rate": 1.5145088311226599e-05,
"loss": 0.4882,
"step": 508
},
{
"epoch": 1.1361607142857142,
"grad_norm": 1.2509236335754395,
"learning_rate": 1.5123417033412078e-05,
"loss": 0.3845,
"step": 509
},
{
"epoch": 1.1383928571428572,
"grad_norm": 1.1513557434082031,
"learning_rate": 1.510171308038197e-05,
"loss": 0.4387,
"step": 510
},
{
"epoch": 1.140625,
"grad_norm": 1.3206255435943604,
"learning_rate": 1.5079976590555876e-05,
"loss": 0.4861,
"step": 511
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.1096549034118652,
"learning_rate": 1.5058207702560907e-05,
"loss": 0.4612,
"step": 512
},
{
"epoch": 1.1450892857142858,
"grad_norm": 1.1876708269119263,
"learning_rate": 1.5036406555230794e-05,
"loss": 0.4633,
"step": 513
},
{
"epoch": 1.1473214285714286,
"grad_norm": 1.07047438621521,
"learning_rate": 1.501457328760501e-05,
"loss": 0.3813,
"step": 514
},
{
"epoch": 1.1495535714285714,
"grad_norm": 1.0505889654159546,
"learning_rate": 1.499270803892787e-05,
"loss": 0.3661,
"step": 515
},
{
"epoch": 1.1517857142857142,
"grad_norm": 1.243187665939331,
"learning_rate": 1.4970810948647664e-05,
"loss": 0.4753,
"step": 516
},
{
"epoch": 1.1540178571428572,
"grad_norm": 1.1276707649230957,
"learning_rate": 1.4948882156415748e-05,
"loss": 0.4216,
"step": 517
},
{
"epoch": 1.15625,
"grad_norm": 1.1489506959915161,
"learning_rate": 1.4926921802085662e-05,
"loss": 0.5241,
"step": 518
},
{
"epoch": 1.1584821428571428,
"grad_norm": 1.2082480192184448,
"learning_rate": 1.4904930025712236e-05,
"loss": 0.4244,
"step": 519
},
{
"epoch": 1.1607142857142858,
"grad_norm": 1.0853203535079956,
"learning_rate": 1.4882906967550708e-05,
"loss": 0.4449,
"step": 520
},
{
"epoch": 1.1629464285714286,
"grad_norm": 1.0796903371810913,
"learning_rate": 1.4860852768055804e-05,
"loss": 0.4915,
"step": 521
},
{
"epoch": 1.1651785714285714,
"grad_norm": 1.31143057346344,
"learning_rate": 1.4838767567880865e-05,
"loss": 0.4262,
"step": 522
},
{
"epoch": 1.1674107142857142,
"grad_norm": 1.1698493957519531,
"learning_rate": 1.4816651507876946e-05,
"loss": 0.4845,
"step": 523
},
{
"epoch": 1.1696428571428572,
"grad_norm": 1.1286602020263672,
"learning_rate": 1.479450472909191e-05,
"loss": 0.3967,
"step": 524
},
{
"epoch": 1.171875,
"grad_norm": 1.054138422012329,
"learning_rate": 1.4772327372769533e-05,
"loss": 0.4502,
"step": 525
},
{
"epoch": 1.1741071428571428,
"grad_norm": 1.3807618618011475,
"learning_rate": 1.4750119580348601e-05,
"loss": 0.5044,
"step": 526
},
{
"epoch": 1.1763392857142858,
"grad_norm": 1.2077445983886719,
"learning_rate": 1.4727881493462018e-05,
"loss": 0.3643,
"step": 527
},
{
"epoch": 1.1785714285714286,
"grad_norm": 1.1224011182785034,
"learning_rate": 1.4705613253935886e-05,
"loss": 0.4503,
"step": 528
},
{
"epoch": 1.1808035714285714,
"grad_norm": 1.0725282430648804,
"learning_rate": 1.4683315003788614e-05,
"loss": 0.4861,
"step": 529
},
{
"epoch": 1.1830357142857142,
"grad_norm": 1.2990797758102417,
"learning_rate": 1.4660986885230002e-05,
"loss": 0.4194,
"step": 530
},
{
"epoch": 1.1852678571428572,
"grad_norm": 1.075061321258545,
"learning_rate": 1.463862904066035e-05,
"loss": 0.4469,
"step": 531
},
{
"epoch": 1.1875,
"grad_norm": 1.1602604389190674,
"learning_rate": 1.4616241612669523e-05,
"loss": 0.433,
"step": 532
},
{
"epoch": 1.1897321428571428,
"grad_norm": 1.1690677404403687,
"learning_rate": 1.4593824744036078e-05,
"loss": 0.4625,
"step": 533
},
{
"epoch": 1.1919642857142858,
"grad_norm": 1.1553549766540527,
"learning_rate": 1.4571378577726317e-05,
"loss": 0.4143,
"step": 534
},
{
"epoch": 1.1941964285714286,
"grad_norm": 1.2570900917053223,
"learning_rate": 1.4548903256893392e-05,
"loss": 0.4434,
"step": 535
},
{
"epoch": 1.1964285714285714,
"grad_norm": 1.2540876865386963,
"learning_rate": 1.4526398924876407e-05,
"loss": 0.461,
"step": 536
},
{
"epoch": 1.1986607142857142,
"grad_norm": 1.1208131313323975,
"learning_rate": 1.4503865725199468e-05,
"loss": 0.4251,
"step": 537
},
{
"epoch": 1.2008928571428572,
"grad_norm": 1.0891457796096802,
"learning_rate": 1.4481303801570805e-05,
"loss": 0.4534,
"step": 538
},
{
"epoch": 1.203125,
"grad_norm": 0.965118944644928,
"learning_rate": 1.4458713297881828e-05,
"loss": 0.4057,
"step": 539
},
{
"epoch": 1.2053571428571428,
"grad_norm": 1.086959958076477,
"learning_rate": 1.4436094358206224e-05,
"loss": 0.4249,
"step": 540
},
{
"epoch": 1.2075892857142858,
"grad_norm": 1.0582690238952637,
"learning_rate": 1.4413447126799038e-05,
"loss": 0.3932,
"step": 541
},
{
"epoch": 1.2098214285714286,
"grad_norm": 0.9543986916542053,
"learning_rate": 1.4390771748095735e-05,
"loss": 0.3234,
"step": 542
},
{
"epoch": 1.2120535714285714,
"grad_norm": 1.2138005495071411,
"learning_rate": 1.436806836671131e-05,
"loss": 0.4154,
"step": 543
},
{
"epoch": 1.2142857142857142,
"grad_norm": 1.23991060256958,
"learning_rate": 1.4345337127439333e-05,
"loss": 0.4757,
"step": 544
},
{
"epoch": 1.2165178571428572,
"grad_norm": 1.1649141311645508,
"learning_rate": 1.4322578175251058e-05,
"loss": 0.4841,
"step": 545
},
{
"epoch": 1.21875,
"grad_norm": 1.238889217376709,
"learning_rate": 1.4299791655294461e-05,
"loss": 0.4381,
"step": 546
},
{
"epoch": 1.2209821428571428,
"grad_norm": 1.1498550176620483,
"learning_rate": 1.4276977712893357e-05,
"loss": 0.4608,
"step": 547
},
{
"epoch": 1.2232142857142858,
"grad_norm": 1.2343175411224365,
"learning_rate": 1.4254136493546432e-05,
"loss": 0.4884,
"step": 548
},
{
"epoch": 1.2254464285714286,
"grad_norm": 1.2278847694396973,
"learning_rate": 1.4231268142926345e-05,
"loss": 0.489,
"step": 549
},
{
"epoch": 1.2276785714285714,
"grad_norm": 1.3790256977081299,
"learning_rate": 1.4208372806878782e-05,
"loss": 0.4945,
"step": 550
},
{
"epoch": 1.2299107142857142,
"grad_norm": 1.2269234657287598,
"learning_rate": 1.4185450631421542e-05,
"loss": 0.5471,
"step": 551
},
{
"epoch": 1.2321428571428572,
"grad_norm": 1.1349695920944214,
"learning_rate": 1.4162501762743579e-05,
"loss": 0.4547,
"step": 552
},
{
"epoch": 1.234375,
"grad_norm": 1.1662615537643433,
"learning_rate": 1.41395263472041e-05,
"loss": 0.478,
"step": 553
},
{
"epoch": 1.2366071428571428,
"grad_norm": 1.1003633737564087,
"learning_rate": 1.4116524531331616e-05,
"loss": 0.4237,
"step": 554
},
{
"epoch": 1.2388392857142858,
"grad_norm": 1.012122631072998,
"learning_rate": 1.4093496461823002e-05,
"loss": 0.4799,
"step": 555
},
{
"epoch": 1.2410714285714286,
"grad_norm": 1.0672236680984497,
"learning_rate": 1.4070442285542579e-05,
"loss": 0.4342,
"step": 556
},
{
"epoch": 1.2433035714285714,
"grad_norm": 1.1434969902038574,
"learning_rate": 1.4047362149521152e-05,
"loss": 0.4758,
"step": 557
},
{
"epoch": 1.2455357142857142,
"grad_norm": 1.1280899047851562,
"learning_rate": 1.402425620095511e-05,
"loss": 0.4325,
"step": 558
},
{
"epoch": 1.2477678571428572,
"grad_norm": 1.074324369430542,
"learning_rate": 1.400112458720544e-05,
"loss": 0.4504,
"step": 559
},
{
"epoch": 1.25,
"grad_norm": 1.0355358123779297,
"learning_rate": 1.3977967455796828e-05,
"loss": 0.464,
"step": 560
},
{
"epoch": 1.25,
"eval_loss": 0.5843456983566284,
"eval_runtime": 27.5507,
"eval_samples_per_second": 2.65,
"eval_steps_per_second": 0.363,
"step": 560
},
{
"epoch": 1.2522321428571428,
"grad_norm": 1.1094245910644531,
"learning_rate": 1.3954784954416703e-05,
"loss": 0.458,
"step": 561
},
{
"epoch": 1.2544642857142856,
"grad_norm": 1.1554150581359863,
"learning_rate": 1.393157723091428e-05,
"loss": 0.4661,
"step": 562
},
{
"epoch": 1.2566964285714286,
"grad_norm": 1.195674180984497,
"learning_rate": 1.3908344433299644e-05,
"loss": 0.5074,
"step": 563
},
{
"epoch": 1.2589285714285714,
"grad_norm": 1.067400336265564,
"learning_rate": 1.3885086709742788e-05,
"loss": 0.3862,
"step": 564
},
{
"epoch": 1.2611607142857144,
"grad_norm": 1.1013455390930176,
"learning_rate": 1.3861804208572674e-05,
"loss": 0.4355,
"step": 565
},
{
"epoch": 1.2633928571428572,
"grad_norm": 1.1833617687225342,
"learning_rate": 1.3838497078276288e-05,
"loss": 0.4716,
"step": 566
},
{
"epoch": 1.265625,
"grad_norm": 1.09175705909729,
"learning_rate": 1.3815165467497686e-05,
"loss": 0.4745,
"step": 567
},
{
"epoch": 1.2678571428571428,
"grad_norm": 1.2390869855880737,
"learning_rate": 1.3791809525037057e-05,
"loss": 0.428,
"step": 568
},
{
"epoch": 1.2700892857142856,
"grad_norm": 1.2348787784576416,
"learning_rate": 1.376842939984977e-05,
"loss": 0.3749,
"step": 569
},
{
"epoch": 1.2723214285714286,
"grad_norm": 0.9252075552940369,
"learning_rate": 1.3745025241045414e-05,
"loss": 0.4135,
"step": 570
},
{
"epoch": 1.2745535714285714,
"grad_norm": 1.3814020156860352,
"learning_rate": 1.372159719788686e-05,
"loss": 0.4562,
"step": 571
},
{
"epoch": 1.2767857142857144,
"grad_norm": 1.3983325958251953,
"learning_rate": 1.3698145419789302e-05,
"loss": 0.4768,
"step": 572
},
{
"epoch": 1.2790178571428572,
"grad_norm": 1.184475064277649,
"learning_rate": 1.3674670056319315e-05,
"loss": 0.4687,
"step": 573
},
{
"epoch": 1.28125,
"grad_norm": 1.053714632987976,
"learning_rate": 1.3651171257193883e-05,
"loss": 0.4564,
"step": 574
},
{
"epoch": 1.2834821428571428,
"grad_norm": 1.1729894876480103,
"learning_rate": 1.3627649172279453e-05,
"loss": 0.4586,
"step": 575
},
{
"epoch": 1.2857142857142856,
"grad_norm": 1.1714091300964355,
"learning_rate": 1.3604103951590993e-05,
"loss": 0.4365,
"step": 576
},
{
"epoch": 1.2879464285714286,
"grad_norm": 1.2077913284301758,
"learning_rate": 1.3580535745291001e-05,
"loss": 0.4415,
"step": 577
},
{
"epoch": 1.2901785714285714,
"grad_norm": 1.2015424966812134,
"learning_rate": 1.3556944703688592e-05,
"loss": 0.4644,
"step": 578
},
{
"epoch": 1.2924107142857144,
"grad_norm": 1.0738661289215088,
"learning_rate": 1.3533330977238496e-05,
"loss": 0.4131,
"step": 579
},
{
"epoch": 1.2946428571428572,
"grad_norm": 1.1716837882995605,
"learning_rate": 1.3509694716540135e-05,
"loss": 0.4083,
"step": 580
},
{
"epoch": 1.296875,
"grad_norm": 0.9854421019554138,
"learning_rate": 1.348603607233663e-05,
"loss": 0.3796,
"step": 581
},
{
"epoch": 1.2991071428571428,
"grad_norm": 1.105299949645996,
"learning_rate": 1.3462355195513868e-05,
"loss": 0.4918,
"step": 582
},
{
"epoch": 1.3013392857142856,
"grad_norm": 1.2206473350524902,
"learning_rate": 1.343865223709952e-05,
"loss": 0.4594,
"step": 583
},
{
"epoch": 1.3035714285714286,
"grad_norm": 1.1916580200195312,
"learning_rate": 1.341492734826209e-05,
"loss": 0.482,
"step": 584
},
{
"epoch": 1.3058035714285714,
"grad_norm": 1.0674858093261719,
"learning_rate": 1.3391180680309945e-05,
"loss": 0.4192,
"step": 585
},
{
"epoch": 1.3080357142857144,
"grad_norm": 1.403185486793518,
"learning_rate": 1.3367412384690346e-05,
"loss": 0.5409,
"step": 586
},
{
"epoch": 1.3102678571428572,
"grad_norm": 1.0415091514587402,
"learning_rate": 1.3343622612988492e-05,
"loss": 0.4695,
"step": 587
},
{
"epoch": 1.3125,
"grad_norm": 1.290241003036499,
"learning_rate": 1.3319811516926541e-05,
"loss": 0.4639,
"step": 588
},
{
"epoch": 1.3147321428571428,
"grad_norm": 0.9970055222511292,
"learning_rate": 1.329597924836267e-05,
"loss": 0.428,
"step": 589
},
{
"epoch": 1.3169642857142856,
"grad_norm": 1.2482655048370361,
"learning_rate": 1.3272125959290059e-05,
"loss": 0.4967,
"step": 590
},
{
"epoch": 1.3191964285714286,
"grad_norm": 1.0975176095962524,
"learning_rate": 1.3248251801835968e-05,
"loss": 0.4343,
"step": 591
},
{
"epoch": 1.3214285714285714,
"grad_norm": 1.1875313520431519,
"learning_rate": 1.3224356928260735e-05,
"loss": 0.387,
"step": 592
},
{
"epoch": 1.3236607142857144,
"grad_norm": 1.1995247602462769,
"learning_rate": 1.3200441490956832e-05,
"loss": 0.4853,
"step": 593
},
{
"epoch": 1.3258928571428572,
"grad_norm": 1.2359397411346436,
"learning_rate": 1.317650564244787e-05,
"loss": 0.4778,
"step": 594
},
{
"epoch": 1.328125,
"grad_norm": 0.9895375967025757,
"learning_rate": 1.3152549535387624e-05,
"loss": 0.4386,
"step": 595
},
{
"epoch": 1.3303571428571428,
"grad_norm": 1.1084294319152832,
"learning_rate": 1.3128573322559097e-05,
"loss": 0.4152,
"step": 596
},
{
"epoch": 1.3325892857142856,
"grad_norm": 1.230063557624817,
"learning_rate": 1.3104577156873496e-05,
"loss": 0.413,
"step": 597
},
{
"epoch": 1.3348214285714286,
"grad_norm": 1.1854337453842163,
"learning_rate": 1.3080561191369286e-05,
"loss": 0.471,
"step": 598
},
{
"epoch": 1.3370535714285714,
"grad_norm": 0.9070828557014465,
"learning_rate": 1.3056525579211215e-05,
"loss": 0.3926,
"step": 599
},
{
"epoch": 1.3392857142857144,
"grad_norm": 1.3953931331634521,
"learning_rate": 1.3032470473689322e-05,
"loss": 0.4771,
"step": 600
},
{
"epoch": 1.3415178571428572,
"grad_norm": 1.1881200075149536,
"learning_rate": 1.3008396028217969e-05,
"loss": 0.4817,
"step": 601
},
{
"epoch": 1.34375,
"grad_norm": 1.27316153049469,
"learning_rate": 1.298430239633486e-05,
"loss": 0.4898,
"step": 602
},
{
"epoch": 1.3459821428571428,
"grad_norm": 1.0014166831970215,
"learning_rate": 1.296018973170007e-05,
"loss": 0.4285,
"step": 603
},
{
"epoch": 1.3482142857142856,
"grad_norm": 1.0519405603408813,
"learning_rate": 1.2936058188095045e-05,
"loss": 0.4123,
"step": 604
},
{
"epoch": 1.3504464285714286,
"grad_norm": 1.192347526550293,
"learning_rate": 1.2911907919421647e-05,
"loss": 0.5152,
"step": 605
},
{
"epoch": 1.3526785714285714,
"grad_norm": 1.1087520122528076,
"learning_rate": 1.2887739079701147e-05,
"loss": 0.4679,
"step": 606
},
{
"epoch": 1.3549107142857144,
"grad_norm": 1.2100986242294312,
"learning_rate": 1.2863551823073266e-05,
"loss": 0.4792,
"step": 607
},
{
"epoch": 1.3571428571428572,
"grad_norm": 1.1435983180999756,
"learning_rate": 1.2839346303795173e-05,
"loss": 0.4087,
"step": 608
},
{
"epoch": 1.359375,
"grad_norm": 1.0820369720458984,
"learning_rate": 1.2815122676240518e-05,
"loss": 0.3981,
"step": 609
},
{
"epoch": 1.3616071428571428,
"grad_norm": 1.0322455167770386,
"learning_rate": 1.2790881094898428e-05,
"loss": 0.489,
"step": 610
},
{
"epoch": 1.3638392857142856,
"grad_norm": 1.12478768825531,
"learning_rate": 1.2766621714372543e-05,
"loss": 0.3883,
"step": 611
},
{
"epoch": 1.3660714285714286,
"grad_norm": 1.2271108627319336,
"learning_rate": 1.274234468938001e-05,
"loss": 0.4615,
"step": 612
},
{
"epoch": 1.3683035714285714,
"grad_norm": 1.1735365390777588,
"learning_rate": 1.271805017475051e-05,
"loss": 0.421,
"step": 613
},
{
"epoch": 1.3705357142857144,
"grad_norm": 1.11384916305542,
"learning_rate": 1.2693738325425272e-05,
"loss": 0.4568,
"step": 614
},
{
"epoch": 1.3727678571428572,
"grad_norm": 1.5108226537704468,
"learning_rate": 1.266940929645606e-05,
"loss": 0.5374,
"step": 615
},
{
"epoch": 1.375,
"grad_norm": 1.1557199954986572,
"learning_rate": 1.2645063243004236e-05,
"loss": 0.3919,
"step": 616
},
{
"epoch": 1.3772321428571428,
"grad_norm": 1.1951533555984497,
"learning_rate": 1.2620700320339705e-05,
"loss": 0.4521,
"step": 617
},
{
"epoch": 1.3794642857142856,
"grad_norm": 1.1673365831375122,
"learning_rate": 1.2596320683839976e-05,
"loss": 0.4613,
"step": 618
},
{
"epoch": 1.3816964285714286,
"grad_norm": 1.2299069166183472,
"learning_rate": 1.2571924488989145e-05,
"loss": 0.436,
"step": 619
},
{
"epoch": 1.3839285714285714,
"grad_norm": 1.0827404260635376,
"learning_rate": 1.2547511891376916e-05,
"loss": 0.3655,
"step": 620
},
{
"epoch": 1.3861607142857144,
"grad_norm": 1.1951014995574951,
"learning_rate": 1.2523083046697598e-05,
"loss": 0.4677,
"step": 621
},
{
"epoch": 1.3883928571428572,
"grad_norm": 1.2478485107421875,
"learning_rate": 1.2498638110749122e-05,
"loss": 0.4752,
"step": 622
},
{
"epoch": 1.390625,
"grad_norm": 1.0962241888046265,
"learning_rate": 1.2474177239432042e-05,
"loss": 0.4408,
"step": 623
},
{
"epoch": 1.3928571428571428,
"grad_norm": 1.0092295408248901,
"learning_rate": 1.2449700588748541e-05,
"loss": 0.4805,
"step": 624
},
{
"epoch": 1.3950892857142856,
"grad_norm": 1.1849055290222168,
"learning_rate": 1.2425208314801441e-05,
"loss": 0.403,
"step": 625
},
{
"epoch": 1.3973214285714286,
"grad_norm": 1.1802829504013062,
"learning_rate": 1.2400700573793191e-05,
"loss": 0.4818,
"step": 626
},
{
"epoch": 1.3995535714285714,
"grad_norm": 1.2204647064208984,
"learning_rate": 1.23761775220249e-05,
"loss": 0.4584,
"step": 627
},
{
"epoch": 1.4017857142857144,
"grad_norm": 1.3409292697906494,
"learning_rate": 1.2351639315895309e-05,
"loss": 0.5377,
"step": 628
},
{
"epoch": 1.4040178571428572,
"grad_norm": 1.1798982620239258,
"learning_rate": 1.2327086111899816e-05,
"loss": 0.5223,
"step": 629
},
{
"epoch": 1.40625,
"grad_norm": 1.2914079427719116,
"learning_rate": 1.2302518066629467e-05,
"loss": 0.4595,
"step": 630
},
{
"epoch": 1.4084821428571428,
"grad_norm": 1.084916114807129,
"learning_rate": 1.2277935336769961e-05,
"loss": 0.4484,
"step": 631
},
{
"epoch": 1.4107142857142856,
"grad_norm": 1.1137053966522217,
"learning_rate": 1.2253338079100652e-05,
"loss": 0.4465,
"step": 632
},
{
"epoch": 1.4129464285714286,
"grad_norm": 1.1979303359985352,
"learning_rate": 1.2228726450493538e-05,
"loss": 0.4932,
"step": 633
},
{
"epoch": 1.4151785714285714,
"grad_norm": 1.1469295024871826,
"learning_rate": 1.2204100607912277e-05,
"loss": 0.4786,
"step": 634
},
{
"epoch": 1.4174107142857144,
"grad_norm": 1.144337773323059,
"learning_rate": 1.2179460708411177e-05,
"loss": 0.42,
"step": 635
},
{
"epoch": 1.4196428571428572,
"grad_norm": 1.2468199729919434,
"learning_rate": 1.2154806909134198e-05,
"loss": 0.4205,
"step": 636
},
{
"epoch": 1.421875,
"grad_norm": 1.237923264503479,
"learning_rate": 1.213013936731394e-05,
"loss": 0.4724,
"step": 637
},
{
"epoch": 1.4241071428571428,
"grad_norm": 1.166019082069397,
"learning_rate": 1.210545824027066e-05,
"loss": 0.475,
"step": 638
},
{
"epoch": 1.4263392857142856,
"grad_norm": 1.1303397417068481,
"learning_rate": 1.2080763685411243e-05,
"loss": 0.4825,
"step": 639
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.0004240274429321,
"learning_rate": 1.205605586022822e-05,
"loss": 0.4399,
"step": 640
},
{
"epoch": 1.4308035714285714,
"grad_norm": 1.0718317031860352,
"learning_rate": 1.2031334922298749e-05,
"loss": 0.4585,
"step": 641
},
{
"epoch": 1.4330357142857144,
"grad_norm": 1.0824089050292969,
"learning_rate": 1.2006601029283629e-05,
"loss": 0.4365,
"step": 642
},
{
"epoch": 1.4352678571428572,
"grad_norm": 1.1855394840240479,
"learning_rate": 1.1981854338926262e-05,
"loss": 0.4693,
"step": 643
},
{
"epoch": 1.4375,
"grad_norm": 1.056105375289917,
"learning_rate": 1.1957095009051683e-05,
"loss": 0.4306,
"step": 644
},
{
"epoch": 1.4397321428571428,
"grad_norm": 1.109755516052246,
"learning_rate": 1.193232319756553e-05,
"loss": 0.4184,
"step": 645
},
{
"epoch": 1.4419642857142856,
"grad_norm": 1.1697285175323486,
"learning_rate": 1.1907539062453044e-05,
"loss": 0.4157,
"step": 646
},
{
"epoch": 1.4441964285714286,
"grad_norm": 1.3334587812423706,
"learning_rate": 1.1882742761778069e-05,
"loss": 0.4534,
"step": 647
},
{
"epoch": 1.4464285714285714,
"grad_norm": 1.3505901098251343,
"learning_rate": 1.1857934453682016e-05,
"loss": 0.4529,
"step": 648
},
{
"epoch": 1.4486607142857144,
"grad_norm": 1.154270887374878,
"learning_rate": 1.1833114296382903e-05,
"loss": 0.4426,
"step": 649
},
{
"epoch": 1.4508928571428572,
"grad_norm": 1.0679898262023926,
"learning_rate": 1.1808282448174295e-05,
"loss": 0.3802,
"step": 650
},
{
"epoch": 1.453125,
"grad_norm": 1.2348114252090454,
"learning_rate": 1.1783439067424329e-05,
"loss": 0.467,
"step": 651
},
{
"epoch": 1.4553571428571428,
"grad_norm": 1.1057263612747192,
"learning_rate": 1.1758584312574693e-05,
"loss": 0.4077,
"step": 652
},
{
"epoch": 1.4575892857142856,
"grad_norm": 1.0530842542648315,
"learning_rate": 1.17337183421396e-05,
"loss": 0.4221,
"step": 653
},
{
"epoch": 1.4598214285714286,
"grad_norm": 1.0335931777954102,
"learning_rate": 1.1708841314704811e-05,
"loss": 0.4917,
"step": 654
},
{
"epoch": 1.4620535714285714,
"grad_norm": 0.9055652618408203,
"learning_rate": 1.1683953388926592e-05,
"loss": 0.3894,
"step": 655
},
{
"epoch": 1.4642857142857144,
"grad_norm": 1.1665129661560059,
"learning_rate": 1.1659054723530721e-05,
"loss": 0.4008,
"step": 656
},
{
"epoch": 1.4665178571428572,
"grad_norm": 1.3352913856506348,
"learning_rate": 1.163414547731146e-05,
"loss": 0.4859,
"step": 657
},
{
"epoch": 1.46875,
"grad_norm": 1.474576473236084,
"learning_rate": 1.1609225809130566e-05,
"loss": 0.4766,
"step": 658
},
{
"epoch": 1.4709821428571428,
"grad_norm": 0.9900811314582825,
"learning_rate": 1.1584295877916251e-05,
"loss": 0.3852,
"step": 659
},
{
"epoch": 1.4732142857142856,
"grad_norm": 1.0743541717529297,
"learning_rate": 1.1559355842662188e-05,
"loss": 0.4747,
"step": 660
},
{
"epoch": 1.4754464285714286,
"grad_norm": 1.0290722846984863,
"learning_rate": 1.1534405862426481e-05,
"loss": 0.4397,
"step": 661
},
{
"epoch": 1.4776785714285714,
"grad_norm": 1.0267919301986694,
"learning_rate": 1.150944609633067e-05,
"loss": 0.4738,
"step": 662
},
{
"epoch": 1.4799107142857144,
"grad_norm": 1.073096752166748,
"learning_rate": 1.1484476703558698e-05,
"loss": 0.4751,
"step": 663
},
{
"epoch": 1.4821428571428572,
"grad_norm": 1.1763546466827393,
"learning_rate": 1.1459497843355907e-05,
"loss": 0.4471,
"step": 664
},
{
"epoch": 1.484375,
"grad_norm": 1.1504164934158325,
"learning_rate": 1.1434509675028018e-05,
"loss": 0.4272,
"step": 665
},
{
"epoch": 1.4866071428571428,
"grad_norm": 1.0036594867706299,
"learning_rate": 1.1409512357940114e-05,
"loss": 0.4174,
"step": 666
},
{
"epoch": 1.4888392857142856,
"grad_norm": 1.3433315753936768,
"learning_rate": 1.138450605151563e-05,
"loss": 0.4634,
"step": 667
},
{
"epoch": 1.4910714285714286,
"grad_norm": 1.1281894445419312,
"learning_rate": 1.1359490915235323e-05,
"loss": 0.5026,
"step": 668
},
{
"epoch": 1.4933035714285714,
"grad_norm": 1.0535964965820312,
"learning_rate": 1.1334467108636273e-05,
"loss": 0.4849,
"step": 669
},
{
"epoch": 1.4955357142857144,
"grad_norm": 1.028182864189148,
"learning_rate": 1.1309434791310848e-05,
"loss": 0.5133,
"step": 670
},
{
"epoch": 1.4977678571428572,
"grad_norm": 1.0213048458099365,
"learning_rate": 1.1284394122905697e-05,
"loss": 0.4587,
"step": 671
},
{
"epoch": 1.5,
"grad_norm": 1.157265067100525,
"learning_rate": 1.1259345263120738e-05,
"loss": 0.4129,
"step": 672
},
{
"epoch": 1.5,
"eval_loss": 0.5762594938278198,
"eval_runtime": 28.7549,
"eval_samples_per_second": 2.539,
"eval_steps_per_second": 0.348,
"step": 672
},
{
"epoch": 1.5022321428571428,
"grad_norm": 1.186301589012146,
"learning_rate": 1.1234288371708112e-05,
"loss": 0.4361,
"step": 673
},
{
"epoch": 1.5044642857142856,
"grad_norm": 1.1052271127700806,
"learning_rate": 1.1209223608471202e-05,
"loss": 0.415,
"step": 674
},
{
"epoch": 1.5066964285714286,
"grad_norm": 1.0246312618255615,
"learning_rate": 1.1184151133263578e-05,
"loss": 0.4258,
"step": 675
},
{
"epoch": 1.5089285714285714,
"grad_norm": 1.1626089811325073,
"learning_rate": 1.1159071105988012e-05,
"loss": 0.4135,
"step": 676
},
{
"epoch": 1.5111607142857144,
"grad_norm": 1.3956563472747803,
"learning_rate": 1.1133983686595416e-05,
"loss": 0.4887,
"step": 677
},
{
"epoch": 1.5133928571428572,
"grad_norm": 1.3288111686706543,
"learning_rate": 1.110888903508387e-05,
"loss": 0.5532,
"step": 678
},
{
"epoch": 1.515625,
"grad_norm": 1.1640434265136719,
"learning_rate": 1.1083787311497562e-05,
"loss": 0.459,
"step": 679
},
{
"epoch": 1.5178571428571428,
"grad_norm": 1.3315949440002441,
"learning_rate": 1.1058678675925796e-05,
"loss": 0.4436,
"step": 680
},
{
"epoch": 1.5200892857142856,
"grad_norm": 1.4845269918441772,
"learning_rate": 1.1033563288501944e-05,
"loss": 0.4378,
"step": 681
},
{
"epoch": 1.5223214285714286,
"grad_norm": 1.1825847625732422,
"learning_rate": 1.1008441309402448e-05,
"loss": 0.4766,
"step": 682
},
{
"epoch": 1.5245535714285714,
"grad_norm": 1.282860279083252,
"learning_rate": 1.0983312898845788e-05,
"loss": 0.4995,
"step": 683
},
{
"epoch": 1.5267857142857144,
"grad_norm": 1.080328345298767,
"learning_rate": 1.0958178217091455e-05,
"loss": 0.3866,
"step": 684
},
{
"epoch": 1.5290178571428572,
"grad_norm": 1.218947172164917,
"learning_rate": 1.093303742443895e-05,
"loss": 0.528,
"step": 685
},
{
"epoch": 1.53125,
"grad_norm": 1.027030348777771,
"learning_rate": 1.0907890681226728e-05,
"loss": 0.4396,
"step": 686
},
{
"epoch": 1.5334821428571428,
"grad_norm": 1.1486948728561401,
"learning_rate": 1.0882738147831209e-05,
"loss": 0.4119,
"step": 687
},
{
"epoch": 1.5357142857142856,
"grad_norm": 1.2360179424285889,
"learning_rate": 1.0857579984665733e-05,
"loss": 0.4318,
"step": 688
},
{
"epoch": 1.5379464285714286,
"grad_norm": 1.1595594882965088,
"learning_rate": 1.0832416352179549e-05,
"loss": 0.4664,
"step": 689
},
{
"epoch": 1.5401785714285714,
"grad_norm": 1.145702838897705,
"learning_rate": 1.0807247410856783e-05,
"loss": 0.4545,
"step": 690
},
{
"epoch": 1.5424107142857144,
"grad_norm": 1.0401489734649658,
"learning_rate": 1.0782073321215423e-05,
"loss": 0.4562,
"step": 691
},
{
"epoch": 1.5446428571428572,
"grad_norm": 0.9377378821372986,
"learning_rate": 1.0756894243806291e-05,
"loss": 0.4018,
"step": 692
},
{
"epoch": 1.546875,
"grad_norm": 1.0041829347610474,
"learning_rate": 1.073171033921201e-05,
"loss": 0.4464,
"step": 693
},
{
"epoch": 1.5491071428571428,
"grad_norm": 1.0813108682632446,
"learning_rate": 1.0706521768046006e-05,
"loss": 0.419,
"step": 694
},
{
"epoch": 1.5513392857142856,
"grad_norm": 1.024339199066162,
"learning_rate": 1.0681328690951447e-05,
"loss": 0.4,
"step": 695
},
{
"epoch": 1.5535714285714286,
"grad_norm": 1.2120790481567383,
"learning_rate": 1.0656131268600254e-05,
"loss": 0.4255,
"step": 696
},
{
"epoch": 1.5558035714285714,
"grad_norm": 1.0487234592437744,
"learning_rate": 1.0630929661692051e-05,
"loss": 0.4281,
"step": 697
},
{
"epoch": 1.5580357142857144,
"grad_norm": 1.1584324836730957,
"learning_rate": 1.0605724030953155e-05,
"loss": 0.4147,
"step": 698
},
{
"epoch": 1.5602678571428572,
"grad_norm": 1.2971632480621338,
"learning_rate": 1.0580514537135542e-05,
"loss": 0.4992,
"step": 699
},
{
"epoch": 1.5625,
"grad_norm": 1.186423420906067,
"learning_rate": 1.0555301341015832e-05,
"loss": 0.509,
"step": 700
},
{
"epoch": 1.5647321428571428,
"grad_norm": 1.0909963846206665,
"learning_rate": 1.0530084603394239e-05,
"loss": 0.4044,
"step": 701
},
{
"epoch": 1.5669642857142856,
"grad_norm": 1.1188690662384033,
"learning_rate": 1.0504864485093588e-05,
"loss": 0.4433,
"step": 702
},
{
"epoch": 1.5691964285714286,
"grad_norm": 1.1996616125106812,
"learning_rate": 1.0479641146958249e-05,
"loss": 0.4001,
"step": 703
},
{
"epoch": 1.5714285714285714,
"grad_norm": 1.3726085424423218,
"learning_rate": 1.0454414749853126e-05,
"loss": 0.4005,
"step": 704
},
{
"epoch": 1.5736607142857144,
"grad_norm": 1.3191404342651367,
"learning_rate": 1.0429185454662638e-05,
"loss": 0.51,
"step": 705
},
{
"epoch": 1.5758928571428572,
"grad_norm": 1.1625711917877197,
"learning_rate": 1.0403953422289687e-05,
"loss": 0.4751,
"step": 706
},
{
"epoch": 1.578125,
"grad_norm": 0.9676429629325867,
"learning_rate": 1.0378718813654633e-05,
"loss": 0.4208,
"step": 707
},
{
"epoch": 1.5803571428571428,
"grad_norm": 1.3455173969268799,
"learning_rate": 1.0353481789694258e-05,
"loss": 0.5174,
"step": 708
},
{
"epoch": 1.5825892857142856,
"grad_norm": 1.349685549736023,
"learning_rate": 1.0328242511360753e-05,
"loss": 0.4612,
"step": 709
},
{
"epoch": 1.5848214285714286,
"grad_norm": 1.0394678115844727,
"learning_rate": 1.030300113962069e-05,
"loss": 0.396,
"step": 710
},
{
"epoch": 1.5870535714285714,
"grad_norm": 0.9809379577636719,
"learning_rate": 1.0277757835453989e-05,
"loss": 0.4269,
"step": 711
},
{
"epoch": 1.5892857142857144,
"grad_norm": 1.2518723011016846,
"learning_rate": 1.0252512759852891e-05,
"loss": 0.4136,
"step": 712
},
{
"epoch": 1.5915178571428572,
"grad_norm": 1.1232184171676636,
"learning_rate": 1.0227266073820939e-05,
"loss": 0.491,
"step": 713
},
{
"epoch": 1.59375,
"grad_norm": 1.1965726613998413,
"learning_rate": 1.0202017938371947e-05,
"loss": 0.5157,
"step": 714
},
{
"epoch": 1.5959821428571428,
"grad_norm": 1.116578459739685,
"learning_rate": 1.0176768514528967e-05,
"loss": 0.4045,
"step": 715
},
{
"epoch": 1.5982142857142856,
"grad_norm": 1.1951912641525269,
"learning_rate": 1.015151796332328e-05,
"loss": 0.4598,
"step": 716
},
{
"epoch": 1.6004464285714286,
"grad_norm": 1.137501835823059,
"learning_rate": 1.012626644579334e-05,
"loss": 0.521,
"step": 717
},
{
"epoch": 1.6026785714285714,
"grad_norm": 1.0315260887145996,
"learning_rate": 1.010101412298378e-05,
"loss": 0.4153,
"step": 718
},
{
"epoch": 1.6049107142857144,
"grad_norm": 1.1565879583358765,
"learning_rate": 1.0075761155944355e-05,
"loss": 0.4429,
"step": 719
},
{
"epoch": 1.6071428571428572,
"grad_norm": 1.1776032447814941,
"learning_rate": 1.0050507705728943e-05,
"loss": 0.3924,
"step": 720
},
{
"epoch": 1.609375,
"grad_norm": 1.1653850078582764,
"learning_rate": 1.0025253933394487e-05,
"loss": 0.4368,
"step": 721
},
{
"epoch": 1.6116071428571428,
"grad_norm": 1.1864845752716064,
"learning_rate": 1e-05,
"loss": 0.4214,
"step": 722
},
{
"epoch": 1.6138392857142856,
"grad_norm": 1.2345666885375977,
"learning_rate": 9.974746066605515e-06,
"loss": 0.4087,
"step": 723
},
{
"epoch": 1.6160714285714286,
"grad_norm": 1.0691934823989868,
"learning_rate": 9.949492294271062e-06,
"loss": 0.4253,
"step": 724
},
{
"epoch": 1.6183035714285714,
"grad_norm": 1.1589877605438232,
"learning_rate": 9.924238844055646e-06,
"loss": 0.4587,
"step": 725
},
{
"epoch": 1.6205357142857144,
"grad_norm": 1.1777771711349487,
"learning_rate": 9.898985877016225e-06,
"loss": 0.4513,
"step": 726
},
{
"epoch": 1.6227678571428572,
"grad_norm": 1.0277619361877441,
"learning_rate": 9.873733554206663e-06,
"loss": 0.4343,
"step": 727
},
{
"epoch": 1.625,
"grad_norm": 1.0607410669326782,
"learning_rate": 9.848482036676725e-06,
"loss": 0.454,
"step": 728
},
{
"epoch": 1.6272321428571428,
"grad_norm": 1.1368486881256104,
"learning_rate": 9.823231485471034e-06,
"loss": 0.4728,
"step": 729
},
{
"epoch": 1.6294642857142856,
"grad_norm": 0.9980977773666382,
"learning_rate": 9.797982061628056e-06,
"loss": 0.4544,
"step": 730
},
{
"epoch": 1.6316964285714286,
"grad_norm": 1.1738172769546509,
"learning_rate": 9.772733926179066e-06,
"loss": 0.4884,
"step": 731
},
{
"epoch": 1.6339285714285714,
"grad_norm": 1.079927682876587,
"learning_rate": 9.747487240147112e-06,
"loss": 0.4424,
"step": 732
},
{
"epoch": 1.6361607142857144,
"grad_norm": 1.255071997642517,
"learning_rate": 9.722242164546016e-06,
"loss": 0.4366,
"step": 733
},
{
"epoch": 1.6383928571428572,
"grad_norm": 1.1740080118179321,
"learning_rate": 9.696998860379313e-06,
"loss": 0.4567,
"step": 734
},
{
"epoch": 1.640625,
"grad_norm": 1.2524385452270508,
"learning_rate": 9.67175748863925e-06,
"loss": 0.4696,
"step": 735
},
{
"epoch": 1.6428571428571428,
"grad_norm": 1.1558537483215332,
"learning_rate": 9.646518210305747e-06,
"loss": 0.4027,
"step": 736
},
{
"epoch": 1.6450892857142856,
"grad_norm": 1.218324899673462,
"learning_rate": 9.621281186345367e-06,
"loss": 0.4871,
"step": 737
},
{
"epoch": 1.6473214285714286,
"grad_norm": 1.1278901100158691,
"learning_rate": 9.596046577710314e-06,
"loss": 0.3962,
"step": 738
},
{
"epoch": 1.6495535714285714,
"grad_norm": 1.1411744356155396,
"learning_rate": 9.570814545337362e-06,
"loss": 0.5,
"step": 739
},
{
"epoch": 1.6517857142857144,
"grad_norm": 1.4232391119003296,
"learning_rate": 9.545585250146879e-06,
"loss": 0.5546,
"step": 740
},
{
"epoch": 1.6540178571428572,
"grad_norm": 1.096972107887268,
"learning_rate": 9.520358853041756e-06,
"loss": 0.3857,
"step": 741
},
{
"epoch": 1.65625,
"grad_norm": 1.0176469087600708,
"learning_rate": 9.495135514906415e-06,
"loss": 0.4268,
"step": 742
},
{
"epoch": 1.6584821428571428,
"grad_norm": 1.092172384262085,
"learning_rate": 9.469915396605763e-06,
"loss": 0.4572,
"step": 743
},
{
"epoch": 1.6607142857142856,
"grad_norm": 1.2569825649261475,
"learning_rate": 9.44469865898417e-06,
"loss": 0.4815,
"step": 744
},
{
"epoch": 1.6629464285714286,
"grad_norm": 1.1099218130111694,
"learning_rate": 9.41948546286446e-06,
"loss": 0.4179,
"step": 745
},
{
"epoch": 1.6651785714285714,
"grad_norm": 1.0693453550338745,
"learning_rate": 9.394275969046845e-06,
"loss": 0.3651,
"step": 746
},
{
"epoch": 1.6674107142857144,
"grad_norm": 1.193220853805542,
"learning_rate": 9.369070338307954e-06,
"loss": 0.4632,
"step": 747
},
{
"epoch": 1.6696428571428572,
"grad_norm": 1.2311550378799438,
"learning_rate": 9.34386873139975e-06,
"loss": 0.4297,
"step": 748
},
{
"epoch": 1.671875,
"grad_norm": 1.1190276145935059,
"learning_rate": 9.31867130904856e-06,
"loss": 0.39,
"step": 749
},
{
"epoch": 1.6741071428571428,
"grad_norm": 1.3360817432403564,
"learning_rate": 9.293478231954e-06,
"loss": 0.5313,
"step": 750
},
{
"epoch": 1.6763392857142856,
"grad_norm": 1.0268186330795288,
"learning_rate": 9.26828966078799e-06,
"loss": 0.347,
"step": 751
},
{
"epoch": 1.6785714285714286,
"grad_norm": 1.1641294956207275,
"learning_rate": 9.243105756193714e-06,
"loss": 0.453,
"step": 752
},
{
"epoch": 1.6808035714285714,
"grad_norm": 0.9681382775306702,
"learning_rate": 9.217926678784579e-06,
"loss": 0.4076,
"step": 753
},
{
"epoch": 1.6830357142857144,
"grad_norm": 1.077756404876709,
"learning_rate": 9.192752589143219e-06,
"loss": 0.4225,
"step": 754
},
{
"epoch": 1.6852678571428572,
"grad_norm": 1.216125249862671,
"learning_rate": 9.167583647820453e-06,
"loss": 0.5314,
"step": 755
},
{
"epoch": 1.6875,
"grad_norm": 1.0570131540298462,
"learning_rate": 9.14242001533427e-06,
"loss": 0.4223,
"step": 756
},
{
"epoch": 1.6897321428571428,
"grad_norm": 1.0465316772460938,
"learning_rate": 9.117261852168794e-06,
"loss": 0.4597,
"step": 757
},
{
"epoch": 1.6919642857142856,
"grad_norm": 1.07583487033844,
"learning_rate": 9.092109318773274e-06,
"loss": 0.4947,
"step": 758
},
{
"epoch": 1.6941964285714286,
"grad_norm": 1.1008681058883667,
"learning_rate": 9.066962575561054e-06,
"loss": 0.4785,
"step": 759
},
{
"epoch": 1.6964285714285714,
"grad_norm": 1.061246633529663,
"learning_rate": 9.041821782908544e-06,
"loss": 0.4698,
"step": 760
},
{
"epoch": 1.6986607142857144,
"grad_norm": 0.8538286089897156,
"learning_rate": 9.016687101154215e-06,
"loss": 0.3926,
"step": 761
},
{
"epoch": 1.7008928571428572,
"grad_norm": 1.1151841878890991,
"learning_rate": 8.991558690597553e-06,
"loss": 0.4459,
"step": 762
},
{
"epoch": 1.703125,
"grad_norm": 1.27910578250885,
"learning_rate": 8.966436711498058e-06,
"loss": 0.4883,
"step": 763
},
{
"epoch": 1.7053571428571428,
"grad_norm": 1.1799464225769043,
"learning_rate": 8.941321324074207e-06,
"loss": 0.4439,
"step": 764
},
{
"epoch": 1.7075892857142856,
"grad_norm": 1.2295399904251099,
"learning_rate": 8.916212688502438e-06,
"loss": 0.4074,
"step": 765
},
{
"epoch": 1.7098214285714286,
"grad_norm": 1.0072729587554932,
"learning_rate": 8.891110964916135e-06,
"loss": 0.3901,
"step": 766
},
{
"epoch": 1.7120535714285714,
"grad_norm": 1.0866972208023071,
"learning_rate": 8.866016313404586e-06,
"loss": 0.4063,
"step": 767
},
{
"epoch": 1.7142857142857144,
"grad_norm": 1.1431010961532593,
"learning_rate": 8.840928894011995e-06,
"loss": 0.4814,
"step": 768
},
{
"epoch": 1.7165178571428572,
"grad_norm": 0.9729580879211426,
"learning_rate": 8.815848866736424e-06,
"loss": 0.366,
"step": 769
},
{
"epoch": 1.71875,
"grad_norm": 1.3122913837432861,
"learning_rate": 8.790776391528803e-06,
"loss": 0.4625,
"step": 770
},
{
"epoch": 1.7209821428571428,
"grad_norm": 1.2349814176559448,
"learning_rate": 8.76571162829189e-06,
"loss": 0.4846,
"step": 771
},
{
"epoch": 1.7232142857142856,
"grad_norm": 1.240909218788147,
"learning_rate": 8.740654736879265e-06,
"loss": 0.5493,
"step": 772
},
{
"epoch": 1.7254464285714286,
"grad_norm": 1.0323981046676636,
"learning_rate": 8.715605877094304e-06,
"loss": 0.3947,
"step": 773
},
{
"epoch": 1.7276785714285714,
"grad_norm": 1.106673002243042,
"learning_rate": 8.690565208689157e-06,
"loss": 0.434,
"step": 774
},
{
"epoch": 1.7299107142857144,
"grad_norm": 1.1972298622131348,
"learning_rate": 8.665532891363732e-06,
"loss": 0.4705,
"step": 775
},
{
"epoch": 1.7321428571428572,
"grad_norm": 1.1289480924606323,
"learning_rate": 8.640509084764682e-06,
"loss": 0.4872,
"step": 776
},
{
"epoch": 1.734375,
"grad_norm": 1.2260942459106445,
"learning_rate": 8.615493948484375e-06,
"loss": 0.5072,
"step": 777
},
{
"epoch": 1.7366071428571428,
"grad_norm": 0.9576632976531982,
"learning_rate": 8.590487642059888e-06,
"loss": 0.392,
"step": 778
},
{
"epoch": 1.7388392857142856,
"grad_norm": 1.2125643491744995,
"learning_rate": 8.565490324971983e-06,
"loss": 0.4466,
"step": 779
},
{
"epoch": 1.7410714285714286,
"grad_norm": 1.294597864151001,
"learning_rate": 8.540502156644096e-06,
"loss": 0.4632,
"step": 780
},
{
"epoch": 1.7433035714285714,
"grad_norm": 1.1891837120056152,
"learning_rate": 8.515523296441304e-06,
"loss": 0.446,
"step": 781
},
{
"epoch": 1.7455357142857144,
"grad_norm": 0.9572664499282837,
"learning_rate": 8.490553903669335e-06,
"loss": 0.3964,
"step": 782
},
{
"epoch": 1.7477678571428572,
"grad_norm": 1.1124510765075684,
"learning_rate": 8.465594137573524e-06,
"loss": 0.4481,
"step": 783
},
{
"epoch": 1.75,
"grad_norm": 1.0124688148498535,
"learning_rate": 8.440644157337819e-06,
"loss": 0.4132,
"step": 784
},
{
"epoch": 1.75,
"eval_loss": 0.5769185423851013,
"eval_runtime": 27.5149,
"eval_samples_per_second": 2.653,
"eval_steps_per_second": 0.363,
"step": 784
},
{
"epoch": 1.7522321428571428,
"grad_norm": 1.0379537343978882,
"learning_rate": 8.415704122083752e-06,
"loss": 0.3967,
"step": 785
},
{
"epoch": 1.7544642857142856,
"grad_norm": 1.0697929859161377,
"learning_rate": 8.390774190869434e-06,
"loss": 0.4963,
"step": 786
},
{
"epoch": 1.7566964285714286,
"grad_norm": 1.1796789169311523,
"learning_rate": 8.365854522688543e-06,
"loss": 0.5222,
"step": 787
},
{
"epoch": 1.7589285714285714,
"grad_norm": 1.0521793365478516,
"learning_rate": 8.340945276469282e-06,
"loss": 0.4473,
"step": 788
},
{
"epoch": 1.7611607142857144,
"grad_norm": 0.9840192794799805,
"learning_rate": 8.316046611073413e-06,
"loss": 0.4225,
"step": 789
},
{
"epoch": 1.7633928571428572,
"grad_norm": 1.3078526258468628,
"learning_rate": 8.29115868529519e-06,
"loss": 0.4677,
"step": 790
},
{
"epoch": 1.765625,
"grad_norm": 1.1875336170196533,
"learning_rate": 8.266281657860406e-06,
"loss": 0.4099,
"step": 791
},
{
"epoch": 1.7678571428571428,
"grad_norm": 1.1061992645263672,
"learning_rate": 8.24141568742531e-06,
"loss": 0.4772,
"step": 792
},
{
"epoch": 1.7700892857142856,
"grad_norm": 1.0990246534347534,
"learning_rate": 8.21656093257567e-06,
"loss": 0.415,
"step": 793
},
{
"epoch": 1.7723214285714286,
"grad_norm": 1.235956072807312,
"learning_rate": 8.191717551825707e-06,
"loss": 0.4911,
"step": 794
},
{
"epoch": 1.7745535714285714,
"grad_norm": 1.0186740159988403,
"learning_rate": 8.166885703617098e-06,
"loss": 0.4068,
"step": 795
},
{
"epoch": 1.7767857142857144,
"grad_norm": 1.046399474143982,
"learning_rate": 8.142065546317988e-06,
"loss": 0.4641,
"step": 796
},
{
"epoch": 1.7790178571428572,
"grad_norm": 1.2857236862182617,
"learning_rate": 8.117257238221936e-06,
"loss": 0.4947,
"step": 797
},
{
"epoch": 1.78125,
"grad_norm": 1.0499598979949951,
"learning_rate": 8.09246093754696e-06,
"loss": 0.4596,
"step": 798
},
{
"epoch": 1.7834821428571428,
"grad_norm": 1.1524500846862793,
"learning_rate": 8.067676802434472e-06,
"loss": 0.4414,
"step": 799
},
{
"epoch": 1.7857142857142856,
"grad_norm": 1.1189254522323608,
"learning_rate": 8.042904990948319e-06,
"loss": 0.4364,
"step": 800
},
{
"epoch": 1.7879464285714286,
"grad_norm": 1.1881475448608398,
"learning_rate": 8.01814566107374e-06,
"loss": 0.4466,
"step": 801
},
{
"epoch": 1.7901785714285714,
"grad_norm": 1.0106115341186523,
"learning_rate": 7.993398970716375e-06,
"loss": 0.3923,
"step": 802
},
{
"epoch": 1.7924107142857144,
"grad_norm": 1.1234852075576782,
"learning_rate": 7.968665077701253e-06,
"loss": 0.4678,
"step": 803
},
{
"epoch": 1.7946428571428572,
"grad_norm": 1.1420475244522095,
"learning_rate": 7.943944139771784e-06,
"loss": 0.4642,
"step": 804
},
{
"epoch": 1.796875,
"grad_norm": 1.1545650959014893,
"learning_rate": 7.919236314588759e-06,
"loss": 0.4387,
"step": 805
},
{
"epoch": 1.7991071428571428,
"grad_norm": 1.2602638006210327,
"learning_rate": 7.894541759729344e-06,
"loss": 0.4275,
"step": 806
},
{
"epoch": 1.8013392857142856,
"grad_norm": 1.1043881177902222,
"learning_rate": 7.869860632686059e-06,
"loss": 0.4424,
"step": 807
},
{
"epoch": 1.8035714285714286,
"grad_norm": 1.2350860834121704,
"learning_rate": 7.845193090865807e-06,
"loss": 0.4516,
"step": 808
},
{
"epoch": 1.8058035714285714,
"grad_norm": 1.1565461158752441,
"learning_rate": 7.820539291588825e-06,
"loss": 0.4463,
"step": 809
},
{
"epoch": 1.8080357142857144,
"grad_norm": 1.1159425973892212,
"learning_rate": 7.795899392087728e-06,
"loss": 0.4668,
"step": 810
},
{
"epoch": 1.8102678571428572,
"grad_norm": 1.323809027671814,
"learning_rate": 7.771273549506466e-06,
"loss": 0.475,
"step": 811
},
{
"epoch": 1.8125,
"grad_norm": 1.0323381423950195,
"learning_rate": 7.746661920899351e-06,
"loss": 0.4279,
"step": 812
},
{
"epoch": 1.8147321428571428,
"grad_norm": 1.3589287996292114,
"learning_rate": 7.72206466323004e-06,
"loss": 0.496,
"step": 813
},
{
"epoch": 1.8169642857142856,
"grad_norm": 1.098157286643982,
"learning_rate": 7.697481933370535e-06,
"loss": 0.5262,
"step": 814
},
{
"epoch": 1.8191964285714286,
"grad_norm": 1.0474650859832764,
"learning_rate": 7.672913888100187e-06,
"loss": 0.4107,
"step": 815
},
{
"epoch": 1.8214285714285714,
"grad_norm": 1.034593939781189,
"learning_rate": 7.648360684104695e-06,
"loss": 0.464,
"step": 816
},
{
"epoch": 1.8236607142857144,
"grad_norm": 1.3255833387374878,
"learning_rate": 7.623822477975105e-06,
"loss": 0.4558,
"step": 817
},
{
"epoch": 1.8258928571428572,
"grad_norm": 0.9907006025314331,
"learning_rate": 7.599299426206812e-06,
"loss": 0.3483,
"step": 818
},
{
"epoch": 1.828125,
"grad_norm": 1.1865644454956055,
"learning_rate": 7.574791685198563e-06,
"loss": 0.4634,
"step": 819
},
{
"epoch": 1.8303571428571428,
"grad_norm": 1.2235808372497559,
"learning_rate": 7.550299411251461e-06,
"loss": 0.4313,
"step": 820
},
{
"epoch": 1.8325892857142856,
"grad_norm": 1.1662776470184326,
"learning_rate": 7.52582276056796e-06,
"loss": 0.4313,
"step": 821
},
{
"epoch": 1.8348214285714286,
"grad_norm": 1.1542125940322876,
"learning_rate": 7.501361889250882e-06,
"loss": 0.4432,
"step": 822
},
{
"epoch": 1.8370535714285714,
"grad_norm": 1.1531603336334229,
"learning_rate": 7.4769169533024055e-06,
"loss": 0.4444,
"step": 823
},
{
"epoch": 1.8392857142857144,
"grad_norm": 1.059193730354309,
"learning_rate": 7.452488108623089e-06,
"loss": 0.4356,
"step": 824
},
{
"epoch": 1.8415178571428572,
"grad_norm": 0.994286835193634,
"learning_rate": 7.428075511010858e-06,
"loss": 0.4322,
"step": 825
},
{
"epoch": 1.84375,
"grad_norm": 1.0639030933380127,
"learning_rate": 7.403679316160024e-06,
"loss": 0.5315,
"step": 826
},
{
"epoch": 1.8459821428571428,
"grad_norm": 0.840054452419281,
"learning_rate": 7.379299679660299e-06,
"loss": 0.3606,
"step": 827
},
{
"epoch": 1.8482142857142856,
"grad_norm": 1.074033498764038,
"learning_rate": 7.354936756995766e-06,
"loss": 0.4659,
"step": 828
},
{
"epoch": 1.8504464285714286,
"grad_norm": 1.2196365594863892,
"learning_rate": 7.3305907035439404e-06,
"loss": 0.4919,
"step": 829
},
{
"epoch": 1.8526785714285714,
"grad_norm": 1.1700623035430908,
"learning_rate": 7.3062616745747325e-06,
"loss": 0.445,
"step": 830
},
{
"epoch": 1.8549107142857144,
"grad_norm": 1.149138331413269,
"learning_rate": 7.281949825249495e-06,
"loss": 0.4704,
"step": 831
},
{
"epoch": 1.8571428571428572,
"grad_norm": 1.0726150274276733,
"learning_rate": 7.257655310619996e-06,
"loss": 0.4275,
"step": 832
},
{
"epoch": 1.859375,
"grad_norm": 1.2384564876556396,
"learning_rate": 7.233378285627459e-06,
"loss": 0.4262,
"step": 833
},
{
"epoch": 1.8616071428571428,
"grad_norm": 1.1791836023330688,
"learning_rate": 7.209118905101575e-06,
"loss": 0.5294,
"step": 834
},
{
"epoch": 1.8638392857142856,
"grad_norm": 1.1020606756210327,
"learning_rate": 7.184877323759482e-06,
"loss": 0.4688,
"step": 835
},
{
"epoch": 1.8660714285714286,
"grad_norm": 1.0698236227035522,
"learning_rate": 7.16065369620483e-06,
"loss": 0.4932,
"step": 836
},
{
"epoch": 1.8683035714285714,
"grad_norm": 1.3022637367248535,
"learning_rate": 7.136448176926736e-06,
"loss": 0.4702,
"step": 837
},
{
"epoch": 1.8705357142857144,
"grad_norm": 1.197245478630066,
"learning_rate": 7.112260920298859e-06,
"loss": 0.5103,
"step": 838
},
{
"epoch": 1.8727678571428572,
"grad_norm": 1.157585859298706,
"learning_rate": 7.088092080578357e-06,
"loss": 0.5016,
"step": 839
},
{
"epoch": 1.875,
"grad_norm": 1.1110048294067383,
"learning_rate": 7.063941811904956e-06,
"loss": 0.4405,
"step": 840
},
{
"epoch": 1.8772321428571428,
"grad_norm": 1.014962077140808,
"learning_rate": 7.039810268299934e-06,
"loss": 0.3925,
"step": 841
},
{
"epoch": 1.8794642857142856,
"grad_norm": 1.1455286741256714,
"learning_rate": 7.015697603665141e-06,
"loss": 0.4581,
"step": 842
},
{
"epoch": 1.8816964285714286,
"grad_norm": 1.2424312829971313,
"learning_rate": 6.991603971782035e-06,
"loss": 0.5135,
"step": 843
},
{
"epoch": 1.8839285714285714,
"grad_norm": 1.0072928667068481,
"learning_rate": 6.967529526310681e-06,
"loss": 0.3755,
"step": 844
},
{
"epoch": 1.8861607142857144,
"grad_norm": 1.1363320350646973,
"learning_rate": 6.943474420788788e-06,
"loss": 0.4542,
"step": 845
},
{
"epoch": 1.8883928571428572,
"grad_norm": 1.0867626667022705,
"learning_rate": 6.919438808630716e-06,
"loss": 0.4454,
"step": 846
},
{
"epoch": 1.890625,
"grad_norm": 1.0299354791641235,
"learning_rate": 6.895422843126507e-06,
"loss": 0.4268,
"step": 847
},
{
"epoch": 1.8928571428571428,
"grad_norm": 1.2059564590454102,
"learning_rate": 6.871426677440907e-06,
"loss": 0.511,
"step": 848
},
{
"epoch": 1.8950892857142856,
"grad_norm": 1.1440831422805786,
"learning_rate": 6.847450464612378e-06,
"loss": 0.4773,
"step": 849
},
{
"epoch": 1.8973214285714286,
"grad_norm": 1.1981797218322754,
"learning_rate": 6.8234943575521365e-06,
"loss": 0.4979,
"step": 850
},
{
"epoch": 1.8995535714285714,
"grad_norm": 1.140823483467102,
"learning_rate": 6.799558509043169e-06,
"loss": 0.4491,
"step": 851
},
{
"epoch": 1.9017857142857144,
"grad_norm": 1.1695003509521484,
"learning_rate": 6.775643071739267e-06,
"loss": 0.4302,
"step": 852
},
{
"epoch": 1.9040178571428572,
"grad_norm": 1.292051911354065,
"learning_rate": 6.751748198164036e-06,
"loss": 0.5915,
"step": 853
},
{
"epoch": 1.90625,
"grad_norm": 1.060410499572754,
"learning_rate": 6.727874040709943e-06,
"loss": 0.4208,
"step": 854
},
{
"epoch": 1.9084821428571428,
"grad_norm": 1.1094176769256592,
"learning_rate": 6.704020751637333e-06,
"loss": 0.4261,
"step": 855
},
{
"epoch": 1.9107142857142856,
"grad_norm": 1.1401662826538086,
"learning_rate": 6.680188483073458e-06,
"loss": 0.3836,
"step": 856
},
{
"epoch": 1.9129464285714286,
"grad_norm": 1.1735782623291016,
"learning_rate": 6.6563773870115135e-06,
"loss": 0.4362,
"step": 857
},
{
"epoch": 1.9151785714285714,
"grad_norm": 0.9720476269721985,
"learning_rate": 6.632587615309658e-06,
"loss": 0.4288,
"step": 858
},
{
"epoch": 1.9174107142857144,
"grad_norm": 1.134281039237976,
"learning_rate": 6.608819319690059e-06,
"loss": 0.434,
"step": 859
},
{
"epoch": 1.9196428571428572,
"grad_norm": 0.9402589201927185,
"learning_rate": 6.585072651737911e-06,
"loss": 0.4452,
"step": 860
},
{
"epoch": 1.921875,
"grad_norm": 1.0265930891036987,
"learning_rate": 6.56134776290048e-06,
"loss": 0.4111,
"step": 861
},
{
"epoch": 1.9241071428571428,
"grad_norm": 1.001212477684021,
"learning_rate": 6.537644804486136e-06,
"loss": 0.4677,
"step": 862
},
{
"epoch": 1.9263392857142856,
"grad_norm": 1.193760871887207,
"learning_rate": 6.513963927663372e-06,
"loss": 0.4496,
"step": 863
},
{
"epoch": 1.9285714285714286,
"grad_norm": 1.094336986541748,
"learning_rate": 6.49030528345987e-06,
"loss": 0.403,
"step": 864
},
{
"epoch": 1.9308035714285714,
"grad_norm": 1.051220417022705,
"learning_rate": 6.466669022761506e-06,
"loss": 0.4085,
"step": 865
},
{
"epoch": 1.9330357142857144,
"grad_norm": 1.0897574424743652,
"learning_rate": 6.443055296311413e-06,
"loss": 0.4779,
"step": 866
},
{
"epoch": 1.9352678571428572,
"grad_norm": 1.124507188796997,
"learning_rate": 6.4194642547090016e-06,
"loss": 0.474,
"step": 867
},
{
"epoch": 1.9375,
"grad_norm": 1.1396455764770508,
"learning_rate": 6.3958960484090094e-06,
"loss": 0.4122,
"step": 868
},
{
"epoch": 1.9397321428571428,
"grad_norm": 0.9783452153205872,
"learning_rate": 6.37235082772055e-06,
"loss": 0.4359,
"step": 869
},
{
"epoch": 1.9419642857142856,
"grad_norm": 1.2163567543029785,
"learning_rate": 6.348828742806122e-06,
"loss": 0.4256,
"step": 870
},
{
"epoch": 1.9441964285714286,
"grad_norm": 1.1555063724517822,
"learning_rate": 6.325329943680689e-06,
"loss": 0.4604,
"step": 871
},
{
"epoch": 1.9464285714285714,
"grad_norm": 1.1841658353805542,
"learning_rate": 6.3018545802107e-06,
"loss": 0.4478,
"step": 872
},
{
"epoch": 1.9486607142857144,
"grad_norm": 1.2992147207260132,
"learning_rate": 6.278402802113146e-06,
"loss": 0.4252,
"step": 873
},
{
"epoch": 1.9508928571428572,
"grad_norm": 1.136289358139038,
"learning_rate": 6.25497475895459e-06,
"loss": 0.4876,
"step": 874
},
{
"epoch": 1.953125,
"grad_norm": 1.138102412223816,
"learning_rate": 6.2315706001502305e-06,
"loss": 0.446,
"step": 875
},
{
"epoch": 1.9553571428571428,
"grad_norm": 1.0559848546981812,
"learning_rate": 6.208190474962945e-06,
"loss": 0.4242,
"step": 876
},
{
"epoch": 1.9575892857142856,
"grad_norm": 1.1156829595565796,
"learning_rate": 6.184834532502315e-06,
"loss": 0.4874,
"step": 877
},
{
"epoch": 1.9598214285714286,
"grad_norm": 1.0015919208526611,
"learning_rate": 6.161502921723719e-06,
"loss": 0.4157,
"step": 878
},
{
"epoch": 1.9620535714285714,
"grad_norm": 1.1571860313415527,
"learning_rate": 6.138195791427329e-06,
"loss": 0.4177,
"step": 879
},
{
"epoch": 1.9642857142857144,
"grad_norm": 1.3629335165023804,
"learning_rate": 6.114913290257219e-06,
"loss": 0.4605,
"step": 880
},
{
"epoch": 1.9665178571428572,
"grad_norm": 1.03495454788208,
"learning_rate": 6.091655566700359e-06,
"loss": 0.443,
"step": 881
},
{
"epoch": 1.96875,
"grad_norm": 1.1198673248291016,
"learning_rate": 6.068422769085722e-06,
"loss": 0.3935,
"step": 882
},
{
"epoch": 1.9709821428571428,
"grad_norm": 1.0860130786895752,
"learning_rate": 6.045215045583301e-06,
"loss": 0.4203,
"step": 883
},
{
"epoch": 1.9732142857142856,
"grad_norm": 1.050460696220398,
"learning_rate": 6.0220325442031714e-06,
"loss": 0.4028,
"step": 884
},
{
"epoch": 1.9754464285714286,
"grad_norm": 1.0525908470153809,
"learning_rate": 5.998875412794562e-06,
"loss": 0.3915,
"step": 885
},
{
"epoch": 1.9776785714285714,
"grad_norm": 1.0617356300354004,
"learning_rate": 5.975743799044894e-06,
"loss": 0.4651,
"step": 886
},
{
"epoch": 1.9799107142857144,
"grad_norm": 0.9719341993331909,
"learning_rate": 5.952637850478852e-06,
"loss": 0.4276,
"step": 887
},
{
"epoch": 1.9821428571428572,
"grad_norm": 1.2568265199661255,
"learning_rate": 5.929557714457425e-06,
"loss": 0.5201,
"step": 888
},
{
"epoch": 1.984375,
"grad_norm": 1.1193770170211792,
"learning_rate": 5.906503538176999e-06,
"loss": 0.4661,
"step": 889
},
{
"epoch": 1.9866071428571428,
"grad_norm": 1.1484289169311523,
"learning_rate": 5.883475468668387e-06,
"loss": 0.4808,
"step": 890
},
{
"epoch": 1.9888392857142856,
"grad_norm": 0.956321120262146,
"learning_rate": 5.860473652795901e-06,
"loss": 0.4322,
"step": 891
},
{
"epoch": 1.9910714285714286,
"grad_norm": 1.0034775733947754,
"learning_rate": 5.8374982372564255e-06,
"loss": 0.3559,
"step": 892
},
{
"epoch": 1.9933035714285714,
"grad_norm": 1.0558040142059326,
"learning_rate": 5.814549368578464e-06,
"loss": 0.4988,
"step": 893
},
{
"epoch": 1.9955357142857144,
"grad_norm": 1.1722464561462402,
"learning_rate": 5.7916271931212185e-06,
"loss": 0.4951,
"step": 894
},
{
"epoch": 1.9977678571428572,
"grad_norm": 1.1077440977096558,
"learning_rate": 5.768731857073657e-06,
"loss": 0.449,
"step": 895
},
{
"epoch": 2.0,
"grad_norm": 1.0378532409667969,
"learning_rate": 5.745863506453569e-06,
"loss": 0.3514,
"step": 896
},
{
"epoch": 2.0,
"eval_loss": 0.5705999732017517,
"eval_runtime": 29.1399,
"eval_samples_per_second": 2.505,
"eval_steps_per_second": 0.343,
"step": 896
},
{
"epoch": 2.002232142857143,
"grad_norm": 1.1312880516052246,
"learning_rate": 5.7230222871066475e-06,
"loss": 0.3434,
"step": 897
},
{
"epoch": 2.0044642857142856,
"grad_norm": 1.2804909944534302,
"learning_rate": 5.700208344705537e-06,
"loss": 0.3325,
"step": 898
},
{
"epoch": 2.0066964285714284,
"grad_norm": 1.2890160083770752,
"learning_rate": 5.677421824748946e-06,
"loss": 0.3234,
"step": 899
},
{
"epoch": 2.0089285714285716,
"grad_norm": 1.3255733251571655,
"learning_rate": 5.6546628725606675e-06,
"loss": 0.3237,
"step": 900
},
{
"epoch": 2.0111607142857144,
"grad_norm": 1.2110278606414795,
"learning_rate": 5.631931633288696e-06,
"loss": 0.303,
"step": 901
},
{
"epoch": 2.013392857142857,
"grad_norm": 0.9276548624038696,
"learning_rate": 5.609228251904265e-06,
"loss": 0.3032,
"step": 902
},
{
"epoch": 2.015625,
"grad_norm": 1.1542302370071411,
"learning_rate": 5.586552873200963e-06,
"loss": 0.3109,
"step": 903
},
{
"epoch": 2.017857142857143,
"grad_norm": 1.1671963930130005,
"learning_rate": 5.563905641793776e-06,
"loss": 0.3398,
"step": 904
},
{
"epoch": 2.0200892857142856,
"grad_norm": 1.0253148078918457,
"learning_rate": 5.541286702118174e-06,
"loss": 0.3085,
"step": 905
},
{
"epoch": 2.0223214285714284,
"grad_norm": 1.112776517868042,
"learning_rate": 5.518696198429201e-06,
"loss": 0.3137,
"step": 906
},
{
"epoch": 2.0245535714285716,
"grad_norm": 1.0918265581130981,
"learning_rate": 5.496134274800533e-06,
"loss": 0.3044,
"step": 907
},
{
"epoch": 2.0267857142857144,
"grad_norm": 1.1833134889602661,
"learning_rate": 5.473601075123599e-06,
"loss": 0.3135,
"step": 908
},
{
"epoch": 2.029017857142857,
"grad_norm": 1.2949035167694092,
"learning_rate": 5.451096743106611e-06,
"loss": 0.3631,
"step": 909
},
{
"epoch": 2.03125,
"grad_norm": 1.2424516677856445,
"learning_rate": 5.428621422273687e-06,
"loss": 0.327,
"step": 910
},
{
"epoch": 2.033482142857143,
"grad_norm": 1.102992057800293,
"learning_rate": 5.406175255963923e-06,
"loss": 0.2871,
"step": 911
},
{
"epoch": 2.0357142857142856,
"grad_norm": 1.140796184539795,
"learning_rate": 5.383758387330476e-06,
"loss": 0.3021,
"step": 912
},
{
"epoch": 2.0379464285714284,
"grad_norm": 1.1115362644195557,
"learning_rate": 5.3613709593396545e-06,
"loss": 0.3066,
"step": 913
},
{
"epoch": 2.0401785714285716,
"grad_norm": 1.0036022663116455,
"learning_rate": 5.3390131147699995e-06,
"loss": 0.2789,
"step": 914
},
{
"epoch": 2.0424107142857144,
"grad_norm": 1.0242892503738403,
"learning_rate": 5.3166849962113886e-06,
"loss": 0.2943,
"step": 915
},
{
"epoch": 2.044642857142857,
"grad_norm": 1.0714552402496338,
"learning_rate": 5.294386746064115e-06,
"loss": 0.265,
"step": 916
},
{
"epoch": 2.046875,
"grad_norm": 1.2583504915237427,
"learning_rate": 5.272118506537982e-06,
"loss": 0.3399,
"step": 917
},
{
"epoch": 2.049107142857143,
"grad_norm": 1.0679799318313599,
"learning_rate": 5.249880419651403e-06,
"loss": 0.3327,
"step": 918
},
{
"epoch": 2.0513392857142856,
"grad_norm": 1.1574690341949463,
"learning_rate": 5.2276726272304724e-06,
"loss": 0.3007,
"step": 919
},
{
"epoch": 2.0535714285714284,
"grad_norm": 0.8751293420791626,
"learning_rate": 5.205495270908094e-06,
"loss": 0.2863,
"step": 920
},
{
"epoch": 2.0558035714285716,
"grad_norm": 1.0779941082000732,
"learning_rate": 5.183348492123056e-06,
"loss": 0.3201,
"step": 921
},
{
"epoch": 2.0580357142857144,
"grad_norm": 1.1002516746520996,
"learning_rate": 5.16123243211914e-06,
"loss": 0.3093,
"step": 922
},
{
"epoch": 2.060267857142857,
"grad_norm": 1.1461528539657593,
"learning_rate": 5.1391472319442016e-06,
"loss": 0.3172,
"step": 923
},
{
"epoch": 2.0625,
"grad_norm": 1.1231317520141602,
"learning_rate": 5.117093032449297e-06,
"loss": 0.3279,
"step": 924
},
{
"epoch": 2.064732142857143,
"grad_norm": 1.1722527742385864,
"learning_rate": 5.0950699742877645e-06,
"loss": 0.3203,
"step": 925
},
{
"epoch": 2.0669642857142856,
"grad_norm": 1.0735809803009033,
"learning_rate": 5.073078197914341e-06,
"loss": 0.3336,
"step": 926
},
{
"epoch": 2.0691964285714284,
"grad_norm": 1.074890375137329,
"learning_rate": 5.0511178435842565e-06,
"loss": 0.3264,
"step": 927
},
{
"epoch": 2.0714285714285716,
"grad_norm": 0.9348042011260986,
"learning_rate": 5.029189051352339e-06,
"loss": 0.2924,
"step": 928
},
{
"epoch": 2.0736607142857144,
"grad_norm": 1.1553387641906738,
"learning_rate": 5.007291961072133e-06,
"loss": 0.2878,
"step": 929
},
{
"epoch": 2.075892857142857,
"grad_norm": 1.0851117372512817,
"learning_rate": 4.985426712394994e-06,
"loss": 0.3376,
"step": 930
},
{
"epoch": 2.078125,
"grad_norm": 0.9342989921569824,
"learning_rate": 4.963593444769207e-06,
"loss": 0.2701,
"step": 931
},
{
"epoch": 2.080357142857143,
"grad_norm": 1.141180157661438,
"learning_rate": 4.941792297439098e-06,
"loss": 0.2681,
"step": 932
},
{
"epoch": 2.0825892857142856,
"grad_norm": 1.2393943071365356,
"learning_rate": 4.920023409444128e-06,
"loss": 0.3901,
"step": 933
},
{
"epoch": 2.0848214285714284,
"grad_norm": 1.389803409576416,
"learning_rate": 4.898286919618034e-06,
"loss": 0.3377,
"step": 934
},
{
"epoch": 2.0870535714285716,
"grad_norm": 1.1894795894622803,
"learning_rate": 4.876582966587924e-06,
"loss": 0.3204,
"step": 935
},
{
"epoch": 2.0892857142857144,
"grad_norm": 1.2430485486984253,
"learning_rate": 4.8549116887734045e-06,
"loss": 0.3155,
"step": 936
},
{
"epoch": 2.091517857142857,
"grad_norm": 1.2486804723739624,
"learning_rate": 4.833273224385678e-06,
"loss": 0.3485,
"step": 937
},
{
"epoch": 2.09375,
"grad_norm": 0.9740699529647827,
"learning_rate": 4.811667711426686e-06,
"loss": 0.2882,
"step": 938
},
{
"epoch": 2.095982142857143,
"grad_norm": 1.1677557229995728,
"learning_rate": 4.790095287688227e-06,
"loss": 0.3942,
"step": 939
},
{
"epoch": 2.0982142857142856,
"grad_norm": 1.2111822366714478,
"learning_rate": 4.7685560907510465e-06,
"loss": 0.2916,
"step": 940
},
{
"epoch": 2.1004464285714284,
"grad_norm": 1.1940852403640747,
"learning_rate": 4.747050257984002e-06,
"loss": 0.2918,
"step": 941
},
{
"epoch": 2.1026785714285716,
"grad_norm": 0.9879006743431091,
"learning_rate": 4.725577926543151e-06,
"loss": 0.2781,
"step": 942
},
{
"epoch": 2.1049107142857144,
"grad_norm": 1.0214678049087524,
"learning_rate": 4.704139233370905e-06,
"loss": 0.3141,
"step": 943
},
{
"epoch": 2.107142857142857,
"grad_norm": 0.9733462333679199,
"learning_rate": 4.682734315195138e-06,
"loss": 0.3298,
"step": 944
},
{
"epoch": 2.109375,
"grad_norm": 1.053040623664856,
"learning_rate": 4.661363308528319e-06,
"loss": 0.2853,
"step": 945
},
{
"epoch": 2.111607142857143,
"grad_norm": 1.1272430419921875,
"learning_rate": 4.640026349666651e-06,
"loss": 0.3328,
"step": 946
},
{
"epoch": 2.1138392857142856,
"grad_norm": 1.0852495431900024,
"learning_rate": 4.61872357468917e-06,
"loss": 0.2992,
"step": 947
},
{
"epoch": 2.1160714285714284,
"grad_norm": 1.0410633087158203,
"learning_rate": 4.5974551194569336e-06,
"loss": 0.3085,
"step": 948
},
{
"epoch": 2.1183035714285716,
"grad_norm": 1.071105718612671,
"learning_rate": 4.576221119612091e-06,
"loss": 0.338,
"step": 949
},
{
"epoch": 2.1205357142857144,
"grad_norm": 0.9693806767463684,
"learning_rate": 4.555021710577068e-06,
"loss": 0.3483,
"step": 950
},
{
"epoch": 2.122767857142857,
"grad_norm": 1.1795443296432495,
"learning_rate": 4.533857027553663e-06,
"loss": 0.3496,
"step": 951
},
{
"epoch": 2.125,
"grad_norm": 1.0642222166061401,
"learning_rate": 4.51272720552223e-06,
"loss": 0.2831,
"step": 952
},
{
"epoch": 2.127232142857143,
"grad_norm": 1.088563084602356,
"learning_rate": 4.49163237924078e-06,
"loss": 0.3134,
"step": 953
},
{
"epoch": 2.1294642857142856,
"grad_norm": 1.0107040405273438,
"learning_rate": 4.470572683244127e-06,
"loss": 0.2798,
"step": 954
},
{
"epoch": 2.1316964285714284,
"grad_norm": 1.1377826929092407,
"learning_rate": 4.449548251843048e-06,
"loss": 0.2932,
"step": 955
},
{
"epoch": 2.1339285714285716,
"grad_norm": 1.1572898626327515,
"learning_rate": 4.4285592191234125e-06,
"loss": 0.31,
"step": 956
},
{
"epoch": 2.1361607142857144,
"grad_norm": 1.0224123001098633,
"learning_rate": 4.4076057189453325e-06,
"loss": 0.2888,
"step": 957
},
{
"epoch": 2.138392857142857,
"grad_norm": 1.0553392171859741,
"learning_rate": 4.386687884942307e-06,
"loss": 0.2955,
"step": 958
},
{
"epoch": 2.140625,
"grad_norm": 1.226377248764038,
"learning_rate": 4.365805850520362e-06,
"loss": 0.3023,
"step": 959
},
{
"epoch": 2.142857142857143,
"grad_norm": 1.1049394607543945,
"learning_rate": 4.344959748857215e-06,
"loss": 0.2934,
"step": 960
},
{
"epoch": 2.1450892857142856,
"grad_norm": 1.1788314580917358,
"learning_rate": 4.324149712901417e-06,
"loss": 0.3623,
"step": 961
},
{
"epoch": 2.1473214285714284,
"grad_norm": 0.9973257184028625,
"learning_rate": 4.3033758753715095e-06,
"loss": 0.2857,
"step": 962
},
{
"epoch": 2.1495535714285716,
"grad_norm": 1.0208550691604614,
"learning_rate": 4.282638368755161e-06,
"loss": 0.2855,
"step": 963
},
{
"epoch": 2.1517857142857144,
"grad_norm": 1.0481334924697876,
"learning_rate": 4.261937325308347e-06,
"loss": 0.351,
"step": 964
},
{
"epoch": 2.154017857142857,
"grad_norm": 1.0952095985412598,
"learning_rate": 4.241272877054489e-06,
"loss": 0.3035,
"step": 965
},
{
"epoch": 2.15625,
"grad_norm": 1.1130040884017944,
"learning_rate": 4.2206451557836235e-06,
"loss": 0.298,
"step": 966
},
{
"epoch": 2.158482142857143,
"grad_norm": 1.0529165267944336,
"learning_rate": 4.200054293051556e-06,
"loss": 0.339,
"step": 967
},
{
"epoch": 2.1607142857142856,
"grad_norm": 0.9979656934738159,
"learning_rate": 4.179500420179011e-06,
"loss": 0.354,
"step": 968
},
{
"epoch": 2.1629464285714284,
"grad_norm": 1.0694448947906494,
"learning_rate": 4.158983668250819e-06,
"loss": 0.3054,
"step": 969
},
{
"epoch": 2.1651785714285716,
"grad_norm": 0.9863750338554382,
"learning_rate": 4.138504168115059e-06,
"loss": 0.2886,
"step": 970
},
{
"epoch": 2.1674107142857144,
"grad_norm": 1.0251253843307495,
"learning_rate": 4.11806205038224e-06,
"loss": 0.2786,
"step": 971
},
{
"epoch": 2.169642857142857,
"grad_norm": 1.2727429866790771,
"learning_rate": 4.097657445424454e-06,
"loss": 0.3197,
"step": 972
},
{
"epoch": 2.171875,
"grad_norm": 1.0682674646377563,
"learning_rate": 4.077290483374549e-06,
"loss": 0.2598,
"step": 973
},
{
"epoch": 2.174107142857143,
"grad_norm": 1.122412085533142,
"learning_rate": 4.056961294125305e-06,
"loss": 0.2843,
"step": 974
},
{
"epoch": 2.1763392857142856,
"grad_norm": 1.1398166418075562,
"learning_rate": 4.0366700073286005e-06,
"loss": 0.321,
"step": 975
},
{
"epoch": 2.1785714285714284,
"grad_norm": 1.2503103017807007,
"learning_rate": 4.016416752394591e-06,
"loss": 0.3992,
"step": 976
},
{
"epoch": 2.1808035714285716,
"grad_norm": 1.1967464685440063,
"learning_rate": 3.996201658490866e-06,
"loss": 0.3086,
"step": 977
},
{
"epoch": 2.1830357142857144,
"grad_norm": 1.0485783815383911,
"learning_rate": 3.9760248545416465e-06,
"loss": 0.2849,
"step": 978
},
{
"epoch": 2.185267857142857,
"grad_norm": 1.1187446117401123,
"learning_rate": 3.955886469226967e-06,
"loss": 0.2947,
"step": 979
},
{
"epoch": 2.1875,
"grad_norm": 1.0842468738555908,
"learning_rate": 3.935786630981819e-06,
"loss": 0.2789,
"step": 980
},
{
"epoch": 2.189732142857143,
"grad_norm": 0.9847047924995422,
"learning_rate": 3.915725467995375e-06,
"loss": 0.2868,
"step": 981
},
{
"epoch": 2.1919642857142856,
"grad_norm": 1.0602375268936157,
"learning_rate": 3.895703108210135e-06,
"loss": 0.3282,
"step": 982
},
{
"epoch": 2.1941964285714284,
"grad_norm": 0.9967721700668335,
"learning_rate": 3.875719679321138e-06,
"loss": 0.3096,
"step": 983
},
{
"epoch": 2.1964285714285716,
"grad_norm": 1.177323818206787,
"learning_rate": 3.8557753087751345e-06,
"loss": 0.3515,
"step": 984
},
{
"epoch": 2.1986607142857144,
"grad_norm": 0.9941040277481079,
"learning_rate": 3.835870123769775e-06,
"loss": 0.3254,
"step": 985
},
{
"epoch": 2.200892857142857,
"grad_norm": 1.2178453207015991,
"learning_rate": 3.8160042512528e-06,
"loss": 0.3142,
"step": 986
},
{
"epoch": 2.203125,
"grad_norm": 1.2556642293930054,
"learning_rate": 3.796177817921223e-06,
"loss": 0.3206,
"step": 987
},
{
"epoch": 2.205357142857143,
"grad_norm": 1.0012449026107788,
"learning_rate": 3.776390950220544e-06,
"loss": 0.3497,
"step": 988
},
{
"epoch": 2.2075892857142856,
"grad_norm": 1.051226258277893,
"learning_rate": 3.756643774343913e-06,
"loss": 0.381,
"step": 989
},
{
"epoch": 2.2098214285714284,
"grad_norm": 0.9949240684509277,
"learning_rate": 3.7369364162313528e-06,
"loss": 0.3661,
"step": 990
},
{
"epoch": 2.2120535714285716,
"grad_norm": 1.0959502458572388,
"learning_rate": 3.7172690015689263e-06,
"loss": 0.3618,
"step": 991
},
{
"epoch": 2.2142857142857144,
"grad_norm": 1.028232455253601,
"learning_rate": 3.6976416557879757e-06,
"loss": 0.2777,
"step": 992
},
{
"epoch": 2.216517857142857,
"grad_norm": 1.055541753768921,
"learning_rate": 3.678054504064287e-06,
"loss": 0.3042,
"step": 993
},
{
"epoch": 2.21875,
"grad_norm": 1.1653730869293213,
"learning_rate": 3.658507671317296e-06,
"loss": 0.3297,
"step": 994
},
{
"epoch": 2.220982142857143,
"grad_norm": 1.1325184106826782,
"learning_rate": 3.639001282209311e-06,
"loss": 0.3129,
"step": 995
},
{
"epoch": 2.2232142857142856,
"grad_norm": 1.1521151065826416,
"learning_rate": 3.6195354611447033e-06,
"loss": 0.314,
"step": 996
},
{
"epoch": 2.2254464285714284,
"grad_norm": 1.1670697927474976,
"learning_rate": 3.600110332269118e-06,
"loss": 0.2789,
"step": 997
},
{
"epoch": 2.2276785714285716,
"grad_norm": 1.0550611019134521,
"learning_rate": 3.580726019468671e-06,
"loss": 0.3384,
"step": 998
},
{
"epoch": 2.2299107142857144,
"grad_norm": 1.1587961912155151,
"learning_rate": 3.561382646369179e-06,
"loss": 0.344,
"step": 999
},
{
"epoch": 2.232142857142857,
"grad_norm": 1.1734824180603027,
"learning_rate": 3.5420803363353604e-06,
"loss": 0.3755,
"step": 1000
},
{
"epoch": 2.234375,
"grad_norm": 1.101392149925232,
"learning_rate": 3.5228192124700433e-06,
"loss": 0.3274,
"step": 1001
},
{
"epoch": 2.236607142857143,
"grad_norm": 1.3699195384979248,
"learning_rate": 3.503599397613394e-06,
"loss": 0.2845,
"step": 1002
},
{
"epoch": 2.2388392857142856,
"grad_norm": 1.0058083534240723,
"learning_rate": 3.4844210143421143e-06,
"loss": 0.3133,
"step": 1003
},
{
"epoch": 2.2410714285714284,
"grad_norm": 1.107692837715149,
"learning_rate": 3.465284184968679e-06,
"loss": 0.3459,
"step": 1004
},
{
"epoch": 2.2433035714285716,
"grad_norm": 1.0924514532089233,
"learning_rate": 3.4461890315405466e-06,
"loss": 0.3195,
"step": 1005
},
{
"epoch": 2.2455357142857144,
"grad_norm": 1.1005483865737915,
"learning_rate": 3.4271356758393827e-06,
"loss": 0.288,
"step": 1006
},
{
"epoch": 2.247767857142857,
"grad_norm": 1.0790880918502808,
"learning_rate": 3.4081242393802847e-06,
"loss": 0.3078,
"step": 1007
},
{
"epoch": 2.25,
"grad_norm": 1.0338419675827026,
"learning_rate": 3.3891548434109942e-06,
"loss": 0.3592,
"step": 1008
},
{
"epoch": 2.25,
"eval_loss": 0.6482473611831665,
"eval_runtime": 32.6288,
"eval_samples_per_second": 2.237,
"eval_steps_per_second": 0.306,
"step": 1008
},
{
"epoch": 2.252232142857143,
"grad_norm": 1.1008427143096924,
"learning_rate": 3.3702276089111484e-06,
"loss": 0.3108,
"step": 1009
},
{
"epoch": 2.2544642857142856,
"grad_norm": 1.160321831703186,
"learning_rate": 3.3513426565914854e-06,
"loss": 0.3412,
"step": 1010
},
{
"epoch": 2.2566964285714284,
"grad_norm": 1.04502534866333,
"learning_rate": 3.3325001068930917e-06,
"loss": 0.306,
"step": 1011
},
{
"epoch": 2.2589285714285716,
"grad_norm": 1.108601689338684,
"learning_rate": 3.3137000799866148e-06,
"loss": 0.2485,
"step": 1012
},
{
"epoch": 2.2611607142857144,
"grad_norm": 1.0433441400527954,
"learning_rate": 3.2949426957715157e-06,
"loss": 0.3673,
"step": 1013
},
{
"epoch": 2.263392857142857,
"grad_norm": 1.0202276706695557,
"learning_rate": 3.276228073875296e-06,
"loss": 0.3336,
"step": 1014
},
{
"epoch": 2.265625,
"grad_norm": 1.3703482151031494,
"learning_rate": 3.257556333652734e-06,
"loss": 0.287,
"step": 1015
},
{
"epoch": 2.267857142857143,
"grad_norm": 1.1855953931808472,
"learning_rate": 3.238927594185127e-06,
"loss": 0.334,
"step": 1016
},
{
"epoch": 2.2700892857142856,
"grad_norm": 1.0647932291030884,
"learning_rate": 3.2203419742795237e-06,
"loss": 0.3158,
"step": 1017
},
{
"epoch": 2.2723214285714284,
"grad_norm": 1.2551442384719849,
"learning_rate": 3.201799592467978e-06,
"loss": 0.34,
"step": 1018
},
{
"epoch": 2.2745535714285716,
"grad_norm": 1.2371532917022705,
"learning_rate": 3.1833005670067874e-06,
"loss": 0.3305,
"step": 1019
},
{
"epoch": 2.2767857142857144,
"grad_norm": 1.0982019901275635,
"learning_rate": 3.1648450158757373e-06,
"loss": 0.3204,
"step": 1020
},
{
"epoch": 2.279017857142857,
"grad_norm": 1.1620010137557983,
"learning_rate": 3.146433056777355e-06,
"loss": 0.3213,
"step": 1021
},
{
"epoch": 2.28125,
"grad_norm": 1.0344820022583008,
"learning_rate": 3.128064807136142e-06,
"loss": 0.2553,
"step": 1022
},
{
"epoch": 2.283482142857143,
"grad_norm": 0.9919111132621765,
"learning_rate": 3.10974038409785e-06,
"loss": 0.2734,
"step": 1023
},
{
"epoch": 2.2857142857142856,
"grad_norm": 1.1597861051559448,
"learning_rate": 3.0914599045287165e-06,
"loss": 0.2914,
"step": 1024
},
{
"epoch": 2.2879464285714284,
"grad_norm": 1.2358967065811157,
"learning_rate": 3.073223485014727e-06,
"loss": 0.3175,
"step": 1025
},
{
"epoch": 2.2901785714285716,
"grad_norm": 1.172170877456665,
"learning_rate": 3.0550312418608617e-06,
"loss": 0.3634,
"step": 1026
},
{
"epoch": 2.2924107142857144,
"grad_norm": 1.1412909030914307,
"learning_rate": 3.0368832910903625e-06,
"loss": 0.3065,
"step": 1027
},
{
"epoch": 2.294642857142857,
"grad_norm": 1.2911455631256104,
"learning_rate": 3.018779748444005e-06,
"loss": 0.3507,
"step": 1028
},
{
"epoch": 2.296875,
"grad_norm": 1.1728956699371338,
"learning_rate": 3.000720729379326e-06,
"loss": 0.2747,
"step": 1029
},
{
"epoch": 2.299107142857143,
"grad_norm": 1.1581259965896606,
"learning_rate": 2.9827063490699225e-06,
"loss": 0.2607,
"step": 1030
},
{
"epoch": 2.3013392857142856,
"grad_norm": 1.1151872873306274,
"learning_rate": 2.9647367224046884e-06,
"loss": 0.3373,
"step": 1031
},
{
"epoch": 2.3035714285714284,
"grad_norm": 1.1207554340362549,
"learning_rate": 2.9468119639871163e-06,
"loss": 0.2752,
"step": 1032
},
{
"epoch": 2.3058035714285716,
"grad_norm": 1.0638618469238281,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.2839,
"step": 1033
},
{
"epoch": 2.3080357142857144,
"grad_norm": 1.1098335981369019,
"learning_rate": 2.911097508877365e-06,
"loss": 0.3076,
"step": 1034
},
{
"epoch": 2.310267857142857,
"grad_norm": 1.0225090980529785,
"learning_rate": 2.8933080399584757e-06,
"loss": 0.3733,
"step": 1035
},
{
"epoch": 2.3125,
"grad_norm": 1.366018533706665,
"learning_rate": 2.8755638948323494e-06,
"loss": 0.3335,
"step": 1036
},
{
"epoch": 2.314732142857143,
"grad_norm": 1.2792249917984009,
"learning_rate": 2.8578651866644447e-06,
"loss": 0.3129,
"step": 1037
},
{
"epoch": 2.3169642857142856,
"grad_norm": 1.0269707441329956,
"learning_rate": 2.840212028330418e-06,
"loss": 0.2463,
"step": 1038
},
{
"epoch": 2.3191964285714284,
"grad_norm": 1.0969288349151611,
"learning_rate": 2.8226045324154394e-06,
"loss": 0.2923,
"step": 1039
},
{
"epoch": 2.3214285714285716,
"grad_norm": 1.0409966707229614,
"learning_rate": 2.8050428112134474e-06,
"loss": 0.3553,
"step": 1040
},
{
"epoch": 2.3236607142857144,
"grad_norm": 1.2081456184387207,
"learning_rate": 2.7875269767264667e-06,
"loss": 0.3003,
"step": 1041
},
{
"epoch": 2.325892857142857,
"grad_norm": 1.045913815498352,
"learning_rate": 2.7700571406638633e-06,
"loss": 0.2766,
"step": 1042
},
{
"epoch": 2.328125,
"grad_norm": 1.0660289525985718,
"learning_rate": 2.7526334144416345e-06,
"loss": 0.2689,
"step": 1043
},
{
"epoch": 2.330357142857143,
"grad_norm": 1.1659197807312012,
"learning_rate": 2.735255909181719e-06,
"loss": 0.2751,
"step": 1044
},
{
"epoch": 2.3325892857142856,
"grad_norm": 0.9567285776138306,
"learning_rate": 2.7179247357112704e-06,
"loss": 0.3007,
"step": 1045
},
{
"epoch": 2.3348214285714284,
"grad_norm": 1.0861024856567383,
"learning_rate": 2.7006400045619597e-06,
"loss": 0.2645,
"step": 1046
},
{
"epoch": 2.3370535714285716,
"grad_norm": 1.0163832902908325,
"learning_rate": 2.6834018259692574e-06,
"loss": 0.3232,
"step": 1047
},
{
"epoch": 2.3392857142857144,
"grad_norm": 1.087554931640625,
"learning_rate": 2.6662103098717485e-06,
"loss": 0.3177,
"step": 1048
},
{
"epoch": 2.341517857142857,
"grad_norm": 1.1598862409591675,
"learning_rate": 2.649065565910419e-06,
"loss": 0.3277,
"step": 1049
},
{
"epoch": 2.34375,
"grad_norm": 1.0833197832107544,
"learning_rate": 2.631967703427959e-06,
"loss": 0.3163,
"step": 1050
},
{
"epoch": 2.345982142857143,
"grad_norm": 1.1643481254577637,
"learning_rate": 2.6149168314680707e-06,
"loss": 0.2935,
"step": 1051
},
{
"epoch": 2.3482142857142856,
"grad_norm": 1.165391445159912,
"learning_rate": 2.597913058774758e-06,
"loss": 0.3332,
"step": 1052
},
{
"epoch": 2.3504464285714284,
"grad_norm": 1.015273928642273,
"learning_rate": 2.5809564937916543e-06,
"loss": 0.287,
"step": 1053
},
{
"epoch": 2.3526785714285716,
"grad_norm": 1.1589689254760742,
"learning_rate": 2.564047244661316e-06,
"loss": 0.3049,
"step": 1054
},
{
"epoch": 2.3549107142857144,
"grad_norm": 1.1673892736434937,
"learning_rate": 2.547185419224537e-06,
"loss": 0.2775,
"step": 1055
},
{
"epoch": 2.357142857142857,
"grad_norm": 1.0458134412765503,
"learning_rate": 2.530371125019664e-06,
"loss": 0.2951,
"step": 1056
},
{
"epoch": 2.359375,
"grad_norm": 1.2780659198760986,
"learning_rate": 2.513604469281897e-06,
"loss": 0.2946,
"step": 1057
},
{
"epoch": 2.361607142857143,
"grad_norm": 1.2554868459701538,
"learning_rate": 2.4968855589426288e-06,
"loss": 0.3346,
"step": 1058
},
{
"epoch": 2.3638392857142856,
"grad_norm": 1.0079058408737183,
"learning_rate": 2.4802145006287425e-06,
"loss": 0.2499,
"step": 1059
},
{
"epoch": 2.3660714285714284,
"grad_norm": 1.0529001951217651,
"learning_rate": 2.4635914006619454e-06,
"loss": 0.3005,
"step": 1060
},
{
"epoch": 2.3683035714285716,
"grad_norm": 1.2676379680633545,
"learning_rate": 2.4470163650580747e-06,
"loss": 0.3839,
"step": 1061
},
{
"epoch": 2.3705357142857144,
"grad_norm": 1.1082603931427002,
"learning_rate": 2.430489499526438e-06,
"loss": 0.3176,
"step": 1062
},
{
"epoch": 2.372767857142857,
"grad_norm": 1.0914406776428223,
"learning_rate": 2.414010909469133e-06,
"loss": 0.2938,
"step": 1063
},
{
"epoch": 2.375,
"grad_norm": 1.001238465309143,
"learning_rate": 2.3975806999803717e-06,
"loss": 0.3355,
"step": 1064
},
{
"epoch": 2.377232142857143,
"grad_norm": 1.0119569301605225,
"learning_rate": 2.38119897584582e-06,
"loss": 0.2893,
"step": 1065
},
{
"epoch": 2.3794642857142856,
"grad_norm": 1.0528173446655273,
"learning_rate": 2.364865841541908e-06,
"loss": 0.3065,
"step": 1066
},
{
"epoch": 2.3816964285714284,
"grad_norm": 1.0538735389709473,
"learning_rate": 2.3485814012351914e-06,
"loss": 0.3086,
"step": 1067
},
{
"epoch": 2.3839285714285716,
"grad_norm": 1.143399715423584,
"learning_rate": 2.33234575878167e-06,
"loss": 0.3536,
"step": 1068
},
{
"epoch": 2.3861607142857144,
"grad_norm": 1.137209177017212,
"learning_rate": 2.3161590177261294e-06,
"loss": 0.2712,
"step": 1069
},
{
"epoch": 2.388392857142857,
"grad_norm": 1.1341702938079834,
"learning_rate": 2.300021281301483e-06,
"loss": 0.305,
"step": 1070
},
{
"epoch": 2.390625,
"grad_norm": 1.000322937965393,
"learning_rate": 2.2839326524281037e-06,
"loss": 0.3311,
"step": 1071
},
{
"epoch": 2.392857142857143,
"grad_norm": 0.9652037620544434,
"learning_rate": 2.267893233713182e-06,
"loss": 0.33,
"step": 1072
},
{
"epoch": 2.3950892857142856,
"grad_norm": 0.9035744667053223,
"learning_rate": 2.2519031274500625e-06,
"loss": 0.2822,
"step": 1073
},
{
"epoch": 2.3973214285714284,
"grad_norm": 1.0602307319641113,
"learning_rate": 2.235962435617596e-06,
"loss": 0.2652,
"step": 1074
},
{
"epoch": 2.3995535714285716,
"grad_norm": 0.950262188911438,
"learning_rate": 2.2200712598794804e-06,
"loss": 0.2667,
"step": 1075
},
{
"epoch": 2.4017857142857144,
"grad_norm": 1.068719744682312,
"learning_rate": 2.204229701583621e-06,
"loss": 0.307,
"step": 1076
},
{
"epoch": 2.404017857142857,
"grad_norm": 1.082491397857666,
"learning_rate": 2.1884378617614933e-06,
"loss": 0.2429,
"step": 1077
},
{
"epoch": 2.40625,
"grad_norm": 1.1966094970703125,
"learning_rate": 2.172695841127468e-06,
"loss": 0.3158,
"step": 1078
},
{
"epoch": 2.408482142857143,
"grad_norm": 1.2111045122146606,
"learning_rate": 2.157003740078203e-06,
"loss": 0.279,
"step": 1079
},
{
"epoch": 2.4107142857142856,
"grad_norm": 1.0568370819091797,
"learning_rate": 2.141361658691975e-06,
"loss": 0.2984,
"step": 1080
},
{
"epoch": 2.4129464285714284,
"grad_norm": 1.1779588460922241,
"learning_rate": 2.1257696967280716e-06,
"loss": 0.2841,
"step": 1081
},
{
"epoch": 2.4151785714285716,
"grad_norm": 1.0319031476974487,
"learning_rate": 2.1102279536261193e-06,
"loss": 0.2793,
"step": 1082
},
{
"epoch": 2.4174107142857144,
"grad_norm": 0.9412409663200378,
"learning_rate": 2.09473652850548e-06,
"loss": 0.2851,
"step": 1083
},
{
"epoch": 2.419642857142857,
"grad_norm": 1.105271339416504,
"learning_rate": 2.0792955201646005e-06,
"loss": 0.2802,
"step": 1084
},
{
"epoch": 2.421875,
"grad_norm": 1.1076663732528687,
"learning_rate": 2.063905027080392e-06,
"loss": 0.3152,
"step": 1085
},
{
"epoch": 2.424107142857143,
"grad_norm": 0.9948890805244446,
"learning_rate": 2.0485651474075987e-06,
"loss": 0.3178,
"step": 1086
},
{
"epoch": 2.4263392857142856,
"grad_norm": 1.04751718044281,
"learning_rate": 2.033275978978164e-06,
"loss": 0.3219,
"step": 1087
},
{
"epoch": 2.4285714285714284,
"grad_norm": 1.0496200323104858,
"learning_rate": 2.018037619300628e-06,
"loss": 0.2937,
"step": 1088
},
{
"epoch": 2.4308035714285716,
"grad_norm": 0.9747620820999146,
"learning_rate": 2.0028501655594736e-06,
"loss": 0.3119,
"step": 1089
},
{
"epoch": 2.4330357142857144,
"grad_norm": 1.0099077224731445,
"learning_rate": 1.987713714614543e-06,
"loss": 0.2794,
"step": 1090
},
{
"epoch": 2.435267857142857,
"grad_norm": 0.9872100353240967,
"learning_rate": 1.972628363000396e-06,
"loss": 0.2875,
"step": 1091
},
{
"epoch": 2.4375,
"grad_norm": 0.9732192158699036,
"learning_rate": 1.9575942069256914e-06,
"loss": 0.2856,
"step": 1092
},
{
"epoch": 2.439732142857143,
"grad_norm": 1.5758808851242065,
"learning_rate": 1.942611342272591e-06,
"loss": 0.326,
"step": 1093
},
{
"epoch": 2.4419642857142856,
"grad_norm": 1.0248242616653442,
"learning_rate": 1.9276798645961392e-06,
"loss": 0.3052,
"step": 1094
},
{
"epoch": 2.4441964285714284,
"grad_norm": 1.0433028936386108,
"learning_rate": 1.9127998691236537e-06,
"loss": 0.2528,
"step": 1095
},
{
"epoch": 2.4464285714285716,
"grad_norm": 1.2717303037643433,
"learning_rate": 1.8979714507541103e-06,
"loss": 0.3272,
"step": 1096
},
{
"epoch": 2.4486607142857144,
"grad_norm": 1.105989694595337,
"learning_rate": 1.883194704057556e-06,
"loss": 0.3166,
"step": 1097
},
{
"epoch": 2.450892857142857,
"grad_norm": 0.9582863450050354,
"learning_rate": 1.8684697232744886e-06,
"loss": 0.3559,
"step": 1098
},
{
"epoch": 2.453125,
"grad_norm": 1.3236531019210815,
"learning_rate": 1.8537966023152664e-06,
"loss": 0.2735,
"step": 1099
},
{
"epoch": 2.455357142857143,
"grad_norm": 1.1173170804977417,
"learning_rate": 1.839175434759507e-06,
"loss": 0.2657,
"step": 1100
},
{
"epoch": 2.4575892857142856,
"grad_norm": 1.127776026725769,
"learning_rate": 1.8246063138554793e-06,
"loss": 0.3515,
"step": 1101
},
{
"epoch": 2.4598214285714284,
"grad_norm": 1.1345889568328857,
"learning_rate": 1.810089332519528e-06,
"loss": 0.3252,
"step": 1102
},
{
"epoch": 2.4620535714285716,
"grad_norm": 1.0312929153442383,
"learning_rate": 1.795624583335467e-06,
"loss": 0.2631,
"step": 1103
},
{
"epoch": 2.4642857142857144,
"grad_norm": 1.1354609727859497,
"learning_rate": 1.7812121585539964e-06,
"loss": 0.3394,
"step": 1104
},
{
"epoch": 2.466517857142857,
"grad_norm": 0.9982307553291321,
"learning_rate": 1.7668521500921098e-06,
"loss": 0.3204,
"step": 1105
},
{
"epoch": 2.46875,
"grad_norm": 1.3083230257034302,
"learning_rate": 1.7525446495325038e-06,
"loss": 0.3237,
"step": 1106
},
{
"epoch": 2.470982142857143,
"grad_norm": 1.1162062883377075,
"learning_rate": 1.7382897481230076e-06,
"loss": 0.2478,
"step": 1107
},
{
"epoch": 2.4732142857142856,
"grad_norm": 1.1414133310317993,
"learning_rate": 1.7240875367759902e-06,
"loss": 0.3077,
"step": 1108
},
{
"epoch": 2.4754464285714284,
"grad_norm": 1.2450937032699585,
"learning_rate": 1.7099381060677833e-06,
"loss": 0.3084,
"step": 1109
},
{
"epoch": 2.4776785714285716,
"grad_norm": 0.9135916233062744,
"learning_rate": 1.6958415462380983e-06,
"loss": 0.2893,
"step": 1110
},
{
"epoch": 2.4799107142857144,
"grad_norm": 1.0570931434631348,
"learning_rate": 1.6817979471894641e-06,
"loss": 0.2603,
"step": 1111
},
{
"epoch": 2.482142857142857,
"grad_norm": 1.3998193740844727,
"learning_rate": 1.6678073984866438e-06,
"loss": 0.3685,
"step": 1112
},
{
"epoch": 2.484375,
"grad_norm": 1.2341136932373047,
"learning_rate": 1.6538699893560618e-06,
"loss": 0.3341,
"step": 1113
},
{
"epoch": 2.486607142857143,
"grad_norm": 1.1154083013534546,
"learning_rate": 1.639985808685245e-06,
"loss": 0.3208,
"step": 1114
},
{
"epoch": 2.4888392857142856,
"grad_norm": 1.3637317419052124,
"learning_rate": 1.6261549450222392e-06,
"loss": 0.3564,
"step": 1115
},
{
"epoch": 2.4910714285714284,
"grad_norm": 0.9546897411346436,
"learning_rate": 1.6123774865750607e-06,
"loss": 0.2526,
"step": 1116
},
{
"epoch": 2.4933035714285716,
"grad_norm": 1.1718556880950928,
"learning_rate": 1.5986535212111353e-06,
"loss": 0.3031,
"step": 1117
},
{
"epoch": 2.4955357142857144,
"grad_norm": 1.1763498783111572,
"learning_rate": 1.5849831364567137e-06,
"loss": 0.2933,
"step": 1118
},
{
"epoch": 2.497767857142857,
"grad_norm": 1.3048385381698608,
"learning_rate": 1.571366419496344e-06,
"loss": 0.3144,
"step": 1119
},
{
"epoch": 2.5,
"grad_norm": 1.1232630014419556,
"learning_rate": 1.5578034571722879e-06,
"loss": 0.3085,
"step": 1120
},
{
"epoch": 2.5,
"eval_loss": 0.6504756212234497,
"eval_runtime": 34.8268,
"eval_samples_per_second": 2.096,
"eval_steps_per_second": 0.287,
"step": 1120
},
{
"epoch": 2.502232142857143,
"grad_norm": 1.248914122581482,
"learning_rate": 1.5442943359839978e-06,
"loss": 0.3111,
"step": 1121
},
{
"epoch": 2.5044642857142856,
"grad_norm": 1.2444794178009033,
"learning_rate": 1.5308391420875312e-06,
"loss": 0.3108,
"step": 1122
},
{
"epoch": 2.506696428571429,
"grad_norm": 1.0684040784835815,
"learning_rate": 1.5174379612950273e-06,
"loss": 0.2805,
"step": 1123
},
{
"epoch": 2.508928571428571,
"grad_norm": 1.049082636833191,
"learning_rate": 1.5040908790741448e-06,
"loss": 0.3263,
"step": 1124
},
{
"epoch": 2.5111607142857144,
"grad_norm": 0.9625985026359558,
"learning_rate": 1.490797980547528e-06,
"loss": 0.2914,
"step": 1125
},
{
"epoch": 2.513392857142857,
"grad_norm": 1.173850178718567,
"learning_rate": 1.4775593504922547e-06,
"loss": 0.3015,
"step": 1126
},
{
"epoch": 2.515625,
"grad_norm": 1.032834529876709,
"learning_rate": 1.4643750733392958e-06,
"loss": 0.3199,
"step": 1127
},
{
"epoch": 2.517857142857143,
"grad_norm": 1.14642333984375,
"learning_rate": 1.4512452331729864e-06,
"loss": 0.3114,
"step": 1128
},
{
"epoch": 2.5200892857142856,
"grad_norm": 1.0797921419143677,
"learning_rate": 1.438169913730475e-06,
"loss": 0.3425,
"step": 1129
},
{
"epoch": 2.522321428571429,
"grad_norm": 0.9763374328613281,
"learning_rate": 1.4251491984012089e-06,
"loss": 0.3186,
"step": 1130
},
{
"epoch": 2.524553571428571,
"grad_norm": 1.1403617858886719,
"learning_rate": 1.4121831702263833e-06,
"loss": 0.3343,
"step": 1131
},
{
"epoch": 2.5267857142857144,
"grad_norm": 1.1691458225250244,
"learning_rate": 1.3992719118984167e-06,
"loss": 0.308,
"step": 1132
},
{
"epoch": 2.529017857142857,
"grad_norm": 1.1367650032043457,
"learning_rate": 1.3864155057604323e-06,
"loss": 0.3668,
"step": 1133
},
{
"epoch": 2.53125,
"grad_norm": 1.1581871509552002,
"learning_rate": 1.3736140338057247e-06,
"loss": 0.3477,
"step": 1134
},
{
"epoch": 2.533482142857143,
"grad_norm": 1.1503748893737793,
"learning_rate": 1.3608675776772428e-06,
"loss": 0.3597,
"step": 1135
},
{
"epoch": 2.5357142857142856,
"grad_norm": 1.056081771850586,
"learning_rate": 1.3481762186670556e-06,
"loss": 0.341,
"step": 1136
},
{
"epoch": 2.537946428571429,
"grad_norm": 1.1978353261947632,
"learning_rate": 1.335540037715851e-06,
"loss": 0.3677,
"step": 1137
},
{
"epoch": 2.540178571428571,
"grad_norm": 1.1892170906066895,
"learning_rate": 1.3229591154124132e-06,
"loss": 0.2895,
"step": 1138
},
{
"epoch": 2.5424107142857144,
"grad_norm": 0.9958797693252563,
"learning_rate": 1.310433531993104e-06,
"loss": 0.3434,
"step": 1139
},
{
"epoch": 2.544642857142857,
"grad_norm": 1.1225999593734741,
"learning_rate": 1.2979633673413571e-06,
"loss": 0.2626,
"step": 1140
},
{
"epoch": 2.546875,
"grad_norm": 1.121170997619629,
"learning_rate": 1.2855487009871615e-06,
"loss": 0.2736,
"step": 1141
},
{
"epoch": 2.549107142857143,
"grad_norm": 1.124768853187561,
"learning_rate": 1.2731896121065645e-06,
"loss": 0.3502,
"step": 1142
},
{
"epoch": 2.5513392857142856,
"grad_norm": 1.1074433326721191,
"learning_rate": 1.2608861795211601e-06,
"loss": 0.3724,
"step": 1143
},
{
"epoch": 2.553571428571429,
"grad_norm": 1.0048129558563232,
"learning_rate": 1.248638481697586e-06,
"loss": 0.3128,
"step": 1144
},
{
"epoch": 2.555803571428571,
"grad_norm": 1.2129403352737427,
"learning_rate": 1.2364465967470284e-06,
"loss": 0.3048,
"step": 1145
},
{
"epoch": 2.5580357142857144,
"grad_norm": 1.4070841073989868,
"learning_rate": 1.224310602424712e-06,
"loss": 0.3082,
"step": 1146
},
{
"epoch": 2.560267857142857,
"grad_norm": 1.1774101257324219,
"learning_rate": 1.2122305761294196e-06,
"loss": 0.3184,
"step": 1147
},
{
"epoch": 2.5625,
"grad_norm": 1.1185553073883057,
"learning_rate": 1.2002065949029896e-06,
"loss": 0.259,
"step": 1148
},
{
"epoch": 2.564732142857143,
"grad_norm": 1.0791926383972168,
"learning_rate": 1.1882387354298264e-06,
"loss": 0.2782,
"step": 1149
},
{
"epoch": 2.5669642857142856,
"grad_norm": 1.1294567584991455,
"learning_rate": 1.1763270740364074e-06,
"loss": 0.2585,
"step": 1150
},
{
"epoch": 2.569196428571429,
"grad_norm": 1.1617059707641602,
"learning_rate": 1.1644716866908035e-06,
"loss": 0.397,
"step": 1151
},
{
"epoch": 2.571428571428571,
"grad_norm": 1.1602836847305298,
"learning_rate": 1.15267264900219e-06,
"loss": 0.2723,
"step": 1152
},
{
"epoch": 2.5736607142857144,
"grad_norm": 1.0874189138412476,
"learning_rate": 1.1409300362203667e-06,
"loss": 0.3081,
"step": 1153
},
{
"epoch": 2.575892857142857,
"grad_norm": 1.186552882194519,
"learning_rate": 1.1292439232352781e-06,
"loss": 0.3096,
"step": 1154
},
{
"epoch": 2.578125,
"grad_norm": 1.004425287246704,
"learning_rate": 1.1176143845765253e-06,
"loss": 0.2703,
"step": 1155
},
{
"epoch": 2.580357142857143,
"grad_norm": 1.0555604696273804,
"learning_rate": 1.1060414944129106e-06,
"loss": 0.3055,
"step": 1156
},
{
"epoch": 2.5825892857142856,
"grad_norm": 1.0088086128234863,
"learning_rate": 1.0945253265519472e-06,
"loss": 0.3204,
"step": 1157
},
{
"epoch": 2.584821428571429,
"grad_norm": 1.1574110984802246,
"learning_rate": 1.0830659544393996e-06,
"loss": 0.3007,
"step": 1158
},
{
"epoch": 2.587053571428571,
"grad_norm": 1.011350154876709,
"learning_rate": 1.0716634511588076e-06,
"loss": 0.2853,
"step": 1159
},
{
"epoch": 2.5892857142857144,
"grad_norm": 1.0175875425338745,
"learning_rate": 1.0603178894310185e-06,
"loss": 0.326,
"step": 1160
},
{
"epoch": 2.591517857142857,
"grad_norm": 1.1453709602355957,
"learning_rate": 1.0490293416137409e-06,
"loss": 0.2979,
"step": 1161
},
{
"epoch": 2.59375,
"grad_norm": 0.9553859233856201,
"learning_rate": 1.0377978797010558e-06,
"loss": 0.2825,
"step": 1162
},
{
"epoch": 2.595982142857143,
"grad_norm": 1.2206124067306519,
"learning_rate": 1.0266235753229825e-06,
"loss": 0.3796,
"step": 1163
},
{
"epoch": 2.5982142857142856,
"grad_norm": 1.2103142738342285,
"learning_rate": 1.0155064997450026e-06,
"loss": 0.2994,
"step": 1164
},
{
"epoch": 2.600446428571429,
"grad_norm": 0.9963259696960449,
"learning_rate": 1.004446723867618e-06,
"loss": 0.3351,
"step": 1165
},
{
"epoch": 2.602678571428571,
"grad_norm": 1.100026249885559,
"learning_rate": 9.934443182259023e-07,
"loss": 0.3307,
"step": 1166
},
{
"epoch": 2.6049107142857144,
"grad_norm": 0.9989385008811951,
"learning_rate": 9.824993529890303e-07,
"loss": 0.3261,
"step": 1167
},
{
"epoch": 2.607142857142857,
"grad_norm": 0.9991337656974792,
"learning_rate": 9.716118979598533e-07,
"loss": 0.3303,
"step": 1168
},
{
"epoch": 2.609375,
"grad_norm": 1.0319366455078125,
"learning_rate": 9.607820225744346e-07,
"loss": 0.3108,
"step": 1169
},
{
"epoch": 2.611607142857143,
"grad_norm": 1.1170539855957031,
"learning_rate": 9.500097959016297e-07,
"loss": 0.3182,
"step": 1170
},
{
"epoch": 2.6138392857142856,
"grad_norm": 1.0289571285247803,
"learning_rate": 9.392952866426198e-07,
"loss": 0.348,
"step": 1171
},
{
"epoch": 2.616071428571429,
"grad_norm": 1.1711770296096802,
"learning_rate": 9.286385631304939e-07,
"loss": 0.325,
"step": 1172
},
{
"epoch": 2.618303571428571,
"grad_norm": 1.0985348224639893,
"learning_rate": 9.180396933298019e-07,
"loss": 0.3667,
"step": 1173
},
{
"epoch": 2.6205357142857144,
"grad_norm": 1.128706455230713,
"learning_rate": 9.074987448361261e-07,
"loss": 0.3939,
"step": 1174
},
{
"epoch": 2.622767857142857,
"grad_norm": 1.119606852531433,
"learning_rate": 8.970157848756511e-07,
"loss": 0.2845,
"step": 1175
},
{
"epoch": 2.625,
"grad_norm": 1.1094435453414917,
"learning_rate": 8.865908803047241e-07,
"loss": 0.3649,
"step": 1176
},
{
"epoch": 2.627232142857143,
"grad_norm": 1.1740772724151611,
"learning_rate": 8.762240976094461e-07,
"loss": 0.3225,
"step": 1177
},
{
"epoch": 2.6294642857142856,
"grad_norm": 1.083766222000122,
"learning_rate": 8.659155029052346e-07,
"loss": 0.2813,
"step": 1178
},
{
"epoch": 2.631696428571429,
"grad_norm": 1.0689512491226196,
"learning_rate": 8.556651619364065e-07,
"loss": 0.2851,
"step": 1179
},
{
"epoch": 2.633928571428571,
"grad_norm": 1.105716586112976,
"learning_rate": 8.454731400757599e-07,
"loss": 0.2946,
"step": 1180
},
{
"epoch": 2.6361607142857144,
"grad_norm": 1.143965244293213,
"learning_rate": 8.353395023241528e-07,
"loss": 0.2949,
"step": 1181
},
{
"epoch": 2.638392857142857,
"grad_norm": 0.8723753690719604,
"learning_rate": 8.252643133100935e-07,
"loss": 0.28,
"step": 1182
},
{
"epoch": 2.640625,
"grad_norm": 1.0619475841522217,
"learning_rate": 8.152476372893259e-07,
"loss": 0.3412,
"step": 1183
},
{
"epoch": 2.642857142857143,
"grad_norm": 1.1905899047851562,
"learning_rate": 8.052895381444226e-07,
"loss": 0.2769,
"step": 1184
},
{
"epoch": 2.6450892857142856,
"grad_norm": 1.1052501201629639,
"learning_rate": 7.953900793843694e-07,
"loss": 0.2905,
"step": 1185
},
{
"epoch": 2.647321428571429,
"grad_norm": 1.066614031791687,
"learning_rate": 7.855493241441692e-07,
"loss": 0.2377,
"step": 1186
},
{
"epoch": 2.649553571428571,
"grad_norm": 1.2181105613708496,
"learning_rate": 7.757673351844386e-07,
"loss": 0.3052,
"step": 1187
},
{
"epoch": 2.6517857142857144,
"grad_norm": 1.0465307235717773,
"learning_rate": 7.660441748909997e-07,
"loss": 0.3296,
"step": 1188
},
{
"epoch": 2.654017857142857,
"grad_norm": 0.9089909195899963,
"learning_rate": 7.563799052744947e-07,
"loss": 0.266,
"step": 1189
},
{
"epoch": 2.65625,
"grad_norm": 0.9568466544151306,
"learning_rate": 7.46774587969975e-07,
"loss": 0.3503,
"step": 1190
},
{
"epoch": 2.658482142857143,
"grad_norm": 1.032265067100525,
"learning_rate": 7.372282842365208e-07,
"loss": 0.2716,
"step": 1191
},
{
"epoch": 2.6607142857142856,
"grad_norm": 1.2572873830795288,
"learning_rate": 7.277410549568476e-07,
"loss": 0.3476,
"step": 1192
},
{
"epoch": 2.662946428571429,
"grad_norm": 1.076196312904358,
"learning_rate": 7.183129606369133e-07,
"loss": 0.3222,
"step": 1193
},
{
"epoch": 2.665178571428571,
"grad_norm": 1.0198694467544556,
"learning_rate": 7.089440614055398e-07,
"loss": 0.257,
"step": 1194
},
{
"epoch": 2.6674107142857144,
"grad_norm": 1.1174793243408203,
"learning_rate": 6.996344170140168e-07,
"loss": 0.3542,
"step": 1195
},
{
"epoch": 2.669642857142857,
"grad_norm": 1.0214918851852417,
"learning_rate": 6.903840868357382e-07,
"loss": 0.3026,
"step": 1196
},
{
"epoch": 2.671875,
"grad_norm": 1.2186428308486938,
"learning_rate": 6.811931298658092e-07,
"loss": 0.281,
"step": 1197
},
{
"epoch": 2.674107142857143,
"grad_norm": 1.0608640909194946,
"learning_rate": 6.720616047206774e-07,
"loss": 0.3752,
"step": 1198
},
{
"epoch": 2.6763392857142856,
"grad_norm": 1.1944961547851562,
"learning_rate": 6.62989569637752e-07,
"loss": 0.2898,
"step": 1199
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.9049682021141052,
"learning_rate": 6.539770824750447e-07,
"loss": 0.2757,
"step": 1200
},
{
"epoch": 2.680803571428571,
"grad_norm": 0.9544935822486877,
"learning_rate": 6.450242007107865e-07,
"loss": 0.2786,
"step": 1201
},
{
"epoch": 2.6830357142857144,
"grad_norm": 1.1401265859603882,
"learning_rate": 6.361309814430727e-07,
"loss": 0.2984,
"step": 1202
},
{
"epoch": 2.685267857142857,
"grad_norm": 0.852730929851532,
"learning_rate": 6.272974813894905e-07,
"loss": 0.2265,
"step": 1203
},
{
"epoch": 2.6875,
"grad_norm": 0.9403284788131714,
"learning_rate": 6.185237568867597e-07,
"loss": 0.2965,
"step": 1204
},
{
"epoch": 2.689732142857143,
"grad_norm": 0.9182292222976685,
"learning_rate": 6.098098638903771e-07,
"loss": 0.2878,
"step": 1205
},
{
"epoch": 2.6919642857142856,
"grad_norm": 0.9199351072311401,
"learning_rate": 6.01155857974256e-07,
"loss": 0.2658,
"step": 1206
},
{
"epoch": 2.694196428571429,
"grad_norm": 1.2709399461746216,
"learning_rate": 5.925617943303719e-07,
"loss": 0.3033,
"step": 1207
},
{
"epoch": 2.696428571428571,
"grad_norm": 1.1471501588821411,
"learning_rate": 5.840277277684136e-07,
"loss": 0.2973,
"step": 1208
},
{
"epoch": 2.6986607142857144,
"grad_norm": 1.1023441553115845,
"learning_rate": 5.755537127154231e-07,
"loss": 0.3461,
"step": 1209
},
{
"epoch": 2.700892857142857,
"grad_norm": 1.036736249923706,
"learning_rate": 5.671398032154707e-07,
"loss": 0.3071,
"step": 1210
},
{
"epoch": 2.703125,
"grad_norm": 1.0169968605041504,
"learning_rate": 5.58786052929281e-07,
"loss": 0.3106,
"step": 1211
},
{
"epoch": 2.705357142857143,
"grad_norm": 0.9377234578132629,
"learning_rate": 5.504925151339191e-07,
"loss": 0.31,
"step": 1212
},
{
"epoch": 2.7075892857142856,
"grad_norm": 1.1917829513549805,
"learning_rate": 5.422592427224239e-07,
"loss": 0.3508,
"step": 1213
},
{
"epoch": 2.709821428571429,
"grad_norm": 1.1036509275436401,
"learning_rate": 5.340862882034992e-07,
"loss": 0.2706,
"step": 1214
},
{
"epoch": 2.712053571428571,
"grad_norm": 1.0939146280288696,
"learning_rate": 5.259737037011547e-07,
"loss": 0.3306,
"step": 1215
},
{
"epoch": 2.7142857142857144,
"grad_norm": 1.6331071853637695,
"learning_rate": 5.179215409543848e-07,
"loss": 0.3035,
"step": 1216
},
{
"epoch": 2.716517857142857,
"grad_norm": 1.2998929023742676,
"learning_rate": 5.099298513168382e-07,
"loss": 0.3532,
"step": 1217
},
{
"epoch": 2.71875,
"grad_norm": 1.013765573501587,
"learning_rate": 5.01998685756484e-07,
"loss": 0.3092,
"step": 1218
},
{
"epoch": 2.720982142857143,
"grad_norm": 1.1098829507827759,
"learning_rate": 4.941280948553018e-07,
"loss": 0.3248,
"step": 1219
},
{
"epoch": 2.7232142857142856,
"grad_norm": 1.028980016708374,
"learning_rate": 4.863181288089391e-07,
"loss": 0.3397,
"step": 1220
},
{
"epoch": 2.725446428571429,
"grad_norm": 1.1095798015594482,
"learning_rate": 4.785688374264053e-07,
"loss": 0.3002,
"step": 1221
},
{
"epoch": 2.727678571428571,
"grad_norm": 1.1329293251037598,
"learning_rate": 4.708802701297499e-07,
"loss": 0.3509,
"step": 1222
},
{
"epoch": 2.7299107142857144,
"grad_norm": 1.1559255123138428,
"learning_rate": 4.632524759537449e-07,
"loss": 0.3122,
"step": 1223
},
{
"epoch": 2.732142857142857,
"grad_norm": 1.0540506839752197,
"learning_rate": 4.556855035455787e-07,
"loss": 0.3124,
"step": 1224
},
{
"epoch": 2.734375,
"grad_norm": 1.0515660047531128,
"learning_rate": 4.481794011645368e-07,
"loss": 0.3451,
"step": 1225
},
{
"epoch": 2.736607142857143,
"grad_norm": 1.035614013671875,
"learning_rate": 4.407342166816997e-07,
"loss": 0.277,
"step": 1226
},
{
"epoch": 2.7388392857142856,
"grad_norm": 1.2367432117462158,
"learning_rate": 4.3334999757963734e-07,
"loss": 0.3876,
"step": 1227
},
{
"epoch": 2.741071428571429,
"grad_norm": 1.1165615320205688,
"learning_rate": 4.2602679095210766e-07,
"loss": 0.3484,
"step": 1228
},
{
"epoch": 2.743303571428571,
"grad_norm": 0.900805652141571,
"learning_rate": 4.187646435037529e-07,
"loss": 0.2895,
"step": 1229
},
{
"epoch": 2.7455357142857144,
"grad_norm": 1.1723212003707886,
"learning_rate": 4.1156360154979813e-07,
"loss": 0.3315,
"step": 1230
},
{
"epoch": 2.747767857142857,
"grad_norm": 0.971288800239563,
"learning_rate": 4.044237110157667e-07,
"loss": 0.3146,
"step": 1231
},
{
"epoch": 2.75,
"grad_norm": 1.0520827770233154,
"learning_rate": 3.9734501743717956e-07,
"loss": 0.3277,
"step": 1232
},
{
"epoch": 2.75,
"eval_loss": 0.6504682898521423,
"eval_runtime": 25.5149,
"eval_samples_per_second": 2.861,
"eval_steps_per_second": 0.392,
"step": 1232
},
{
"epoch": 2.752232142857143,
"grad_norm": 1.031693458557129,
"learning_rate": 3.9032756595926755e-07,
"loss": 0.3002,
"step": 1233
},
{
"epoch": 2.7544642857142856,
"grad_norm": 1.1060569286346436,
"learning_rate": 3.833714013366796e-07,
"loss": 0.2792,
"step": 1234
},
{
"epoch": 2.756696428571429,
"grad_norm": 1.0176774263381958,
"learning_rate": 3.7647656793320164e-07,
"loss": 0.2915,
"step": 1235
},
{
"epoch": 2.758928571428571,
"grad_norm": 1.0510061979293823,
"learning_rate": 3.696431097214748e-07,
"loss": 0.3348,
"step": 1236
},
{
"epoch": 2.7611607142857144,
"grad_norm": 1.1525824069976807,
"learning_rate": 3.628710702827076e-07,
"loss": 0.3144,
"step": 1237
},
{
"epoch": 2.763392857142857,
"grad_norm": 1.0995824337005615,
"learning_rate": 3.5616049280640995e-07,
"loss": 0.2969,
"step": 1238
},
{
"epoch": 2.765625,
"grad_norm": 1.255507469177246,
"learning_rate": 3.4951142009010173e-07,
"loss": 0.3467,
"step": 1239
},
{
"epoch": 2.767857142857143,
"grad_norm": 1.0282824039459229,
"learning_rate": 3.429238945390556e-07,
"loss": 0.3324,
"step": 1240
},
{
"epoch": 2.7700892857142856,
"grad_norm": 0.9991137385368347,
"learning_rate": 3.3639795816601705e-07,
"loss": 0.3323,
"step": 1241
},
{
"epoch": 2.772321428571429,
"grad_norm": 0.9604209065437317,
"learning_rate": 3.299336525909391e-07,
"loss": 0.2618,
"step": 1242
},
{
"epoch": 2.774553571428571,
"grad_norm": 1.1453895568847656,
"learning_rate": 3.235310190407182e-07,
"loss": 0.2599,
"step": 1243
},
{
"epoch": 2.7767857142857144,
"grad_norm": 0.890120804309845,
"learning_rate": 3.171900983489273e-07,
"loss": 0.2831,
"step": 1244
},
{
"epoch": 2.779017857142857,
"grad_norm": 1.039947509765625,
"learning_rate": 3.109109309555602e-07,
"loss": 0.3081,
"step": 1245
},
{
"epoch": 2.78125,
"grad_norm": 1.0824922323226929,
"learning_rate": 3.0469355690677216e-07,
"loss": 0.3286,
"step": 1246
},
{
"epoch": 2.783482142857143,
"grad_norm": 1.1116937398910522,
"learning_rate": 2.985380158546236e-07,
"loss": 0.2822,
"step": 1247
},
{
"epoch": 2.7857142857142856,
"grad_norm": 1.1681832075119019,
"learning_rate": 2.9244434705682276e-07,
"loss": 0.2968,
"step": 1248
},
{
"epoch": 2.787946428571429,
"grad_norm": 1.229740858078003,
"learning_rate": 2.8641258937648577e-07,
"loss": 0.2954,
"step": 1249
},
{
"epoch": 2.790178571428571,
"grad_norm": 1.1749082803726196,
"learning_rate": 2.8044278128188327e-07,
"loss": 0.3335,
"step": 1250
},
{
"epoch": 2.7924107142857144,
"grad_norm": 1.116082787513733,
"learning_rate": 2.7453496084619116e-07,
"loss": 0.3388,
"step": 1251
},
{
"epoch": 2.794642857142857,
"grad_norm": 1.1051106452941895,
"learning_rate": 2.6868916574725347e-07,
"loss": 0.3184,
"step": 1252
},
{
"epoch": 2.796875,
"grad_norm": 0.9878678321838379,
"learning_rate": 2.6290543326733865e-07,
"loss": 0.306,
"step": 1253
},
{
"epoch": 2.799107142857143,
"grad_norm": 1.0825244188308716,
"learning_rate": 2.571838002929061e-07,
"loss": 0.3561,
"step": 1254
},
{
"epoch": 2.8013392857142856,
"grad_norm": 1.0878150463104248,
"learning_rate": 2.515243033143644e-07,
"loss": 0.2942,
"step": 1255
},
{
"epoch": 2.803571428571429,
"grad_norm": 1.051950454711914,
"learning_rate": 2.459269784258467e-07,
"loss": 0.321,
"step": 1256
},
{
"epoch": 2.805803571428571,
"grad_norm": 1.1179208755493164,
"learning_rate": 2.4039186132497226e-07,
"loss": 0.3436,
"step": 1257
},
{
"epoch": 2.8080357142857144,
"grad_norm": 1.0265306234359741,
"learning_rate": 2.349189873126223e-07,
"loss": 0.3321,
"step": 1258
},
{
"epoch": 2.810267857142857,
"grad_norm": 1.211876630783081,
"learning_rate": 2.2950839129272096e-07,
"loss": 0.3464,
"step": 1259
},
{
"epoch": 2.8125,
"grad_norm": 1.05633544921875,
"learning_rate": 2.2416010777199904e-07,
"loss": 0.3155,
"step": 1260
},
{
"epoch": 2.814732142857143,
"grad_norm": 1.1148322820663452,
"learning_rate": 2.1887417085978745e-07,
"loss": 0.2922,
"step": 1261
},
{
"epoch": 2.8169642857142856,
"grad_norm": 1.1178148984909058,
"learning_rate": 2.1365061426778967e-07,
"loss": 0.3572,
"step": 1262
},
{
"epoch": 2.819196428571429,
"grad_norm": 1.076788306236267,
"learning_rate": 2.0848947130987617e-07,
"loss": 0.3669,
"step": 1263
},
{
"epoch": 2.821428571428571,
"grad_norm": 1.0222539901733398,
"learning_rate": 2.0339077490186488e-07,
"loss": 0.2447,
"step": 1264
},
{
"epoch": 2.8236607142857144,
"grad_norm": 1.1097285747528076,
"learning_rate": 1.9835455756130995e-07,
"loss": 0.3211,
"step": 1265
},
{
"epoch": 2.825892857142857,
"grad_norm": 1.0979808568954468,
"learning_rate": 1.93380851407301e-07,
"loss": 0.3193,
"step": 1266
},
{
"epoch": 2.828125,
"grad_norm": 1.0203653573989868,
"learning_rate": 1.8846968816025434e-07,
"loss": 0.2787,
"step": 1267
},
{
"epoch": 2.830357142857143,
"grad_norm": 1.2996318340301514,
"learning_rate": 1.83621099141712e-07,
"loss": 0.3012,
"step": 1268
},
{
"epoch": 2.8325892857142856,
"grad_norm": 1.303546667098999,
"learning_rate": 1.7883511527414078e-07,
"loss": 0.3176,
"step": 1269
},
{
"epoch": 2.834821428571429,
"grad_norm": 1.072313666343689,
"learning_rate": 1.741117670807335e-07,
"loss": 0.3456,
"step": 1270
},
{
"epoch": 2.837053571428571,
"grad_norm": 1.0661609172821045,
"learning_rate": 1.694510846852193e-07,
"loss": 0.3156,
"step": 1271
},
{
"epoch": 2.8392857142857144,
"grad_norm": 1.1145730018615723,
"learning_rate": 1.648530978116658e-07,
"loss": 0.3167,
"step": 1272
},
{
"epoch": 2.841517857142857,
"grad_norm": 1.2349388599395752,
"learning_rate": 1.6031783578429605e-07,
"loss": 0.3167,
"step": 1273
},
{
"epoch": 2.84375,
"grad_norm": 1.0584666728973389,
"learning_rate": 1.558453275272942e-07,
"loss": 0.3261,
"step": 1274
},
{
"epoch": 2.845982142857143,
"grad_norm": 1.3490861654281616,
"learning_rate": 1.5143560156462567e-07,
"loss": 0.307,
"step": 1275
},
{
"epoch": 2.8482142857142856,
"grad_norm": 1.16843581199646,
"learning_rate": 1.4708868601985503e-07,
"loss": 0.3004,
"step": 1276
},
{
"epoch": 2.850446428571429,
"grad_norm": 1.0229218006134033,
"learning_rate": 1.4280460861596513e-07,
"loss": 0.2641,
"step": 1277
},
{
"epoch": 2.852678571428571,
"grad_norm": 1.0380784273147583,
"learning_rate": 1.385833966751815e-07,
"loss": 0.2836,
"step": 1278
},
{
"epoch": 2.8549107142857144,
"grad_norm": 1.2049779891967773,
"learning_rate": 1.3442507711879494e-07,
"loss": 0.3272,
"step": 1279
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.1403696537017822,
"learning_rate": 1.303296764669959e-07,
"loss": 0.4074,
"step": 1280
},
{
"epoch": 2.859375,
"grad_norm": 1.1046922206878662,
"learning_rate": 1.2629722083870033e-07,
"loss": 0.3132,
"step": 1281
},
{
"epoch": 2.861607142857143,
"grad_norm": 1.0592225790023804,
"learning_rate": 1.2232773595138415e-07,
"loss": 0.2396,
"step": 1282
},
{
"epoch": 2.8638392857142856,
"grad_norm": 1.0567072629928589,
"learning_rate": 1.1842124712092117e-07,
"loss": 0.2843,
"step": 1283
},
{
"epoch": 2.866071428571429,
"grad_norm": 0.9778628349304199,
"learning_rate": 1.1457777926141889e-07,
"loss": 0.2609,
"step": 1284
},
{
"epoch": 2.868303571428571,
"grad_norm": 1.0432276725769043,
"learning_rate": 1.1079735688506065e-07,
"loss": 0.3249,
"step": 1285
},
{
"epoch": 2.8705357142857144,
"grad_norm": 1.0415635108947754,
"learning_rate": 1.0708000410195041e-07,
"loss": 0.2822,
"step": 1286
},
{
"epoch": 2.872767857142857,
"grad_norm": 0.9630635976791382,
"learning_rate": 1.0342574461995936e-07,
"loss": 0.2646,
"step": 1287
},
{
"epoch": 2.875,
"grad_norm": 1.06930410861969,
"learning_rate": 9.98346017445706e-08,
"loss": 0.2927,
"step": 1288
},
{
"epoch": 2.877232142857143,
"grad_norm": 0.973823606967926,
"learning_rate": 9.630659837873368e-08,
"loss": 0.3139,
"step": 1289
},
{
"epoch": 2.8794642857142856,
"grad_norm": 1.0439759492874146,
"learning_rate": 9.284175702272246e-08,
"loss": 0.2869,
"step": 1290
},
{
"epoch": 2.881696428571429,
"grad_norm": 1.300384283065796,
"learning_rate": 8.944009977398083e-08,
"loss": 0.323,
"step": 1291
},
{
"epoch": 2.883928571428571,
"grad_norm": 1.096799612045288,
"learning_rate": 8.610164832699608e-08,
"loss": 0.309,
"step": 1292
},
{
"epoch": 2.8861607142857144,
"grad_norm": 1.0528693199157715,
"learning_rate": 8.282642397314356e-08,
"loss": 0.3453,
"step": 1293
},
{
"epoch": 2.888392857142857,
"grad_norm": 1.0400652885437012,
"learning_rate": 7.96144476005689e-08,
"loss": 0.2666,
"step": 1294
},
{
"epoch": 2.890625,
"grad_norm": 0.9061072468757629,
"learning_rate": 7.646573969404159e-08,
"loss": 0.2714,
"step": 1295
},
{
"epoch": 2.892857142857143,
"grad_norm": 1.0487802028656006,
"learning_rate": 7.338032033482712e-08,
"loss": 0.2844,
"step": 1296
},
{
"epoch": 2.8950892857142856,
"grad_norm": 0.9911705255508423,
"learning_rate": 7.035820920056724e-08,
"loss": 0.2749,
"step": 1297
},
{
"epoch": 2.897321428571429,
"grad_norm": 1.3166087865829468,
"learning_rate": 6.73994255651389e-08,
"loss": 0.3114,
"step": 1298
},
{
"epoch": 2.899553571428571,
"grad_norm": 1.0869697332382202,
"learning_rate": 6.450398829854764e-08,
"loss": 0.2953,
"step": 1299
},
{
"epoch": 2.9017857142857144,
"grad_norm": 0.9904835224151611,
"learning_rate": 6.167191586679556e-08,
"loss": 0.2908,
"step": 1300
},
{
"epoch": 2.904017857142857,
"grad_norm": 1.242396354675293,
"learning_rate": 5.890322633177126e-08,
"loss": 0.312,
"step": 1301
},
{
"epoch": 2.90625,
"grad_norm": 1.274328589439392,
"learning_rate": 5.6197937351125664e-08,
"loss": 0.2899,
"step": 1302
},
{
"epoch": 2.908482142857143,
"grad_norm": 0.8983531594276428,
"learning_rate": 5.355606617817089e-08,
"loss": 0.2409,
"step": 1303
},
{
"epoch": 2.9107142857142856,
"grad_norm": 1.077206015586853,
"learning_rate": 5.097762966176256e-08,
"loss": 0.2714,
"step": 1304
},
{
"epoch": 2.912946428571429,
"grad_norm": 0.9723058342933655,
"learning_rate": 4.846264424619218e-08,
"loss": 0.2529,
"step": 1305
},
{
"epoch": 2.915178571428571,
"grad_norm": 0.922687828540802,
"learning_rate": 4.6011125971084924e-08,
"loss": 0.2839,
"step": 1306
},
{
"epoch": 2.9174107142857144,
"grad_norm": 1.1098482608795166,
"learning_rate": 4.3623090471296426e-08,
"loss": 0.2432,
"step": 1307
},
{
"epoch": 2.919642857142857,
"grad_norm": 1.146208643913269,
"learning_rate": 4.129855297681618e-08,
"loss": 0.289,
"step": 1308
},
{
"epoch": 2.921875,
"grad_norm": 1.244287371635437,
"learning_rate": 3.903752831266205e-08,
"loss": 0.2719,
"step": 1309
},
{
"epoch": 2.924107142857143,
"grad_norm": 1.0499545335769653,
"learning_rate": 3.684003089879484e-08,
"loss": 0.3146,
"step": 1310
},
{
"epoch": 2.9263392857142856,
"grad_norm": 1.0027427673339844,
"learning_rate": 3.4706074750022744e-08,
"loss": 0.3214,
"step": 1311
},
{
"epoch": 2.928571428571429,
"grad_norm": 1.0911877155303955,
"learning_rate": 3.2635673475910345e-08,
"loss": 0.2908,
"step": 1312
},
{
"epoch": 2.930803571428571,
"grad_norm": 1.0470216274261475,
"learning_rate": 3.062884028069313e-08,
"loss": 0.3333,
"step": 1313
},
{
"epoch": 2.9330357142857144,
"grad_norm": 1.1500890254974365,
"learning_rate": 2.8685587963194206e-08,
"loss": 0.3176,
"step": 1314
},
{
"epoch": 2.935267857142857,
"grad_norm": 1.0063806772232056,
"learning_rate": 2.6805928916742163e-08,
"loss": 0.2551,
"step": 1315
},
{
"epoch": 2.9375,
"grad_norm": 1.0910873413085938,
"learning_rate": 2.4989875129091124e-08,
"loss": 0.2847,
"step": 1316
},
{
"epoch": 2.939732142857143,
"grad_norm": 0.9625343084335327,
"learning_rate": 2.323743818234414e-08,
"loss": 0.2416,
"step": 1317
},
{
"epoch": 2.9419642857142856,
"grad_norm": 0.9664291739463806,
"learning_rate": 2.154862925288326e-08,
"loss": 0.3257,
"step": 1318
},
{
"epoch": 2.944196428571429,
"grad_norm": 1.1130931377410889,
"learning_rate": 1.9923459111290676e-08,
"loss": 0.3248,
"step": 1319
},
{
"epoch": 2.946428571428571,
"grad_norm": 1.1273753643035889,
"learning_rate": 1.8361938122287704e-08,
"loss": 0.3255,
"step": 1320
},
{
"epoch": 2.9486607142857144,
"grad_norm": 1.1616261005401611,
"learning_rate": 1.6864076244663686e-08,
"loss": 0.3834,
"step": 1321
},
{
"epoch": 2.950892857142857,
"grad_norm": 0.9705925583839417,
"learning_rate": 1.5429883031217173e-08,
"loss": 0.3499,
"step": 1322
},
{
"epoch": 2.953125,
"grad_norm": 1.1763215065002441,
"learning_rate": 1.4059367628687094e-08,
"loss": 0.3263,
"step": 1323
},
{
"epoch": 2.955357142857143,
"grad_norm": 1.0161902904510498,
"learning_rate": 1.2752538777704993e-08,
"loss": 0.2873,
"step": 1324
},
{
"epoch": 2.9575892857142856,
"grad_norm": 1.0196352005004883,
"learning_rate": 1.1509404812728443e-08,
"loss": 0.31,
"step": 1325
},
{
"epoch": 2.959821428571429,
"grad_norm": 1.0153658390045166,
"learning_rate": 1.0329973661996617e-08,
"loss": 0.3055,
"step": 1326
},
{
"epoch": 2.962053571428571,
"grad_norm": 1.0681740045547485,
"learning_rate": 9.214252847475902e-09,
"loss": 0.2633,
"step": 1327
},
{
"epoch": 2.9642857142857144,
"grad_norm": 1.227658987045288,
"learning_rate": 8.162249484809926e-09,
"loss": 0.3204,
"step": 1328
},
{
"epoch": 2.966517857142857,
"grad_norm": 1.0228925943374634,
"learning_rate": 7.173970283279597e-09,
"loss": 0.2675,
"step": 1329
},
{
"epoch": 2.96875,
"grad_norm": 1.1837745904922485,
"learning_rate": 6.249421545755363e-09,
"loss": 0.3337,
"step": 1330
},
{
"epoch": 2.970982142857143,
"grad_norm": 1.1520233154296875,
"learning_rate": 5.388609168659465e-09,
"loss": 0.3157,
"step": 1331
},
{
"epoch": 2.9732142857142856,
"grad_norm": 1.3604669570922852,
"learning_rate": 4.591538641927074e-09,
"loss": 0.3322,
"step": 1332
},
{
"epoch": 2.975446428571429,
"grad_norm": 1.0328572988510132,
"learning_rate": 3.858215048972991e-09,
"loss": 0.3262,
"step": 1333
},
{
"epoch": 2.977678571428571,
"grad_norm": 1.1347808837890625,
"learning_rate": 3.1886430666561163e-09,
"loss": 0.329,
"step": 1334
},
{
"epoch": 2.9799107142857144,
"grad_norm": 1.0527995824813843,
"learning_rate": 2.5828269652561355e-09,
"loss": 0.2758,
"step": 1335
},
{
"epoch": 2.982142857142857,
"grad_norm": 1.057120680809021,
"learning_rate": 2.0407706084368816e-09,
"loss": 0.2689,
"step": 1336
},
{
"epoch": 2.984375,
"grad_norm": 0.9978762269020081,
"learning_rate": 1.5624774532285726e-09,
"loss": 0.2841,
"step": 1337
},
{
"epoch": 2.986607142857143,
"grad_norm": 1.1930631399154663,
"learning_rate": 1.1479505500044952e-09,
"loss": 0.2659,
"step": 1338
},
{
"epoch": 2.9888392857142856,
"grad_norm": 0.9426537752151489,
"learning_rate": 7.971925424621329e-10,
"loss": 0.2822,
"step": 1339
},
{
"epoch": 2.991071428571429,
"grad_norm": 1.2255107164382935,
"learning_rate": 5.102056675998501e-10,
"loss": 0.3154,
"step": 1340
},
{
"epoch": 2.993303571428571,
"grad_norm": 1.0295275449752808,
"learning_rate": 2.8699175571467177e-10,
"loss": 0.2938,
"step": 1341
},
{
"epoch": 2.9955357142857144,
"grad_norm": 1.188139796257019,
"learning_rate": 1.2755223037896892e-10,
"loss": 0.3389,
"step": 1342
},
{
"epoch": 2.997767857142857,
"grad_norm": 1.121012568473816,
"learning_rate": 3.1888108437128085e-11,
"loss": 0.3364,
"step": 1343
},
{
"epoch": 3.0,
"grad_norm": 0.9530598521232605,
"learning_rate": 0.0,
"loss": 0.3106,
"step": 1344
},
{
"epoch": 3.0,
"eval_loss": 0.6524144411087036,
"eval_runtime": 25.7885,
"eval_samples_per_second": 2.831,
"eval_steps_per_second": 0.388,
"step": 1344
}
],
"logging_steps": 1,
"max_steps": 1344,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 224,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.076426689825997e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}