{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 4.403661251068115, "learning_rate": 5e-06, "loss": 3.5443, "step": 1 }, { "epoch": 0.008, "grad_norm": 3.535775661468506, "learning_rate": 1e-05, "loss": 1.8447, "step": 2 }, { "epoch": 0.012, "grad_norm": 2.976018190383911, "learning_rate": 1.5e-05, "loss": 1.7894, "step": 3 }, { "epoch": 0.016, "grad_norm": 3.0810327529907227, "learning_rate": 2e-05, "loss": 2.9688, "step": 4 }, { "epoch": 0.02, "grad_norm": 2.9574649333953857, "learning_rate": 2.5e-05, "loss": 1.6972, "step": 5 }, { "epoch": 0.024, "grad_norm": 2.9108431339263916, "learning_rate": 3e-05, "loss": 1.6871, "step": 6 }, { "epoch": 0.028, "grad_norm": 2.5679922103881836, "learning_rate": 3.5e-05, "loss": 1.6313, "step": 7 }, { "epoch": 0.032, "grad_norm": 2.8970062732696533, "learning_rate": 4e-05, "loss": 1.5489, "step": 8 }, { "epoch": 0.036, "grad_norm": 2.5108256340026855, "learning_rate": 4.5e-05, "loss": 1.3924, "step": 9 }, { "epoch": 0.04, "grad_norm": 1.4062409400939941, "learning_rate": 5e-05, "loss": 1.4039, "step": 10 }, { "epoch": 0.044, "grad_norm": 1.4537838697433472, "learning_rate": 4.9989979959919844e-05, "loss": 1.3405, "step": 11 }, { "epoch": 0.048, "grad_norm": 1.1395865678787231, "learning_rate": 4.997995991983968e-05, "loss": 1.2754, "step": 12 }, { "epoch": 0.052, "grad_norm": 1.3618974685668945, "learning_rate": 4.996993987975952e-05, "loss": 1.2058, "step": 13 }, { "epoch": 0.056, "grad_norm": 1.0007574558258057, "learning_rate": 4.995991983967936e-05, "loss": 1.1149, "step": 14 }, { "epoch": 0.06, "grad_norm": 1.0973924398422241, "learning_rate": 4.99498997995992e-05, "loss": 1.0393, "step": 15 }, { "epoch": 0.064, "grad_norm": 0.9514708518981934, "learning_rate": 4.993987975951904e-05, "loss": 1.1058, "step": 16 }, { "epoch": 0.068, "grad_norm": 0.8538547158241272, "learning_rate": 4.9929859719438885e-05, "loss": 1.0185, "step": 17 }, { "epoch": 0.072, "grad_norm": 0.7519181966781616, "learning_rate": 4.991983967935872e-05, "loss": 0.9962, "step": 18 }, { "epoch": 0.076, "grad_norm": 0.9052465558052063, "learning_rate": 4.990981963927856e-05, "loss": 1.034, "step": 19 }, { "epoch": 0.08, "grad_norm": 0.9109005928039551, "learning_rate": 4.98997995991984e-05, "loss": 1.0563, "step": 20 }, { "epoch": 0.084, "grad_norm": 1.0023589134216309, "learning_rate": 4.9889779559118236e-05, "loss": 0.8603, "step": 21 }, { "epoch": 0.088, "grad_norm": 0.8098154664039612, "learning_rate": 4.987975951903808e-05, "loss": 0.9769, "step": 22 }, { "epoch": 0.092, "grad_norm": 0.7375625967979431, "learning_rate": 4.986973947895792e-05, "loss": 0.8612, "step": 23 }, { "epoch": 0.096, "grad_norm": 0.6169605851173401, "learning_rate": 4.985971943887775e-05, "loss": 0.8061, "step": 24 }, { "epoch": 0.1, "grad_norm": 0.5663151144981384, "learning_rate": 4.98496993987976e-05, "loss": 0.7382, "step": 25 }, { "epoch": 0.104, "grad_norm": 0.5792577266693115, "learning_rate": 4.983967935871744e-05, "loss": 0.8212, "step": 26 }, { "epoch": 0.108, "grad_norm": 0.5520902276039124, "learning_rate": 4.982965931863728e-05, "loss": 0.7566, "step": 27 }, { "epoch": 0.112, "grad_norm": 0.6001718044281006, "learning_rate": 4.981963927855712e-05, "loss": 0.7855, "step": 28 }, { "epoch": 0.116, "grad_norm": 0.9115943312644958, "learning_rate": 4.980961923847696e-05, "loss": 0.7621, "step": 29 }, { "epoch": 0.12, "grad_norm": 0.47501370310783386, "learning_rate": 4.9799599198396794e-05, "loss": 0.6687, "step": 30 }, { "epoch": 0.124, "grad_norm": 0.48663514852523804, "learning_rate": 4.9789579158316635e-05, "loss": 0.7291, "step": 31 }, { "epoch": 0.128, "grad_norm": 0.49639806151390076, "learning_rate": 4.977955911823648e-05, "loss": 0.7049, "step": 32 }, { "epoch": 0.132, "grad_norm": 0.49986907839775085, "learning_rate": 4.976953907815631e-05, "loss": 0.7347, "step": 33 }, { "epoch": 0.136, "grad_norm": 0.5302708745002747, "learning_rate": 4.975951903807615e-05, "loss": 0.771, "step": 34 }, { "epoch": 0.14, "grad_norm": 0.5710265040397644, "learning_rate": 4.9749498997995994e-05, "loss": 0.6717, "step": 35 }, { "epoch": 0.144, "grad_norm": 0.4576522707939148, "learning_rate": 4.9739478957915835e-05, "loss": 0.6038, "step": 36 }, { "epoch": 0.148, "grad_norm": 0.509702205657959, "learning_rate": 4.9729458917835676e-05, "loss": 0.681, "step": 37 }, { "epoch": 0.152, "grad_norm": 0.5716866254806519, "learning_rate": 4.971943887775551e-05, "loss": 0.7039, "step": 38 }, { "epoch": 0.156, "grad_norm": 0.4261268079280853, "learning_rate": 4.970941883767535e-05, "loss": 0.6841, "step": 39 }, { "epoch": 0.16, "grad_norm": 0.5015498399734497, "learning_rate": 4.9699398797595193e-05, "loss": 0.6062, "step": 40 }, { "epoch": 0.164, "grad_norm": 0.4798198640346527, "learning_rate": 4.9689378757515035e-05, "loss": 0.6174, "step": 41 }, { "epoch": 0.168, "grad_norm": 0.47841379046440125, "learning_rate": 4.967935871743487e-05, "loss": 0.71, "step": 42 }, { "epoch": 0.172, "grad_norm": 0.38819581270217896, "learning_rate": 4.966933867735471e-05, "loss": 0.6236, "step": 43 }, { "epoch": 0.176, "grad_norm": 0.3802018165588379, "learning_rate": 4.965931863727455e-05, "loss": 0.5662, "step": 44 }, { "epoch": 0.18, "grad_norm": 0.4659491777420044, "learning_rate": 4.964929859719439e-05, "loss": 0.6553, "step": 45 }, { "epoch": 0.184, "grad_norm": 0.4309292733669281, "learning_rate": 4.9639278557114234e-05, "loss": 0.6317, "step": 46 }, { "epoch": 0.188, "grad_norm": 0.36458510160446167, "learning_rate": 4.962925851703407e-05, "loss": 0.5853, "step": 47 }, { "epoch": 0.192, "grad_norm": 0.3789045214653015, "learning_rate": 4.961923847695391e-05, "loss": 0.6565, "step": 48 }, { "epoch": 0.196, "grad_norm": 0.43124228715896606, "learning_rate": 4.960921843687375e-05, "loss": 0.6484, "step": 49 }, { "epoch": 0.2, "grad_norm": Infinity, "learning_rate": 4.960921843687375e-05, "loss": 3.1127, "step": 50 }, { "epoch": 0.204, "grad_norm": 0.40404313802719116, "learning_rate": 4.9599198396793586e-05, "loss": 0.6541, "step": 51 }, { "epoch": 0.208, "grad_norm": 0.329449862241745, "learning_rate": 4.958917835671343e-05, "loss": 0.5814, "step": 52 }, { "epoch": 0.212, "grad_norm": 0.3724288046360016, "learning_rate": 4.957915831663327e-05, "loss": 0.6095, "step": 53 }, { "epoch": 0.216, "grad_norm": 0.3768303096294403, "learning_rate": 4.956913827655311e-05, "loss": 0.6467, "step": 54 }, { "epoch": 0.22, "grad_norm": 0.3847925066947937, "learning_rate": 4.9559118236472944e-05, "loss": 0.6584, "step": 55 }, { "epoch": 0.224, "grad_norm": 0.38740023970603943, "learning_rate": 4.954909819639279e-05, "loss": 0.6108, "step": 56 }, { "epoch": 0.228, "grad_norm": 0.4133378863334656, "learning_rate": 4.953907815631263e-05, "loss": 0.569, "step": 57 }, { "epoch": 0.232, "grad_norm": 0.3934808373451233, "learning_rate": 4.952905811623247e-05, "loss": 0.5979, "step": 58 }, { "epoch": 0.236, "grad_norm": 0.39415234327316284, "learning_rate": 4.951903807615231e-05, "loss": 0.5732, "step": 59 }, { "epoch": 0.24, "grad_norm": 0.41425755620002747, "learning_rate": 4.9509018036072144e-05, "loss": 0.6131, "step": 60 }, { "epoch": 0.244, "grad_norm": 0.3662201166152954, "learning_rate": 4.9498997995991985e-05, "loss": 0.582, "step": 61 }, { "epoch": 0.248, "grad_norm": 0.6343560218811035, "learning_rate": 4.9488977955911826e-05, "loss": 0.7075, "step": 62 }, { "epoch": 0.252, "grad_norm": 0.362642765045166, "learning_rate": 4.947895791583166e-05, "loss": 0.5534, "step": 63 }, { "epoch": 0.256, "grad_norm": 0.3604782819747925, "learning_rate": 4.94689378757515e-05, "loss": 0.5873, "step": 64 }, { "epoch": 0.26, "grad_norm": 0.3327144384384155, "learning_rate": 4.9458917835671344e-05, "loss": 0.5309, "step": 65 }, { "epoch": 0.264, "grad_norm": 0.35557445883750916, "learning_rate": 4.9448897795591185e-05, "loss": 0.5307, "step": 66 }, { "epoch": 0.268, "grad_norm": 0.5837879180908203, "learning_rate": 4.9438877755511026e-05, "loss": 0.6416, "step": 67 }, { "epoch": 0.272, "grad_norm": 0.7987128496170044, "learning_rate": 4.942885771543087e-05, "loss": 0.5083, "step": 68 }, { "epoch": 0.276, "grad_norm": 0.3976365923881531, "learning_rate": 4.94188376753507e-05, "loss": 0.6014, "step": 69 }, { "epoch": 0.28, "grad_norm": 0.3430960774421692, "learning_rate": 4.940881763527054e-05, "loss": 0.5841, "step": 70 }, { "epoch": 0.284, "grad_norm": 0.3691798746585846, "learning_rate": 4.9398797595190384e-05, "loss": 0.5346, "step": 71 }, { "epoch": 0.288, "grad_norm": 0.3781915307044983, "learning_rate": 4.938877755511022e-05, "loss": 0.5438, "step": 72 }, { "epoch": 0.292, "grad_norm": 0.43587324023246765, "learning_rate": 4.937875751503006e-05, "loss": 0.5575, "step": 73 }, { "epoch": 0.296, "grad_norm": 0.5425245761871338, "learning_rate": 4.93687374749499e-05, "loss": 0.5755, "step": 74 }, { "epoch": 0.3, "grad_norm": 0.3869353234767914, "learning_rate": 4.935871743486974e-05, "loss": 0.5267, "step": 75 }, { "epoch": 0.304, "grad_norm": 1.0698070526123047, "learning_rate": 4.9348697394789584e-05, "loss": 0.5967, "step": 76 }, { "epoch": 0.308, "grad_norm": 0.3409326672554016, "learning_rate": 4.9338677354709425e-05, "loss": 0.5764, "step": 77 }, { "epoch": 0.312, "grad_norm": 0.42487403750419617, "learning_rate": 4.932865731462926e-05, "loss": 0.5788, "step": 78 }, { "epoch": 0.316, "grad_norm": 0.355347216129303, "learning_rate": 4.93186372745491e-05, "loss": 0.5232, "step": 79 }, { "epoch": 0.32, "grad_norm": 0.3655643165111542, "learning_rate": 4.930861723446894e-05, "loss": 0.5693, "step": 80 }, { "epoch": 0.324, "grad_norm": 0.40408679842948914, "learning_rate": 4.929859719438878e-05, "loss": 0.5206, "step": 81 }, { "epoch": 0.328, "grad_norm": 0.358632355928421, "learning_rate": 4.928857715430862e-05, "loss": 0.5344, "step": 82 }, { "epoch": 0.332, "grad_norm": 0.3510683476924896, "learning_rate": 4.927855711422846e-05, "loss": 0.5673, "step": 83 }, { "epoch": 0.336, "grad_norm": 0.3240058422088623, "learning_rate": 4.9268537074148294e-05, "loss": 0.5064, "step": 84 }, { "epoch": 0.34, "grad_norm": 0.39716836810112, "learning_rate": 4.925851703406814e-05, "loss": 0.4974, "step": 85 }, { "epoch": 0.344, "grad_norm": 0.3803369998931885, "learning_rate": 4.9248496993987983e-05, "loss": 0.5433, "step": 86 }, { "epoch": 0.348, "grad_norm": 0.345559298992157, "learning_rate": 4.923847695390782e-05, "loss": 0.5194, "step": 87 }, { "epoch": 0.352, "grad_norm": 0.3797976076602936, "learning_rate": 4.922845691382766e-05, "loss": 0.5668, "step": 88 }, { "epoch": 0.356, "grad_norm": 0.4122374653816223, "learning_rate": 4.92184368737475e-05, "loss": 0.5709, "step": 89 }, { "epoch": 0.36, "grad_norm": 0.3925560414791107, "learning_rate": 4.9208416833667335e-05, "loss": 0.5528, "step": 90 }, { "epoch": 0.364, "grad_norm": 0.35586780309677124, "learning_rate": 4.9198396793587176e-05, "loss": 0.5123, "step": 91 }, { "epoch": 0.368, "grad_norm": 0.5167198181152344, "learning_rate": 4.918837675350702e-05, "loss": 0.5667, "step": 92 }, { "epoch": 0.372, "grad_norm": 0.6427658200263977, "learning_rate": 4.917835671342685e-05, "loss": 0.5046, "step": 93 }, { "epoch": 0.376, "grad_norm": 0.3925032615661621, "learning_rate": 4.916833667334669e-05, "loss": 0.5729, "step": 94 }, { "epoch": 0.38, "grad_norm": 0.4006875157356262, "learning_rate": 4.9158316633266535e-05, "loss": 0.5932, "step": 95 }, { "epoch": 0.384, "grad_norm": 0.3980340361595154, "learning_rate": 4.9148296593186376e-05, "loss": 0.5277, "step": 96 }, { "epoch": 0.388, "grad_norm": 0.35809895396232605, "learning_rate": 4.913827655310622e-05, "loss": 0.5806, "step": 97 }, { "epoch": 0.392, "grad_norm": 0.3831680417060852, "learning_rate": 4.912825651302606e-05, "loss": 0.5285, "step": 98 }, { "epoch": 0.396, "grad_norm": 8.12513256072998, "learning_rate": 4.911823647294589e-05, "loss": 1.9624, "step": 99 }, { "epoch": 0.4, "grad_norm": 0.38112860918045044, "learning_rate": 4.9108216432865734e-05, "loss": 0.5226, "step": 100 }, { "epoch": 0.404, "grad_norm": 0.37594351172447205, "learning_rate": 4.9098196392785576e-05, "loss": 0.521, "step": 101 }, { "epoch": 0.408, "grad_norm": 0.35807371139526367, "learning_rate": 4.908817635270541e-05, "loss": 0.574, "step": 102 }, { "epoch": 0.412, "grad_norm": 0.42019104957580566, "learning_rate": 4.907815631262525e-05, "loss": 0.5854, "step": 103 }, { "epoch": 0.416, "grad_norm": 4.040385723114014, "learning_rate": 4.906813627254509e-05, "loss": 2.0486, "step": 104 }, { "epoch": 0.42, "grad_norm": 0.40117359161376953, "learning_rate": 4.9058116232464934e-05, "loss": 0.4933, "step": 105 }, { "epoch": 0.424, "grad_norm": 0.3785370886325836, "learning_rate": 4.9048096192384775e-05, "loss": 0.5409, "step": 106 }, { "epoch": 0.428, "grad_norm": 0.5202479958534241, "learning_rate": 4.903807615230461e-05, "loss": 0.6319, "step": 107 }, { "epoch": 0.432, "grad_norm": 0.3808040916919708, "learning_rate": 4.902805611222445e-05, "loss": 0.5222, "step": 108 }, { "epoch": 0.436, "grad_norm": 0.33765068650245667, "learning_rate": 4.901803607214429e-05, "loss": 0.5191, "step": 109 }, { "epoch": 0.44, "grad_norm": 0.37931010127067566, "learning_rate": 4.9008016032064134e-05, "loss": 0.509, "step": 110 }, { "epoch": 0.444, "grad_norm": 0.46715351939201355, "learning_rate": 4.899799599198397e-05, "loss": 0.5343, "step": 111 }, { "epoch": 0.448, "grad_norm": 0.372823029756546, "learning_rate": 4.898797595190381e-05, "loss": 0.542, "step": 112 }, { "epoch": 0.452, "grad_norm": 0.3744858205318451, "learning_rate": 4.897795591182365e-05, "loss": 0.4599, "step": 113 }, { "epoch": 0.456, "grad_norm": 0.4025891423225403, "learning_rate": 4.8967935871743485e-05, "loss": 0.5455, "step": 114 }, { "epoch": 0.46, "grad_norm": 0.33246585726737976, "learning_rate": 4.895791583166333e-05, "loss": 0.5026, "step": 115 }, { "epoch": 0.464, "grad_norm": 0.4333687126636505, "learning_rate": 4.894789579158317e-05, "loss": 0.5372, "step": 116 }, { "epoch": 0.468, "grad_norm": 0.36540961265563965, "learning_rate": 4.893787575150301e-05, "loss": 0.4877, "step": 117 }, { "epoch": 0.472, "grad_norm": 0.39382699131965637, "learning_rate": 4.892785571142285e-05, "loss": 0.4705, "step": 118 }, { "epoch": 0.476, "grad_norm": 0.37407657504081726, "learning_rate": 4.8917835671342685e-05, "loss": 0.5003, "step": 119 }, { "epoch": 0.48, "grad_norm": 0.36774665117263794, "learning_rate": 4.8907815631262526e-05, "loss": 0.5304, "step": 120 }, { "epoch": 0.484, "grad_norm": 0.37522393465042114, "learning_rate": 4.889779559118237e-05, "loss": 0.5437, "step": 121 }, { "epoch": 0.488, "grad_norm": 0.5063856244087219, "learning_rate": 4.88877755511022e-05, "loss": 0.558, "step": 122 }, { "epoch": 0.492, "grad_norm": 5.0222487449646, "learning_rate": 4.887775551102204e-05, "loss": 1.7772, "step": 123 }, { "epoch": 0.496, "grad_norm": 0.4300785958766937, "learning_rate": 4.886773547094189e-05, "loss": 0.5218, "step": 124 }, { "epoch": 0.5, "grad_norm": 0.5471500754356384, "learning_rate": 4.8857715430861726e-05, "loss": 0.5194, "step": 125 }, { "epoch": 0.504, "grad_norm": 0.40047940611839294, "learning_rate": 4.884769539078157e-05, "loss": 0.47, "step": 126 }, { "epoch": 0.508, "grad_norm": 0.39165449142456055, "learning_rate": 4.883767535070141e-05, "loss": 0.5365, "step": 127 }, { "epoch": 0.512, "grad_norm": 0.4630558490753174, "learning_rate": 4.882765531062124e-05, "loss": 0.5676, "step": 128 }, { "epoch": 0.516, "grad_norm": 0.3909466862678528, "learning_rate": 4.8817635270541084e-05, "loss": 0.565, "step": 129 }, { "epoch": 0.52, "grad_norm": 0.49433305859565735, "learning_rate": 4.8807615230460925e-05, "loss": 0.4734, "step": 130 }, { "epoch": 0.524, "grad_norm": 0.40937337279319763, "learning_rate": 4.879759519038076e-05, "loss": 0.4959, "step": 131 }, { "epoch": 0.528, "grad_norm": 0.4436751902103424, "learning_rate": 4.87875751503006e-05, "loss": 0.5009, "step": 132 }, { "epoch": 0.532, "grad_norm": 0.3952745199203491, "learning_rate": 4.877755511022044e-05, "loss": 0.5088, "step": 133 }, { "epoch": 0.536, "grad_norm": 0.4065265357494354, "learning_rate": 4.8767535070140284e-05, "loss": 0.5368, "step": 134 }, { "epoch": 0.54, "grad_norm": 0.351837694644928, "learning_rate": 4.8757515030060125e-05, "loss": 0.4719, "step": 135 }, { "epoch": 0.544, "grad_norm": 0.38240760564804077, "learning_rate": 4.8747494989979966e-05, "loss": 0.5045, "step": 136 }, { "epoch": 0.548, "grad_norm": 1.0215145349502563, "learning_rate": 4.87374749498998e-05, "loss": 0.4706, "step": 137 }, { "epoch": 0.552, "grad_norm": 0.44947350025177, "learning_rate": 4.872745490981964e-05, "loss": 0.4955, "step": 138 }, { "epoch": 0.556, "grad_norm": 0.4560631811618805, "learning_rate": 4.871743486973948e-05, "loss": 0.4635, "step": 139 }, { "epoch": 0.56, "grad_norm": 0.44674062728881836, "learning_rate": 4.870741482965932e-05, "loss": 0.5357, "step": 140 }, { "epoch": 0.564, "grad_norm": 0.3947732150554657, "learning_rate": 4.869739478957916e-05, "loss": 0.5075, "step": 141 }, { "epoch": 0.568, "grad_norm": 0.42195793986320496, "learning_rate": 4.8687374749499e-05, "loss": 0.5268, "step": 142 }, { "epoch": 0.572, "grad_norm": 0.350239634513855, "learning_rate": 4.8677354709418835e-05, "loss": 0.4759, "step": 143 }, { "epoch": 0.576, "grad_norm": 0.4829655885696411, "learning_rate": 4.866733466933868e-05, "loss": 0.4905, "step": 144 }, { "epoch": 0.58, "grad_norm": 0.37556272745132446, "learning_rate": 4.8657314629258524e-05, "loss": 0.4722, "step": 145 }, { "epoch": 0.584, "grad_norm": 0.3916873335838318, "learning_rate": 4.864729458917836e-05, "loss": 0.4706, "step": 146 }, { "epoch": 0.588, "grad_norm": 0.4552304148674011, "learning_rate": 4.86372745490982e-05, "loss": 0.5148, "step": 147 }, { "epoch": 0.592, "grad_norm": 0.43419304490089417, "learning_rate": 4.862725450901804e-05, "loss": 0.5318, "step": 148 }, { "epoch": 0.596, "grad_norm": 0.4283067584037781, "learning_rate": 4.8617234468937876e-05, "loss": 0.5103, "step": 149 }, { "epoch": 0.6, "grad_norm": 0.39657995104789734, "learning_rate": 4.860721442885772e-05, "loss": 0.4643, "step": 150 }, { "epoch": 0.604, "grad_norm": 0.43963828682899475, "learning_rate": 4.859719438877756e-05, "loss": 0.5201, "step": 151 }, { "epoch": 0.608, "grad_norm": 0.4126660227775574, "learning_rate": 4.858717434869739e-05, "loss": 0.5069, "step": 152 }, { "epoch": 0.612, "grad_norm": 0.42871612310409546, "learning_rate": 4.8577154308617234e-05, "loss": 0.527, "step": 153 }, { "epoch": 0.616, "grad_norm": 0.44571855664253235, "learning_rate": 4.856713426853708e-05, "loss": 0.5771, "step": 154 }, { "epoch": 0.62, "grad_norm": 0.3940712809562683, "learning_rate": 4.855711422845692e-05, "loss": 0.5059, "step": 155 }, { "epoch": 0.624, "grad_norm": 0.39730727672576904, "learning_rate": 4.854709418837676e-05, "loss": 0.4702, "step": 156 }, { "epoch": 0.628, "grad_norm": 0.4266124367713928, "learning_rate": 4.85370741482966e-05, "loss": 0.4936, "step": 157 }, { "epoch": 0.632, "grad_norm": 0.49682965874671936, "learning_rate": 4.8527054108216434e-05, "loss": 0.5417, "step": 158 }, { "epoch": 0.636, "grad_norm": 0.43528032302856445, "learning_rate": 4.8517034068136275e-05, "loss": 0.5538, "step": 159 }, { "epoch": 0.64, "grad_norm": 0.4789420962333679, "learning_rate": 4.8507014028056116e-05, "loss": 0.4866, "step": 160 }, { "epoch": 0.644, "grad_norm": 0.4257548153400421, "learning_rate": 4.849699398797595e-05, "loss": 0.4858, "step": 161 }, { "epoch": 0.648, "grad_norm": 0.38716921210289, "learning_rate": 4.848697394789579e-05, "loss": 0.4976, "step": 162 }, { "epoch": 0.652, "grad_norm": 10.990571975708008, "learning_rate": 4.8476953907815633e-05, "loss": 2.1143, "step": 163 }, { "epoch": 0.656, "grad_norm": 0.42327937483787537, "learning_rate": 4.8466933867735475e-05, "loss": 0.5064, "step": 164 }, { "epoch": 0.66, "grad_norm": 0.35529690980911255, "learning_rate": 4.8456913827655316e-05, "loss": 0.4584, "step": 165 }, { "epoch": 0.664, "grad_norm": 0.39093905687332153, "learning_rate": 4.844689378757515e-05, "loss": 0.4638, "step": 166 }, { "epoch": 0.668, "grad_norm": 0.4272507429122925, "learning_rate": 4.843687374749499e-05, "loss": 0.5097, "step": 167 }, { "epoch": 0.672, "grad_norm": 0.4245583117008209, "learning_rate": 4.842685370741483e-05, "loss": 0.5568, "step": 168 }, { "epoch": 0.676, "grad_norm": 3.0908520221710205, "learning_rate": 4.8416833667334674e-05, "loss": 1.8422, "step": 169 }, { "epoch": 0.68, "grad_norm": 0.4836314916610718, "learning_rate": 4.840681362725451e-05, "loss": 0.5187, "step": 170 }, { "epoch": 0.684, "grad_norm": 0.4002329409122467, "learning_rate": 4.839679358717435e-05, "loss": 0.4804, "step": 171 }, { "epoch": 0.688, "grad_norm": 2.849741220474243, "learning_rate": 4.838677354709419e-05, "loss": 1.5929, "step": 172 }, { "epoch": 0.692, "grad_norm": 0.3997988700866699, "learning_rate": 4.8376753507014026e-05, "loss": 0.4551, "step": 173 }, { "epoch": 0.696, "grad_norm": 0.4090086817741394, "learning_rate": 4.8366733466933874e-05, "loss": 0.5154, "step": 174 }, { "epoch": 0.7, "grad_norm": 0.4361508786678314, "learning_rate": 4.835671342685371e-05, "loss": 0.4584, "step": 175 }, { "epoch": 0.704, "grad_norm": 0.41879937052726746, "learning_rate": 4.834669338677355e-05, "loss": 0.5122, "step": 176 }, { "epoch": 0.708, "grad_norm": 0.43722084164619446, "learning_rate": 4.833667334669339e-05, "loss": 0.503, "step": 177 }, { "epoch": 0.712, "grad_norm": 0.3997744917869568, "learning_rate": 4.8326653306613226e-05, "loss": 0.5121, "step": 178 }, { "epoch": 0.716, "grad_norm": 0.4667453169822693, "learning_rate": 4.831663326653307e-05, "loss": 0.4888, "step": 179 }, { "epoch": 0.72, "grad_norm": 0.47008612751960754, "learning_rate": 4.830661322645291e-05, "loss": 0.5683, "step": 180 }, { "epoch": 0.724, "grad_norm": 0.4221080541610718, "learning_rate": 4.829659318637275e-05, "loss": 0.4872, "step": 181 }, { "epoch": 0.728, "grad_norm": 0.4071381688117981, "learning_rate": 4.8286573146292584e-05, "loss": 0.4833, "step": 182 }, { "epoch": 0.732, "grad_norm": 0.427738219499588, "learning_rate": 4.827655310621243e-05, "loss": 0.4781, "step": 183 }, { "epoch": 0.736, "grad_norm": 0.3943522274494171, "learning_rate": 4.8266533066132266e-05, "loss": 0.5011, "step": 184 }, { "epoch": 0.74, "grad_norm": 0.4425494968891144, "learning_rate": 4.825651302605211e-05, "loss": 0.461, "step": 185 }, { "epoch": 0.744, "grad_norm": 0.4339994192123413, "learning_rate": 4.824649298597195e-05, "loss": 0.4756, "step": 186 }, { "epoch": 0.748, "grad_norm": 0.38859328627586365, "learning_rate": 4.8236472945891784e-05, "loss": 0.451, "step": 187 }, { "epoch": 0.752, "grad_norm": 0.4013572931289673, "learning_rate": 4.8226452905811625e-05, "loss": 0.5298, "step": 188 }, { "epoch": 0.756, "grad_norm": 9.899824142456055, "learning_rate": 4.8216432865731466e-05, "loss": 1.542, "step": 189 }, { "epoch": 0.76, "grad_norm": 0.41241151094436646, "learning_rate": 4.82064128256513e-05, "loss": 0.5358, "step": 190 }, { "epoch": 0.764, "grad_norm": 0.408150315284729, "learning_rate": 4.819639278557114e-05, "loss": 0.4647, "step": 191 }, { "epoch": 0.768, "grad_norm": 0.40550801157951355, "learning_rate": 4.818637274549098e-05, "loss": 0.4881, "step": 192 }, { "epoch": 0.772, "grad_norm": 0.35784292221069336, "learning_rate": 4.8176352705410824e-05, "loss": 0.4493, "step": 193 }, { "epoch": 0.776, "grad_norm": 0.3900321424007416, "learning_rate": 4.8166332665330666e-05, "loss": 0.4015, "step": 194 }, { "epoch": 0.78, "grad_norm": 0.38830649852752686, "learning_rate": 4.815631262525051e-05, "loss": 0.4702, "step": 195 }, { "epoch": 0.784, "grad_norm": 0.4276293218135834, "learning_rate": 4.814629258517034e-05, "loss": 0.4875, "step": 196 }, { "epoch": 0.788, "grad_norm": 0.3857548236846924, "learning_rate": 4.813627254509018e-05, "loss": 0.4491, "step": 197 }, { "epoch": 0.792, "grad_norm": 0.4131658375263214, "learning_rate": 4.8126252505010024e-05, "loss": 0.495, "step": 198 }, { "epoch": 0.796, "grad_norm": 0.41396304965019226, "learning_rate": 4.811623246492986e-05, "loss": 0.5112, "step": 199 }, { "epoch": 0.8, "grad_norm": 0.4702424108982086, "learning_rate": 4.81062124248497e-05, "loss": 0.548, "step": 200 }, { "epoch": 0.804, "grad_norm": 0.41151168942451477, "learning_rate": 4.809619238476954e-05, "loss": 0.4484, "step": 201 }, { "epoch": 0.808, "grad_norm": 0.4377236068248749, "learning_rate": 4.8086172344689376e-05, "loss": 0.4626, "step": 202 }, { "epoch": 0.812, "grad_norm": 0.47114676237106323, "learning_rate": 4.8076152304609224e-05, "loss": 0.505, "step": 203 }, { "epoch": 0.816, "grad_norm": 0.44927552342414856, "learning_rate": 4.8066132264529065e-05, "loss": 0.4697, "step": 204 }, { "epoch": 0.82, "grad_norm": 0.4451066255569458, "learning_rate": 4.80561122244489e-05, "loss": 0.5453, "step": 205 }, { "epoch": 0.824, "grad_norm": 0.7564288973808289, "learning_rate": 4.804609218436874e-05, "loss": 0.521, "step": 206 }, { "epoch": 0.828, "grad_norm": 0.4182622730731964, "learning_rate": 4.803607214428858e-05, "loss": 0.4806, "step": 207 }, { "epoch": 0.832, "grad_norm": 0.3740067780017853, "learning_rate": 4.8026052104208417e-05, "loss": 0.4551, "step": 208 }, { "epoch": 0.836, "grad_norm": 0.4166138768196106, "learning_rate": 4.801603206412826e-05, "loss": 0.5156, "step": 209 }, { "epoch": 0.84, "grad_norm": 0.4035174548625946, "learning_rate": 4.80060120240481e-05, "loss": 0.4798, "step": 210 }, { "epoch": 0.844, "grad_norm": 0.46257126331329346, "learning_rate": 4.7995991983967934e-05, "loss": 0.527, "step": 211 }, { "epoch": 0.848, "grad_norm": 0.4034459590911865, "learning_rate": 4.7985971943887775e-05, "loss": 0.4595, "step": 212 }, { "epoch": 0.852, "grad_norm": 0.37898552417755127, "learning_rate": 4.797595190380762e-05, "loss": 0.4495, "step": 213 }, { "epoch": 0.856, "grad_norm": 0.4459609389305115, "learning_rate": 4.796593186372746e-05, "loss": 0.5279, "step": 214 }, { "epoch": 0.86, "grad_norm": 0.436012864112854, "learning_rate": 4.79559118236473e-05, "loss": 0.5159, "step": 215 }, { "epoch": 0.864, "grad_norm": 0.42498722672462463, "learning_rate": 4.794589178356714e-05, "loss": 0.4828, "step": 216 }, { "epoch": 0.868, "grad_norm": 0.4183887541294098, "learning_rate": 4.7935871743486975e-05, "loss": 0.4886, "step": 217 }, { "epoch": 0.872, "grad_norm": 0.4127841889858246, "learning_rate": 4.7925851703406816e-05, "loss": 0.4601, "step": 218 }, { "epoch": 0.876, "grad_norm": 0.42419275641441345, "learning_rate": 4.791583166332666e-05, "loss": 0.5311, "step": 219 }, { "epoch": 0.88, "grad_norm": 3.3534486293792725, "learning_rate": 4.790581162324649e-05, "loss": 1.7676, "step": 220 }, { "epoch": 0.884, "grad_norm": 0.44655194878578186, "learning_rate": 4.789579158316633e-05, "loss": 0.4378, "step": 221 }, { "epoch": 0.888, "grad_norm": 0.3902299702167511, "learning_rate": 4.7885771543086174e-05, "loss": 0.4638, "step": 222 }, { "epoch": 0.892, "grad_norm": 0.700932502746582, "learning_rate": 4.7875751503006016e-05, "loss": 0.5176, "step": 223 }, { "epoch": 0.896, "grad_norm": 0.4358803927898407, "learning_rate": 4.786573146292586e-05, "loss": 0.4604, "step": 224 }, { "epoch": 0.9, "grad_norm": 0.4845912754535675, "learning_rate": 4.78557114228457e-05, "loss": 0.5406, "step": 225 }, { "epoch": 0.904, "grad_norm": 0.38343381881713867, "learning_rate": 4.784569138276553e-05, "loss": 0.4693, "step": 226 }, { "epoch": 0.908, "grad_norm": 0.48064181208610535, "learning_rate": 4.7835671342685374e-05, "loss": 0.4616, "step": 227 }, { "epoch": 0.912, "grad_norm": 0.5941019654273987, "learning_rate": 4.7825651302605215e-05, "loss": 0.6027, "step": 228 }, { "epoch": 0.916, "grad_norm": 0.4344301223754883, "learning_rate": 4.781563126252505e-05, "loss": 0.4404, "step": 229 }, { "epoch": 0.92, "grad_norm": 0.4292423129081726, "learning_rate": 4.780561122244489e-05, "loss": 0.5116, "step": 230 }, { "epoch": 0.924, "grad_norm": 6.355248928070068, "learning_rate": 4.779559118236473e-05, "loss": 1.3833, "step": 231 }, { "epoch": 0.928, "grad_norm": 6.541921138763428, "learning_rate": 4.778557114228457e-05, "loss": 1.459, "step": 232 }, { "epoch": 0.932, "grad_norm": 0.42501968145370483, "learning_rate": 4.7775551102204415e-05, "loss": 0.5074, "step": 233 }, { "epoch": 0.936, "grad_norm": 0.4112103581428528, "learning_rate": 4.776553106212425e-05, "loss": 0.4512, "step": 234 }, { "epoch": 0.94, "grad_norm": 0.40065306425094604, "learning_rate": 4.775551102204409e-05, "loss": 0.4646, "step": 235 }, { "epoch": 0.944, "grad_norm": 0.43869850039482117, "learning_rate": 4.774549098196393e-05, "loss": 0.5059, "step": 236 }, { "epoch": 0.948, "grad_norm": 0.4382988214492798, "learning_rate": 4.773547094188377e-05, "loss": 0.4654, "step": 237 }, { "epoch": 0.952, "grad_norm": 0.437419056892395, "learning_rate": 4.772545090180361e-05, "loss": 0.4978, "step": 238 }, { "epoch": 0.956, "grad_norm": 0.3500848412513733, "learning_rate": 4.771543086172345e-05, "loss": 0.4537, "step": 239 }, { "epoch": 0.96, "grad_norm": 0.39310508966445923, "learning_rate": 4.770541082164329e-05, "loss": 0.4443, "step": 240 }, { "epoch": 0.964, "grad_norm": 0.4204409420490265, "learning_rate": 4.7695390781563125e-05, "loss": 0.4745, "step": 241 }, { "epoch": 0.968, "grad_norm": 0.42494168877601624, "learning_rate": 4.768537074148297e-05, "loss": 0.4443, "step": 242 }, { "epoch": 0.972, "grad_norm": 0.406387060880661, "learning_rate": 4.767535070140281e-05, "loss": 0.491, "step": 243 }, { "epoch": 0.976, "grad_norm": 0.4731022119522095, "learning_rate": 4.766533066132265e-05, "loss": 0.5007, "step": 244 }, { "epoch": 0.98, "grad_norm": 0.4447345733642578, "learning_rate": 4.765531062124249e-05, "loss": 0.5206, "step": 245 }, { "epoch": 0.984, "grad_norm": 0.47526809573173523, "learning_rate": 4.7645290581162324e-05, "loss": 0.4576, "step": 246 }, { "epoch": 0.988, "grad_norm": 0.43649783730506897, "learning_rate": 4.7635270541082166e-05, "loss": 0.5029, "step": 247 }, { "epoch": 0.992, "grad_norm": 0.4308774173259735, "learning_rate": 4.762525050100201e-05, "loss": 0.4237, "step": 248 }, { "epoch": 0.996, "grad_norm": 0.3801085948944092, "learning_rate": 4.761523046092184e-05, "loss": 0.4387, "step": 249 }, { "epoch": 1.0, "grad_norm": 0.4443725645542145, "learning_rate": 4.760521042084168e-05, "loss": 0.4304, "step": 250 }, { "epoch": 1.004, "grad_norm": 0.3666495680809021, "learning_rate": 4.7595190380761524e-05, "loss": 0.4053, "step": 251 }, { "epoch": 1.008, "grad_norm": 0.4456152617931366, "learning_rate": 4.7585170340681365e-05, "loss": 0.4301, "step": 252 }, { "epoch": 1.012, "grad_norm": 0.4105600118637085, "learning_rate": 4.7575150300601207e-05, "loss": 0.4437, "step": 253 }, { "epoch": 1.016, "grad_norm": 0.42938780784606934, "learning_rate": 4.756513026052105e-05, "loss": 0.4685, "step": 254 }, { "epoch": 1.02, "grad_norm": 0.4617982506752014, "learning_rate": 4.755511022044088e-05, "loss": 0.5046, "step": 255 }, { "epoch": 1.024, "grad_norm": 0.3736300468444824, "learning_rate": 4.7545090180360724e-05, "loss": 0.4222, "step": 256 }, { "epoch": 1.028, "grad_norm": 0.40071138739585876, "learning_rate": 4.7535070140280565e-05, "loss": 0.4113, "step": 257 }, { "epoch": 1.032, "grad_norm": 0.43367400765419006, "learning_rate": 4.75250501002004e-05, "loss": 0.4743, "step": 258 }, { "epoch": 1.036, "grad_norm": 0.5123701691627502, "learning_rate": 4.751503006012024e-05, "loss": 0.4289, "step": 259 }, { "epoch": 1.04, "grad_norm": 0.4922557771205902, "learning_rate": 4.750501002004008e-05, "loss": 0.4976, "step": 260 }, { "epoch": 1.044, "grad_norm": 0.4395969808101654, "learning_rate": 4.7494989979959916e-05, "loss": 0.4344, "step": 261 }, { "epoch": 1.048, "grad_norm": 0.4711666405200958, "learning_rate": 4.7484969939879765e-05, "loss": 0.4616, "step": 262 }, { "epoch": 1.052, "grad_norm": 0.4415505528450012, "learning_rate": 4.7474949899799606e-05, "loss": 0.3927, "step": 263 }, { "epoch": 1.056, "grad_norm": 0.47160524129867554, "learning_rate": 4.746492985971944e-05, "loss": 0.4112, "step": 264 }, { "epoch": 1.06, "grad_norm": 0.4496021568775177, "learning_rate": 4.745490981963928e-05, "loss": 0.4275, "step": 265 }, { "epoch": 1.064, "grad_norm": 0.42134368419647217, "learning_rate": 4.744488977955912e-05, "loss": 0.4191, "step": 266 }, { "epoch": 1.068, "grad_norm": 0.4793007969856262, "learning_rate": 4.743486973947896e-05, "loss": 0.4858, "step": 267 }, { "epoch": 1.072, "grad_norm": 0.4894791841506958, "learning_rate": 4.74248496993988e-05, "loss": 0.4462, "step": 268 }, { "epoch": 1.076, "grad_norm": 0.45502549409866333, "learning_rate": 4.741482965931864e-05, "loss": 0.4883, "step": 269 }, { "epoch": 1.08, "grad_norm": 0.4357253313064575, "learning_rate": 4.7404809619238474e-05, "loss": 0.4329, "step": 270 }, { "epoch": 1.084, "grad_norm": 0.4542175233364105, "learning_rate": 4.7394789579158316e-05, "loss": 0.4501, "step": 271 }, { "epoch": 1.088, "grad_norm": 0.4260950982570648, "learning_rate": 4.7384769539078164e-05, "loss": 0.4186, "step": 272 }, { "epoch": 1.092, "grad_norm": 0.5126277804374695, "learning_rate": 4.7374749498998e-05, "loss": 0.4815, "step": 273 }, { "epoch": 1.096, "grad_norm": 0.4585254490375519, "learning_rate": 4.736472945891784e-05, "loss": 0.4461, "step": 274 }, { "epoch": 1.1, "grad_norm": 0.4200245440006256, "learning_rate": 4.735470941883768e-05, "loss": 0.4348, "step": 275 }, { "epoch": 1.104, "grad_norm": 0.4564264118671417, "learning_rate": 4.7344689378757515e-05, "loss": 0.4483, "step": 276 }, { "epoch": 1.108, "grad_norm": 0.38692179322242737, "learning_rate": 4.733466933867736e-05, "loss": 0.4139, "step": 277 }, { "epoch": 1.112, "grad_norm": 0.5220334529876709, "learning_rate": 4.73246492985972e-05, "loss": 0.4688, "step": 278 }, { "epoch": 1.116, "grad_norm": 0.47125041484832764, "learning_rate": 4.731462925851703e-05, "loss": 0.4965, "step": 279 }, { "epoch": 1.12, "grad_norm": 0.41665714979171753, "learning_rate": 4.7304609218436874e-05, "loss": 0.4569, "step": 280 }, { "epoch": 1.124, "grad_norm": 0.45704329013824463, "learning_rate": 4.7294589178356715e-05, "loss": 0.4441, "step": 281 }, { "epoch": 1.1280000000000001, "grad_norm": 0.5076858401298523, "learning_rate": 4.7284569138276556e-05, "loss": 0.4441, "step": 282 }, { "epoch": 1.1320000000000001, "grad_norm": 0.4565269649028778, "learning_rate": 4.72745490981964e-05, "loss": 0.4001, "step": 283 }, { "epoch": 1.1360000000000001, "grad_norm": 0.4239833652973175, "learning_rate": 4.726452905811624e-05, "loss": 0.3982, "step": 284 }, { "epoch": 1.1400000000000001, "grad_norm": 0.4458796977996826, "learning_rate": 4.725450901803607e-05, "loss": 0.4151, "step": 285 }, { "epoch": 1.144, "grad_norm": 0.49094000458717346, "learning_rate": 4.7244488977955915e-05, "loss": 0.4491, "step": 286 }, { "epoch": 1.148, "grad_norm": 11.27839183807373, "learning_rate": 4.7234468937875756e-05, "loss": 0.934, "step": 287 }, { "epoch": 1.152, "grad_norm": 0.5622511506080627, "learning_rate": 4.722444889779559e-05, "loss": 0.4862, "step": 288 }, { "epoch": 1.156, "grad_norm": 0.5482255220413208, "learning_rate": 4.721442885771543e-05, "loss": 0.4668, "step": 289 }, { "epoch": 1.16, "grad_norm": 7.071150779724121, "learning_rate": 4.720440881763527e-05, "loss": 0.8774, "step": 290 }, { "epoch": 1.164, "grad_norm": 1.3377376794815063, "learning_rate": 4.7194388777555114e-05, "loss": 0.4714, "step": 291 }, { "epoch": 1.168, "grad_norm": 0.5460695028305054, "learning_rate": 4.7184368737474956e-05, "loss": 0.4769, "step": 292 }, { "epoch": 1.172, "grad_norm": 0.4908643364906311, "learning_rate": 4.717434869739479e-05, "loss": 0.4904, "step": 293 }, { "epoch": 1.176, "grad_norm": 0.42432764172554016, "learning_rate": 4.716432865731463e-05, "loss": 0.4213, "step": 294 }, { "epoch": 1.18, "grad_norm": 0.5326458811759949, "learning_rate": 4.715430861723447e-05, "loss": 0.4568, "step": 295 }, { "epoch": 1.184, "grad_norm": 0.4412928521633148, "learning_rate": 4.7144288577154314e-05, "loss": 0.4776, "step": 296 }, { "epoch": 1.188, "grad_norm": 0.4530280828475952, "learning_rate": 4.713426853707415e-05, "loss": 0.4139, "step": 297 }, { "epoch": 1.192, "grad_norm": 0.4728831946849823, "learning_rate": 4.712424849699399e-05, "loss": 0.4586, "step": 298 }, { "epoch": 1.196, "grad_norm": 0.4735589325428009, "learning_rate": 4.711422845691383e-05, "loss": 0.4707, "step": 299 }, { "epoch": 1.2, "grad_norm": 0.5155274271965027, "learning_rate": 4.7104208416833666e-05, "loss": 0.425, "step": 300 }, { "epoch": 1.204, "grad_norm": 0.5296701788902283, "learning_rate": 4.7094188376753514e-05, "loss": 0.4858, "step": 301 }, { "epoch": 1.208, "grad_norm": 0.478127121925354, "learning_rate": 4.708416833667335e-05, "loss": 0.4522, "step": 302 }, { "epoch": 1.212, "grad_norm": 0.4879436194896698, "learning_rate": 4.707414829659319e-05, "loss": 0.45, "step": 303 }, { "epoch": 1.216, "grad_norm": 0.4468042850494385, "learning_rate": 4.706412825651303e-05, "loss": 0.4119, "step": 304 }, { "epoch": 1.22, "grad_norm": 0.47193828225135803, "learning_rate": 4.7054108216432865e-05, "loss": 0.4911, "step": 305 }, { "epoch": 1.224, "grad_norm": 0.46041783690452576, "learning_rate": 4.7044088176352706e-05, "loss": 0.4087, "step": 306 }, { "epoch": 1.228, "grad_norm": 0.5108659267425537, "learning_rate": 4.703406813627255e-05, "loss": 0.4601, "step": 307 }, { "epoch": 1.232, "grad_norm": 0.4466314911842346, "learning_rate": 4.702404809619239e-05, "loss": 0.4084, "step": 308 }, { "epoch": 1.236, "grad_norm": 0.5967437624931335, "learning_rate": 4.7014028056112224e-05, "loss": 0.4609, "step": 309 }, { "epoch": 1.24, "grad_norm": 0.4608320891857147, "learning_rate": 4.7004008016032065e-05, "loss": 0.3945, "step": 310 }, { "epoch": 1.244, "grad_norm": 0.47840774059295654, "learning_rate": 4.6993987975951906e-05, "loss": 0.4506, "step": 311 }, { "epoch": 1.248, "grad_norm": 0.49409082531929016, "learning_rate": 4.698396793587175e-05, "loss": 0.4387, "step": 312 }, { "epoch": 1.252, "grad_norm": 0.46387237310409546, "learning_rate": 4.697394789579159e-05, "loss": 0.436, "step": 313 }, { "epoch": 1.256, "grad_norm": 1.0678813457489014, "learning_rate": 4.696392785571142e-05, "loss": 0.4139, "step": 314 }, { "epoch": 1.26, "grad_norm": 0.5538118481636047, "learning_rate": 4.6953907815631264e-05, "loss": 0.4666, "step": 315 }, { "epoch": 1.264, "grad_norm": 7.208223819732666, "learning_rate": 4.6943887775551106e-05, "loss": 0.6192, "step": 316 }, { "epoch": 1.268, "grad_norm": 0.483923077583313, "learning_rate": 4.693386773547094e-05, "loss": 0.4103, "step": 317 }, { "epoch": 1.272, "grad_norm": 0.4743936061859131, "learning_rate": 4.692384769539078e-05, "loss": 0.4514, "step": 318 }, { "epoch": 1.276, "grad_norm": 0.5241031646728516, "learning_rate": 4.691382765531062e-05, "loss": 0.4661, "step": 319 }, { "epoch": 1.28, "grad_norm": 0.44694361090660095, "learning_rate": 4.690380761523046e-05, "loss": 0.4173, "step": 320 }, { "epoch": 1.284, "grad_norm": 0.4751509726047516, "learning_rate": 4.6893787575150305e-05, "loss": 0.3872, "step": 321 }, { "epoch": 1.288, "grad_norm": 0.548673152923584, "learning_rate": 4.688376753507015e-05, "loss": 0.474, "step": 322 }, { "epoch": 1.292, "grad_norm": 0.46685653924942017, "learning_rate": 4.687374749498998e-05, "loss": 0.4433, "step": 323 }, { "epoch": 1.296, "grad_norm": 3.28513240814209, "learning_rate": 4.686372745490982e-05, "loss": 0.4851, "step": 324 }, { "epoch": 1.3, "grad_norm": 0.5524126887321472, "learning_rate": 4.6853707414829664e-05, "loss": 0.4318, "step": 325 }, { "epoch": 1.304, "grad_norm": 0.48285192251205444, "learning_rate": 4.68436873747495e-05, "loss": 0.4203, "step": 326 }, { "epoch": 1.308, "grad_norm": 2.8871777057647705, "learning_rate": 4.683366733466934e-05, "loss": 0.4773, "step": 327 }, { "epoch": 1.312, "grad_norm": 0.5716524720191956, "learning_rate": 4.682364729458918e-05, "loss": 0.4876, "step": 328 }, { "epoch": 1.316, "grad_norm": 0.5107681155204773, "learning_rate": 4.6813627254509015e-05, "loss": 0.4194, "step": 329 }, { "epoch": 1.32, "grad_norm": 0.5048828125, "learning_rate": 4.6803607214428857e-05, "loss": 0.4635, "step": 330 }, { "epoch": 1.324, "grad_norm": 0.4749213755130768, "learning_rate": 4.6793587174348705e-05, "loss": 0.4625, "step": 331 }, { "epoch": 1.328, "grad_norm": 0.5069287419319153, "learning_rate": 4.678356713426854e-05, "loss": 0.3894, "step": 332 }, { "epoch": 1.332, "grad_norm": 0.5243629217147827, "learning_rate": 4.677354709418838e-05, "loss": 0.4179, "step": 333 }, { "epoch": 1.336, "grad_norm": 0.4602661728858948, "learning_rate": 4.676352705410822e-05, "loss": 0.4378, "step": 334 }, { "epoch": 1.34, "grad_norm": 0.591300904750824, "learning_rate": 4.6753507014028056e-05, "loss": 0.4615, "step": 335 }, { "epoch": 1.3439999999999999, "grad_norm": 0.5004926919937134, "learning_rate": 4.67434869739479e-05, "loss": 0.4199, "step": 336 }, { "epoch": 1.3479999999999999, "grad_norm": 0.5834985971450806, "learning_rate": 4.673346693386774e-05, "loss": 0.499, "step": 337 }, { "epoch": 1.3519999999999999, "grad_norm": 0.4432782530784607, "learning_rate": 4.672344689378757e-05, "loss": 0.3882, "step": 338 }, { "epoch": 1.3559999999999999, "grad_norm": 0.5421860814094543, "learning_rate": 4.6713426853707415e-05, "loss": 0.479, "step": 339 }, { "epoch": 1.3599999999999999, "grad_norm": 0.5701932907104492, "learning_rate": 4.6703406813627256e-05, "loss": 0.4337, "step": 340 }, { "epoch": 1.3639999999999999, "grad_norm": 0.5272790193557739, "learning_rate": 4.66933867735471e-05, "loss": 0.4419, "step": 341 }, { "epoch": 1.3679999999999999, "grad_norm": 0.5008136034011841, "learning_rate": 4.668336673346694e-05, "loss": 0.4162, "step": 342 }, { "epoch": 1.3719999999999999, "grad_norm": 0.47261759638786316, "learning_rate": 4.667334669338678e-05, "loss": 0.4037, "step": 343 }, { "epoch": 1.376, "grad_norm": 0.49282076954841614, "learning_rate": 4.6663326653306614e-05, "loss": 0.4203, "step": 344 }, { "epoch": 1.38, "grad_norm": 0.5760653614997864, "learning_rate": 4.6653306613226455e-05, "loss": 0.4935, "step": 345 }, { "epoch": 1.384, "grad_norm": 0.4914257824420929, "learning_rate": 4.66432865731463e-05, "loss": 0.3675, "step": 346 }, { "epoch": 1.388, "grad_norm": 0.6651628017425537, "learning_rate": 4.663326653306613e-05, "loss": 0.4484, "step": 347 }, { "epoch": 1.392, "grad_norm": 1.1650779247283936, "learning_rate": 4.662324649298597e-05, "loss": 0.496, "step": 348 }, { "epoch": 1.396, "grad_norm": 0.4959772825241089, "learning_rate": 4.6613226452905814e-05, "loss": 0.3795, "step": 349 }, { "epoch": 1.4, "grad_norm": 11.361075401306152, "learning_rate": 4.6603206412825655e-05, "loss": 0.4368, "step": 350 }, { "epoch": 1.404, "grad_norm": 0.5089369416236877, "learning_rate": 4.6593186372745496e-05, "loss": 0.3721, "step": 351 }, { "epoch": 1.408, "grad_norm": 0.4523348808288574, "learning_rate": 4.658316633266534e-05, "loss": 0.4223, "step": 352 }, { "epoch": 1.412, "grad_norm": 0.5115591287612915, "learning_rate": 4.657314629258517e-05, "loss": 0.4383, "step": 353 }, { "epoch": 1.416, "grad_norm": 0.4922471344470978, "learning_rate": 4.6563126252505013e-05, "loss": 0.4566, "step": 354 }, { "epoch": 1.42, "grad_norm": 0.49494296312332153, "learning_rate": 4.6553106212424855e-05, "loss": 0.4328, "step": 355 }, { "epoch": 1.424, "grad_norm": 0.6130332946777344, "learning_rate": 4.654308617234469e-05, "loss": 0.4885, "step": 356 }, { "epoch": 1.428, "grad_norm": 0.5084357857704163, "learning_rate": 4.653306613226453e-05, "loss": 0.4918, "step": 357 }, { "epoch": 1.432, "grad_norm": 0.5193467140197754, "learning_rate": 4.652304609218437e-05, "loss": 0.4379, "step": 358 }, { "epoch": 1.436, "grad_norm": 0.5419803261756897, "learning_rate": 4.6513026052104206e-05, "loss": 0.444, "step": 359 }, { "epoch": 1.44, "grad_norm": 0.5299074649810791, "learning_rate": 4.6503006012024054e-05, "loss": 0.4716, "step": 360 }, { "epoch": 1.444, "grad_norm": 1.1689682006835938, "learning_rate": 4.649298597194389e-05, "loss": 0.2592, "step": 361 }, { "epoch": 1.448, "grad_norm": 0.532464861869812, "learning_rate": 4.648296593186373e-05, "loss": 0.4572, "step": 362 }, { "epoch": 1.452, "grad_norm": 0.4852958023548126, "learning_rate": 4.647294589178357e-05, "loss": 0.4132, "step": 363 }, { "epoch": 1.456, "grad_norm": 0.4912414848804474, "learning_rate": 4.646292585170341e-05, "loss": 0.5046, "step": 364 }, { "epoch": 1.46, "grad_norm": 0.5567641854286194, "learning_rate": 4.645290581162325e-05, "loss": 0.4648, "step": 365 }, { "epoch": 1.464, "grad_norm": 0.5151598453521729, "learning_rate": 4.644288577154309e-05, "loss": 0.4677, "step": 366 }, { "epoch": 1.468, "grad_norm": 0.5214430093765259, "learning_rate": 4.643286573146293e-05, "loss": 0.4396, "step": 367 }, { "epoch": 1.472, "grad_norm": 0.5087822079658508, "learning_rate": 4.6422845691382764e-05, "loss": 0.4185, "step": 368 }, { "epoch": 1.476, "grad_norm": 0.49884939193725586, "learning_rate": 4.6412825651302606e-05, "loss": 0.427, "step": 369 }, { "epoch": 1.48, "grad_norm": 0.5047423243522644, "learning_rate": 4.640280561122245e-05, "loss": 0.4682, "step": 370 }, { "epoch": 1.484, "grad_norm": 0.5239180326461792, "learning_rate": 4.639278557114229e-05, "loss": 0.4701, "step": 371 }, { "epoch": 1.488, "grad_norm": 0.48768556118011475, "learning_rate": 4.638276553106213e-05, "loss": 0.4158, "step": 372 }, { "epoch": 1.492, "grad_norm": 0.5886668562889099, "learning_rate": 4.6372745490981964e-05, "loss": 0.4679, "step": 373 }, { "epoch": 1.496, "grad_norm": 0.5226956009864807, "learning_rate": 4.6362725450901805e-05, "loss": 0.4523, "step": 374 }, { "epoch": 1.5, "grad_norm": 0.5257872939109802, "learning_rate": 4.6352705410821647e-05, "loss": 0.4399, "step": 375 }, { "epoch": 1.504, "grad_norm": 0.6213669180870056, "learning_rate": 4.634268537074148e-05, "loss": 0.4535, "step": 376 }, { "epoch": 1.508, "grad_norm": 0.49080657958984375, "learning_rate": 4.633266533066132e-05, "loss": 0.4379, "step": 377 }, { "epoch": 1.512, "grad_norm": 0.6423617005348206, "learning_rate": 4.6322645290581164e-05, "loss": 0.5559, "step": 378 }, { "epoch": 1.516, "grad_norm": 0.6420733332633972, "learning_rate": 4.6312625250501005e-05, "loss": 0.4543, "step": 379 }, { "epoch": 1.52, "grad_norm": 0.5223842859268188, "learning_rate": 4.6302605210420846e-05, "loss": 0.466, "step": 380 }, { "epoch": 1.524, "grad_norm": 0.4663429856300354, "learning_rate": 4.629258517034069e-05, "loss": 0.4153, "step": 381 }, { "epoch": 1.528, "grad_norm": 0.5308319926261902, "learning_rate": 4.628256513026052e-05, "loss": 0.4136, "step": 382 }, { "epoch": 1.532, "grad_norm": 0.48709505796432495, "learning_rate": 4.627254509018036e-05, "loss": 0.4478, "step": 383 }, { "epoch": 1.536, "grad_norm": 0.4852888584136963, "learning_rate": 4.6262525050100205e-05, "loss": 0.3736, "step": 384 }, { "epoch": 1.54, "grad_norm": 0.510391116142273, "learning_rate": 4.625250501002004e-05, "loss": 0.4505, "step": 385 }, { "epoch": 1.544, "grad_norm": 0.48822277784347534, "learning_rate": 4.624248496993988e-05, "loss": 0.384, "step": 386 }, { "epoch": 1.548, "grad_norm": 0.5103605389595032, "learning_rate": 4.623246492985972e-05, "loss": 0.3747, "step": 387 }, { "epoch": 1.552, "grad_norm": 0.5321424603462219, "learning_rate": 4.6222444889779556e-05, "loss": 0.4732, "step": 388 }, { "epoch": 1.556, "grad_norm": 0.5756722092628479, "learning_rate": 4.62124248496994e-05, "loss": 0.4735, "step": 389 }, { "epoch": 1.56, "grad_norm": 0.5002274513244629, "learning_rate": 4.6202404809619245e-05, "loss": 0.4149, "step": 390 }, { "epoch": 1.564, "grad_norm": 0.5540143847465515, "learning_rate": 4.619238476953908e-05, "loss": 0.4388, "step": 391 }, { "epoch": 1.568, "grad_norm": 0.5765349864959717, "learning_rate": 4.618236472945892e-05, "loss": 0.5177, "step": 392 }, { "epoch": 1.572, "grad_norm": 0.5519753694534302, "learning_rate": 4.617234468937876e-05, "loss": 0.4497, "step": 393 }, { "epoch": 1.576, "grad_norm": 0.5800144672393799, "learning_rate": 4.61623246492986e-05, "loss": 0.448, "step": 394 }, { "epoch": 1.58, "grad_norm": 0.5450941920280457, "learning_rate": 4.615230460921844e-05, "loss": 0.4368, "step": 395 }, { "epoch": 1.584, "grad_norm": 0.50678551197052, "learning_rate": 4.614228456913828e-05, "loss": 0.4496, "step": 396 }, { "epoch": 1.588, "grad_norm": 0.48308175802230835, "learning_rate": 4.6132264529058114e-05, "loss": 0.3646, "step": 397 }, { "epoch": 1.592, "grad_norm": 0.5025294423103333, "learning_rate": 4.6122244488977955e-05, "loss": 0.4331, "step": 398 }, { "epoch": 1.596, "grad_norm": 0.4999145567417145, "learning_rate": 4.61122244488978e-05, "loss": 0.3968, "step": 399 }, { "epoch": 1.6, "grad_norm": 0.5278040170669556, "learning_rate": 4.610220440881764e-05, "loss": 0.3938, "step": 400 }, { "epoch": 1.604, "grad_norm": 0.5218545198440552, "learning_rate": 4.609218436873748e-05, "loss": 0.4465, "step": 401 }, { "epoch": 1.608, "grad_norm": 0.5310875177383423, "learning_rate": 4.608216432865732e-05, "loss": 0.4094, "step": 402 }, { "epoch": 1.612, "grad_norm": 0.47267839312553406, "learning_rate": 4.6072144288577155e-05, "loss": 0.4233, "step": 403 }, { "epoch": 1.616, "grad_norm": 0.9498124122619629, "learning_rate": 4.6062124248496996e-05, "loss": 0.4357, "step": 404 }, { "epoch": 1.62, "grad_norm": 0.46646052598953247, "learning_rate": 4.605210420841684e-05, "loss": 0.4146, "step": 405 }, { "epoch": 1.624, "grad_norm": 1.2247719764709473, "learning_rate": 4.604208416833667e-05, "loss": 0.3475, "step": 406 }, { "epoch": 1.6280000000000001, "grad_norm": 0.5586639642715454, "learning_rate": 4.603206412825651e-05, "loss": 0.444, "step": 407 }, { "epoch": 1.6320000000000001, "grad_norm": 0.552677571773529, "learning_rate": 4.6022044088176355e-05, "loss": 0.4234, "step": 408 }, { "epoch": 1.6360000000000001, "grad_norm": 0.5379041433334351, "learning_rate": 4.6012024048096196e-05, "loss": 0.4755, "step": 409 }, { "epoch": 1.6400000000000001, "grad_norm": 0.5361511707305908, "learning_rate": 4.600200400801604e-05, "loss": 0.3844, "step": 410 }, { "epoch": 1.6440000000000001, "grad_norm": 0.5484291315078735, "learning_rate": 4.599198396793588e-05, "loss": 0.4237, "step": 411 }, { "epoch": 1.6480000000000001, "grad_norm": 0.47265616059303284, "learning_rate": 4.598196392785571e-05, "loss": 0.4055, "step": 412 }, { "epoch": 1.6520000000000001, "grad_norm": 0.586786150932312, "learning_rate": 4.5971943887775554e-05, "loss": 0.5109, "step": 413 }, { "epoch": 1.6560000000000001, "grad_norm": 0.4930340051651001, "learning_rate": 4.5961923847695396e-05, "loss": 0.3967, "step": 414 }, { "epoch": 1.6600000000000001, "grad_norm": 0.5312355160713196, "learning_rate": 4.595190380761523e-05, "loss": 0.4881, "step": 415 }, { "epoch": 1.6640000000000001, "grad_norm": 0.5358534455299377, "learning_rate": 4.594188376753507e-05, "loss": 0.4785, "step": 416 }, { "epoch": 1.6680000000000001, "grad_norm": 0.5466772317886353, "learning_rate": 4.593186372745491e-05, "loss": 0.4695, "step": 417 }, { "epoch": 1.6720000000000002, "grad_norm": 0.5252442359924316, "learning_rate": 4.592184368737475e-05, "loss": 0.4647, "step": 418 }, { "epoch": 1.6760000000000002, "grad_norm": 0.6336548924446106, "learning_rate": 4.5911823647294595e-05, "loss": 0.4535, "step": 419 }, { "epoch": 1.6800000000000002, "grad_norm": 0.5073897838592529, "learning_rate": 4.590180360721443e-05, "loss": 0.4622, "step": 420 }, { "epoch": 1.6840000000000002, "grad_norm": 0.5029900074005127, "learning_rate": 4.589178356713427e-05, "loss": 0.4221, "step": 421 }, { "epoch": 1.688, "grad_norm": 0.590262234210968, "learning_rate": 4.588176352705411e-05, "loss": 0.4097, "step": 422 }, { "epoch": 1.692, "grad_norm": 0.5236296653747559, "learning_rate": 4.5871743486973954e-05, "loss": 0.4445, "step": 423 }, { "epoch": 1.696, "grad_norm": 0.5591893792152405, "learning_rate": 4.586172344689379e-05, "loss": 0.4016, "step": 424 }, { "epoch": 1.7, "grad_norm": 0.48660945892333984, "learning_rate": 4.585170340681363e-05, "loss": 0.3968, "step": 425 }, { "epoch": 1.704, "grad_norm": 1.2732222080230713, "learning_rate": 4.584168336673347e-05, "loss": 0.2928, "step": 426 }, { "epoch": 1.708, "grad_norm": 0.5137063264846802, "learning_rate": 4.5831663326653305e-05, "loss": 0.4274, "step": 427 }, { "epoch": 1.712, "grad_norm": 0.532532811164856, "learning_rate": 4.5821643286573146e-05, "loss": 0.443, "step": 428 }, { "epoch": 1.716, "grad_norm": 0.7271252870559692, "learning_rate": 4.581162324649299e-05, "loss": 0.5061, "step": 429 }, { "epoch": 1.72, "grad_norm": 0.49782320857048035, "learning_rate": 4.580160320641283e-05, "loss": 0.4603, "step": 430 }, { "epoch": 1.724, "grad_norm": 0.49443551898002625, "learning_rate": 4.579158316633267e-05, "loss": 0.3841, "step": 431 }, { "epoch": 1.728, "grad_norm": 0.5045324563980103, "learning_rate": 4.5781563126252505e-05, "loss": 0.3932, "step": 432 }, { "epoch": 1.732, "grad_norm": 0.5934986472129822, "learning_rate": 4.5771543086172346e-05, "loss": 0.5067, "step": 433 }, { "epoch": 1.736, "grad_norm": 0.5429759621620178, "learning_rate": 4.576152304609219e-05, "loss": 0.3985, "step": 434 }, { "epoch": 1.74, "grad_norm": 0.6646963357925415, "learning_rate": 4.575150300601203e-05, "loss": 0.4736, "step": 435 }, { "epoch": 1.744, "grad_norm": 0.5723690986633301, "learning_rate": 4.574148296593186e-05, "loss": 0.4137, "step": 436 }, { "epoch": 1.748, "grad_norm": 0.5194367170333862, "learning_rate": 4.5731462925851704e-05, "loss": 0.3921, "step": 437 }, { "epoch": 1.752, "grad_norm": 0.510465145111084, "learning_rate": 4.5721442885771546e-05, "loss": 0.4055, "step": 438 }, { "epoch": 1.756, "grad_norm": 0.6061939597129822, "learning_rate": 4.571142284569139e-05, "loss": 0.4404, "step": 439 }, { "epoch": 1.76, "grad_norm": 0.5550720691680908, "learning_rate": 4.570140280561123e-05, "loss": 0.4366, "step": 440 }, { "epoch": 1.764, "grad_norm": 0.5832468867301941, "learning_rate": 4.569138276553106e-05, "loss": 0.4176, "step": 441 }, { "epoch": 1.768, "grad_norm": 0.5731306672096252, "learning_rate": 4.5681362725450904e-05, "loss": 0.3685, "step": 442 }, { "epoch": 1.772, "grad_norm": 0.6895073056221008, "learning_rate": 4.5671342685370745e-05, "loss": 0.4906, "step": 443 }, { "epoch": 1.776, "grad_norm": 0.6335456967353821, "learning_rate": 4.566132264529058e-05, "loss": 0.4004, "step": 444 }, { "epoch": 1.78, "grad_norm": 0.5286179780960083, "learning_rate": 4.565130260521042e-05, "loss": 0.409, "step": 445 }, { "epoch": 1.784, "grad_norm": 0.626899003982544, "learning_rate": 4.564128256513026e-05, "loss": 0.4522, "step": 446 }, { "epoch": 1.788, "grad_norm": 0.5576806664466858, "learning_rate": 4.56312625250501e-05, "loss": 0.4494, "step": 447 }, { "epoch": 1.792, "grad_norm": 0.5164839625358582, "learning_rate": 4.562124248496994e-05, "loss": 0.4463, "step": 448 }, { "epoch": 1.796, "grad_norm": 0.5332900881767273, "learning_rate": 4.5611222444889786e-05, "loss": 0.4454, "step": 449 }, { "epoch": 1.8, "grad_norm": 0.6575366258621216, "learning_rate": 4.560120240480962e-05, "loss": 0.4384, "step": 450 }, { "epoch": 1.804, "grad_norm": 0.571654200553894, "learning_rate": 4.559118236472946e-05, "loss": 0.4229, "step": 451 }, { "epoch": 1.808, "grad_norm": 0.5586163997650146, "learning_rate": 4.55811623246493e-05, "loss": 0.4484, "step": 452 }, { "epoch": 1.812, "grad_norm": 0.5708151459693909, "learning_rate": 4.557114228456914e-05, "loss": 0.4247, "step": 453 }, { "epoch": 1.8159999999999998, "grad_norm": 0.5310369729995728, "learning_rate": 4.556112224448898e-05, "loss": 0.4246, "step": 454 }, { "epoch": 1.8199999999999998, "grad_norm": 0.5038295984268188, "learning_rate": 4.555110220440882e-05, "loss": 0.3868, "step": 455 }, { "epoch": 1.8239999999999998, "grad_norm": 0.6038540005683899, "learning_rate": 4.5541082164328655e-05, "loss": 0.5129, "step": 456 }, { "epoch": 1.8279999999999998, "grad_norm": 0.6133531928062439, "learning_rate": 4.5531062124248496e-05, "loss": 0.4609, "step": 457 }, { "epoch": 1.8319999999999999, "grad_norm": 0.6467635035514832, "learning_rate": 4.5521042084168344e-05, "loss": 0.4598, "step": 458 }, { "epoch": 1.8359999999999999, "grad_norm": 0.5676400661468506, "learning_rate": 4.551102204408818e-05, "loss": 0.3724, "step": 459 }, { "epoch": 1.8399999999999999, "grad_norm": 3.835160732269287, "learning_rate": 4.550100200400802e-05, "loss": 0.3293, "step": 460 }, { "epoch": 1.8439999999999999, "grad_norm": 0.5771289467811584, "learning_rate": 4.549098196392786e-05, "loss": 0.4299, "step": 461 }, { "epoch": 1.8479999999999999, "grad_norm": 0.5256178379058838, "learning_rate": 4.5480961923847696e-05, "loss": 0.3774, "step": 462 }, { "epoch": 1.8519999999999999, "grad_norm": 0.5819401741027832, "learning_rate": 4.547094188376754e-05, "loss": 0.4542, "step": 463 }, { "epoch": 1.8559999999999999, "grad_norm": 0.5631145238876343, "learning_rate": 4.546092184368738e-05, "loss": 0.4326, "step": 464 }, { "epoch": 1.8599999999999999, "grad_norm": 0.5169239044189453, "learning_rate": 4.545090180360721e-05, "loss": 0.3941, "step": 465 }, { "epoch": 1.8639999999999999, "grad_norm": 0.5691187381744385, "learning_rate": 4.5440881763527054e-05, "loss": 0.483, "step": 466 }, { "epoch": 1.8679999999999999, "grad_norm": 0.5725812911987305, "learning_rate": 4.5430861723446895e-05, "loss": 0.448, "step": 467 }, { "epoch": 1.8719999999999999, "grad_norm": 0.5266627669334412, "learning_rate": 4.542084168336674e-05, "loss": 0.4355, "step": 468 }, { "epoch": 1.876, "grad_norm": 0.4844062924385071, "learning_rate": 4.541082164328658e-05, "loss": 0.4057, "step": 469 }, { "epoch": 1.88, "grad_norm": 0.5010110139846802, "learning_rate": 4.540080160320642e-05, "loss": 0.4314, "step": 470 }, { "epoch": 1.884, "grad_norm": 0.5683146715164185, "learning_rate": 4.5390781563126254e-05, "loss": 0.4376, "step": 471 }, { "epoch": 1.888, "grad_norm": 0.535389244556427, "learning_rate": 4.5380761523046095e-05, "loss": 0.4047, "step": 472 }, { "epoch": 1.892, "grad_norm": 0.49679070711135864, "learning_rate": 4.5370741482965936e-05, "loss": 0.4082, "step": 473 }, { "epoch": 1.896, "grad_norm": 0.527479887008667, "learning_rate": 4.536072144288577e-05, "loss": 0.4829, "step": 474 }, { "epoch": 1.9, "grad_norm": 0.5545944571495056, "learning_rate": 4.535070140280561e-05, "loss": 0.4503, "step": 475 }, { "epoch": 1.904, "grad_norm": 0.5213431119918823, "learning_rate": 4.5340681362725453e-05, "loss": 0.3898, "step": 476 }, { "epoch": 1.908, "grad_norm": 0.5331659317016602, "learning_rate": 4.533066132264529e-05, "loss": 0.4274, "step": 477 }, { "epoch": 1.912, "grad_norm": 0.4619419574737549, "learning_rate": 4.5320641282565136e-05, "loss": 0.3848, "step": 478 }, { "epoch": 1.916, "grad_norm": 0.5063828825950623, "learning_rate": 4.531062124248498e-05, "loss": 0.4625, "step": 479 }, { "epoch": 1.92, "grad_norm": 0.4973587095737457, "learning_rate": 4.530060120240481e-05, "loss": 0.4274, "step": 480 }, { "epoch": 1.924, "grad_norm": 0.49490657448768616, "learning_rate": 4.529058116232465e-05, "loss": 0.4322, "step": 481 }, { "epoch": 1.928, "grad_norm": 4.3649067878723145, "learning_rate": 4.5280561122244494e-05, "loss": 0.4085, "step": 482 }, { "epoch": 1.932, "grad_norm": 0.5393351316452026, "learning_rate": 4.527054108216433e-05, "loss": 0.4309, "step": 483 }, { "epoch": 1.936, "grad_norm": 0.6917522549629211, "learning_rate": 4.526052104208417e-05, "loss": 0.268, "step": 484 }, { "epoch": 1.94, "grad_norm": 0.5249350666999817, "learning_rate": 4.525050100200401e-05, "loss": 0.4548, "step": 485 }, { "epoch": 1.944, "grad_norm": 0.5986626148223877, "learning_rate": 4.5240480961923846e-05, "loss": 0.4347, "step": 486 }, { "epoch": 1.948, "grad_norm": 0.6276787519454956, "learning_rate": 4.523046092184369e-05, "loss": 0.5127, "step": 487 }, { "epoch": 1.952, "grad_norm": 0.5993108749389648, "learning_rate": 4.522044088176353e-05, "loss": 0.4559, "step": 488 }, { "epoch": 1.956, "grad_norm": 0.4931926131248474, "learning_rate": 4.521042084168337e-05, "loss": 0.4144, "step": 489 }, { "epoch": 1.96, "grad_norm": 0.482890784740448, "learning_rate": 4.520040080160321e-05, "loss": 0.2603, "step": 490 }, { "epoch": 1.964, "grad_norm": 0.6805306077003479, "learning_rate": 4.519038076152305e-05, "loss": 0.5272, "step": 491 }, { "epoch": 1.968, "grad_norm": 0.6288460493087769, "learning_rate": 4.518036072144289e-05, "loss": 0.4455, "step": 492 }, { "epoch": 1.972, "grad_norm": 0.5547501444816589, "learning_rate": 4.517034068136273e-05, "loss": 0.4432, "step": 493 }, { "epoch": 1.976, "grad_norm": 0.545608639717102, "learning_rate": 4.516032064128257e-05, "loss": 0.4219, "step": 494 }, { "epoch": 1.98, "grad_norm": 0.5442771911621094, "learning_rate": 4.5150300601202404e-05, "loss": 0.4113, "step": 495 }, { "epoch": 1.984, "grad_norm": 0.5810039043426514, "learning_rate": 4.5140280561122245e-05, "loss": 0.4304, "step": 496 }, { "epoch": 1.988, "grad_norm": 0.5679133534431458, "learning_rate": 4.5130260521042086e-05, "loss": 0.3819, "step": 497 }, { "epoch": 1.992, "grad_norm": 0.5476593375205994, "learning_rate": 4.512024048096193e-05, "loss": 0.4566, "step": 498 }, { "epoch": 1.996, "grad_norm": 0.516356348991394, "learning_rate": 4.511022044088177e-05, "loss": 0.427, "step": 499 }, { "epoch": 2.0, "grad_norm": 0.5612138509750366, "learning_rate": 4.5100200400801604e-05, "loss": 0.4089, "step": 500 }, { "epoch": 2.004, "grad_norm": 0.5504969358444214, "learning_rate": 4.5090180360721445e-05, "loss": 0.3998, "step": 501 }, { "epoch": 2.008, "grad_norm": 0.5878968238830566, "learning_rate": 4.5080160320641286e-05, "loss": 0.3539, "step": 502 }, { "epoch": 2.012, "grad_norm": 0.5411523580551147, "learning_rate": 4.507014028056112e-05, "loss": 0.3457, "step": 503 }, { "epoch": 2.016, "grad_norm": 0.4899858236312866, "learning_rate": 4.506012024048096e-05, "loss": 0.3602, "step": 504 }, { "epoch": 2.02, "grad_norm": 0.5289257764816284, "learning_rate": 4.50501002004008e-05, "loss": 0.4093, "step": 505 }, { "epoch": 2.024, "grad_norm": 0.5221056938171387, "learning_rate": 4.5040080160320644e-05, "loss": 0.3818, "step": 506 }, { "epoch": 2.028, "grad_norm": 0.5114055871963501, "learning_rate": 4.503006012024048e-05, "loss": 0.3801, "step": 507 }, { "epoch": 2.032, "grad_norm": 0.5133730173110962, "learning_rate": 4.502004008016033e-05, "loss": 0.3916, "step": 508 }, { "epoch": 2.036, "grad_norm": 0.6223206520080566, "learning_rate": 4.501002004008016e-05, "loss": 0.3904, "step": 509 }, { "epoch": 2.04, "grad_norm": 0.597932755947113, "learning_rate": 4.5e-05, "loss": 0.3352, "step": 510 }, { "epoch": 2.044, "grad_norm": 0.6192795038223267, "learning_rate": 4.4989979959919844e-05, "loss": 0.3788, "step": 511 }, { "epoch": 2.048, "grad_norm": 0.5938431024551392, "learning_rate": 4.497995991983968e-05, "loss": 0.3946, "step": 512 }, { "epoch": 2.052, "grad_norm": 0.5885916352272034, "learning_rate": 4.496993987975952e-05, "loss": 0.3706, "step": 513 }, { "epoch": 2.056, "grad_norm": 0.6078701615333557, "learning_rate": 4.495991983967936e-05, "loss": 0.3561, "step": 514 }, { "epoch": 2.06, "grad_norm": 0.6931913495063782, "learning_rate": 4.4949899799599196e-05, "loss": 0.3697, "step": 515 }, { "epoch": 2.064, "grad_norm": 0.6855788230895996, "learning_rate": 4.493987975951904e-05, "loss": 0.4235, "step": 516 }, { "epoch": 2.068, "grad_norm": 0.5608630180358887, "learning_rate": 4.4929859719438885e-05, "loss": 0.3561, "step": 517 }, { "epoch": 2.072, "grad_norm": 0.6403824687004089, "learning_rate": 4.491983967935872e-05, "loss": 0.3701, "step": 518 }, { "epoch": 2.076, "grad_norm": 0.5921077728271484, "learning_rate": 4.490981963927856e-05, "loss": 0.36, "step": 519 }, { "epoch": 2.08, "grad_norm": 0.6234927773475647, "learning_rate": 4.48997995991984e-05, "loss": 0.4041, "step": 520 }, { "epoch": 2.084, "grad_norm": 0.6670336723327637, "learning_rate": 4.488977955911824e-05, "loss": 0.3733, "step": 521 }, { "epoch": 2.088, "grad_norm": 0.6524705290794373, "learning_rate": 4.487975951903808e-05, "loss": 0.344, "step": 522 }, { "epoch": 2.092, "grad_norm": 0.6538939476013184, "learning_rate": 4.486973947895792e-05, "loss": 0.4209, "step": 523 }, { "epoch": 2.096, "grad_norm": 0.6096150875091553, "learning_rate": 4.4859719438877754e-05, "loss": 0.3739, "step": 524 }, { "epoch": 2.1, "grad_norm": 0.5730370879173279, "learning_rate": 4.4849699398797595e-05, "loss": 0.3912, "step": 525 }, { "epoch": 2.104, "grad_norm": 0.6088821887969971, "learning_rate": 4.4839679358717436e-05, "loss": 0.3633, "step": 526 }, { "epoch": 2.108, "grad_norm": 0.9573196768760681, "learning_rate": 4.482965931863728e-05, "loss": 0.2372, "step": 527 }, { "epoch": 2.112, "grad_norm": 0.6470226049423218, "learning_rate": 4.481963927855712e-05, "loss": 0.369, "step": 528 }, { "epoch": 2.116, "grad_norm": 0.6785104274749756, "learning_rate": 4.480961923847696e-05, "loss": 0.4002, "step": 529 }, { "epoch": 2.12, "grad_norm": 0.46951213479042053, "learning_rate": 4.4799599198396795e-05, "loss": 0.2299, "step": 530 }, { "epoch": 2.124, "grad_norm": 0.6221892237663269, "learning_rate": 4.4789579158316636e-05, "loss": 0.3324, "step": 531 }, { "epoch": 2.128, "grad_norm": 0.6120625138282776, "learning_rate": 4.477955911823648e-05, "loss": 0.3473, "step": 532 }, { "epoch": 2.132, "grad_norm": 0.6631773710250854, "learning_rate": 4.476953907815631e-05, "loss": 0.3919, "step": 533 }, { "epoch": 2.136, "grad_norm": 0.6568106412887573, "learning_rate": 4.475951903807615e-05, "loss": 0.223, "step": 534 }, { "epoch": 2.14, "grad_norm": 0.6405401825904846, "learning_rate": 4.4749498997995994e-05, "loss": 0.3438, "step": 535 }, { "epoch": 2.144, "grad_norm": 0.7879334092140198, "learning_rate": 4.473947895791583e-05, "loss": 0.3801, "step": 536 }, { "epoch": 2.148, "grad_norm": 0.6276340484619141, "learning_rate": 4.472945891783568e-05, "loss": 0.365, "step": 537 }, { "epoch": 2.152, "grad_norm": 0.7515552043914795, "learning_rate": 4.471943887775552e-05, "loss": 0.3993, "step": 538 }, { "epoch": 2.156, "grad_norm": 0.6806215643882751, "learning_rate": 4.470941883767535e-05, "loss": 0.3654, "step": 539 }, { "epoch": 2.16, "grad_norm": 0.6749470829963684, "learning_rate": 4.4699398797595194e-05, "loss": 0.3565, "step": 540 }, { "epoch": 2.164, "grad_norm": 0.8084760308265686, "learning_rate": 4.4689378757515035e-05, "loss": 0.4149, "step": 541 }, { "epoch": 2.168, "grad_norm": 0.6896063089370728, "learning_rate": 4.467935871743487e-05, "loss": 0.3676, "step": 542 }, { "epoch": 2.172, "grad_norm": 0.665913999080658, "learning_rate": 4.466933867735471e-05, "loss": 0.3514, "step": 543 }, { "epoch": 2.176, "grad_norm": 0.6971928477287292, "learning_rate": 4.465931863727455e-05, "loss": 0.4026, "step": 544 }, { "epoch": 2.18, "grad_norm": 0.752924919128418, "learning_rate": 4.464929859719439e-05, "loss": 0.3677, "step": 545 }, { "epoch": 2.184, "grad_norm": 0.6077944040298462, "learning_rate": 4.463927855711423e-05, "loss": 0.3712, "step": 546 }, { "epoch": 2.188, "grad_norm": 0.6650569438934326, "learning_rate": 4.462925851703407e-05, "loss": 0.3999, "step": 547 }, { "epoch": 2.192, "grad_norm": 0.6830900311470032, "learning_rate": 4.461923847695391e-05, "loss": 0.3617, "step": 548 }, { "epoch": 2.196, "grad_norm": 0.6619176268577576, "learning_rate": 4.460921843687375e-05, "loss": 0.3707, "step": 549 }, { "epoch": 2.2, "grad_norm": 0.6596035361289978, "learning_rate": 4.459919839679359e-05, "loss": 0.4169, "step": 550 }, { "epoch": 2.204, "grad_norm": 0.636611819267273, "learning_rate": 4.458917835671343e-05, "loss": 0.3806, "step": 551 }, { "epoch": 2.208, "grad_norm": 0.6402906179428101, "learning_rate": 4.457915831663327e-05, "loss": 0.3406, "step": 552 }, { "epoch": 2.212, "grad_norm": 0.6241163015365601, "learning_rate": 4.456913827655311e-05, "loss": 0.3814, "step": 553 }, { "epoch": 2.216, "grad_norm": 0.73136305809021, "learning_rate": 4.4559118236472945e-05, "loss": 0.3599, "step": 554 }, { "epoch": 2.22, "grad_norm": 0.9286080002784729, "learning_rate": 4.4549098196392786e-05, "loss": 0.3629, "step": 555 }, { "epoch": 2.224, "grad_norm": 0.6749750375747681, "learning_rate": 4.453907815631263e-05, "loss": 0.3829, "step": 556 }, { "epoch": 2.228, "grad_norm": 0.6232475638389587, "learning_rate": 4.452905811623247e-05, "loss": 0.3816, "step": 557 }, { "epoch": 2.232, "grad_norm": 0.6914759874343872, "learning_rate": 4.451903807615231e-05, "loss": 0.383, "step": 558 }, { "epoch": 2.2359999999999998, "grad_norm": 0.786461591720581, "learning_rate": 4.4509018036072144e-05, "loss": 0.3644, "step": 559 }, { "epoch": 2.24, "grad_norm": 0.7495032548904419, "learning_rate": 4.4498997995991986e-05, "loss": 0.3818, "step": 560 }, { "epoch": 2.2439999999999998, "grad_norm": 0.7083197236061096, "learning_rate": 4.448897795591183e-05, "loss": 0.3574, "step": 561 }, { "epoch": 2.248, "grad_norm": 0.6997913718223572, "learning_rate": 4.447895791583167e-05, "loss": 0.3473, "step": 562 }, { "epoch": 2.252, "grad_norm": 0.7174849510192871, "learning_rate": 4.44689378757515e-05, "loss": 0.331, "step": 563 }, { "epoch": 2.2560000000000002, "grad_norm": 0.6463573575019836, "learning_rate": 4.4458917835671344e-05, "loss": 0.3907, "step": 564 }, { "epoch": 2.26, "grad_norm": 0.6492186784744263, "learning_rate": 4.4448897795591185e-05, "loss": 0.373, "step": 565 }, { "epoch": 2.2640000000000002, "grad_norm": 0.7330689430236816, "learning_rate": 4.4438877755511027e-05, "loss": 0.3704, "step": 566 }, { "epoch": 2.268, "grad_norm": 0.7165907621383667, "learning_rate": 4.442885771543087e-05, "loss": 0.4114, "step": 567 }, { "epoch": 2.2720000000000002, "grad_norm": 0.7889212369918823, "learning_rate": 4.44188376753507e-05, "loss": 0.3837, "step": 568 }, { "epoch": 2.276, "grad_norm": 0.7672508358955383, "learning_rate": 4.4408817635270544e-05, "loss": 0.3647, "step": 569 }, { "epoch": 2.2800000000000002, "grad_norm": 0.7716801762580872, "learning_rate": 4.4398797595190385e-05, "loss": 0.4272, "step": 570 }, { "epoch": 2.284, "grad_norm": 0.6814711689949036, "learning_rate": 4.438877755511022e-05, "loss": 0.4071, "step": 571 }, { "epoch": 2.288, "grad_norm": 0.7873872518539429, "learning_rate": 4.437875751503006e-05, "loss": 0.3599, "step": 572 }, { "epoch": 2.292, "grad_norm": 0.6841912865638733, "learning_rate": 4.43687374749499e-05, "loss": 0.3494, "step": 573 }, { "epoch": 2.296, "grad_norm": 1.1426913738250732, "learning_rate": 4.4358717434869737e-05, "loss": 0.2038, "step": 574 }, { "epoch": 2.3, "grad_norm": 0.7427853345870972, "learning_rate": 4.434869739478958e-05, "loss": 0.4356, "step": 575 }, { "epoch": 2.304, "grad_norm": 0.6541914939880371, "learning_rate": 4.4338677354709426e-05, "loss": 0.3992, "step": 576 }, { "epoch": 2.308, "grad_norm": 0.6165962219238281, "learning_rate": 4.432865731462926e-05, "loss": 0.3442, "step": 577 }, { "epoch": 2.312, "grad_norm": 0.6729843616485596, "learning_rate": 4.43186372745491e-05, "loss": 0.3985, "step": 578 }, { "epoch": 2.316, "grad_norm": 0.735693097114563, "learning_rate": 4.430861723446894e-05, "loss": 0.3458, "step": 579 }, { "epoch": 2.32, "grad_norm": 0.6572169065475464, "learning_rate": 4.429859719438878e-05, "loss": 0.3533, "step": 580 }, { "epoch": 2.324, "grad_norm": 0.7412426471710205, "learning_rate": 4.428857715430862e-05, "loss": 0.3494, "step": 581 }, { "epoch": 2.328, "grad_norm": 0.7071847319602966, "learning_rate": 4.427855711422846e-05, "loss": 0.3647, "step": 582 }, { "epoch": 2.332, "grad_norm": 0.8219819068908691, "learning_rate": 4.4268537074148294e-05, "loss": 0.4073, "step": 583 }, { "epoch": 2.336, "grad_norm": 0.8309367895126343, "learning_rate": 4.4258517034068136e-05, "loss": 0.3826, "step": 584 }, { "epoch": 2.34, "grad_norm": 0.7580353617668152, "learning_rate": 4.424849699398798e-05, "loss": 0.3815, "step": 585 }, { "epoch": 2.344, "grad_norm": 0.6513029336929321, "learning_rate": 4.423847695390782e-05, "loss": 0.3712, "step": 586 }, { "epoch": 2.348, "grad_norm": 0.7023140788078308, "learning_rate": 4.422845691382766e-05, "loss": 0.3733, "step": 587 }, { "epoch": 2.352, "grad_norm": 0.7600807547569275, "learning_rate": 4.42184368737475e-05, "loss": 0.3862, "step": 588 }, { "epoch": 2.356, "grad_norm": 0.7764167189598083, "learning_rate": 4.4208416833667335e-05, "loss": 0.415, "step": 589 }, { "epoch": 2.36, "grad_norm": 0.8243778347969055, "learning_rate": 4.419839679358718e-05, "loss": 0.3776, "step": 590 }, { "epoch": 2.364, "grad_norm": 0.7651914954185486, "learning_rate": 4.418837675350702e-05, "loss": 0.3871, "step": 591 }, { "epoch": 2.368, "grad_norm": 0.7026492357254028, "learning_rate": 4.417835671342685e-05, "loss": 0.3352, "step": 592 }, { "epoch": 2.372, "grad_norm": 0.6480612754821777, "learning_rate": 4.4168336673346694e-05, "loss": 0.3574, "step": 593 }, { "epoch": 2.376, "grad_norm": 0.6376757621765137, "learning_rate": 4.4158316633266535e-05, "loss": 0.3865, "step": 594 }, { "epoch": 2.38, "grad_norm": 0.7384195327758789, "learning_rate": 4.414829659318637e-05, "loss": 0.346, "step": 595 }, { "epoch": 2.384, "grad_norm": 0.7906630635261536, "learning_rate": 4.413827655310622e-05, "loss": 0.409, "step": 596 }, { "epoch": 2.388, "grad_norm": 0.7145755290985107, "learning_rate": 4.412825651302606e-05, "loss": 0.3461, "step": 597 }, { "epoch": 2.392, "grad_norm": 0.8087967038154602, "learning_rate": 4.4118236472945893e-05, "loss": 0.215, "step": 598 }, { "epoch": 2.396, "grad_norm": 0.7960618138313293, "learning_rate": 4.4108216432865735e-05, "loss": 0.4122, "step": 599 }, { "epoch": 2.4, "grad_norm": 0.6827014684677124, "learning_rate": 4.4098196392785576e-05, "loss": 0.3964, "step": 600 }, { "epoch": 2.404, "grad_norm": 0.7268480062484741, "learning_rate": 4.408817635270541e-05, "loss": 0.3557, "step": 601 }, { "epoch": 2.408, "grad_norm": 0.7457350492477417, "learning_rate": 4.407815631262525e-05, "loss": 0.4288, "step": 602 }, { "epoch": 2.412, "grad_norm": 0.7236207127571106, "learning_rate": 4.406813627254509e-05, "loss": 0.4079, "step": 603 }, { "epoch": 2.416, "grad_norm": 0.6497883200645447, "learning_rate": 4.405811623246493e-05, "loss": 0.4348, "step": 604 }, { "epoch": 2.42, "grad_norm": 0.7279455065727234, "learning_rate": 4.404809619238477e-05, "loss": 0.3534, "step": 605 }, { "epoch": 2.424, "grad_norm": 0.7461408972740173, "learning_rate": 4.403807615230462e-05, "loss": 0.3619, "step": 606 }, { "epoch": 2.428, "grad_norm": 0.7696019411087036, "learning_rate": 4.402805611222445e-05, "loss": 0.3546, "step": 607 }, { "epoch": 2.432, "grad_norm": 0.7570071816444397, "learning_rate": 4.401803607214429e-05, "loss": 0.3496, "step": 608 }, { "epoch": 2.436, "grad_norm": 0.6898276805877686, "learning_rate": 4.4008016032064134e-05, "loss": 0.3789, "step": 609 }, { "epoch": 2.44, "grad_norm": 0.7434217929840088, "learning_rate": 4.399799599198397e-05, "loss": 0.3736, "step": 610 }, { "epoch": 2.444, "grad_norm": 0.8145261406898499, "learning_rate": 4.398797595190381e-05, "loss": 0.3516, "step": 611 }, { "epoch": 2.448, "grad_norm": 0.7646612524986267, "learning_rate": 4.397795591182365e-05, "loss": 0.3867, "step": 612 }, { "epoch": 2.452, "grad_norm": 0.9545761942863464, "learning_rate": 4.3967935871743486e-05, "loss": 0.3563, "step": 613 }, { "epoch": 2.456, "grad_norm": 0.8131189346313477, "learning_rate": 4.395791583166333e-05, "loss": 0.389, "step": 614 }, { "epoch": 2.46, "grad_norm": 0.7146084308624268, "learning_rate": 4.394789579158317e-05, "loss": 0.3727, "step": 615 }, { "epoch": 2.464, "grad_norm": 0.7887970209121704, "learning_rate": 4.393787575150301e-05, "loss": 0.3596, "step": 616 }, { "epoch": 2.468, "grad_norm": 0.7648036479949951, "learning_rate": 4.392785571142285e-05, "loss": 0.3388, "step": 617 }, { "epoch": 2.472, "grad_norm": 0.80208420753479, "learning_rate": 4.391783567134269e-05, "loss": 0.3498, "step": 618 }, { "epoch": 2.476, "grad_norm": 0.7335792183876038, "learning_rate": 4.3907815631262526e-05, "loss": 0.3678, "step": 619 }, { "epoch": 2.48, "grad_norm": 0.7437960505485535, "learning_rate": 4.389779559118237e-05, "loss": 0.3697, "step": 620 }, { "epoch": 2.484, "grad_norm": 0.687762975692749, "learning_rate": 4.388777555110221e-05, "loss": 0.3292, "step": 621 }, { "epoch": 2.488, "grad_norm": 0.7212290167808533, "learning_rate": 4.3877755511022044e-05, "loss": 0.3707, "step": 622 }, { "epoch": 2.492, "grad_norm": 0.6740947365760803, "learning_rate": 4.3867735470941885e-05, "loss": 0.33, "step": 623 }, { "epoch": 2.496, "grad_norm": 0.7523700594902039, "learning_rate": 4.3857715430861726e-05, "loss": 0.348, "step": 624 }, { "epoch": 2.5, "grad_norm": 0.8317469954490662, "learning_rate": 4.384769539078157e-05, "loss": 0.4042, "step": 625 }, { "epoch": 2.504, "grad_norm": 0.8676090836524963, "learning_rate": 4.383767535070141e-05, "loss": 0.3682, "step": 626 }, { "epoch": 2.508, "grad_norm": 0.7623687386512756, "learning_rate": 4.382765531062124e-05, "loss": 0.4103, "step": 627 }, { "epoch": 2.512, "grad_norm": 0.756071925163269, "learning_rate": 4.3817635270541084e-05, "loss": 0.3673, "step": 628 }, { "epoch": 2.516, "grad_norm": 0.7560201287269592, "learning_rate": 4.3807615230460926e-05, "loss": 0.3961, "step": 629 }, { "epoch": 2.52, "grad_norm": 0.8405265212059021, "learning_rate": 4.379759519038076e-05, "loss": 0.3891, "step": 630 }, { "epoch": 2.524, "grad_norm": 0.7683942317962646, "learning_rate": 4.37875751503006e-05, "loss": 0.4056, "step": 631 }, { "epoch": 2.528, "grad_norm": 0.7256227135658264, "learning_rate": 4.377755511022044e-05, "loss": 0.3555, "step": 632 }, { "epoch": 2.532, "grad_norm": 0.8724226355552673, "learning_rate": 4.3767535070140284e-05, "loss": 0.4297, "step": 633 }, { "epoch": 2.536, "grad_norm": 0.8041679263114929, "learning_rate": 4.375751503006012e-05, "loss": 0.3846, "step": 634 }, { "epoch": 2.54, "grad_norm": 0.7637845873832703, "learning_rate": 4.374749498997997e-05, "loss": 0.3578, "step": 635 }, { "epoch": 2.544, "grad_norm": 0.7199541330337524, "learning_rate": 4.37374749498998e-05, "loss": 0.3484, "step": 636 }, { "epoch": 2.548, "grad_norm": 0.7177748680114746, "learning_rate": 4.372745490981964e-05, "loss": 0.341, "step": 637 }, { "epoch": 2.552, "grad_norm": 0.683948814868927, "learning_rate": 4.3717434869739484e-05, "loss": 0.3593, "step": 638 }, { "epoch": 2.556, "grad_norm": 0.7331079840660095, "learning_rate": 4.370741482965932e-05, "loss": 0.3521, "step": 639 }, { "epoch": 2.56, "grad_norm": 0.7923868298530579, "learning_rate": 4.369739478957916e-05, "loss": 0.3775, "step": 640 }, { "epoch": 2.564, "grad_norm": 0.8066992163658142, "learning_rate": 4.3687374749499e-05, "loss": 0.362, "step": 641 }, { "epoch": 2.568, "grad_norm": 0.8344401121139526, "learning_rate": 4.3677354709418835e-05, "loss": 0.3877, "step": 642 }, { "epoch": 2.572, "grad_norm": 0.7773652076721191, "learning_rate": 4.3667334669338677e-05, "loss": 0.3878, "step": 643 }, { "epoch": 2.576, "grad_norm": 0.9228508472442627, "learning_rate": 4.365731462925852e-05, "loss": 0.3737, "step": 644 }, { "epoch": 2.58, "grad_norm": 0.7461867928504944, "learning_rate": 4.364729458917836e-05, "loss": 0.3422, "step": 645 }, { "epoch": 2.584, "grad_norm": 0.7550526261329651, "learning_rate": 4.36372745490982e-05, "loss": 0.3748, "step": 646 }, { "epoch": 2.588, "grad_norm": 0.7776896953582764, "learning_rate": 4.362725450901804e-05, "loss": 0.3629, "step": 647 }, { "epoch": 2.592, "grad_norm": 0.8079218864440918, "learning_rate": 4.3617234468937876e-05, "loss": 0.352, "step": 648 }, { "epoch": 2.596, "grad_norm": 0.7805841565132141, "learning_rate": 4.360721442885772e-05, "loss": 0.4112, "step": 649 }, { "epoch": 2.6, "grad_norm": 0.6444045305252075, "learning_rate": 4.359719438877756e-05, "loss": 0.2286, "step": 650 }, { "epoch": 2.604, "grad_norm": 0.7929733395576477, "learning_rate": 4.358717434869739e-05, "loss": 0.368, "step": 651 }, { "epoch": 2.608, "grad_norm": 0.7788984775543213, "learning_rate": 4.3577154308617235e-05, "loss": 0.3658, "step": 652 }, { "epoch": 2.612, "grad_norm": 0.7166882753372192, "learning_rate": 4.3567134268537076e-05, "loss": 0.3472, "step": 653 }, { "epoch": 2.616, "grad_norm": 0.7755756974220276, "learning_rate": 4.355711422845691e-05, "loss": 0.3748, "step": 654 }, { "epoch": 2.62, "grad_norm": 0.8172996640205383, "learning_rate": 4.354709418837676e-05, "loss": 0.3924, "step": 655 }, { "epoch": 2.624, "grad_norm": 0.739886462688446, "learning_rate": 4.35370741482966e-05, "loss": 0.4263, "step": 656 }, { "epoch": 2.628, "grad_norm": 0.8171461224555969, "learning_rate": 4.3527054108216434e-05, "loss": 0.3717, "step": 657 }, { "epoch": 2.632, "grad_norm": 0.7296169400215149, "learning_rate": 4.3517034068136275e-05, "loss": 0.3753, "step": 658 }, { "epoch": 2.636, "grad_norm": 0.7279816269874573, "learning_rate": 4.350701402805612e-05, "loss": 0.4018, "step": 659 }, { "epoch": 2.64, "grad_norm": 0.6948480606079102, "learning_rate": 4.349699398797595e-05, "loss": 0.3866, "step": 660 }, { "epoch": 2.644, "grad_norm": 0.7570900321006775, "learning_rate": 4.348697394789579e-05, "loss": 0.3787, "step": 661 }, { "epoch": 2.648, "grad_norm": 0.7140201330184937, "learning_rate": 4.3476953907815634e-05, "loss": 0.3161, "step": 662 }, { "epoch": 2.652, "grad_norm": 0.7074480056762695, "learning_rate": 4.346693386773547e-05, "loss": 0.4135, "step": 663 }, { "epoch": 2.656, "grad_norm": 0.7118886113166809, "learning_rate": 4.345691382765531e-05, "loss": 0.3456, "step": 664 }, { "epoch": 2.66, "grad_norm": 0.7889545559883118, "learning_rate": 4.344689378757516e-05, "loss": 0.3727, "step": 665 }, { "epoch": 2.664, "grad_norm": 0.7217925786972046, "learning_rate": 4.343687374749499e-05, "loss": 0.3396, "step": 666 }, { "epoch": 2.668, "grad_norm": 0.5060538053512573, "learning_rate": 4.3426853707414833e-05, "loss": 0.2113, "step": 667 }, { "epoch": 2.672, "grad_norm": 0.8558411002159119, "learning_rate": 4.3416833667334675e-05, "loss": 0.4217, "step": 668 }, { "epoch": 2.676, "grad_norm": 0.7091007828712463, "learning_rate": 4.340681362725451e-05, "loss": 0.3889, "step": 669 }, { "epoch": 2.68, "grad_norm": 0.8151986598968506, "learning_rate": 4.339679358717435e-05, "loss": 0.3808, "step": 670 }, { "epoch": 2.684, "grad_norm": 0.8028424978256226, "learning_rate": 4.338677354709419e-05, "loss": 0.376, "step": 671 }, { "epoch": 2.6879999999999997, "grad_norm": 0.7256678938865662, "learning_rate": 4.3376753507014026e-05, "loss": 0.3924, "step": 672 }, { "epoch": 2.692, "grad_norm": 0.7175251245498657, "learning_rate": 4.336673346693387e-05, "loss": 0.3795, "step": 673 }, { "epoch": 2.6959999999999997, "grad_norm": 0.8073795437812805, "learning_rate": 4.335671342685371e-05, "loss": 0.3812, "step": 674 }, { "epoch": 2.7, "grad_norm": 0.7120643258094788, "learning_rate": 4.334669338677355e-05, "loss": 0.3856, "step": 675 }, { "epoch": 2.7039999999999997, "grad_norm": 0.6971157193183899, "learning_rate": 4.333667334669339e-05, "loss": 0.3223, "step": 676 }, { "epoch": 2.708, "grad_norm": 0.7540895938873291, "learning_rate": 4.332665330661323e-05, "loss": 0.3582, "step": 677 }, { "epoch": 2.7119999999999997, "grad_norm": 0.7137064337730408, "learning_rate": 4.331663326653307e-05, "loss": 0.3178, "step": 678 }, { "epoch": 2.716, "grad_norm": 0.7786818742752075, "learning_rate": 4.330661322645291e-05, "loss": 0.38, "step": 679 }, { "epoch": 2.7199999999999998, "grad_norm": 0.7963460087776184, "learning_rate": 4.329659318637275e-05, "loss": 0.3811, "step": 680 }, { "epoch": 2.724, "grad_norm": 0.9932142496109009, "learning_rate": 4.3286573146292584e-05, "loss": 0.4158, "step": 681 }, { "epoch": 2.7279999999999998, "grad_norm": 0.7535082101821899, "learning_rate": 4.3276553106212426e-05, "loss": 0.4385, "step": 682 }, { "epoch": 2.732, "grad_norm": 0.6805514097213745, "learning_rate": 4.326653306613227e-05, "loss": 0.3881, "step": 683 }, { "epoch": 2.7359999999999998, "grad_norm": 0.569618284702301, "learning_rate": 4.325651302605211e-05, "loss": 0.2244, "step": 684 }, { "epoch": 2.74, "grad_norm": 0.6559450030326843, "learning_rate": 4.324649298597195e-05, "loss": 0.3176, "step": 685 }, { "epoch": 2.7439999999999998, "grad_norm": 0.7690601944923401, "learning_rate": 4.3236472945891784e-05, "loss": 0.3672, "step": 686 }, { "epoch": 2.748, "grad_norm": 0.662957489490509, "learning_rate": 4.3226452905811625e-05, "loss": 0.3441, "step": 687 }, { "epoch": 2.752, "grad_norm": 0.5383203029632568, "learning_rate": 4.3216432865731467e-05, "loss": 0.2406, "step": 688 }, { "epoch": 2.7560000000000002, "grad_norm": 0.5156013369560242, "learning_rate": 4.320641282565131e-05, "loss": 0.1923, "step": 689 }, { "epoch": 2.76, "grad_norm": 0.798517644405365, "learning_rate": 4.319639278557114e-05, "loss": 0.3142, "step": 690 }, { "epoch": 2.7640000000000002, "grad_norm": 0.7717877626419067, "learning_rate": 4.3186372745490984e-05, "loss": 0.3537, "step": 691 }, { "epoch": 2.768, "grad_norm": 0.8068984150886536, "learning_rate": 4.3176352705410825e-05, "loss": 0.3579, "step": 692 }, { "epoch": 2.7720000000000002, "grad_norm": 0.7683663368225098, "learning_rate": 4.316633266533066e-05, "loss": 0.3463, "step": 693 }, { "epoch": 2.776, "grad_norm": 0.7648158669471741, "learning_rate": 4.315631262525051e-05, "loss": 0.3613, "step": 694 }, { "epoch": 2.7800000000000002, "grad_norm": 0.6192529201507568, "learning_rate": 4.314629258517034e-05, "loss": 0.2317, "step": 695 }, { "epoch": 2.784, "grad_norm": 0.7966995239257812, "learning_rate": 4.313627254509018e-05, "loss": 0.3571, "step": 696 }, { "epoch": 2.7880000000000003, "grad_norm": 0.7972453236579895, "learning_rate": 4.3126252505010025e-05, "loss": 0.3334, "step": 697 }, { "epoch": 2.792, "grad_norm": 0.7164406180381775, "learning_rate": 4.311623246492986e-05, "loss": 0.3394, "step": 698 }, { "epoch": 2.7960000000000003, "grad_norm": 0.8824707865715027, "learning_rate": 4.31062124248497e-05, "loss": 0.4308, "step": 699 }, { "epoch": 2.8, "grad_norm": 0.7947747111320496, "learning_rate": 4.309619238476954e-05, "loss": 0.3586, "step": 700 }, { "epoch": 2.8040000000000003, "grad_norm": 0.8112667202949524, "learning_rate": 4.3086172344689376e-05, "loss": 0.424, "step": 701 }, { "epoch": 2.808, "grad_norm": 0.7361273765563965, "learning_rate": 4.307615230460922e-05, "loss": 0.357, "step": 702 }, { "epoch": 2.8120000000000003, "grad_norm": 0.7715202569961548, "learning_rate": 4.306613226452906e-05, "loss": 0.4038, "step": 703 }, { "epoch": 2.816, "grad_norm": 0.7197621464729309, "learning_rate": 4.30561122244489e-05, "loss": 0.3378, "step": 704 }, { "epoch": 2.82, "grad_norm": 0.7480655312538147, "learning_rate": 4.304609218436874e-05, "loss": 0.4022, "step": 705 }, { "epoch": 2.824, "grad_norm": 0.6959013938903809, "learning_rate": 4.303607214428858e-05, "loss": 0.3704, "step": 706 }, { "epoch": 2.828, "grad_norm": 0.7385444641113281, "learning_rate": 4.302605210420842e-05, "loss": 0.3868, "step": 707 }, { "epoch": 2.832, "grad_norm": 0.7400097846984863, "learning_rate": 4.301603206412826e-05, "loss": 0.3851, "step": 708 }, { "epoch": 2.836, "grad_norm": 0.7167990207672119, "learning_rate": 4.30060120240481e-05, "loss": 0.3516, "step": 709 }, { "epoch": 2.84, "grad_norm": 0.7471911907196045, "learning_rate": 4.2995991983967934e-05, "loss": 0.3891, "step": 710 }, { "epoch": 2.844, "grad_norm": 0.670911431312561, "learning_rate": 4.2985971943887775e-05, "loss": 0.1749, "step": 711 }, { "epoch": 2.848, "grad_norm": 0.7623646259307861, "learning_rate": 4.297595190380762e-05, "loss": 0.3528, "step": 712 }, { "epoch": 2.852, "grad_norm": 0.7303867936134338, "learning_rate": 4.296593186372745e-05, "loss": 0.3443, "step": 713 }, { "epoch": 2.856, "grad_norm": 0.8334202766418457, "learning_rate": 4.29559118236473e-05, "loss": 0.3797, "step": 714 }, { "epoch": 2.86, "grad_norm": 0.8006396889686584, "learning_rate": 4.294589178356714e-05, "loss": 0.3702, "step": 715 }, { "epoch": 2.864, "grad_norm": 5.877392768859863, "learning_rate": 4.2935871743486975e-05, "loss": 0.3651, "step": 716 }, { "epoch": 2.868, "grad_norm": 0.7039988040924072, "learning_rate": 4.2925851703406816e-05, "loss": 0.3728, "step": 717 }, { "epoch": 2.872, "grad_norm": 0.777847409248352, "learning_rate": 4.291583166332666e-05, "loss": 0.3986, "step": 718 }, { "epoch": 2.876, "grad_norm": 0.7246628999710083, "learning_rate": 4.290581162324649e-05, "loss": 0.3821, "step": 719 }, { "epoch": 2.88, "grad_norm": 0.773358941078186, "learning_rate": 4.289579158316633e-05, "loss": 0.3758, "step": 720 }, { "epoch": 2.884, "grad_norm": 0.7800974249839783, "learning_rate": 4.2885771543086175e-05, "loss": 0.364, "step": 721 }, { "epoch": 2.888, "grad_norm": 0.8116373419761658, "learning_rate": 4.287575150300601e-05, "loss": 0.457, "step": 722 }, { "epoch": 2.892, "grad_norm": 0.6917723417282104, "learning_rate": 4.286573146292585e-05, "loss": 0.3549, "step": 723 }, { "epoch": 2.896, "grad_norm": 0.8000931739807129, "learning_rate": 4.28557114228457e-05, "loss": 0.3616, "step": 724 }, { "epoch": 2.9, "grad_norm": 0.7841587066650391, "learning_rate": 4.284569138276553e-05, "loss": 0.4008, "step": 725 }, { "epoch": 2.904, "grad_norm": 0.723857581615448, "learning_rate": 4.2835671342685374e-05, "loss": 0.3367, "step": 726 }, { "epoch": 2.908, "grad_norm": 0.7573955059051514, "learning_rate": 4.2825651302605216e-05, "loss": 0.3473, "step": 727 }, { "epoch": 2.912, "grad_norm": 0.6830270290374756, "learning_rate": 4.281563126252505e-05, "loss": 0.4192, "step": 728 }, { "epoch": 2.916, "grad_norm": 0.7317824363708496, "learning_rate": 4.280561122244489e-05, "loss": 0.3519, "step": 729 }, { "epoch": 2.92, "grad_norm": 0.8307793736457825, "learning_rate": 4.279559118236473e-05, "loss": 0.385, "step": 730 }, { "epoch": 2.924, "grad_norm": 0.5436768531799316, "learning_rate": 4.278557114228457e-05, "loss": 0.225, "step": 731 }, { "epoch": 2.928, "grad_norm": 0.798338770866394, "learning_rate": 4.277555110220441e-05, "loss": 0.4136, "step": 732 }, { "epoch": 2.932, "grad_norm": 0.7588498592376709, "learning_rate": 4.2765531062124256e-05, "loss": 0.3705, "step": 733 }, { "epoch": 2.936, "grad_norm": 0.8589876890182495, "learning_rate": 4.275551102204409e-05, "loss": 0.3989, "step": 734 }, { "epoch": 2.94, "grad_norm": 0.6937852501869202, "learning_rate": 4.274549098196393e-05, "loss": 0.3804, "step": 735 }, { "epoch": 2.944, "grad_norm": 0.8076114058494568, "learning_rate": 4.2735470941883774e-05, "loss": 0.3755, "step": 736 }, { "epoch": 2.948, "grad_norm": 0.7671177983283997, "learning_rate": 4.272545090180361e-05, "loss": 0.3414, "step": 737 }, { "epoch": 2.952, "grad_norm": 0.7929058074951172, "learning_rate": 4.271543086172345e-05, "loss": 0.4099, "step": 738 }, { "epoch": 2.956, "grad_norm": 0.8027397394180298, "learning_rate": 4.270541082164329e-05, "loss": 0.3635, "step": 739 }, { "epoch": 2.96, "grad_norm": 0.7325429916381836, "learning_rate": 4.2695390781563125e-05, "loss": 0.3457, "step": 740 }, { "epoch": 2.964, "grad_norm": 0.7555444240570068, "learning_rate": 4.2685370741482966e-05, "loss": 0.4271, "step": 741 }, { "epoch": 2.968, "grad_norm": 0.6851345300674438, "learning_rate": 4.267535070140281e-05, "loss": 0.374, "step": 742 }, { "epoch": 2.972, "grad_norm": 0.7643485069274902, "learning_rate": 4.266533066132265e-05, "loss": 0.3338, "step": 743 }, { "epoch": 2.976, "grad_norm": 0.8755035996437073, "learning_rate": 4.265531062124249e-05, "loss": 0.3444, "step": 744 }, { "epoch": 2.98, "grad_norm": 0.6367295980453491, "learning_rate": 4.264529058116233e-05, "loss": 0.3667, "step": 745 }, { "epoch": 2.984, "grad_norm": 0.8797886967658997, "learning_rate": 4.2635270541082166e-05, "loss": 0.4266, "step": 746 }, { "epoch": 2.988, "grad_norm": 0.6971140503883362, "learning_rate": 4.262525050100201e-05, "loss": 0.337, "step": 747 }, { "epoch": 2.992, "grad_norm": 0.751207709312439, "learning_rate": 4.261523046092185e-05, "loss": 0.3907, "step": 748 }, { "epoch": 2.996, "grad_norm": 0.7211257219314575, "learning_rate": 4.260521042084168e-05, "loss": 0.396, "step": 749 }, { "epoch": 3.0, "grad_norm": 0.9239338636398315, "learning_rate": 4.2595190380761524e-05, "loss": 0.4502, "step": 750 }, { "epoch": 3.004, "grad_norm": 0.779204249382019, "learning_rate": 4.2585170340681366e-05, "loss": 0.2879, "step": 751 }, { "epoch": 3.008, "grad_norm": 0.6356750726699829, "learning_rate": 4.25751503006012e-05, "loss": 0.2986, "step": 752 }, { "epoch": 3.012, "grad_norm": 0.8113148212432861, "learning_rate": 4.256513026052105e-05, "loss": 0.3308, "step": 753 }, { "epoch": 3.016, "grad_norm": 0.8562130331993103, "learning_rate": 4.255511022044088e-05, "loss": 0.3309, "step": 754 }, { "epoch": 3.02, "grad_norm": 0.7852639555931091, "learning_rate": 4.2545090180360724e-05, "loss": 0.253, "step": 755 }, { "epoch": 3.024, "grad_norm": 0.83427894115448, "learning_rate": 4.2535070140280565e-05, "loss": 0.2877, "step": 756 }, { "epoch": 3.028, "grad_norm": 0.8282837867736816, "learning_rate": 4.25250501002004e-05, "loss": 0.304, "step": 757 }, { "epoch": 3.032, "grad_norm": 0.895320475101471, "learning_rate": 4.251503006012024e-05, "loss": 0.3018, "step": 758 }, { "epoch": 3.036, "grad_norm": 0.9096167683601379, "learning_rate": 4.250501002004008e-05, "loss": 0.2782, "step": 759 }, { "epoch": 3.04, "grad_norm": 0.9886460304260254, "learning_rate": 4.2494989979959924e-05, "loss": 0.2964, "step": 760 }, { "epoch": 3.044, "grad_norm": 1.022255778312683, "learning_rate": 4.248496993987976e-05, "loss": 0.3164, "step": 761 }, { "epoch": 3.048, "grad_norm": 1.0864485502243042, "learning_rate": 4.24749498997996e-05, "loss": 0.2808, "step": 762 }, { "epoch": 3.052, "grad_norm": 1.1385424137115479, "learning_rate": 4.246492985971944e-05, "loss": 0.3147, "step": 763 }, { "epoch": 3.056, "grad_norm": 1.4085267782211304, "learning_rate": 4.245490981963928e-05, "loss": 0.3169, "step": 764 }, { "epoch": 3.06, "grad_norm": 0.9816059470176697, "learning_rate": 4.244488977955912e-05, "loss": 0.2724, "step": 765 }, { "epoch": 3.064, "grad_norm": 0.9588344693183899, "learning_rate": 4.243486973947896e-05, "loss": 0.2695, "step": 766 }, { "epoch": 3.068, "grad_norm": 0.9727541208267212, "learning_rate": 4.24248496993988e-05, "loss": 0.307, "step": 767 }, { "epoch": 3.072, "grad_norm": 0.9045794010162354, "learning_rate": 4.241482965931864e-05, "loss": 0.281, "step": 768 }, { "epoch": 3.076, "grad_norm": 0.8429794907569885, "learning_rate": 4.2404809619238475e-05, "loss": 0.2693, "step": 769 }, { "epoch": 3.08, "grad_norm": 0.9454277157783508, "learning_rate": 4.2394789579158316e-05, "loss": 0.2927, "step": 770 }, { "epoch": 3.084, "grad_norm": 0.8746525645256042, "learning_rate": 4.238476953907816e-05, "loss": 0.3275, "step": 771 }, { "epoch": 3.088, "grad_norm": 1.0423359870910645, "learning_rate": 4.2374749498998e-05, "loss": 0.2617, "step": 772 }, { "epoch": 3.092, "grad_norm": 0.9890905022621155, "learning_rate": 4.236472945891784e-05, "loss": 0.268, "step": 773 }, { "epoch": 3.096, "grad_norm": 0.9057809114456177, "learning_rate": 4.235470941883768e-05, "loss": 0.2951, "step": 774 }, { "epoch": 3.1, "grad_norm": 0.841378927230835, "learning_rate": 4.2344689378757516e-05, "loss": 0.251, "step": 775 }, { "epoch": 3.104, "grad_norm": 0.926152229309082, "learning_rate": 4.233466933867736e-05, "loss": 0.2939, "step": 776 }, { "epoch": 3.108, "grad_norm": 1.010184407234192, "learning_rate": 4.23246492985972e-05, "loss": 0.2795, "step": 777 }, { "epoch": 3.112, "grad_norm": 1.0386322736740112, "learning_rate": 4.231462925851703e-05, "loss": 0.3257, "step": 778 }, { "epoch": 3.116, "grad_norm": 0.9416765570640564, "learning_rate": 4.2304609218436874e-05, "loss": 0.2857, "step": 779 }, { "epoch": 3.12, "grad_norm": 0.6299401521682739, "learning_rate": 4.2294589178356715e-05, "loss": 0.1485, "step": 780 }, { "epoch": 3.124, "grad_norm": 0.8813037872314453, "learning_rate": 4.228456913827655e-05, "loss": 0.2762, "step": 781 }, { "epoch": 3.128, "grad_norm": 0.9580820798873901, "learning_rate": 4.227454909819639e-05, "loss": 0.3148, "step": 782 }, { "epoch": 3.132, "grad_norm": 0.9623346924781799, "learning_rate": 4.226452905811624e-05, "loss": 0.3172, "step": 783 }, { "epoch": 3.136, "grad_norm": 0.9738514423370361, "learning_rate": 4.2254509018036074e-05, "loss": 0.2936, "step": 784 }, { "epoch": 3.14, "grad_norm": 0.9638810753822327, "learning_rate": 4.2244488977955915e-05, "loss": 0.2686, "step": 785 }, { "epoch": 3.144, "grad_norm": 1.004857063293457, "learning_rate": 4.2234468937875756e-05, "loss": 0.3125, "step": 786 }, { "epoch": 3.148, "grad_norm": 0.9582427740097046, "learning_rate": 4.222444889779559e-05, "loss": 0.2873, "step": 787 }, { "epoch": 3.152, "grad_norm": 0.9714578986167908, "learning_rate": 4.221442885771543e-05, "loss": 0.3185, "step": 788 }, { "epoch": 3.156, "grad_norm": 0.9590511322021484, "learning_rate": 4.2204408817635273e-05, "loss": 0.2962, "step": 789 }, { "epoch": 3.16, "grad_norm": 0.8241173624992371, "learning_rate": 4.219438877755511e-05, "loss": 0.273, "step": 790 }, { "epoch": 3.164, "grad_norm": 0.9807001352310181, "learning_rate": 4.218436873747495e-05, "loss": 0.2778, "step": 791 }, { "epoch": 3.168, "grad_norm": 1.012277364730835, "learning_rate": 4.21743486973948e-05, "loss": 0.2907, "step": 792 }, { "epoch": 3.172, "grad_norm": 0.7485466003417969, "learning_rate": 4.216432865731463e-05, "loss": 0.1743, "step": 793 }, { "epoch": 3.176, "grad_norm": 1.0279030799865723, "learning_rate": 4.215430861723447e-05, "loss": 0.2856, "step": 794 }, { "epoch": 3.18, "grad_norm": 1.0178649425506592, "learning_rate": 4.2144288577154314e-05, "loss": 0.2993, "step": 795 }, { "epoch": 3.184, "grad_norm": 0.8605381846427917, "learning_rate": 4.213426853707415e-05, "loss": 0.2982, "step": 796 }, { "epoch": 3.188, "grad_norm": 1.0019651651382446, "learning_rate": 4.212424849699399e-05, "loss": 0.2708, "step": 797 }, { "epoch": 3.192, "grad_norm": 0.8550509810447693, "learning_rate": 4.211422845691383e-05, "loss": 0.2795, "step": 798 }, { "epoch": 3.196, "grad_norm": 0.9066852927207947, "learning_rate": 4.2104208416833666e-05, "loss": 0.294, "step": 799 }, { "epoch": 3.2, "grad_norm": 1.0385713577270508, "learning_rate": 4.209418837675351e-05, "loss": 0.3046, "step": 800 }, { "epoch": 3.204, "grad_norm": 0.958522379398346, "learning_rate": 4.208416833667335e-05, "loss": 0.3016, "step": 801 }, { "epoch": 3.208, "grad_norm": 1.0640708208084106, "learning_rate": 4.207414829659319e-05, "loss": 0.2636, "step": 802 }, { "epoch": 3.212, "grad_norm": 0.9335046410560608, "learning_rate": 4.206412825651303e-05, "loss": 0.2668, "step": 803 }, { "epoch": 3.216, "grad_norm": 1.0041309595108032, "learning_rate": 4.205410821643287e-05, "loss": 0.2774, "step": 804 }, { "epoch": 3.22, "grad_norm": 0.8523897528648376, "learning_rate": 4.204408817635271e-05, "loss": 0.2638, "step": 805 }, { "epoch": 3.224, "grad_norm": 0.9941080212593079, "learning_rate": 4.203406813627255e-05, "loss": 0.2988, "step": 806 }, { "epoch": 3.228, "grad_norm": 1.0812398195266724, "learning_rate": 4.202404809619239e-05, "loss": 0.277, "step": 807 }, { "epoch": 3.232, "grad_norm": 0.9684290289878845, "learning_rate": 4.2014028056112224e-05, "loss": 0.2944, "step": 808 }, { "epoch": 3.2359999999999998, "grad_norm": 1.021715521812439, "learning_rate": 4.2004008016032065e-05, "loss": 0.2548, "step": 809 }, { "epoch": 3.24, "grad_norm": 1.052801489830017, "learning_rate": 4.1993987975951907e-05, "loss": 0.2885, "step": 810 }, { "epoch": 3.2439999999999998, "grad_norm": 1.0573465824127197, "learning_rate": 4.198396793587174e-05, "loss": 0.3444, "step": 811 }, { "epoch": 3.248, "grad_norm": 0.7046388387680054, "learning_rate": 4.197394789579159e-05, "loss": 0.134, "step": 812 }, { "epoch": 3.252, "grad_norm": 1.030924677848816, "learning_rate": 4.1963927855711424e-05, "loss": 0.3304, "step": 813 }, { "epoch": 3.2560000000000002, "grad_norm": 0.9746088981628418, "learning_rate": 4.1953907815631265e-05, "loss": 0.286, "step": 814 }, { "epoch": 3.26, "grad_norm": 0.9044510722160339, "learning_rate": 4.1943887775551106e-05, "loss": 0.2653, "step": 815 }, { "epoch": 3.2640000000000002, "grad_norm": 1.0177682638168335, "learning_rate": 4.193386773547095e-05, "loss": 0.3103, "step": 816 }, { "epoch": 3.268, "grad_norm": 1.0040210485458374, "learning_rate": 4.192384769539078e-05, "loss": 0.2804, "step": 817 }, { "epoch": 3.2720000000000002, "grad_norm": 0.9195308685302734, "learning_rate": 4.191382765531062e-05, "loss": 0.3041, "step": 818 }, { "epoch": 3.276, "grad_norm": 1.0614639520645142, "learning_rate": 4.1903807615230465e-05, "loss": 0.2694, "step": 819 }, { "epoch": 3.2800000000000002, "grad_norm": 0.9703024625778198, "learning_rate": 4.18937875751503e-05, "loss": 0.3042, "step": 820 }, { "epoch": 3.284, "grad_norm": 0.7372400760650635, "learning_rate": 4.188376753507014e-05, "loss": 0.1776, "step": 821 }, { "epoch": 3.288, "grad_norm": 0.9801708459854126, "learning_rate": 4.187374749498998e-05, "loss": 0.3445, "step": 822 }, { "epoch": 3.292, "grad_norm": 1.0228462219238281, "learning_rate": 4.186372745490982e-05, "loss": 0.2965, "step": 823 }, { "epoch": 3.296, "grad_norm": 0.9982706904411316, "learning_rate": 4.1853707414829664e-05, "loss": 0.2613, "step": 824 }, { "epoch": 3.3, "grad_norm": 1.1089848279953003, "learning_rate": 4.18436873747495e-05, "loss": 0.3156, "step": 825 }, { "epoch": 3.304, "grad_norm": 0.9240087866783142, "learning_rate": 4.183366733466934e-05, "loss": 0.2621, "step": 826 }, { "epoch": 3.308, "grad_norm": 0.9882985353469849, "learning_rate": 4.182364729458918e-05, "loss": 0.2766, "step": 827 }, { "epoch": 3.312, "grad_norm": 1.092639446258545, "learning_rate": 4.1813627254509016e-05, "loss": 0.3167, "step": 828 }, { "epoch": 3.316, "grad_norm": 1.0920966863632202, "learning_rate": 4.180360721442886e-05, "loss": 0.3217, "step": 829 }, { "epoch": 3.32, "grad_norm": 1.0740662813186646, "learning_rate": 4.17935871743487e-05, "loss": 0.2861, "step": 830 }, { "epoch": 3.324, "grad_norm": 0.969103991985321, "learning_rate": 4.178356713426854e-05, "loss": 0.3083, "step": 831 }, { "epoch": 3.328, "grad_norm": 1.1013503074645996, "learning_rate": 4.177354709418838e-05, "loss": 0.3094, "step": 832 }, { "epoch": 3.332, "grad_norm": 0.9775644540786743, "learning_rate": 4.176352705410822e-05, "loss": 0.2919, "step": 833 }, { "epoch": 3.336, "grad_norm": 0.9920254349708557, "learning_rate": 4.175350701402806e-05, "loss": 0.2904, "step": 834 }, { "epoch": 3.34, "grad_norm": 1.0171326398849487, "learning_rate": 4.17434869739479e-05, "loss": 0.2888, "step": 835 }, { "epoch": 3.344, "grad_norm": 0.8502336144447327, "learning_rate": 4.173346693386774e-05, "loss": 0.3193, "step": 836 }, { "epoch": 3.348, "grad_norm": 0.9221425652503967, "learning_rate": 4.1723446893787574e-05, "loss": 0.3163, "step": 837 }, { "epoch": 3.352, "grad_norm": 0.7386908531188965, "learning_rate": 4.1713426853707415e-05, "loss": 0.1987, "step": 838 }, { "epoch": 3.356, "grad_norm": 0.9897122383117676, "learning_rate": 4.1703406813627256e-05, "loss": 0.3081, "step": 839 }, { "epoch": 3.36, "grad_norm": 1.0372047424316406, "learning_rate": 4.169338677354709e-05, "loss": 0.3217, "step": 840 }, { "epoch": 3.364, "grad_norm": 1.108982801437378, "learning_rate": 4.168336673346693e-05, "loss": 0.3282, "step": 841 }, { "epoch": 3.368, "grad_norm": 0.9839752316474915, "learning_rate": 4.167334669338678e-05, "loss": 0.2829, "step": 842 }, { "epoch": 3.372, "grad_norm": 0.9582000970840454, "learning_rate": 4.1663326653306615e-05, "loss": 0.3538, "step": 843 }, { "epoch": 3.376, "grad_norm": 1.0338131189346313, "learning_rate": 4.1653306613226456e-05, "loss": 0.3183, "step": 844 }, { "epoch": 3.38, "grad_norm": 0.969628632068634, "learning_rate": 4.16432865731463e-05, "loss": 0.2753, "step": 845 }, { "epoch": 3.384, "grad_norm": 0.9594615697860718, "learning_rate": 4.163326653306613e-05, "loss": 0.3348, "step": 846 }, { "epoch": 3.388, "grad_norm": 1.0317904949188232, "learning_rate": 4.162324649298597e-05, "loss": 0.3075, "step": 847 }, { "epoch": 3.392, "grad_norm": 1.00275719165802, "learning_rate": 4.1613226452905814e-05, "loss": 0.2883, "step": 848 }, { "epoch": 3.396, "grad_norm": 0.902736246585846, "learning_rate": 4.160320641282565e-05, "loss": 0.293, "step": 849 }, { "epoch": 3.4, "grad_norm": 0.9426193833351135, "learning_rate": 4.159318637274549e-05, "loss": 0.3869, "step": 850 }, { "epoch": 3.404, "grad_norm": 0.8548994660377502, "learning_rate": 4.158316633266534e-05, "loss": 0.2635, "step": 851 }, { "epoch": 3.408, "grad_norm": 0.9464280605316162, "learning_rate": 4.157314629258517e-05, "loss": 0.3005, "step": 852 }, { "epoch": 3.412, "grad_norm": 0.5138210654258728, "learning_rate": 4.1563126252505014e-05, "loss": 0.101, "step": 853 }, { "epoch": 3.416, "grad_norm": 0.8847005367279053, "learning_rate": 4.1553106212424855e-05, "loss": 0.2996, "step": 854 }, { "epoch": 3.42, "grad_norm": 1.042490005493164, "learning_rate": 4.154308617234469e-05, "loss": 0.2794, "step": 855 }, { "epoch": 3.424, "grad_norm": 1.0797277688980103, "learning_rate": 4.153306613226453e-05, "loss": 0.2884, "step": 856 }, { "epoch": 3.428, "grad_norm": 1.097922921180725, "learning_rate": 4.152304609218437e-05, "loss": 0.2846, "step": 857 }, { "epoch": 3.432, "grad_norm": 1.0516830682754517, "learning_rate": 4.151302605210421e-05, "loss": 0.3018, "step": 858 }, { "epoch": 3.436, "grad_norm": 1.0133203268051147, "learning_rate": 4.150300601202405e-05, "loss": 0.2871, "step": 859 }, { "epoch": 3.44, "grad_norm": 1.0574767589569092, "learning_rate": 4.149298597194389e-05, "loss": 0.3106, "step": 860 }, { "epoch": 3.444, "grad_norm": 0.9849914908409119, "learning_rate": 4.148296593186373e-05, "loss": 0.3729, "step": 861 }, { "epoch": 3.448, "grad_norm": 0.8353570103645325, "learning_rate": 4.147294589178357e-05, "loss": 0.2873, "step": 862 }, { "epoch": 3.452, "grad_norm": 1.0005288124084473, "learning_rate": 4.146292585170341e-05, "loss": 0.309, "step": 863 }, { "epoch": 3.456, "grad_norm": 1.139251708984375, "learning_rate": 4.145290581162325e-05, "loss": 0.3101, "step": 864 }, { "epoch": 3.46, "grad_norm": 0.9400432109832764, "learning_rate": 4.144288577154309e-05, "loss": 0.3201, "step": 865 }, { "epoch": 3.464, "grad_norm": 1.0056957006454468, "learning_rate": 4.143286573146293e-05, "loss": 0.2908, "step": 866 }, { "epoch": 3.468, "grad_norm": 1.0107009410858154, "learning_rate": 4.1422845691382765e-05, "loss": 0.2781, "step": 867 }, { "epoch": 3.472, "grad_norm": 0.8538976311683655, "learning_rate": 4.1412825651302606e-05, "loss": 0.271, "step": 868 }, { "epoch": 3.476, "grad_norm": 0.8999854922294617, "learning_rate": 4.140280561122245e-05, "loss": 0.3038, "step": 869 }, { "epoch": 3.48, "grad_norm": 0.8919850587844849, "learning_rate": 4.139278557114228e-05, "loss": 0.2953, "step": 870 }, { "epoch": 3.484, "grad_norm": 0.9376264810562134, "learning_rate": 4.138276553106213e-05, "loss": 0.309, "step": 871 }, { "epoch": 3.488, "grad_norm": 1.0939548015594482, "learning_rate": 4.137274549098197e-05, "loss": 0.3156, "step": 872 }, { "epoch": 3.492, "grad_norm": 0.9430792927742004, "learning_rate": 4.1362725450901806e-05, "loss": 0.3003, "step": 873 }, { "epoch": 3.496, "grad_norm": 0.9949555397033691, "learning_rate": 4.135270541082165e-05, "loss": 0.2998, "step": 874 }, { "epoch": 3.5, "grad_norm": 0.9819671511650085, "learning_rate": 4.134268537074149e-05, "loss": 0.318, "step": 875 }, { "epoch": 3.504, "grad_norm": 0.9571810960769653, "learning_rate": 4.133266533066132e-05, "loss": 0.3308, "step": 876 }, { "epoch": 3.508, "grad_norm": 0.9577171206474304, "learning_rate": 4.1322645290581164e-05, "loss": 0.3261, "step": 877 }, { "epoch": 3.512, "grad_norm": 0.9510439038276672, "learning_rate": 4.1312625250501005e-05, "loss": 0.2901, "step": 878 }, { "epoch": 3.516, "grad_norm": 0.9860422015190125, "learning_rate": 4.130260521042084e-05, "loss": 0.2799, "step": 879 }, { "epoch": 3.52, "grad_norm": 1.0223884582519531, "learning_rate": 4.129258517034068e-05, "loss": 0.3264, "step": 880 }, { "epoch": 3.524, "grad_norm": 1.0917876958847046, "learning_rate": 4.128256513026052e-05, "loss": 0.2869, "step": 881 }, { "epoch": 3.528, "grad_norm": 1.0941041707992554, "learning_rate": 4.1272545090180364e-05, "loss": 0.3296, "step": 882 }, { "epoch": 3.532, "grad_norm": 0.9982560276985168, "learning_rate": 4.1262525050100205e-05, "loss": 0.3123, "step": 883 }, { "epoch": 3.536, "grad_norm": 1.1089354753494263, "learning_rate": 4.125250501002004e-05, "loss": 0.3245, "step": 884 }, { "epoch": 3.54, "grad_norm": 0.9130812883377075, "learning_rate": 4.124248496993988e-05, "loss": 0.265, "step": 885 }, { "epoch": 3.544, "grad_norm": 0.9193147420883179, "learning_rate": 4.123246492985972e-05, "loss": 0.2947, "step": 886 }, { "epoch": 3.548, "grad_norm": 0.908898651599884, "learning_rate": 4.122244488977956e-05, "loss": 0.3173, "step": 887 }, { "epoch": 3.552, "grad_norm": 1.038451910018921, "learning_rate": 4.12124248496994e-05, "loss": 0.3317, "step": 888 }, { "epoch": 3.556, "grad_norm": 1.0614579916000366, "learning_rate": 4.120240480961924e-05, "loss": 0.3206, "step": 889 }, { "epoch": 3.56, "grad_norm": 1.0630053281784058, "learning_rate": 4.119238476953908e-05, "loss": 0.3096, "step": 890 }, { "epoch": 3.564, "grad_norm": 0.9061043858528137, "learning_rate": 4.118236472945892e-05, "loss": 0.3105, "step": 891 }, { "epoch": 3.568, "grad_norm": 0.9253777265548706, "learning_rate": 4.117234468937876e-05, "loss": 0.2853, "step": 892 }, { "epoch": 3.572, "grad_norm": 1.0427800416946411, "learning_rate": 4.11623246492986e-05, "loss": 0.3032, "step": 893 }, { "epoch": 3.576, "grad_norm": 0.9730731844902039, "learning_rate": 4.115230460921844e-05, "loss": 0.315, "step": 894 }, { "epoch": 3.58, "grad_norm": 0.9691907167434692, "learning_rate": 4.114228456913828e-05, "loss": 0.3382, "step": 895 }, { "epoch": 3.584, "grad_norm": 0.905629575252533, "learning_rate": 4.1132264529058115e-05, "loss": 0.2965, "step": 896 }, { "epoch": 3.588, "grad_norm": 1.1402881145477295, "learning_rate": 4.1122244488977956e-05, "loss": 0.3044, "step": 897 }, { "epoch": 3.592, "grad_norm": 1.0247617959976196, "learning_rate": 4.11122244488978e-05, "loss": 0.3063, "step": 898 }, { "epoch": 3.596, "grad_norm": 1.0826162099838257, "learning_rate": 4.110220440881764e-05, "loss": 0.2926, "step": 899 }, { "epoch": 3.6, "grad_norm": 1.0896751880645752, "learning_rate": 4.109218436873748e-05, "loss": 0.3232, "step": 900 }, { "epoch": 3.604, "grad_norm": 1.0806505680084229, "learning_rate": 4.108216432865732e-05, "loss": 0.2913, "step": 901 }, { "epoch": 3.608, "grad_norm": 1.0015305280685425, "learning_rate": 4.1072144288577155e-05, "loss": 0.2998, "step": 902 }, { "epoch": 3.612, "grad_norm": 1.0075455904006958, "learning_rate": 4.1062124248497e-05, "loss": 0.2716, "step": 903 }, { "epoch": 3.616, "grad_norm": 1.06728196144104, "learning_rate": 4.105210420841684e-05, "loss": 0.3269, "step": 904 }, { "epoch": 3.62, "grad_norm": 0.8644483089447021, "learning_rate": 4.104208416833667e-05, "loss": 0.2804, "step": 905 }, { "epoch": 3.624, "grad_norm": 0.9951996803283691, "learning_rate": 4.1032064128256514e-05, "loss": 0.3223, "step": 906 }, { "epoch": 3.628, "grad_norm": 0.9931237697601318, "learning_rate": 4.1022044088176355e-05, "loss": 0.2715, "step": 907 }, { "epoch": 3.632, "grad_norm": 1.0275764465332031, "learning_rate": 4.101202404809619e-05, "loss": 0.3444, "step": 908 }, { "epoch": 3.636, "grad_norm": 0.9816417098045349, "learning_rate": 4.100200400801603e-05, "loss": 0.2724, "step": 909 }, { "epoch": 3.64, "grad_norm": 1.1031150817871094, "learning_rate": 4.099198396793588e-05, "loss": 0.3406, "step": 910 }, { "epoch": 3.644, "grad_norm": 0.7077162265777588, "learning_rate": 4.0981963927855713e-05, "loss": 0.1978, "step": 911 }, { "epoch": 3.648, "grad_norm": 1.0858787298202515, "learning_rate": 4.0971943887775555e-05, "loss": 0.2975, "step": 912 }, { "epoch": 3.652, "grad_norm": 0.9886104464530945, "learning_rate": 4.0961923847695396e-05, "loss": 0.3237, "step": 913 }, { "epoch": 3.656, "grad_norm": 1.1059352159500122, "learning_rate": 4.095190380761523e-05, "loss": 0.3181, "step": 914 }, { "epoch": 3.66, "grad_norm": 0.9329758882522583, "learning_rate": 4.094188376753507e-05, "loss": 0.2751, "step": 915 }, { "epoch": 3.664, "grad_norm": 1.0664880275726318, "learning_rate": 4.093186372745491e-05, "loss": 0.3348, "step": 916 }, { "epoch": 3.668, "grad_norm": 0.9057939052581787, "learning_rate": 4.092184368737475e-05, "loss": 0.3088, "step": 917 }, { "epoch": 3.672, "grad_norm": 0.9240928888320923, "learning_rate": 4.091182364729459e-05, "loss": 0.1825, "step": 918 }, { "epoch": 3.676, "grad_norm": 1.0612246990203857, "learning_rate": 4.090180360721443e-05, "loss": 0.2613, "step": 919 }, { "epoch": 3.68, "grad_norm": 0.8932524919509888, "learning_rate": 4.089178356713427e-05, "loss": 0.3157, "step": 920 }, { "epoch": 3.684, "grad_norm": 0.9782359600067139, "learning_rate": 4.088176352705411e-05, "loss": 0.2984, "step": 921 }, { "epoch": 3.6879999999999997, "grad_norm": 1.0117908716201782, "learning_rate": 4.0871743486973954e-05, "loss": 0.3072, "step": 922 }, { "epoch": 3.692, "grad_norm": 1.1856647729873657, "learning_rate": 4.086172344689379e-05, "loss": 0.3281, "step": 923 }, { "epoch": 3.6959999999999997, "grad_norm": 0.767086386680603, "learning_rate": 4.085170340681363e-05, "loss": 0.1969, "step": 924 }, { "epoch": 3.7, "grad_norm": 1.2814222574234009, "learning_rate": 4.084168336673347e-05, "loss": 0.3409, "step": 925 }, { "epoch": 3.7039999999999997, "grad_norm": 1.0232386589050293, "learning_rate": 4.0831663326653306e-05, "loss": 0.2866, "step": 926 }, { "epoch": 3.708, "grad_norm": 1.0636742115020752, "learning_rate": 4.082164328657315e-05, "loss": 0.3536, "step": 927 }, { "epoch": 3.7119999999999997, "grad_norm": 0.9983400106430054, "learning_rate": 4.081162324649299e-05, "loss": 0.3254, "step": 928 }, { "epoch": 3.716, "grad_norm": 0.9435644745826721, "learning_rate": 4.080160320641282e-05, "loss": 0.3213, "step": 929 }, { "epoch": 3.7199999999999998, "grad_norm": 0.9191159009933472, "learning_rate": 4.079158316633267e-05, "loss": 0.2703, "step": 930 }, { "epoch": 3.724, "grad_norm": 0.6620362401008606, "learning_rate": 4.078156312625251e-05, "loss": 0.1913, "step": 931 }, { "epoch": 3.7279999999999998, "grad_norm": 0.7755175828933716, "learning_rate": 4.0771543086172346e-05, "loss": 0.2269, "step": 932 }, { "epoch": 3.732, "grad_norm": 1.0141972303390503, "learning_rate": 4.076152304609219e-05, "loss": 0.3042, "step": 933 }, { "epoch": 3.7359999999999998, "grad_norm": 1.0013058185577393, "learning_rate": 4.075150300601203e-05, "loss": 0.3252, "step": 934 }, { "epoch": 3.74, "grad_norm": 1.1181739568710327, "learning_rate": 4.0741482965931864e-05, "loss": 0.3186, "step": 935 }, { "epoch": 3.7439999999999998, "grad_norm": 1.0314396619796753, "learning_rate": 4.0731462925851705e-05, "loss": 0.2691, "step": 936 }, { "epoch": 3.748, "grad_norm": 0.9695926308631897, "learning_rate": 4.0721442885771546e-05, "loss": 0.3565, "step": 937 }, { "epoch": 3.752, "grad_norm": 0.9737102389335632, "learning_rate": 4.071142284569138e-05, "loss": 0.2772, "step": 938 }, { "epoch": 3.7560000000000002, "grad_norm": 1.114227056503296, "learning_rate": 4.070140280561122e-05, "loss": 0.316, "step": 939 }, { "epoch": 3.76, "grad_norm": 1.2355971336364746, "learning_rate": 4.069138276553106e-05, "loss": 0.331, "step": 940 }, { "epoch": 3.7640000000000002, "grad_norm": 0.8689810037612915, "learning_rate": 4.0681362725450904e-05, "loss": 0.2914, "step": 941 }, { "epoch": 3.768, "grad_norm": 1.04837167263031, "learning_rate": 4.0671342685370746e-05, "loss": 0.3081, "step": 942 }, { "epoch": 3.7720000000000002, "grad_norm": 1.047351598739624, "learning_rate": 4.066132264529059e-05, "loss": 0.3239, "step": 943 }, { "epoch": 3.776, "grad_norm": 0.9564910531044006, "learning_rate": 4.065130260521042e-05, "loss": 0.3193, "step": 944 }, { "epoch": 3.7800000000000002, "grad_norm": 1.0410816669464111, "learning_rate": 4.064128256513026e-05, "loss": 0.3311, "step": 945 }, { "epoch": 3.784, "grad_norm": 0.9697766304016113, "learning_rate": 4.0631262525050104e-05, "loss": 0.3043, "step": 946 }, { "epoch": 3.7880000000000003, "grad_norm": 1.0359306335449219, "learning_rate": 4.062124248496994e-05, "loss": 0.3646, "step": 947 }, { "epoch": 3.792, "grad_norm": 0.8712097406387329, "learning_rate": 4.061122244488978e-05, "loss": 0.2917, "step": 948 }, { "epoch": 3.7960000000000003, "grad_norm": 0.9690885543823242, "learning_rate": 4.060120240480962e-05, "loss": 0.3265, "step": 949 }, { "epoch": 3.8, "grad_norm": 0.9399465322494507, "learning_rate": 4.059118236472946e-05, "loss": 0.3144, "step": 950 }, { "epoch": 3.8040000000000003, "grad_norm": 1.0360187292099, "learning_rate": 4.0581162324649304e-05, "loss": 0.3777, "step": 951 }, { "epoch": 3.808, "grad_norm": 1.233900785446167, "learning_rate": 4.057114228456914e-05, "loss": 0.3032, "step": 952 }, { "epoch": 3.8120000000000003, "grad_norm": 1.0116194486618042, "learning_rate": 4.056112224448898e-05, "loss": 0.2787, "step": 953 }, { "epoch": 3.816, "grad_norm": 0.9884048700332642, "learning_rate": 4.055110220440882e-05, "loss": 0.3147, "step": 954 }, { "epoch": 3.82, "grad_norm": 0.9382745623588562, "learning_rate": 4.0541082164328655e-05, "loss": 0.3232, "step": 955 }, { "epoch": 3.824, "grad_norm": 0.9225025177001953, "learning_rate": 4.0531062124248497e-05, "loss": 0.3334, "step": 956 }, { "epoch": 3.828, "grad_norm": 0.8968762159347534, "learning_rate": 4.052104208416834e-05, "loss": 0.3146, "step": 957 }, { "epoch": 3.832, "grad_norm": 0.9080137610435486, "learning_rate": 4.051102204408818e-05, "loss": 0.2877, "step": 958 }, { "epoch": 3.836, "grad_norm": 0.9056142568588257, "learning_rate": 4.050100200400802e-05, "loss": 0.2728, "step": 959 }, { "epoch": 3.84, "grad_norm": 0.976733922958374, "learning_rate": 4.049098196392786e-05, "loss": 0.3077, "step": 960 }, { "epoch": 3.844, "grad_norm": 1.0319982767105103, "learning_rate": 4.0480961923847696e-05, "loss": 0.3124, "step": 961 }, { "epoch": 3.848, "grad_norm": 0.8904579281806946, "learning_rate": 4.047094188376754e-05, "loss": 0.3197, "step": 962 }, { "epoch": 3.852, "grad_norm": 0.9294513463973999, "learning_rate": 4.046092184368738e-05, "loss": 0.318, "step": 963 }, { "epoch": 3.856, "grad_norm": 0.9072182178497314, "learning_rate": 4.045090180360721e-05, "loss": 0.2971, "step": 964 }, { "epoch": 3.86, "grad_norm": 1.0785537958145142, "learning_rate": 4.0440881763527055e-05, "loss": 0.3362, "step": 965 }, { "epoch": 3.864, "grad_norm": 1.1097102165222168, "learning_rate": 4.0430861723446896e-05, "loss": 0.3528, "step": 966 }, { "epoch": 3.868, "grad_norm": 1.0673046112060547, "learning_rate": 4.042084168336673e-05, "loss": 0.3085, "step": 967 }, { "epoch": 3.872, "grad_norm": 0.8683971762657166, "learning_rate": 4.041082164328657e-05, "loss": 0.2852, "step": 968 }, { "epoch": 3.876, "grad_norm": 0.9497571587562561, "learning_rate": 4.040080160320642e-05, "loss": 0.2755, "step": 969 }, { "epoch": 3.88, "grad_norm": 1.0285412073135376, "learning_rate": 4.0390781563126254e-05, "loss": 0.3037, "step": 970 }, { "epoch": 3.884, "grad_norm": 1.1578757762908936, "learning_rate": 4.0380761523046096e-05, "loss": 0.3113, "step": 971 }, { "epoch": 3.888, "grad_norm": 1.017016887664795, "learning_rate": 4.037074148296594e-05, "loss": 0.3655, "step": 972 }, { "epoch": 3.892, "grad_norm": 1.044767141342163, "learning_rate": 4.036072144288577e-05, "loss": 0.3088, "step": 973 }, { "epoch": 3.896, "grad_norm": 1.098922848701477, "learning_rate": 4.035070140280561e-05, "loss": 0.3294, "step": 974 }, { "epoch": 3.9, "grad_norm": 0.9794385433197021, "learning_rate": 4.0340681362725454e-05, "loss": 0.302, "step": 975 }, { "epoch": 3.904, "grad_norm": 0.9461687207221985, "learning_rate": 4.033066132264529e-05, "loss": 0.3199, "step": 976 }, { "epoch": 3.908, "grad_norm": 0.971648633480072, "learning_rate": 4.032064128256513e-05, "loss": 0.3065, "step": 977 }, { "epoch": 3.912, "grad_norm": 0.8333442807197571, "learning_rate": 4.031062124248497e-05, "loss": 0.2888, "step": 978 }, { "epoch": 3.916, "grad_norm": 0.8855047225952148, "learning_rate": 4.030060120240481e-05, "loss": 0.3273, "step": 979 }, { "epoch": 3.92, "grad_norm": 0.7457961440086365, "learning_rate": 4.0290581162324654e-05, "loss": 0.2061, "step": 980 }, { "epoch": 3.924, "grad_norm": 0.9684552550315857, "learning_rate": 4.0280561122244495e-05, "loss": 0.3369, "step": 981 }, { "epoch": 3.928, "grad_norm": 1.0121392011642456, "learning_rate": 4.027054108216433e-05, "loss": 0.3324, "step": 982 }, { "epoch": 3.932, "grad_norm": 0.9265434145927429, "learning_rate": 4.026052104208417e-05, "loss": 0.3017, "step": 983 }, { "epoch": 3.936, "grad_norm": 1.0135233402252197, "learning_rate": 4.025050100200401e-05, "loss": 0.2969, "step": 984 }, { "epoch": 3.94, "grad_norm": 1.0143775939941406, "learning_rate": 4.0240480961923846e-05, "loss": 0.2992, "step": 985 }, { "epoch": 3.944, "grad_norm": 0.9987703561782837, "learning_rate": 4.023046092184369e-05, "loss": 0.316, "step": 986 }, { "epoch": 3.948, "grad_norm": 0.9264063239097595, "learning_rate": 4.022044088176353e-05, "loss": 0.3123, "step": 987 }, { "epoch": 3.952, "grad_norm": 1.1002973318099976, "learning_rate": 4.0210420841683363e-05, "loss": 0.3324, "step": 988 }, { "epoch": 3.956, "grad_norm": 1.0875300168991089, "learning_rate": 4.020040080160321e-05, "loss": 0.3193, "step": 989 }, { "epoch": 3.96, "grad_norm": 1.137242078781128, "learning_rate": 4.019038076152305e-05, "loss": 0.2993, "step": 990 }, { "epoch": 3.964, "grad_norm": 1.010221242904663, "learning_rate": 4.018036072144289e-05, "loss": 0.2902, "step": 991 }, { "epoch": 3.968, "grad_norm": 1.0185948610305786, "learning_rate": 4.017034068136273e-05, "loss": 0.2858, "step": 992 }, { "epoch": 3.972, "grad_norm": 1.1539437770843506, "learning_rate": 4.016032064128257e-05, "loss": 0.3179, "step": 993 }, { "epoch": 3.976, "grad_norm": 1.0386922359466553, "learning_rate": 4.0150300601202404e-05, "loss": 0.3271, "step": 994 }, { "epoch": 3.98, "grad_norm": 1.041878581047058, "learning_rate": 4.0140280561122246e-05, "loss": 0.3355, "step": 995 }, { "epoch": 3.984, "grad_norm": 1.1333723068237305, "learning_rate": 4.013026052104209e-05, "loss": 0.3385, "step": 996 }, { "epoch": 3.988, "grad_norm": 1.0429155826568604, "learning_rate": 4.012024048096192e-05, "loss": 0.2962, "step": 997 }, { "epoch": 3.992, "grad_norm": 1.0766241550445557, "learning_rate": 4.011022044088176e-05, "loss": 0.3384, "step": 998 }, { "epoch": 3.996, "grad_norm": 0.979697048664093, "learning_rate": 4.010020040080161e-05, "loss": 0.2805, "step": 999 }, { "epoch": 4.0, "grad_norm": 1.0180047750473022, "learning_rate": 4.0090180360721445e-05, "loss": 0.3099, "step": 1000 }, { "epoch": 4.004, "grad_norm": 1.014128565788269, "learning_rate": 4.0080160320641287e-05, "loss": 0.2158, "step": 1001 }, { "epoch": 4.008, "grad_norm": 0.9923162460327148, "learning_rate": 4.007014028056113e-05, "loss": 0.2208, "step": 1002 }, { "epoch": 4.012, "grad_norm": 1.0028576850891113, "learning_rate": 4.006012024048096e-05, "loss": 0.2285, "step": 1003 }, { "epoch": 4.016, "grad_norm": 0.962719738483429, "learning_rate": 4.0050100200400804e-05, "loss": 0.2034, "step": 1004 }, { "epoch": 4.02, "grad_norm": 1.013881802558899, "learning_rate": 4.0040080160320645e-05, "loss": 0.1846, "step": 1005 }, { "epoch": 4.024, "grad_norm": 0.8782246708869934, "learning_rate": 4.003006012024048e-05, "loss": 0.2139, "step": 1006 }, { "epoch": 4.028, "grad_norm": 1.26191246509552, "learning_rate": 4.002004008016032e-05, "loss": 0.2172, "step": 1007 }, { "epoch": 4.032, "grad_norm": 1.2841931581497192, "learning_rate": 4.001002004008016e-05, "loss": 0.2612, "step": 1008 }, { "epoch": 4.036, "grad_norm": 1.2775436639785767, "learning_rate": 4e-05, "loss": 0.1986, "step": 1009 }, { "epoch": 4.04, "grad_norm": 1.3614190816879272, "learning_rate": 3.9989979959919845e-05, "loss": 0.2709, "step": 1010 }, { "epoch": 4.044, "grad_norm": 1.4308106899261475, "learning_rate": 3.997995991983968e-05, "loss": 0.2196, "step": 1011 }, { "epoch": 4.048, "grad_norm": 1.4295969009399414, "learning_rate": 3.996993987975952e-05, "loss": 0.2131, "step": 1012 }, { "epoch": 4.052, "grad_norm": 1.4375779628753662, "learning_rate": 3.995991983967936e-05, "loss": 0.203, "step": 1013 }, { "epoch": 4.056, "grad_norm": 1.3883522748947144, "learning_rate": 3.99498997995992e-05, "loss": 0.2139, "step": 1014 }, { "epoch": 4.06, "grad_norm": 1.211121678352356, "learning_rate": 3.993987975951904e-05, "loss": 0.2068, "step": 1015 }, { "epoch": 4.064, "grad_norm": 1.3009765148162842, "learning_rate": 3.992985971943888e-05, "loss": 0.2104, "step": 1016 }, { "epoch": 4.068, "grad_norm": 1.137965202331543, "learning_rate": 3.991983967935872e-05, "loss": 0.1963, "step": 1017 }, { "epoch": 4.072, "grad_norm": 1.2894974946975708, "learning_rate": 3.990981963927856e-05, "loss": 0.2235, "step": 1018 }, { "epoch": 4.076, "grad_norm": 0.706529974937439, "learning_rate": 3.98997995991984e-05, "loss": 0.098, "step": 1019 }, { "epoch": 4.08, "grad_norm": 1.130771279335022, "learning_rate": 3.988977955911824e-05, "loss": 0.2235, "step": 1020 }, { "epoch": 4.084, "grad_norm": 1.171426773071289, "learning_rate": 3.987975951903808e-05, "loss": 0.1852, "step": 1021 }, { "epoch": 4.088, "grad_norm": 1.0851662158966064, "learning_rate": 3.986973947895792e-05, "loss": 0.1879, "step": 1022 }, { "epoch": 4.092, "grad_norm": 1.3458068370819092, "learning_rate": 3.9859719438877754e-05, "loss": 0.2321, "step": 1023 }, { "epoch": 4.096, "grad_norm": 1.1501377820968628, "learning_rate": 3.9849699398797595e-05, "loss": 0.2157, "step": 1024 }, { "epoch": 4.1, "grad_norm": 1.2266032695770264, "learning_rate": 3.983967935871744e-05, "loss": 0.22, "step": 1025 }, { "epoch": 4.104, "grad_norm": 1.1825183629989624, "learning_rate": 3.982965931863728e-05, "loss": 0.1887, "step": 1026 }, { "epoch": 4.108, "grad_norm": 1.1460607051849365, "learning_rate": 3.981963927855711e-05, "loss": 0.1834, "step": 1027 }, { "epoch": 4.112, "grad_norm": 1.2845118045806885, "learning_rate": 3.980961923847696e-05, "loss": 0.223, "step": 1028 }, { "epoch": 4.116, "grad_norm": 1.3526803255081177, "learning_rate": 3.9799599198396795e-05, "loss": 0.1907, "step": 1029 }, { "epoch": 4.12, "grad_norm": 1.2375893592834473, "learning_rate": 3.9789579158316636e-05, "loss": 0.188, "step": 1030 }, { "epoch": 4.124, "grad_norm": 1.2433942556381226, "learning_rate": 3.977955911823648e-05, "loss": 0.2173, "step": 1031 }, { "epoch": 4.128, "grad_norm": 1.2910653352737427, "learning_rate": 3.976953907815631e-05, "loss": 0.192, "step": 1032 }, { "epoch": 4.132, "grad_norm": 1.0230827331542969, "learning_rate": 3.9759519038076153e-05, "loss": 0.1544, "step": 1033 }, { "epoch": 4.136, "grad_norm": 0.9551684856414795, "learning_rate": 3.9749498997995995e-05, "loss": 0.1518, "step": 1034 }, { "epoch": 4.14, "grad_norm": 1.2070391178131104, "learning_rate": 3.973947895791583e-05, "loss": 0.1783, "step": 1035 }, { "epoch": 4.144, "grad_norm": 1.4213783740997314, "learning_rate": 3.972945891783567e-05, "loss": 0.2237, "step": 1036 }, { "epoch": 4.148, "grad_norm": 1.2974165678024292, "learning_rate": 3.971943887775551e-05, "loss": 0.1768, "step": 1037 }, { "epoch": 4.152, "grad_norm": 0.8103154301643372, "learning_rate": 3.970941883767535e-05, "loss": 0.0919, "step": 1038 }, { "epoch": 4.156, "grad_norm": 1.279154896736145, "learning_rate": 3.9699398797595194e-05, "loss": 0.2302, "step": 1039 }, { "epoch": 4.16, "grad_norm": 1.3680810928344727, "learning_rate": 3.9689378757515036e-05, "loss": 0.2242, "step": 1040 }, { "epoch": 4.164, "grad_norm": 1.184191107749939, "learning_rate": 3.967935871743487e-05, "loss": 0.2504, "step": 1041 }, { "epoch": 4.168, "grad_norm": 1.2547346353530884, "learning_rate": 3.966933867735471e-05, "loss": 0.2051, "step": 1042 }, { "epoch": 4.172, "grad_norm": 1.1803265810012817, "learning_rate": 3.965931863727455e-05, "loss": 0.2276, "step": 1043 }, { "epoch": 4.176, "grad_norm": 1.4127157926559448, "learning_rate": 3.964929859719439e-05, "loss": 0.2378, "step": 1044 }, { "epoch": 4.18, "grad_norm": 1.2966582775115967, "learning_rate": 3.963927855711423e-05, "loss": 0.1672, "step": 1045 }, { "epoch": 4.184, "grad_norm": 1.1143783330917358, "learning_rate": 3.962925851703407e-05, "loss": 0.2011, "step": 1046 }, { "epoch": 4.188, "grad_norm": 1.2296042442321777, "learning_rate": 3.9619238476953904e-05, "loss": 0.1928, "step": 1047 }, { "epoch": 4.192, "grad_norm": 1.2343521118164062, "learning_rate": 3.960921843687375e-05, "loss": 0.2026, "step": 1048 }, { "epoch": 4.196, "grad_norm": 1.2687957286834717, "learning_rate": 3.9599198396793594e-05, "loss": 0.2408, "step": 1049 }, { "epoch": 4.2, "grad_norm": 1.172644019126892, "learning_rate": 3.958917835671343e-05, "loss": 0.2206, "step": 1050 }, { "epoch": 4.204, "grad_norm": 1.220900297164917, "learning_rate": 3.957915831663327e-05, "loss": 0.2039, "step": 1051 }, { "epoch": 4.208, "grad_norm": 1.3167065382003784, "learning_rate": 3.956913827655311e-05, "loss": 0.1992, "step": 1052 }, { "epoch": 4.212, "grad_norm": 1.2286665439605713, "learning_rate": 3.9559118236472945e-05, "loss": 0.1897, "step": 1053 }, { "epoch": 4.216, "grad_norm": 1.2382471561431885, "learning_rate": 3.9549098196392786e-05, "loss": 0.2299, "step": 1054 }, { "epoch": 4.22, "grad_norm": 1.2562413215637207, "learning_rate": 3.953907815631263e-05, "loss": 0.252, "step": 1055 }, { "epoch": 4.224, "grad_norm": 1.2567752599716187, "learning_rate": 3.952905811623246e-05, "loss": 0.2026, "step": 1056 }, { "epoch": 4.228, "grad_norm": 1.1952612400054932, "learning_rate": 3.9519038076152304e-05, "loss": 0.2166, "step": 1057 }, { "epoch": 4.232, "grad_norm": 1.23219633102417, "learning_rate": 3.950901803607215e-05, "loss": 0.2183, "step": 1058 }, { "epoch": 4.236, "grad_norm": 1.3385809659957886, "learning_rate": 3.9498997995991986e-05, "loss": 0.2031, "step": 1059 }, { "epoch": 4.24, "grad_norm": 1.0677396059036255, "learning_rate": 3.948897795591183e-05, "loss": 0.2011, "step": 1060 }, { "epoch": 4.244, "grad_norm": 1.4090005159378052, "learning_rate": 3.947895791583167e-05, "loss": 0.2227, "step": 1061 }, { "epoch": 4.248, "grad_norm": 1.1532093286514282, "learning_rate": 3.94689378757515e-05, "loss": 0.1974, "step": 1062 }, { "epoch": 4.252, "grad_norm": 1.1573532819747925, "learning_rate": 3.9458917835671344e-05, "loss": 0.2248, "step": 1063 }, { "epoch": 4.256, "grad_norm": 1.2687122821807861, "learning_rate": 3.9448897795591186e-05, "loss": 0.2144, "step": 1064 }, { "epoch": 4.26, "grad_norm": 1.2371104955673218, "learning_rate": 3.943887775551102e-05, "loss": 0.1946, "step": 1065 }, { "epoch": 4.264, "grad_norm": 0.7894278764724731, "learning_rate": 3.942885771543086e-05, "loss": 0.1066, "step": 1066 }, { "epoch": 4.268, "grad_norm": 1.357580304145813, "learning_rate": 3.94188376753507e-05, "loss": 0.2478, "step": 1067 }, { "epoch": 4.272, "grad_norm": 1.174869418144226, "learning_rate": 3.9408817635270544e-05, "loss": 0.1946, "step": 1068 }, { "epoch": 4.276, "grad_norm": 1.242233157157898, "learning_rate": 3.9398797595190385e-05, "loss": 0.2277, "step": 1069 }, { "epoch": 4.28, "grad_norm": 1.31795334815979, "learning_rate": 3.938877755511023e-05, "loss": 0.2155, "step": 1070 }, { "epoch": 4.284, "grad_norm": 0.8449123501777649, "learning_rate": 3.937875751503006e-05, "loss": 0.1393, "step": 1071 }, { "epoch": 4.288, "grad_norm": 1.4304672479629517, "learning_rate": 3.93687374749499e-05, "loss": 0.2227, "step": 1072 }, { "epoch": 4.292, "grad_norm": 1.1797378063201904, "learning_rate": 3.9358717434869744e-05, "loss": 0.2414, "step": 1073 }, { "epoch": 4.296, "grad_norm": 1.1290992498397827, "learning_rate": 3.934869739478958e-05, "loss": 0.2528, "step": 1074 }, { "epoch": 4.3, "grad_norm": 1.2640211582183838, "learning_rate": 3.933867735470942e-05, "loss": 0.2229, "step": 1075 }, { "epoch": 4.304, "grad_norm": 1.117639422416687, "learning_rate": 3.932865731462926e-05, "loss": 0.2176, "step": 1076 }, { "epoch": 4.308, "grad_norm": 1.35614013671875, "learning_rate": 3.93186372745491e-05, "loss": 0.1896, "step": 1077 }, { "epoch": 4.312, "grad_norm": 1.2274762392044067, "learning_rate": 3.930861723446894e-05, "loss": 0.2213, "step": 1078 }, { "epoch": 4.316, "grad_norm": 1.1732356548309326, "learning_rate": 3.929859719438878e-05, "loss": 0.2233, "step": 1079 }, { "epoch": 4.32, "grad_norm": 1.1939340829849243, "learning_rate": 3.928857715430862e-05, "loss": 0.1964, "step": 1080 }, { "epoch": 4.324, "grad_norm": 1.3930495977401733, "learning_rate": 3.927855711422846e-05, "loss": 0.1965, "step": 1081 }, { "epoch": 4.328, "grad_norm": 1.371573567390442, "learning_rate": 3.9268537074148295e-05, "loss": 0.2191, "step": 1082 }, { "epoch": 4.332, "grad_norm": 1.4507895708084106, "learning_rate": 3.9258517034068136e-05, "loss": 0.2927, "step": 1083 }, { "epoch": 4.336, "grad_norm": 1.3624083995819092, "learning_rate": 3.924849699398798e-05, "loss": 0.2211, "step": 1084 }, { "epoch": 4.34, "grad_norm": 1.1890075206756592, "learning_rate": 3.923847695390782e-05, "loss": 0.1766, "step": 1085 }, { "epoch": 4.344, "grad_norm": 1.4425567388534546, "learning_rate": 3.922845691382765e-05, "loss": 0.205, "step": 1086 }, { "epoch": 4.348, "grad_norm": 1.2859835624694824, "learning_rate": 3.92184368737475e-05, "loss": 0.2194, "step": 1087 }, { "epoch": 4.352, "grad_norm": 1.2856872081756592, "learning_rate": 3.9208416833667336e-05, "loss": 0.197, "step": 1088 }, { "epoch": 4.356, "grad_norm": 1.2462953329086304, "learning_rate": 3.919839679358718e-05, "loss": 0.2121, "step": 1089 }, { "epoch": 4.36, "grad_norm": 1.364990472793579, "learning_rate": 3.918837675350702e-05, "loss": 0.2367, "step": 1090 }, { "epoch": 4.364, "grad_norm": 1.3470253944396973, "learning_rate": 3.917835671342685e-05, "loss": 0.2263, "step": 1091 }, { "epoch": 4.368, "grad_norm": 1.2777583599090576, "learning_rate": 3.9168336673346694e-05, "loss": 0.2483, "step": 1092 }, { "epoch": 4.372, "grad_norm": 1.1814221143722534, "learning_rate": 3.9158316633266535e-05, "loss": 0.2304, "step": 1093 }, { "epoch": 4.376, "grad_norm": 1.329244613647461, "learning_rate": 3.914829659318637e-05, "loss": 0.258, "step": 1094 }, { "epoch": 4.38, "grad_norm": 1.2682360410690308, "learning_rate": 3.913827655310621e-05, "loss": 0.2278, "step": 1095 }, { "epoch": 4.384, "grad_norm": 1.2741080522537231, "learning_rate": 3.912825651302605e-05, "loss": 0.2437, "step": 1096 }, { "epoch": 4.388, "grad_norm": 1.1235476732254028, "learning_rate": 3.9118236472945894e-05, "loss": 0.2209, "step": 1097 }, { "epoch": 4.392, "grad_norm": 1.1556679010391235, "learning_rate": 3.9108216432865735e-05, "loss": 0.2668, "step": 1098 }, { "epoch": 4.396, "grad_norm": 1.2366710901260376, "learning_rate": 3.9098196392785576e-05, "loss": 0.2822, "step": 1099 }, { "epoch": 4.4, "grad_norm": 1.325208067893982, "learning_rate": 3.908817635270541e-05, "loss": 0.2203, "step": 1100 }, { "epoch": 4.404, "grad_norm": 1.166352391242981, "learning_rate": 3.907815631262525e-05, "loss": 0.2441, "step": 1101 }, { "epoch": 4.408, "grad_norm": 1.2230168581008911, "learning_rate": 3.9068136272545093e-05, "loss": 0.2217, "step": 1102 }, { "epoch": 4.412, "grad_norm": 1.2550815343856812, "learning_rate": 3.905811623246493e-05, "loss": 0.2299, "step": 1103 }, { "epoch": 4.416, "grad_norm": 1.3714336156845093, "learning_rate": 3.904809619238477e-05, "loss": 0.2247, "step": 1104 }, { "epoch": 4.42, "grad_norm": 1.2269859313964844, "learning_rate": 3.903807615230461e-05, "loss": 0.21, "step": 1105 }, { "epoch": 4.424, "grad_norm": 1.420852780342102, "learning_rate": 3.9028056112224445e-05, "loss": 0.2248, "step": 1106 }, { "epoch": 4.428, "grad_norm": 1.2329579591751099, "learning_rate": 3.901803607214429e-05, "loss": 0.2506, "step": 1107 }, { "epoch": 4.432, "grad_norm": 1.2145705223083496, "learning_rate": 3.9008016032064134e-05, "loss": 0.2244, "step": 1108 }, { "epoch": 4.436, "grad_norm": 1.3034549951553345, "learning_rate": 3.899799599198397e-05, "loss": 0.1866, "step": 1109 }, { "epoch": 4.44, "grad_norm": 1.1892132759094238, "learning_rate": 3.898797595190381e-05, "loss": 0.2283, "step": 1110 }, { "epoch": 4.444, "grad_norm": 1.279307246208191, "learning_rate": 3.897795591182365e-05, "loss": 0.2039, "step": 1111 }, { "epoch": 4.448, "grad_norm": 0.917590320110321, "learning_rate": 3.8967935871743486e-05, "loss": 0.1489, "step": 1112 }, { "epoch": 4.452, "grad_norm": 1.2451698780059814, "learning_rate": 3.895791583166333e-05, "loss": 0.1824, "step": 1113 }, { "epoch": 4.456, "grad_norm": 1.3629359006881714, "learning_rate": 3.894789579158317e-05, "loss": 0.2174, "step": 1114 }, { "epoch": 4.46, "grad_norm": 1.4630622863769531, "learning_rate": 3.8937875751503e-05, "loss": 0.2627, "step": 1115 }, { "epoch": 4.464, "grad_norm": 1.2522321939468384, "learning_rate": 3.8927855711422844e-05, "loss": 0.2285, "step": 1116 }, { "epoch": 4.468, "grad_norm": 1.2080490589141846, "learning_rate": 3.891783567134269e-05, "loss": 0.2458, "step": 1117 }, { "epoch": 4.4719999999999995, "grad_norm": 1.2194185256958008, "learning_rate": 3.890781563126253e-05, "loss": 0.2394, "step": 1118 }, { "epoch": 4.476, "grad_norm": 1.1908085346221924, "learning_rate": 3.889779559118237e-05, "loss": 0.2071, "step": 1119 }, { "epoch": 4.48, "grad_norm": 1.3185300827026367, "learning_rate": 3.888777555110221e-05, "loss": 0.1841, "step": 1120 }, { "epoch": 4.484, "grad_norm": 1.2330673933029175, "learning_rate": 3.8877755511022044e-05, "loss": 0.2216, "step": 1121 }, { "epoch": 4.4879999999999995, "grad_norm": 1.1211673021316528, "learning_rate": 3.8867735470941885e-05, "loss": 0.2207, "step": 1122 }, { "epoch": 4.492, "grad_norm": 1.3571367263793945, "learning_rate": 3.8857715430861727e-05, "loss": 0.3118, "step": 1123 }, { "epoch": 4.496, "grad_norm": 1.3315199613571167, "learning_rate": 3.884769539078156e-05, "loss": 0.2559, "step": 1124 }, { "epoch": 4.5, "grad_norm": 1.2943772077560425, "learning_rate": 3.88376753507014e-05, "loss": 0.2258, "step": 1125 }, { "epoch": 4.504, "grad_norm": 1.3622092008590698, "learning_rate": 3.882765531062125e-05, "loss": 0.2172, "step": 1126 }, { "epoch": 4.508, "grad_norm": 1.2951812744140625, "learning_rate": 3.8817635270541085e-05, "loss": 0.2419, "step": 1127 }, { "epoch": 4.5120000000000005, "grad_norm": 1.2247873544692993, "learning_rate": 3.8807615230460926e-05, "loss": 0.1889, "step": 1128 }, { "epoch": 4.516, "grad_norm": 1.29974365234375, "learning_rate": 3.879759519038077e-05, "loss": 0.2432, "step": 1129 }, { "epoch": 4.52, "grad_norm": 1.2597442865371704, "learning_rate": 3.87875751503006e-05, "loss": 0.25, "step": 1130 }, { "epoch": 4.524, "grad_norm": 1.388176679611206, "learning_rate": 3.877755511022044e-05, "loss": 0.2447, "step": 1131 }, { "epoch": 4.5280000000000005, "grad_norm": 1.1391148567199707, "learning_rate": 3.8767535070140285e-05, "loss": 0.273, "step": 1132 }, { "epoch": 4.532, "grad_norm": 1.2334431409835815, "learning_rate": 3.875751503006012e-05, "loss": 0.2268, "step": 1133 }, { "epoch": 4.536, "grad_norm": 1.2976422309875488, "learning_rate": 3.874749498997996e-05, "loss": 0.2243, "step": 1134 }, { "epoch": 4.54, "grad_norm": 1.2898563146591187, "learning_rate": 3.87374749498998e-05, "loss": 0.226, "step": 1135 }, { "epoch": 4.5440000000000005, "grad_norm": 1.309234380722046, "learning_rate": 3.872745490981964e-05, "loss": 0.193, "step": 1136 }, { "epoch": 4.548, "grad_norm": 0.9056435227394104, "learning_rate": 3.8717434869739484e-05, "loss": 0.1361, "step": 1137 }, { "epoch": 4.552, "grad_norm": 1.2060729265213013, "learning_rate": 3.870741482965932e-05, "loss": 0.2172, "step": 1138 }, { "epoch": 4.556, "grad_norm": 1.299154281616211, "learning_rate": 3.869739478957916e-05, "loss": 0.2291, "step": 1139 }, { "epoch": 4.5600000000000005, "grad_norm": 1.1153815984725952, "learning_rate": 3.8687374749499e-05, "loss": 0.1993, "step": 1140 }, { "epoch": 4.564, "grad_norm": 1.402708649635315, "learning_rate": 3.867735470941884e-05, "loss": 0.2377, "step": 1141 }, { "epoch": 4.568, "grad_norm": 1.3014065027236938, "learning_rate": 3.866733466933868e-05, "loss": 0.2178, "step": 1142 }, { "epoch": 4.572, "grad_norm": 1.4406603574752808, "learning_rate": 3.865731462925852e-05, "loss": 0.2547, "step": 1143 }, { "epoch": 4.576, "grad_norm": 1.3197580575942993, "learning_rate": 3.864729458917836e-05, "loss": 0.234, "step": 1144 }, { "epoch": 4.58, "grad_norm": 1.2924617528915405, "learning_rate": 3.8637274549098194e-05, "loss": 0.2327, "step": 1145 }, { "epoch": 4.584, "grad_norm": 1.121708631515503, "learning_rate": 3.862725450901804e-05, "loss": 0.2183, "step": 1146 }, { "epoch": 4.588, "grad_norm": 1.1940151453018188, "learning_rate": 3.861723446893788e-05, "loss": 0.2617, "step": 1147 }, { "epoch": 4.592, "grad_norm": 1.2300734519958496, "learning_rate": 3.860721442885772e-05, "loss": 0.1954, "step": 1148 }, { "epoch": 4.596, "grad_norm": 1.2809008359909058, "learning_rate": 3.859719438877756e-05, "loss": 0.2435, "step": 1149 }, { "epoch": 4.6, "grad_norm": 1.3110355138778687, "learning_rate": 3.8587174348697394e-05, "loss": 0.2235, "step": 1150 }, { "epoch": 4.604, "grad_norm": 1.3215099573135376, "learning_rate": 3.8577154308617235e-05, "loss": 0.2544, "step": 1151 }, { "epoch": 4.608, "grad_norm": 1.1234500408172607, "learning_rate": 3.8567134268537076e-05, "loss": 0.275, "step": 1152 }, { "epoch": 4.612, "grad_norm": 1.1621915102005005, "learning_rate": 3.855711422845692e-05, "loss": 0.2289, "step": 1153 }, { "epoch": 4.616, "grad_norm": 1.163620114326477, "learning_rate": 3.854709418837675e-05, "loss": 0.25, "step": 1154 }, { "epoch": 4.62, "grad_norm": 1.219595193862915, "learning_rate": 3.853707414829659e-05, "loss": 0.2388, "step": 1155 }, { "epoch": 4.624, "grad_norm": 1.2607448101043701, "learning_rate": 3.8527054108216435e-05, "loss": 0.1885, "step": 1156 }, { "epoch": 4.628, "grad_norm": 0.5879749655723572, "learning_rate": 3.8517034068136276e-05, "loss": 0.068, "step": 1157 }, { "epoch": 4.632, "grad_norm": 1.3314592838287354, "learning_rate": 3.850701402805612e-05, "loss": 0.2302, "step": 1158 }, { "epoch": 4.636, "grad_norm": 1.4725823402404785, "learning_rate": 3.849699398797595e-05, "loss": 0.2359, "step": 1159 }, { "epoch": 4.64, "grad_norm": 1.294355034828186, "learning_rate": 3.848697394789579e-05, "loss": 0.2359, "step": 1160 }, { "epoch": 4.644, "grad_norm": 1.3472228050231934, "learning_rate": 3.8476953907815634e-05, "loss": 0.2347, "step": 1161 }, { "epoch": 4.648, "grad_norm": 1.2284510135650635, "learning_rate": 3.846693386773547e-05, "loss": 0.2115, "step": 1162 }, { "epoch": 4.652, "grad_norm": 1.3090240955352783, "learning_rate": 3.845691382765531e-05, "loss": 0.2571, "step": 1163 }, { "epoch": 4.656, "grad_norm": 1.103739857673645, "learning_rate": 3.844689378757515e-05, "loss": 0.2399, "step": 1164 }, { "epoch": 4.66, "grad_norm": 1.189477801322937, "learning_rate": 3.8436873747494986e-05, "loss": 0.2247, "step": 1165 }, { "epoch": 4.664, "grad_norm": 1.3449199199676514, "learning_rate": 3.8426853707414834e-05, "loss": 0.2112, "step": 1166 }, { "epoch": 4.668, "grad_norm": 1.3756353855133057, "learning_rate": 3.8416833667334675e-05, "loss": 0.2268, "step": 1167 }, { "epoch": 4.672, "grad_norm": 1.3317773342132568, "learning_rate": 3.840681362725451e-05, "loss": 0.2692, "step": 1168 }, { "epoch": 4.676, "grad_norm": 1.2678979635238647, "learning_rate": 3.839679358717435e-05, "loss": 0.2411, "step": 1169 }, { "epoch": 4.68, "grad_norm": 0.8288043141365051, "learning_rate": 3.838677354709419e-05, "loss": 0.1286, "step": 1170 }, { "epoch": 4.684, "grad_norm": 1.362588882446289, "learning_rate": 3.837675350701403e-05, "loss": 0.2238, "step": 1171 }, { "epoch": 4.688, "grad_norm": 1.2444343566894531, "learning_rate": 3.836673346693387e-05, "loss": 0.2223, "step": 1172 }, { "epoch": 4.692, "grad_norm": 1.3691511154174805, "learning_rate": 3.835671342685371e-05, "loss": 0.2527, "step": 1173 }, { "epoch": 4.696, "grad_norm": 1.233352780342102, "learning_rate": 3.8346693386773544e-05, "loss": 0.2361, "step": 1174 }, { "epoch": 4.7, "grad_norm": 1.1726874113082886, "learning_rate": 3.8336673346693385e-05, "loss": 0.2026, "step": 1175 }, { "epoch": 4.704, "grad_norm": 1.3368510007858276, "learning_rate": 3.832665330661323e-05, "loss": 0.2683, "step": 1176 }, { "epoch": 4.708, "grad_norm": 1.355893611907959, "learning_rate": 3.831663326653307e-05, "loss": 0.2503, "step": 1177 }, { "epoch": 4.712, "grad_norm": 1.2774181365966797, "learning_rate": 3.830661322645291e-05, "loss": 0.2655, "step": 1178 }, { "epoch": 4.716, "grad_norm": 1.2439703941345215, "learning_rate": 3.829659318637275e-05, "loss": 0.2311, "step": 1179 }, { "epoch": 4.72, "grad_norm": 1.2443056106567383, "learning_rate": 3.8286573146292585e-05, "loss": 0.2011, "step": 1180 }, { "epoch": 4.724, "grad_norm": 1.3085763454437256, "learning_rate": 3.8276553106212426e-05, "loss": 0.2363, "step": 1181 }, { "epoch": 4.728, "grad_norm": 1.3546162843704224, "learning_rate": 3.826653306613227e-05, "loss": 0.2611, "step": 1182 }, { "epoch": 4.732, "grad_norm": 1.2781968116760254, "learning_rate": 3.82565130260521e-05, "loss": 0.207, "step": 1183 }, { "epoch": 4.736, "grad_norm": 3.8161380290985107, "learning_rate": 3.824649298597194e-05, "loss": 0.2138, "step": 1184 }, { "epoch": 4.74, "grad_norm": 1.2593066692352295, "learning_rate": 3.823647294589179e-05, "loss": 0.2364, "step": 1185 }, { "epoch": 4.744, "grad_norm": 1.3970239162445068, "learning_rate": 3.8226452905811626e-05, "loss": 0.209, "step": 1186 }, { "epoch": 4.748, "grad_norm": 1.306069016456604, "learning_rate": 3.821643286573147e-05, "loss": 0.2443, "step": 1187 }, { "epoch": 4.752, "grad_norm": 1.1042245626449585, "learning_rate": 3.820641282565131e-05, "loss": 0.2157, "step": 1188 }, { "epoch": 4.756, "grad_norm": 1.157153606414795, "learning_rate": 3.819639278557114e-05, "loss": 0.2211, "step": 1189 }, { "epoch": 4.76, "grad_norm": 1.1273726224899292, "learning_rate": 3.8186372745490984e-05, "loss": 0.2221, "step": 1190 }, { "epoch": 4.764, "grad_norm": 1.3015018701553345, "learning_rate": 3.8176352705410825e-05, "loss": 0.2381, "step": 1191 }, { "epoch": 4.768, "grad_norm": 1.152994990348816, "learning_rate": 3.816633266533066e-05, "loss": 0.2523, "step": 1192 }, { "epoch": 4.772, "grad_norm": 1.5288457870483398, "learning_rate": 3.81563126252505e-05, "loss": 0.2474, "step": 1193 }, { "epoch": 4.776, "grad_norm": 1.488034963607788, "learning_rate": 3.814629258517034e-05, "loss": 0.2602, "step": 1194 }, { "epoch": 4.78, "grad_norm": 1.257806420326233, "learning_rate": 3.8136272545090184e-05, "loss": 0.2165, "step": 1195 }, { "epoch": 4.784, "grad_norm": 1.1476024389266968, "learning_rate": 3.8126252505010025e-05, "loss": 0.2374, "step": 1196 }, { "epoch": 4.788, "grad_norm": 1.3641656637191772, "learning_rate": 3.8116232464929866e-05, "loss": 0.2371, "step": 1197 }, { "epoch": 4.792, "grad_norm": 1.5159633159637451, "learning_rate": 3.81062124248497e-05, "loss": 0.2511, "step": 1198 }, { "epoch": 4.796, "grad_norm": 1.186079740524292, "learning_rate": 3.809619238476954e-05, "loss": 0.2566, "step": 1199 }, { "epoch": 4.8, "grad_norm": 1.5716686248779297, "learning_rate": 3.808617234468938e-05, "loss": 0.2318, "step": 1200 }, { "epoch": 4.804, "grad_norm": 1.3118908405303955, "learning_rate": 3.807615230460922e-05, "loss": 0.2452, "step": 1201 }, { "epoch": 4.808, "grad_norm": 1.3244889974594116, "learning_rate": 3.806613226452906e-05, "loss": 0.2362, "step": 1202 }, { "epoch": 4.812, "grad_norm": 1.1287165880203247, "learning_rate": 3.80561122244489e-05, "loss": 0.1915, "step": 1203 }, { "epoch": 4.816, "grad_norm": 1.2781370878219604, "learning_rate": 3.8046092184368735e-05, "loss": 0.2024, "step": 1204 }, { "epoch": 4.82, "grad_norm": 1.2718026638031006, "learning_rate": 3.803607214428858e-05, "loss": 0.2099, "step": 1205 }, { "epoch": 4.824, "grad_norm": 1.2175809144973755, "learning_rate": 3.802605210420842e-05, "loss": 0.2387, "step": 1206 }, { "epoch": 4.828, "grad_norm": 1.1810656785964966, "learning_rate": 3.801603206412826e-05, "loss": 0.2773, "step": 1207 }, { "epoch": 4.832, "grad_norm": 1.0714348554611206, "learning_rate": 3.80060120240481e-05, "loss": 0.1103, "step": 1208 }, { "epoch": 4.836, "grad_norm": 1.2614023685455322, "learning_rate": 3.7995991983967935e-05, "loss": 0.2011, "step": 1209 }, { "epoch": 4.84, "grad_norm": 1.2104132175445557, "learning_rate": 3.7985971943887776e-05, "loss": 0.2036, "step": 1210 }, { "epoch": 4.844, "grad_norm": 1.322792887687683, "learning_rate": 3.797595190380762e-05, "loss": 0.246, "step": 1211 }, { "epoch": 4.848, "grad_norm": 1.1660313606262207, "learning_rate": 3.796593186372746e-05, "loss": 0.2658, "step": 1212 }, { "epoch": 4.852, "grad_norm": 1.3373457193374634, "learning_rate": 3.795591182364729e-05, "loss": 0.2282, "step": 1213 }, { "epoch": 4.856, "grad_norm": 1.284891963005066, "learning_rate": 3.7945891783567134e-05, "loss": 0.2356, "step": 1214 }, { "epoch": 4.86, "grad_norm": 1.2078474760055542, "learning_rate": 3.7935871743486975e-05, "loss": 0.2295, "step": 1215 }, { "epoch": 4.864, "grad_norm": 1.158579707145691, "learning_rate": 3.792585170340682e-05, "loss": 0.2112, "step": 1216 }, { "epoch": 4.868, "grad_norm": 1.3074941635131836, "learning_rate": 3.791583166332666e-05, "loss": 0.2361, "step": 1217 }, { "epoch": 4.872, "grad_norm": 0.9688441157341003, "learning_rate": 3.790581162324649e-05, "loss": 0.1319, "step": 1218 }, { "epoch": 4.876, "grad_norm": 1.5012363195419312, "learning_rate": 3.7895791583166334e-05, "loss": 0.2426, "step": 1219 }, { "epoch": 4.88, "grad_norm": 1.236053228378296, "learning_rate": 3.7885771543086175e-05, "loss": 0.2136, "step": 1220 }, { "epoch": 4.884, "grad_norm": 1.2395013570785522, "learning_rate": 3.787575150300601e-05, "loss": 0.2054, "step": 1221 }, { "epoch": 4.888, "grad_norm": 1.3079355955123901, "learning_rate": 3.786573146292585e-05, "loss": 0.2243, "step": 1222 }, { "epoch": 4.892, "grad_norm": 1.2592490911483765, "learning_rate": 3.785571142284569e-05, "loss": 0.2462, "step": 1223 }, { "epoch": 4.896, "grad_norm": 1.615578055381775, "learning_rate": 3.7845691382765533e-05, "loss": 0.3059, "step": 1224 }, { "epoch": 4.9, "grad_norm": 1.2457841634750366, "learning_rate": 3.7835671342685375e-05, "loss": 0.1957, "step": 1225 }, { "epoch": 4.904, "grad_norm": 1.286840558052063, "learning_rate": 3.7825651302605216e-05, "loss": 0.2372, "step": 1226 }, { "epoch": 4.908, "grad_norm": 1.2358272075653076, "learning_rate": 3.781563126252505e-05, "loss": 0.2342, "step": 1227 }, { "epoch": 4.912, "grad_norm": 1.3874037265777588, "learning_rate": 3.780561122244489e-05, "loss": 0.2249, "step": 1228 }, { "epoch": 4.916, "grad_norm": 1.2254422903060913, "learning_rate": 3.779559118236473e-05, "loss": 0.2138, "step": 1229 }, { "epoch": 4.92, "grad_norm": 1.2836542129516602, "learning_rate": 3.778557114228457e-05, "loss": 0.2212, "step": 1230 }, { "epoch": 4.924, "grad_norm": 1.3436830043792725, "learning_rate": 3.777555110220441e-05, "loss": 0.1943, "step": 1231 }, { "epoch": 4.928, "grad_norm": 1.2510602474212646, "learning_rate": 3.776553106212425e-05, "loss": 0.2079, "step": 1232 }, { "epoch": 4.932, "grad_norm": 1.2235398292541504, "learning_rate": 3.7755511022044085e-05, "loss": 0.245, "step": 1233 }, { "epoch": 4.936, "grad_norm": 1.2405413389205933, "learning_rate": 3.774549098196393e-05, "loss": 0.2368, "step": 1234 }, { "epoch": 4.9399999999999995, "grad_norm": 1.2697187662124634, "learning_rate": 3.7735470941883774e-05, "loss": 0.2472, "step": 1235 }, { "epoch": 4.944, "grad_norm": 1.250165343284607, "learning_rate": 3.772545090180361e-05, "loss": 0.245, "step": 1236 }, { "epoch": 4.948, "grad_norm": 1.3872276544570923, "learning_rate": 3.771543086172345e-05, "loss": 0.2394, "step": 1237 }, { "epoch": 4.952, "grad_norm": 1.4038046598434448, "learning_rate": 3.770541082164329e-05, "loss": 0.2684, "step": 1238 }, { "epoch": 4.9559999999999995, "grad_norm": 1.1734498739242554, "learning_rate": 3.7695390781563126e-05, "loss": 0.2401, "step": 1239 }, { "epoch": 4.96, "grad_norm": 1.181815266609192, "learning_rate": 3.768537074148297e-05, "loss": 0.2575, "step": 1240 }, { "epoch": 4.964, "grad_norm": 1.4323796033859253, "learning_rate": 3.767535070140281e-05, "loss": 0.2368, "step": 1241 }, { "epoch": 4.968, "grad_norm": 1.2690467834472656, "learning_rate": 3.766533066132264e-05, "loss": 0.2726, "step": 1242 }, { "epoch": 4.9719999999999995, "grad_norm": 1.060066819190979, "learning_rate": 3.7655310621242484e-05, "loss": 0.2176, "step": 1243 }, { "epoch": 4.976, "grad_norm": 1.293988823890686, "learning_rate": 3.764529058116233e-05, "loss": 0.2382, "step": 1244 }, { "epoch": 4.98, "grad_norm": 1.1977914571762085, "learning_rate": 3.7635270541082167e-05, "loss": 0.2414, "step": 1245 }, { "epoch": 4.984, "grad_norm": 1.373465895652771, "learning_rate": 3.762525050100201e-05, "loss": 0.2524, "step": 1246 }, { "epoch": 4.9879999999999995, "grad_norm": 1.2298009395599365, "learning_rate": 3.761523046092185e-05, "loss": 0.2156, "step": 1247 }, { "epoch": 4.992, "grad_norm": 1.460239052772522, "learning_rate": 3.7605210420841684e-05, "loss": 0.2943, "step": 1248 }, { "epoch": 4.996, "grad_norm": 1.2077012062072754, "learning_rate": 3.7595190380761525e-05, "loss": 0.2488, "step": 1249 }, { "epoch": 5.0, "grad_norm": 1.2159690856933594, "learning_rate": 3.7585170340681366e-05, "loss": 0.237, "step": 1250 }, { "epoch": 5.004, "grad_norm": 1.1079336404800415, "learning_rate": 3.75751503006012e-05, "loss": 0.1713, "step": 1251 }, { "epoch": 5.008, "grad_norm": 1.1447618007659912, "learning_rate": 3.756513026052104e-05, "loss": 0.1614, "step": 1252 }, { "epoch": 5.012, "grad_norm": 1.0622740983963013, "learning_rate": 3.755511022044088e-05, "loss": 0.1568, "step": 1253 }, { "epoch": 5.016, "grad_norm": 1.1202747821807861, "learning_rate": 3.7545090180360724e-05, "loss": 0.1313, "step": 1254 }, { "epoch": 5.02, "grad_norm": 1.1181975603103638, "learning_rate": 3.7535070140280566e-05, "loss": 0.166, "step": 1255 }, { "epoch": 5.024, "grad_norm": 10.162293434143066, "learning_rate": 3.752505010020041e-05, "loss": 0.0938, "step": 1256 }, { "epoch": 5.028, "grad_norm": 1.2769560813903809, "learning_rate": 3.751503006012024e-05, "loss": 0.1378, "step": 1257 }, { "epoch": 5.032, "grad_norm": 1.2073560953140259, "learning_rate": 3.750501002004008e-05, "loss": 0.1715, "step": 1258 }, { "epoch": 5.036, "grad_norm": 1.229649543762207, "learning_rate": 3.7494989979959924e-05, "loss": 0.14, "step": 1259 }, { "epoch": 5.04, "grad_norm": 1.3796989917755127, "learning_rate": 3.748496993987976e-05, "loss": 0.189, "step": 1260 }, { "epoch": 5.044, "grad_norm": 1.5101726055145264, "learning_rate": 3.74749498997996e-05, "loss": 0.1487, "step": 1261 }, { "epoch": 5.048, "grad_norm": 1.451194167137146, "learning_rate": 3.746492985971944e-05, "loss": 0.1871, "step": 1262 }, { "epoch": 5.052, "grad_norm": 1.608769416809082, "learning_rate": 3.7454909819639276e-05, "loss": 0.1676, "step": 1263 }, { "epoch": 5.056, "grad_norm": 1.3911021947860718, "learning_rate": 3.7444889779559124e-05, "loss": 0.1217, "step": 1264 }, { "epoch": 5.06, "grad_norm": 1.4321279525756836, "learning_rate": 3.743486973947896e-05, "loss": 0.1725, "step": 1265 }, { "epoch": 5.064, "grad_norm": 1.49961519241333, "learning_rate": 3.74248496993988e-05, "loss": 0.1564, "step": 1266 }, { "epoch": 5.068, "grad_norm": 1.374585509300232, "learning_rate": 3.741482965931864e-05, "loss": 0.1183, "step": 1267 }, { "epoch": 5.072, "grad_norm": 1.5012727975845337, "learning_rate": 3.740480961923848e-05, "loss": 0.1466, "step": 1268 }, { "epoch": 5.076, "grad_norm": Infinity, "learning_rate": 3.740480961923848e-05, "loss": 0.3217, "step": 1269 }, { "epoch": 5.08, "grad_norm": 1.511431336402893, "learning_rate": 3.739478957915832e-05, "loss": 0.1452, "step": 1270 }, { "epoch": 5.084, "grad_norm": 1.2745331525802612, "learning_rate": 3.738476953907816e-05, "loss": 0.1128, "step": 1271 }, { "epoch": 5.088, "grad_norm": 1.2193913459777832, "learning_rate": 3.7374749498998e-05, "loss": 0.1502, "step": 1272 }, { "epoch": 5.092, "grad_norm": 1.190537929534912, "learning_rate": 3.7364729458917834e-05, "loss": 0.117, "step": 1273 }, { "epoch": 5.096, "grad_norm": 1.1268037557601929, "learning_rate": 3.7354709418837675e-05, "loss": 0.1432, "step": 1274 }, { "epoch": 5.1, "grad_norm": 1.1972516775131226, "learning_rate": 3.7344689378757516e-05, "loss": 0.1263, "step": 1275 }, { "epoch": 5.104, "grad_norm": 1.2640272378921509, "learning_rate": 3.733466933867736e-05, "loss": 0.1393, "step": 1276 }, { "epoch": 5.108, "grad_norm": 1.2229938507080078, "learning_rate": 3.73246492985972e-05, "loss": 0.1254, "step": 1277 }, { "epoch": 5.112, "grad_norm": 1.4548741579055786, "learning_rate": 3.731462925851703e-05, "loss": 0.129, "step": 1278 }, { "epoch": 5.116, "grad_norm": 1.5529509782791138, "learning_rate": 3.7304609218436875e-05, "loss": 0.1918, "step": 1279 }, { "epoch": 5.12, "grad_norm": 1.473236322402954, "learning_rate": 3.7294589178356716e-05, "loss": 0.129, "step": 1280 }, { "epoch": 5.124, "grad_norm": 1.4448833465576172, "learning_rate": 3.728456913827656e-05, "loss": 0.1673, "step": 1281 }, { "epoch": 5.128, "grad_norm": 1.455904245376587, "learning_rate": 3.727454909819639e-05, "loss": 0.1614, "step": 1282 }, { "epoch": 5.132, "grad_norm": 47.209869384765625, "learning_rate": 3.726452905811623e-05, "loss": 0.6446, "step": 1283 }, { "epoch": 5.136, "grad_norm": 1.2592720985412598, "learning_rate": 3.7254509018036074e-05, "loss": 0.1518, "step": 1284 }, { "epoch": 5.14, "grad_norm": 1.3965915441513062, "learning_rate": 3.7244488977955916e-05, "loss": 0.1286, "step": 1285 }, { "epoch": 5.144, "grad_norm": 1.0066782236099243, "learning_rate": 3.723446893787576e-05, "loss": 0.0742, "step": 1286 }, { "epoch": 5.148, "grad_norm": 1.2181978225708008, "learning_rate": 3.722444889779559e-05, "loss": 0.1358, "step": 1287 }, { "epoch": 5.152, "grad_norm": 1.3564952611923218, "learning_rate": 3.721442885771543e-05, "loss": 0.1212, "step": 1288 }, { "epoch": 5.156, "grad_norm": 1.4054635763168335, "learning_rate": 3.7204408817635274e-05, "loss": 0.1534, "step": 1289 }, { "epoch": 5.16, "grad_norm": 1.1692203283309937, "learning_rate": 3.719438877755511e-05, "loss": 0.1135, "step": 1290 }, { "epoch": 5.164, "grad_norm": 1.528304934501648, "learning_rate": 3.718436873747495e-05, "loss": 0.1787, "step": 1291 }, { "epoch": 5.168, "grad_norm": 1.4810398817062378, "learning_rate": 3.717434869739479e-05, "loss": 0.1711, "step": 1292 }, { "epoch": 5.172, "grad_norm": 1.4008511304855347, "learning_rate": 3.7164328657314625e-05, "loss": 0.107, "step": 1293 }, { "epoch": 5.176, "grad_norm": 1.5215219259262085, "learning_rate": 3.7154308617234474e-05, "loss": 0.1714, "step": 1294 }, { "epoch": 5.18, "grad_norm": 1.3255380392074585, "learning_rate": 3.7144288577154315e-05, "loss": 0.1437, "step": 1295 }, { "epoch": 5.184, "grad_norm": 1.540545105934143, "learning_rate": 3.713426853707415e-05, "loss": 0.1374, "step": 1296 }, { "epoch": 5.188, "grad_norm": 1.2621541023254395, "learning_rate": 3.712424849699399e-05, "loss": 0.1114, "step": 1297 }, { "epoch": 5.192, "grad_norm": 1.355939507484436, "learning_rate": 3.711422845691383e-05, "loss": 0.135, "step": 1298 }, { "epoch": 5.196, "grad_norm": 1.5436731576919556, "learning_rate": 3.7104208416833666e-05, "loss": 0.1471, "step": 1299 }, { "epoch": 5.2, "grad_norm": 1.4287269115447998, "learning_rate": 3.709418837675351e-05, "loss": 0.147, "step": 1300 }, { "epoch": 5.204, "grad_norm": 1.37587308883667, "learning_rate": 3.708416833667335e-05, "loss": 0.1434, "step": 1301 }, { "epoch": 5.208, "grad_norm": 1.3519721031188965, "learning_rate": 3.7074148296593183e-05, "loss": 0.1418, "step": 1302 }, { "epoch": 5.212, "grad_norm": 1.3758410215377808, "learning_rate": 3.7064128256513025e-05, "loss": 0.1535, "step": 1303 }, { "epoch": 5.216, "grad_norm": 1.3432583808898926, "learning_rate": 3.705410821643287e-05, "loss": 0.1477, "step": 1304 }, { "epoch": 5.22, "grad_norm": 1.2263681888580322, "learning_rate": 3.704408817635271e-05, "loss": 0.1374, "step": 1305 }, { "epoch": 5.224, "grad_norm": 1.2747381925582886, "learning_rate": 3.703406813627255e-05, "loss": 0.1498, "step": 1306 }, { "epoch": 5.228, "grad_norm": 1.4037673473358154, "learning_rate": 3.702404809619239e-05, "loss": 0.1539, "step": 1307 }, { "epoch": 5.232, "grad_norm": 1.4913922548294067, "learning_rate": 3.7014028056112224e-05, "loss": 0.1381, "step": 1308 }, { "epoch": 5.236, "grad_norm": 1.3626859188079834, "learning_rate": 3.7004008016032066e-05, "loss": 0.1257, "step": 1309 }, { "epoch": 5.24, "grad_norm": 1.508866548538208, "learning_rate": 3.699398797595191e-05, "loss": 0.1854, "step": 1310 }, { "epoch": 5.244, "grad_norm": 1.3557127714157104, "learning_rate": 3.698396793587174e-05, "loss": 0.1417, "step": 1311 }, { "epoch": 5.248, "grad_norm": 1.4805433750152588, "learning_rate": 3.697394789579158e-05, "loss": 0.1596, "step": 1312 }, { "epoch": 5.252, "grad_norm": 1.4268604516983032, "learning_rate": 3.6963927855711424e-05, "loss": 0.1612, "step": 1313 }, { "epoch": 5.256, "grad_norm": 1.4338222742080688, "learning_rate": 3.6953907815631265e-05, "loss": 0.1668, "step": 1314 }, { "epoch": 5.26, "grad_norm": 1.3227003812789917, "learning_rate": 3.6943887775551107e-05, "loss": 0.1342, "step": 1315 }, { "epoch": 5.264, "grad_norm": 1.3900845050811768, "learning_rate": 3.693386773547095e-05, "loss": 0.1342, "step": 1316 }, { "epoch": 5.268, "grad_norm": 1.3115166425704956, "learning_rate": 3.692384769539078e-05, "loss": 0.1544, "step": 1317 }, { "epoch": 5.272, "grad_norm": 1.6373382806777954, "learning_rate": 3.6913827655310624e-05, "loss": 0.1649, "step": 1318 }, { "epoch": 5.276, "grad_norm": 1.2519011497497559, "learning_rate": 3.6903807615230465e-05, "loss": 0.1448, "step": 1319 }, { "epoch": 5.28, "grad_norm": 1.4340311288833618, "learning_rate": 3.68937875751503e-05, "loss": 0.1744, "step": 1320 }, { "epoch": 5.284, "grad_norm": 1.7507641315460205, "learning_rate": 3.688376753507014e-05, "loss": 0.1714, "step": 1321 }, { "epoch": 5.288, "grad_norm": 1.4718471765518188, "learning_rate": 3.687374749498998e-05, "loss": 0.1769, "step": 1322 }, { "epoch": 5.292, "grad_norm": 1.3668906688690186, "learning_rate": 3.6863727454909817e-05, "loss": 0.1593, "step": 1323 }, { "epoch": 5.296, "grad_norm": 1.5843260288238525, "learning_rate": 3.6853707414829665e-05, "loss": 0.1379, "step": 1324 }, { "epoch": 5.3, "grad_norm": 1.3435697555541992, "learning_rate": 3.6843687374749506e-05, "loss": 0.1084, "step": 1325 }, { "epoch": 5.304, "grad_norm": 1.4321434497833252, "learning_rate": 3.683366733466934e-05, "loss": 0.1472, "step": 1326 }, { "epoch": 5.308, "grad_norm": 1.3599714040756226, "learning_rate": 3.682364729458918e-05, "loss": 0.1392, "step": 1327 }, { "epoch": 5.312, "grad_norm": 1.3491992950439453, "learning_rate": 3.681362725450902e-05, "loss": 0.1574, "step": 1328 }, { "epoch": 5.316, "grad_norm": 1.4255551099777222, "learning_rate": 3.680360721442886e-05, "loss": 0.1349, "step": 1329 }, { "epoch": 5.32, "grad_norm": 1.34247624874115, "learning_rate": 3.67935871743487e-05, "loss": 0.1745, "step": 1330 }, { "epoch": 5.324, "grad_norm": 1.4770880937576294, "learning_rate": 3.678356713426854e-05, "loss": 0.1558, "step": 1331 }, { "epoch": 5.328, "grad_norm": 1.468400001525879, "learning_rate": 3.6773547094188375e-05, "loss": 0.1388, "step": 1332 }, { "epoch": 5.332, "grad_norm": 1.588028907775879, "learning_rate": 3.6763527054108216e-05, "loss": 0.1722, "step": 1333 }, { "epoch": 5.336, "grad_norm": 1.3578060865402222, "learning_rate": 3.675350701402806e-05, "loss": 0.1314, "step": 1334 }, { "epoch": 5.34, "grad_norm": 1.7125457525253296, "learning_rate": 3.67434869739479e-05, "loss": 0.1409, "step": 1335 }, { "epoch": 5.344, "grad_norm": 1.482620358467102, "learning_rate": 3.673346693386774e-05, "loss": 0.1242, "step": 1336 }, { "epoch": 5.348, "grad_norm": 1.468563437461853, "learning_rate": 3.6723446893787574e-05, "loss": 0.1509, "step": 1337 }, { "epoch": 5.352, "grad_norm": 1.4916741847991943, "learning_rate": 3.6713426853707415e-05, "loss": 0.1477, "step": 1338 }, { "epoch": 5.356, "grad_norm": 1.3862581253051758, "learning_rate": 3.670340681362726e-05, "loss": 0.1625, "step": 1339 }, { "epoch": 5.36, "grad_norm": 1.3407968282699585, "learning_rate": 3.66933867735471e-05, "loss": 0.1499, "step": 1340 }, { "epoch": 5.364, "grad_norm": 1.010241985321045, "learning_rate": 3.668336673346693e-05, "loss": 0.0938, "step": 1341 }, { "epoch": 5.368, "grad_norm": 1.3833948373794556, "learning_rate": 3.6673346693386774e-05, "loss": 0.1439, "step": 1342 }, { "epoch": 5.372, "grad_norm": 1.3611929416656494, "learning_rate": 3.6663326653306615e-05, "loss": 0.1722, "step": 1343 }, { "epoch": 5.376, "grad_norm": 1.1714794635772705, "learning_rate": 3.6653306613226456e-05, "loss": 0.093, "step": 1344 }, { "epoch": 5.38, "grad_norm": 1.0513854026794434, "learning_rate": 3.66432865731463e-05, "loss": 0.1028, "step": 1345 }, { "epoch": 5.384, "grad_norm": 1.432370662689209, "learning_rate": 3.663326653306613e-05, "loss": 0.1621, "step": 1346 }, { "epoch": 5.388, "grad_norm": 1.3324103355407715, "learning_rate": 3.6623246492985973e-05, "loss": 0.1261, "step": 1347 }, { "epoch": 5.392, "grad_norm": 1.4653232097625732, "learning_rate": 3.6613226452905815e-05, "loss": 0.1342, "step": 1348 }, { "epoch": 5.396, "grad_norm": 1.4007515907287598, "learning_rate": 3.660320641282565e-05, "loss": 0.1296, "step": 1349 }, { "epoch": 5.4, "grad_norm": 1.5396267175674438, "learning_rate": 3.659318637274549e-05, "loss": 0.1519, "step": 1350 }, { "epoch": 5.404, "grad_norm": 1.4199961423873901, "learning_rate": 3.658316633266533e-05, "loss": 0.1516, "step": 1351 }, { "epoch": 5.408, "grad_norm": 1.455402135848999, "learning_rate": 3.657314629258517e-05, "loss": 0.19, "step": 1352 }, { "epoch": 5.412, "grad_norm": 1.4118759632110596, "learning_rate": 3.6563126252505014e-05, "loss": 0.1687, "step": 1353 }, { "epoch": 5.416, "grad_norm": 1.4946292638778687, "learning_rate": 3.6553106212424856e-05, "loss": 0.1587, "step": 1354 }, { "epoch": 5.42, "grad_norm": 1.4371452331542969, "learning_rate": 3.654308617234469e-05, "loss": 0.1722, "step": 1355 }, { "epoch": 5.424, "grad_norm": 1.4597325325012207, "learning_rate": 3.653306613226453e-05, "loss": 0.1954, "step": 1356 }, { "epoch": 5.428, "grad_norm": 1.463151216506958, "learning_rate": 3.652304609218437e-05, "loss": 0.2056, "step": 1357 }, { "epoch": 5.432, "grad_norm": 1.568338394165039, "learning_rate": 3.651302605210421e-05, "loss": 0.1665, "step": 1358 }, { "epoch": 5.436, "grad_norm": 1.4030911922454834, "learning_rate": 3.650300601202405e-05, "loss": 0.1463, "step": 1359 }, { "epoch": 5.44, "grad_norm": 1.3122279644012451, "learning_rate": 3.649298597194389e-05, "loss": 0.1324, "step": 1360 }, { "epoch": 5.444, "grad_norm": 1.398829698562622, "learning_rate": 3.6482965931863724e-05, "loss": 0.182, "step": 1361 }, { "epoch": 5.448, "grad_norm": 1.5118191242218018, "learning_rate": 3.6472945891783566e-05, "loss": 0.1485, "step": 1362 }, { "epoch": 5.452, "grad_norm": 1.4859402179718018, "learning_rate": 3.6462925851703414e-05, "loss": 0.1429, "step": 1363 }, { "epoch": 5.456, "grad_norm": 1.5563009977340698, "learning_rate": 3.645290581162325e-05, "loss": 0.1834, "step": 1364 }, { "epoch": 5.46, "grad_norm": 1.403515338897705, "learning_rate": 3.644288577154309e-05, "loss": 0.1533, "step": 1365 }, { "epoch": 5.464, "grad_norm": 1.3895883560180664, "learning_rate": 3.643286573146293e-05, "loss": 0.1668, "step": 1366 }, { "epoch": 5.468, "grad_norm": 1.454662561416626, "learning_rate": 3.6422845691382765e-05, "loss": 0.1376, "step": 1367 }, { "epoch": 5.4719999999999995, "grad_norm": 1.4135189056396484, "learning_rate": 3.6412825651302606e-05, "loss": 0.132, "step": 1368 }, { "epoch": 5.476, "grad_norm": 1.3551510572433472, "learning_rate": 3.640280561122245e-05, "loss": 0.2015, "step": 1369 }, { "epoch": 5.48, "grad_norm": 1.4458680152893066, "learning_rate": 3.639278557114228e-05, "loss": 0.1309, "step": 1370 }, { "epoch": 5.484, "grad_norm": 1.508864402770996, "learning_rate": 3.6382765531062124e-05, "loss": 0.1945, "step": 1371 }, { "epoch": 5.4879999999999995, "grad_norm": 1.439699411392212, "learning_rate": 3.6372745490981965e-05, "loss": 0.1537, "step": 1372 }, { "epoch": 5.492, "grad_norm": 1.516624093055725, "learning_rate": 3.6362725450901806e-05, "loss": 0.1503, "step": 1373 }, { "epoch": 5.496, "grad_norm": 1.4190101623535156, "learning_rate": 3.635270541082165e-05, "loss": 0.1483, "step": 1374 }, { "epoch": 5.5, "grad_norm": 1.4877848625183105, "learning_rate": 3.634268537074149e-05, "loss": 0.1409, "step": 1375 }, { "epoch": 5.504, "grad_norm": 1.448904037475586, "learning_rate": 3.633266533066132e-05, "loss": 0.1681, "step": 1376 }, { "epoch": 5.508, "grad_norm": 1.4470781087875366, "learning_rate": 3.6322645290581164e-05, "loss": 0.1599, "step": 1377 }, { "epoch": 5.5120000000000005, "grad_norm": 1.418031930923462, "learning_rate": 3.6312625250501006e-05, "loss": 0.1441, "step": 1378 }, { "epoch": 5.516, "grad_norm": 1.4554909467697144, "learning_rate": 3.630260521042084e-05, "loss": 0.1463, "step": 1379 }, { "epoch": 5.52, "grad_norm": 1.379330039024353, "learning_rate": 3.629258517034068e-05, "loss": 0.1285, "step": 1380 }, { "epoch": 5.524, "grad_norm": 1.4990249872207642, "learning_rate": 3.628256513026052e-05, "loss": 0.2017, "step": 1381 }, { "epoch": 5.5280000000000005, "grad_norm": 1.2808997631072998, "learning_rate": 3.627254509018036e-05, "loss": 0.1664, "step": 1382 }, { "epoch": 5.532, "grad_norm": 1.6173251867294312, "learning_rate": 3.6262525050100205e-05, "loss": 0.174, "step": 1383 }, { "epoch": 5.536, "grad_norm": 1.396598219871521, "learning_rate": 3.625250501002005e-05, "loss": 0.1454, "step": 1384 }, { "epoch": 5.54, "grad_norm": 1.0782043933868408, "learning_rate": 3.624248496993988e-05, "loss": 0.119, "step": 1385 }, { "epoch": 5.5440000000000005, "grad_norm": 1.3274977207183838, "learning_rate": 3.623246492985972e-05, "loss": 0.1369, "step": 1386 }, { "epoch": 5.548, "grad_norm": 1.3669708967208862, "learning_rate": 3.6222444889779564e-05, "loss": 0.1481, "step": 1387 }, { "epoch": 5.552, "grad_norm": 1.3589403629302979, "learning_rate": 3.62124248496994e-05, "loss": 0.2093, "step": 1388 }, { "epoch": 5.556, "grad_norm": 1.4127261638641357, "learning_rate": 3.620240480961924e-05, "loss": 0.1664, "step": 1389 }, { "epoch": 5.5600000000000005, "grad_norm": 1.4620351791381836, "learning_rate": 3.619238476953908e-05, "loss": 0.1434, "step": 1390 }, { "epoch": 5.564, "grad_norm": 1.4212642908096313, "learning_rate": 3.6182364729458915e-05, "loss": 0.161, "step": 1391 }, { "epoch": 5.568, "grad_norm": 1.4831411838531494, "learning_rate": 3.6172344689378757e-05, "loss": 0.17, "step": 1392 }, { "epoch": 5.572, "grad_norm": 1.560911774635315, "learning_rate": 3.61623246492986e-05, "loss": 0.1737, "step": 1393 }, { "epoch": 5.576, "grad_norm": 1.5256128311157227, "learning_rate": 3.615230460921844e-05, "loss": 0.1609, "step": 1394 }, { "epoch": 5.58, "grad_norm": 1.407828688621521, "learning_rate": 3.614228456913828e-05, "loss": 0.1925, "step": 1395 }, { "epoch": 5.584, "grad_norm": 1.3980634212493896, "learning_rate": 3.613226452905812e-05, "loss": 0.1535, "step": 1396 }, { "epoch": 5.588, "grad_norm": 1.4387320280075073, "learning_rate": 3.6122244488977956e-05, "loss": 0.1489, "step": 1397 }, { "epoch": 5.592, "grad_norm": 1.5303181409835815, "learning_rate": 3.61122244488978e-05, "loss": 0.1925, "step": 1398 }, { "epoch": 5.596, "grad_norm": 1.5387216806411743, "learning_rate": 3.610220440881764e-05, "loss": 0.1709, "step": 1399 }, { "epoch": 5.6, "grad_norm": 0.9596248865127563, "learning_rate": 3.609218436873747e-05, "loss": 0.0974, "step": 1400 }, { "epoch": 5.604, "grad_norm": 1.4933253526687622, "learning_rate": 3.6082164328657315e-05, "loss": 0.2206, "step": 1401 }, { "epoch": 5.608, "grad_norm": 1.3961461782455444, "learning_rate": 3.6072144288577156e-05, "loss": 0.1859, "step": 1402 }, { "epoch": 5.612, "grad_norm": 1.2029403448104858, "learning_rate": 3.6062124248497e-05, "loss": 0.1289, "step": 1403 }, { "epoch": 5.616, "grad_norm": 1.345209002494812, "learning_rate": 3.605210420841684e-05, "loss": 0.1448, "step": 1404 }, { "epoch": 5.62, "grad_norm": 1.1782201528549194, "learning_rate": 3.604208416833667e-05, "loss": 0.1294, "step": 1405 }, { "epoch": 5.624, "grad_norm": 1.461083173751831, "learning_rate": 3.6032064128256514e-05, "loss": 0.1534, "step": 1406 }, { "epoch": 5.628, "grad_norm": 1.2814429998397827, "learning_rate": 3.6022044088176356e-05, "loss": 0.1259, "step": 1407 }, { "epoch": 5.632, "grad_norm": 1.565420150756836, "learning_rate": 3.60120240480962e-05, "loss": 0.1861, "step": 1408 }, { "epoch": 5.636, "grad_norm": 1.0360498428344727, "learning_rate": 3.600200400801603e-05, "loss": 0.0836, "step": 1409 }, { "epoch": 5.64, "grad_norm": 1.4248356819152832, "learning_rate": 3.599198396793587e-05, "loss": 0.1988, "step": 1410 }, { "epoch": 5.644, "grad_norm": 1.5098896026611328, "learning_rate": 3.5981963927855714e-05, "loss": 0.1708, "step": 1411 }, { "epoch": 5.648, "grad_norm": 1.4749072790145874, "learning_rate": 3.5971943887775555e-05, "loss": 0.1791, "step": 1412 }, { "epoch": 5.652, "grad_norm": 1.316433072090149, "learning_rate": 3.5961923847695396e-05, "loss": 0.1425, "step": 1413 }, { "epoch": 5.656, "grad_norm": 1.5112534761428833, "learning_rate": 3.595190380761523e-05, "loss": 0.1417, "step": 1414 }, { "epoch": 5.66, "grad_norm": 1.4549270868301392, "learning_rate": 3.594188376753507e-05, "loss": 0.1756, "step": 1415 }, { "epoch": 5.664, "grad_norm": 1.5447784662246704, "learning_rate": 3.5931863727454914e-05, "loss": 0.1416, "step": 1416 }, { "epoch": 5.668, "grad_norm": 1.4175710678100586, "learning_rate": 3.592184368737475e-05, "loss": 0.1437, "step": 1417 }, { "epoch": 5.672, "grad_norm": 1.4404096603393555, "learning_rate": 3.591182364729459e-05, "loss": 0.1456, "step": 1418 }, { "epoch": 5.676, "grad_norm": 1.4998631477355957, "learning_rate": 3.590180360721443e-05, "loss": 0.1666, "step": 1419 }, { "epoch": 5.68, "grad_norm": 1.4109253883361816, "learning_rate": 3.5891783567134265e-05, "loss": 0.1846, "step": 1420 }, { "epoch": 5.684, "grad_norm": 1.2808420658111572, "learning_rate": 3.5881763527054106e-05, "loss": 0.1424, "step": 1421 }, { "epoch": 5.688, "grad_norm": 1.439148187637329, "learning_rate": 3.5871743486973954e-05, "loss": 0.153, "step": 1422 }, { "epoch": 5.692, "grad_norm": 1.5616912841796875, "learning_rate": 3.586172344689379e-05, "loss": 0.1863, "step": 1423 }, { "epoch": 5.696, "grad_norm": 1.353434681892395, "learning_rate": 3.585170340681363e-05, "loss": 0.158, "step": 1424 }, { "epoch": 5.7, "grad_norm": 1.3511037826538086, "learning_rate": 3.584168336673347e-05, "loss": 0.162, "step": 1425 }, { "epoch": 5.704, "grad_norm": 1.4532557725906372, "learning_rate": 3.5831663326653306e-05, "loss": 0.1671, "step": 1426 }, { "epoch": 5.708, "grad_norm": 1.4470570087432861, "learning_rate": 3.582164328657315e-05, "loss": 0.1564, "step": 1427 }, { "epoch": 5.712, "grad_norm": 1.4751670360565186, "learning_rate": 3.581162324649299e-05, "loss": 0.1445, "step": 1428 }, { "epoch": 5.716, "grad_norm": 1.3708182573318481, "learning_rate": 3.580160320641282e-05, "loss": 0.1708, "step": 1429 }, { "epoch": 5.72, "grad_norm": 1.4434928894042969, "learning_rate": 3.5791583166332664e-05, "loss": 0.1691, "step": 1430 }, { "epoch": 5.724, "grad_norm": 1.0687934160232544, "learning_rate": 3.5781563126252506e-05, "loss": 0.1061, "step": 1431 }, { "epoch": 5.728, "grad_norm": 1.4179203510284424, "learning_rate": 3.577154308617235e-05, "loss": 0.174, "step": 1432 }, { "epoch": 5.732, "grad_norm": 1.318701148033142, "learning_rate": 3.576152304609219e-05, "loss": 0.1496, "step": 1433 }, { "epoch": 5.736, "grad_norm": 1.4118852615356445, "learning_rate": 3.575150300601203e-05, "loss": 0.1978, "step": 1434 }, { "epoch": 5.74, "grad_norm": 1.2677303552627563, "learning_rate": 3.5741482965931864e-05, "loss": 0.1672, "step": 1435 }, { "epoch": 5.744, "grad_norm": 1.8141050338745117, "learning_rate": 3.5731462925851705e-05, "loss": 0.1801, "step": 1436 }, { "epoch": 5.748, "grad_norm": 1.5993258953094482, "learning_rate": 3.5721442885771547e-05, "loss": 0.1583, "step": 1437 }, { "epoch": 5.752, "grad_norm": 1.4515212774276733, "learning_rate": 3.571142284569138e-05, "loss": 0.2014, "step": 1438 }, { "epoch": 5.756, "grad_norm": 1.3378204107284546, "learning_rate": 3.570140280561122e-05, "loss": 0.1366, "step": 1439 }, { "epoch": 5.76, "grad_norm": 1.3408321142196655, "learning_rate": 3.5691382765531064e-05, "loss": 0.1301, "step": 1440 }, { "epoch": 5.764, "grad_norm": 1.4962408542633057, "learning_rate": 3.56813627254509e-05, "loss": 0.1385, "step": 1441 }, { "epoch": 5.768, "grad_norm": 1.589575171470642, "learning_rate": 3.5671342685370746e-05, "loss": 0.1777, "step": 1442 }, { "epoch": 5.772, "grad_norm": 1.5674686431884766, "learning_rate": 3.566132264529059e-05, "loss": 0.166, "step": 1443 }, { "epoch": 5.776, "grad_norm": 1.5545917749404907, "learning_rate": 3.565130260521042e-05, "loss": 0.1775, "step": 1444 }, { "epoch": 5.78, "grad_norm": 1.4211684465408325, "learning_rate": 3.564128256513026e-05, "loss": 0.149, "step": 1445 }, { "epoch": 5.784, "grad_norm": 1.6268999576568604, "learning_rate": 3.5631262525050105e-05, "loss": 0.1587, "step": 1446 }, { "epoch": 5.788, "grad_norm": 1.6301075220108032, "learning_rate": 3.562124248496994e-05, "loss": 0.1824, "step": 1447 }, { "epoch": 5.792, "grad_norm": 1.3886116743087769, "learning_rate": 3.561122244488978e-05, "loss": 0.1717, "step": 1448 }, { "epoch": 5.796, "grad_norm": 1.5212382078170776, "learning_rate": 3.560120240480962e-05, "loss": 0.1772, "step": 1449 }, { "epoch": 5.8, "grad_norm": 1.4963605403900146, "learning_rate": 3.5591182364729456e-05, "loss": 0.1917, "step": 1450 }, { "epoch": 5.804, "grad_norm": 1.4401335716247559, "learning_rate": 3.55811623246493e-05, "loss": 0.1619, "step": 1451 }, { "epoch": 5.808, "grad_norm": 1.2470486164093018, "learning_rate": 3.5571142284569145e-05, "loss": 0.1606, "step": 1452 }, { "epoch": 5.812, "grad_norm": 1.4720518589019775, "learning_rate": 3.556112224448898e-05, "loss": 0.1735, "step": 1453 }, { "epoch": 5.816, "grad_norm": 1.3243999481201172, "learning_rate": 3.555110220440882e-05, "loss": 0.172, "step": 1454 }, { "epoch": 5.82, "grad_norm": 1.3949337005615234, "learning_rate": 3.554108216432866e-05, "loss": 0.1698, "step": 1455 }, { "epoch": 5.824, "grad_norm": 1.554047703742981, "learning_rate": 3.55310621242485e-05, "loss": 0.1883, "step": 1456 }, { "epoch": 5.828, "grad_norm": 1.4399428367614746, "learning_rate": 3.552104208416834e-05, "loss": 0.1341, "step": 1457 }, { "epoch": 5.832, "grad_norm": 1.5571656227111816, "learning_rate": 3.551102204408818e-05, "loss": 0.1765, "step": 1458 }, { "epoch": 5.836, "grad_norm": 1.5305334329605103, "learning_rate": 3.5501002004008014e-05, "loss": 0.1873, "step": 1459 }, { "epoch": 5.84, "grad_norm": 1.5004547834396362, "learning_rate": 3.5490981963927855e-05, "loss": 0.1567, "step": 1460 }, { "epoch": 5.844, "grad_norm": 1.336523413658142, "learning_rate": 3.54809619238477e-05, "loss": 0.1511, "step": 1461 }, { "epoch": 5.848, "grad_norm": 1.7299772500991821, "learning_rate": 3.547094188376754e-05, "loss": 0.229, "step": 1462 }, { "epoch": 5.852, "grad_norm": 1.3854844570159912, "learning_rate": 3.546092184368738e-05, "loss": 0.1588, "step": 1463 }, { "epoch": 5.856, "grad_norm": 1.587310552597046, "learning_rate": 3.5450901803607214e-05, "loss": 0.1463, "step": 1464 }, { "epoch": 5.86, "grad_norm": 1.3696925640106201, "learning_rate": 3.5440881763527055e-05, "loss": 0.1679, "step": 1465 }, { "epoch": 5.864, "grad_norm": 0.932451605796814, "learning_rate": 3.5430861723446896e-05, "loss": 0.1001, "step": 1466 }, { "epoch": 5.868, "grad_norm": 1.4562475681304932, "learning_rate": 3.542084168336674e-05, "loss": 0.1691, "step": 1467 }, { "epoch": 5.872, "grad_norm": 1.3682665824890137, "learning_rate": 3.541082164328657e-05, "loss": 0.1666, "step": 1468 }, { "epoch": 5.876, "grad_norm": 1.3185116052627563, "learning_rate": 3.5400801603206413e-05, "loss": 0.1698, "step": 1469 }, { "epoch": 5.88, "grad_norm": 1.5660794973373413, "learning_rate": 3.5390781563126255e-05, "loss": 0.1482, "step": 1470 }, { "epoch": 5.884, "grad_norm": 1.449788212776184, "learning_rate": 3.5380761523046096e-05, "loss": 0.1522, "step": 1471 }, { "epoch": 5.888, "grad_norm": 1.434978723526001, "learning_rate": 3.537074148296594e-05, "loss": 0.1738, "step": 1472 }, { "epoch": 5.892, "grad_norm": 1.4906777143478394, "learning_rate": 3.536072144288577e-05, "loss": 0.1801, "step": 1473 }, { "epoch": 5.896, "grad_norm": 1.4387478828430176, "learning_rate": 3.535070140280561e-05, "loss": 0.1582, "step": 1474 }, { "epoch": 5.9, "grad_norm": 1.5130301713943481, "learning_rate": 3.5340681362725454e-05, "loss": 0.1876, "step": 1475 }, { "epoch": 5.904, "grad_norm": 1.44967782497406, "learning_rate": 3.533066132264529e-05, "loss": 0.1782, "step": 1476 }, { "epoch": 5.908, "grad_norm": 1.5330886840820312, "learning_rate": 3.532064128256513e-05, "loss": 0.1584, "step": 1477 }, { "epoch": 5.912, "grad_norm": 1.5080928802490234, "learning_rate": 3.531062124248497e-05, "loss": 0.1575, "step": 1478 }, { "epoch": 5.916, "grad_norm": 1.455888271331787, "learning_rate": 3.530060120240481e-05, "loss": 0.1837, "step": 1479 }, { "epoch": 5.92, "grad_norm": 1.5638760328292847, "learning_rate": 3.529058116232465e-05, "loss": 0.1553, "step": 1480 }, { "epoch": 5.924, "grad_norm": 1.487268328666687, "learning_rate": 3.5280561122244495e-05, "loss": 0.1817, "step": 1481 }, { "epoch": 5.928, "grad_norm": 1.4699187278747559, "learning_rate": 3.527054108216433e-05, "loss": 0.1571, "step": 1482 }, { "epoch": 5.932, "grad_norm": 1.4532089233398438, "learning_rate": 3.526052104208417e-05, "loss": 0.1434, "step": 1483 }, { "epoch": 5.936, "grad_norm": 1.591044306755066, "learning_rate": 3.525050100200401e-05, "loss": 0.1538, "step": 1484 }, { "epoch": 5.9399999999999995, "grad_norm": 1.4258755445480347, "learning_rate": 3.524048096192385e-05, "loss": 0.1606, "step": 1485 }, { "epoch": 5.944, "grad_norm": 1.4429696798324585, "learning_rate": 3.523046092184369e-05, "loss": 0.2038, "step": 1486 }, { "epoch": 5.948, "grad_norm": 1.4724392890930176, "learning_rate": 3.522044088176353e-05, "loss": 0.1644, "step": 1487 }, { "epoch": 5.952, "grad_norm": 1.488022804260254, "learning_rate": 3.5210420841683364e-05, "loss": 0.1528, "step": 1488 }, { "epoch": 5.9559999999999995, "grad_norm": 1.5905804634094238, "learning_rate": 3.5200400801603205e-05, "loss": 0.2294, "step": 1489 }, { "epoch": 5.96, "grad_norm": 1.4152902364730835, "learning_rate": 3.5190380761523046e-05, "loss": 0.2096, "step": 1490 }, { "epoch": 5.964, "grad_norm": 1.3682752847671509, "learning_rate": 3.518036072144289e-05, "loss": 0.1684, "step": 1491 }, { "epoch": 5.968, "grad_norm": 1.4440537691116333, "learning_rate": 3.517034068136273e-05, "loss": 0.1714, "step": 1492 }, { "epoch": 5.9719999999999995, "grad_norm": 1.4367694854736328, "learning_rate": 3.516032064128257e-05, "loss": 0.1558, "step": 1493 }, { "epoch": 5.976, "grad_norm": 1.351462483406067, "learning_rate": 3.5150300601202405e-05, "loss": 0.167, "step": 1494 }, { "epoch": 5.98, "grad_norm": 1.5862767696380615, "learning_rate": 3.5140280561122246e-05, "loss": 0.2089, "step": 1495 }, { "epoch": 5.984, "grad_norm": 1.0578093528747559, "learning_rate": 3.513026052104209e-05, "loss": 0.1061, "step": 1496 }, { "epoch": 5.9879999999999995, "grad_norm": 1.7430568933486938, "learning_rate": 3.512024048096192e-05, "loss": 0.1775, "step": 1497 }, { "epoch": 5.992, "grad_norm": 1.3868354558944702, "learning_rate": 3.511022044088176e-05, "loss": 0.1627, "step": 1498 }, { "epoch": 5.996, "grad_norm": 1.4972561597824097, "learning_rate": 3.5100200400801604e-05, "loss": 0.1617, "step": 1499 }, { "epoch": 6.0, "grad_norm": 1.5004100799560547, "learning_rate": 3.509018036072144e-05, "loss": 0.1657, "step": 1500 }, { "epoch": 6.004, "grad_norm": 1.2470020055770874, "learning_rate": 3.508016032064129e-05, "loss": 0.1026, "step": 1501 }, { "epoch": 6.008, "grad_norm": 1.0558257102966309, "learning_rate": 3.507014028056113e-05, "loss": 0.0933, "step": 1502 }, { "epoch": 6.012, "grad_norm": 1.027756690979004, "learning_rate": 3.506012024048096e-05, "loss": 0.0819, "step": 1503 }, { "epoch": 6.016, "grad_norm": 1.1106997728347778, "learning_rate": 3.5050100200400804e-05, "loss": 0.0955, "step": 1504 }, { "epoch": 6.02, "grad_norm": 1.2579761743545532, "learning_rate": 3.5040080160320645e-05, "loss": 0.1063, "step": 1505 }, { "epoch": 6.024, "grad_norm": 1.3087098598480225, "learning_rate": 3.503006012024048e-05, "loss": 0.0956, "step": 1506 }, { "epoch": 6.028, "grad_norm": 1.2753757238388062, "learning_rate": 3.502004008016032e-05, "loss": 0.0847, "step": 1507 }, { "epoch": 6.032, "grad_norm": 0.8829823732376099, "learning_rate": 3.501002004008016e-05, "loss": 0.0567, "step": 1508 }, { "epoch": 6.036, "grad_norm": 1.3138492107391357, "learning_rate": 3.5e-05, "loss": 0.1009, "step": 1509 }, { "epoch": 6.04, "grad_norm": 1.252717137336731, "learning_rate": 3.4989979959919845e-05, "loss": 0.0815, "step": 1510 }, { "epoch": 6.044, "grad_norm": 1.5361855030059814, "learning_rate": 3.4979959919839686e-05, "loss": 0.0982, "step": 1511 }, { "epoch": 6.048, "grad_norm": 1.8114335536956787, "learning_rate": 3.496993987975952e-05, "loss": 0.107, "step": 1512 }, { "epoch": 6.052, "grad_norm": 1.430020809173584, "learning_rate": 3.495991983967936e-05, "loss": 0.0926, "step": 1513 }, { "epoch": 6.056, "grad_norm": 1.3625584840774536, "learning_rate": 3.49498997995992e-05, "loss": 0.0849, "step": 1514 }, { "epoch": 6.06, "grad_norm": 1.577380895614624, "learning_rate": 3.493987975951904e-05, "loss": 0.1411, "step": 1515 }, { "epoch": 6.064, "grad_norm": 1.2233928442001343, "learning_rate": 3.492985971943888e-05, "loss": 0.0837, "step": 1516 }, { "epoch": 6.068, "grad_norm": 1.4746198654174805, "learning_rate": 3.491983967935872e-05, "loss": 0.091, "step": 1517 }, { "epoch": 6.072, "grad_norm": 1.5226060152053833, "learning_rate": 3.4909819639278555e-05, "loss": 0.1285, "step": 1518 }, { "epoch": 6.076, "grad_norm": 1.3142298460006714, "learning_rate": 3.4899799599198396e-05, "loss": 0.105, "step": 1519 }, { "epoch": 6.08, "grad_norm": 1.3603146076202393, "learning_rate": 3.488977955911824e-05, "loss": 0.0849, "step": 1520 }, { "epoch": 6.084, "grad_norm": 1.2942161560058594, "learning_rate": 3.487975951903808e-05, "loss": 0.0896, "step": 1521 }, { "epoch": 6.088, "grad_norm": 1.6908429861068726, "learning_rate": 3.486973947895792e-05, "loss": 0.1167, "step": 1522 }, { "epoch": 6.092, "grad_norm": 1.1075533628463745, "learning_rate": 3.485971943887776e-05, "loss": 0.0823, "step": 1523 }, { "epoch": 6.096, "grad_norm": 1.4377602338790894, "learning_rate": 3.4849699398797596e-05, "loss": 0.0825, "step": 1524 }, { "epoch": 6.1, "grad_norm": 1.474347472190857, "learning_rate": 3.483967935871744e-05, "loss": 0.1007, "step": 1525 }, { "epoch": 6.104, "grad_norm": 1.4804532527923584, "learning_rate": 3.482965931863728e-05, "loss": 0.0888, "step": 1526 }, { "epoch": 6.108, "grad_norm": 1.413860559463501, "learning_rate": 3.481963927855711e-05, "loss": 0.0858, "step": 1527 }, { "epoch": 6.112, "grad_norm": 1.3580888509750366, "learning_rate": 3.4809619238476954e-05, "loss": 0.0751, "step": 1528 }, { "epoch": 6.116, "grad_norm": 1.015170693397522, "learning_rate": 3.4799599198396795e-05, "loss": 0.0622, "step": 1529 }, { "epoch": 6.12, "grad_norm": 1.4240541458129883, "learning_rate": 3.478957915831664e-05, "loss": 0.0968, "step": 1530 }, { "epoch": 6.124, "grad_norm": 1.4235905408859253, "learning_rate": 3.477955911823648e-05, "loss": 0.0997, "step": 1531 }, { "epoch": 6.128, "grad_norm": 1.6935840845108032, "learning_rate": 3.476953907815631e-05, "loss": 0.1425, "step": 1532 }, { "epoch": 6.132, "grad_norm": 1.4013699293136597, "learning_rate": 3.4759519038076154e-05, "loss": 0.0898, "step": 1533 }, { "epoch": 6.136, "grad_norm": 1.4652191400527954, "learning_rate": 3.4749498997995995e-05, "loss": 0.0968, "step": 1534 }, { "epoch": 6.14, "grad_norm": 1.4197686910629272, "learning_rate": 3.4739478957915836e-05, "loss": 0.1328, "step": 1535 }, { "epoch": 6.144, "grad_norm": 1.2468719482421875, "learning_rate": 3.472945891783567e-05, "loss": 0.0874, "step": 1536 }, { "epoch": 6.148, "grad_norm": 1.5248289108276367, "learning_rate": 3.471943887775551e-05, "loss": 0.0975, "step": 1537 }, { "epoch": 6.152, "grad_norm": 1.3224518299102783, "learning_rate": 3.4709418837675353e-05, "loss": 0.0761, "step": 1538 }, { "epoch": 6.156, "grad_norm": 1.4312621355056763, "learning_rate": 3.469939879759519e-05, "loss": 0.0833, "step": 1539 }, { "epoch": 6.16, "grad_norm": 1.3523186445236206, "learning_rate": 3.4689378757515036e-05, "loss": 0.0912, "step": 1540 }, { "epoch": 6.164, "grad_norm": 1.4507619142532349, "learning_rate": 3.467935871743487e-05, "loss": 0.0977, "step": 1541 }, { "epoch": 6.168, "grad_norm": 1.0727993249893188, "learning_rate": 3.466933867735471e-05, "loss": 0.0608, "step": 1542 }, { "epoch": 6.172, "grad_norm": 1.7440407276153564, "learning_rate": 3.465931863727455e-05, "loss": 0.1061, "step": 1543 }, { "epoch": 6.176, "grad_norm": 1.238020420074463, "learning_rate": 3.464929859719439e-05, "loss": 0.0703, "step": 1544 }, { "epoch": 6.18, "grad_norm": 1.3099385499954224, "learning_rate": 3.463927855711423e-05, "loss": 0.1048, "step": 1545 }, { "epoch": 6.184, "grad_norm": 1.4899845123291016, "learning_rate": 3.462925851703407e-05, "loss": 0.1037, "step": 1546 }, { "epoch": 6.188, "grad_norm": 1.4137481451034546, "learning_rate": 3.4619238476953905e-05, "loss": 0.1138, "step": 1547 }, { "epoch": 6.192, "grad_norm": 1.402271032333374, "learning_rate": 3.4609218436873746e-05, "loss": 0.1069, "step": 1548 }, { "epoch": 6.196, "grad_norm": 1.4417635202407837, "learning_rate": 3.459919839679359e-05, "loss": 0.1024, "step": 1549 }, { "epoch": 6.2, "grad_norm": 1.2592614889144897, "learning_rate": 3.458917835671343e-05, "loss": 0.1002, "step": 1550 }, { "epoch": 6.204, "grad_norm": 1.6195659637451172, "learning_rate": 3.457915831663327e-05, "loss": 0.1135, "step": 1551 }, { "epoch": 6.208, "grad_norm": 1.3179293870925903, "learning_rate": 3.456913827655311e-05, "loss": 0.1026, "step": 1552 }, { "epoch": 6.212, "grad_norm": 1.3877010345458984, "learning_rate": 3.4559118236472946e-05, "loss": 0.0814, "step": 1553 }, { "epoch": 6.216, "grad_norm": 1.2459914684295654, "learning_rate": 3.454909819639279e-05, "loss": 0.0978, "step": 1554 }, { "epoch": 6.22, "grad_norm": 1.368811011314392, "learning_rate": 3.453907815631263e-05, "loss": 0.0927, "step": 1555 }, { "epoch": 6.224, "grad_norm": 1.629564642906189, "learning_rate": 3.452905811623246e-05, "loss": 0.1302, "step": 1556 }, { "epoch": 6.228, "grad_norm": 1.299174189567566, "learning_rate": 3.4519038076152304e-05, "loss": 0.0974, "step": 1557 }, { "epoch": 6.232, "grad_norm": 1.39913809299469, "learning_rate": 3.4509018036072145e-05, "loss": 0.1, "step": 1558 }, { "epoch": 6.236, "grad_norm": 1.5463366508483887, "learning_rate": 3.449899799599198e-05, "loss": 0.1092, "step": 1559 }, { "epoch": 6.24, "grad_norm": 1.416552186012268, "learning_rate": 3.448897795591183e-05, "loss": 0.1247, "step": 1560 }, { "epoch": 6.244, "grad_norm": 1.3216407299041748, "learning_rate": 3.447895791583167e-05, "loss": 0.1219, "step": 1561 }, { "epoch": 6.248, "grad_norm": 1.2443658113479614, "learning_rate": 3.4468937875751504e-05, "loss": 0.085, "step": 1562 }, { "epoch": 6.252, "grad_norm": 1.3868099451065063, "learning_rate": 3.4458917835671345e-05, "loss": 0.0941, "step": 1563 }, { "epoch": 6.256, "grad_norm": 1.5421968698501587, "learning_rate": 3.4448897795591186e-05, "loss": 0.1034, "step": 1564 }, { "epoch": 6.26, "grad_norm": 1.3406178951263428, "learning_rate": 3.443887775551102e-05, "loss": 0.0902, "step": 1565 }, { "epoch": 6.264, "grad_norm": 1.5938392877578735, "learning_rate": 3.442885771543086e-05, "loss": 0.1159, "step": 1566 }, { "epoch": 6.268, "grad_norm": 1.5965449810028076, "learning_rate": 3.44188376753507e-05, "loss": 0.1174, "step": 1567 }, { "epoch": 6.272, "grad_norm": 1.4595623016357422, "learning_rate": 3.440881763527054e-05, "loss": 0.1078, "step": 1568 }, { "epoch": 6.276, "grad_norm": 1.6079479455947876, "learning_rate": 3.4398797595190386e-05, "loss": 0.1158, "step": 1569 }, { "epoch": 6.28, "grad_norm": 1.078923225402832, "learning_rate": 3.438877755511023e-05, "loss": 0.0722, "step": 1570 }, { "epoch": 6.284, "grad_norm": 1.4052515029907227, "learning_rate": 3.437875751503006e-05, "loss": 0.0988, "step": 1571 }, { "epoch": 6.288, "grad_norm": 1.262578010559082, "learning_rate": 3.43687374749499e-05, "loss": 0.0981, "step": 1572 }, { "epoch": 6.292, "grad_norm": 1.333280324935913, "learning_rate": 3.4358717434869744e-05, "loss": 0.11, "step": 1573 }, { "epoch": 6.296, "grad_norm": 1.4792897701263428, "learning_rate": 3.434869739478958e-05, "loss": 0.0854, "step": 1574 }, { "epoch": 6.3, "grad_norm": 1.2670187950134277, "learning_rate": 3.433867735470942e-05, "loss": 0.0929, "step": 1575 }, { "epoch": 6.304, "grad_norm": 1.4768058061599731, "learning_rate": 3.432865731462926e-05, "loss": 0.0961, "step": 1576 }, { "epoch": 6.308, "grad_norm": 1.6261510848999023, "learning_rate": 3.4318637274549096e-05, "loss": 0.1069, "step": 1577 }, { "epoch": 6.312, "grad_norm": 1.4705723524093628, "learning_rate": 3.430861723446894e-05, "loss": 0.0841, "step": 1578 }, { "epoch": 6.316, "grad_norm": 1.6650081872940063, "learning_rate": 3.4298597194388785e-05, "loss": 0.1196, "step": 1579 }, { "epoch": 6.32, "grad_norm": 1.4160841703414917, "learning_rate": 3.428857715430862e-05, "loss": 0.1067, "step": 1580 }, { "epoch": 6.324, "grad_norm": 0.9616687297821045, "learning_rate": 3.427855711422846e-05, "loss": 0.055, "step": 1581 }, { "epoch": 6.328, "grad_norm": 1.5261203050613403, "learning_rate": 3.42685370741483e-05, "loss": 0.115, "step": 1582 }, { "epoch": 6.332, "grad_norm": 1.568793773651123, "learning_rate": 3.425851703406814e-05, "loss": 0.1151, "step": 1583 }, { "epoch": 6.336, "grad_norm": 1.351989507675171, "learning_rate": 3.424849699398798e-05, "loss": 0.1155, "step": 1584 }, { "epoch": 6.34, "grad_norm": 1.3044042587280273, "learning_rate": 3.423847695390782e-05, "loss": 0.0911, "step": 1585 }, { "epoch": 6.344, "grad_norm": 1.4144433736801147, "learning_rate": 3.4228456913827654e-05, "loss": 0.1016, "step": 1586 }, { "epoch": 6.348, "grad_norm": 2.237967014312744, "learning_rate": 3.4218436873747495e-05, "loss": 0.0831, "step": 1587 }, { "epoch": 6.352, "grad_norm": 1.522283911705017, "learning_rate": 3.4208416833667336e-05, "loss": 0.1045, "step": 1588 }, { "epoch": 6.356, "grad_norm": 1.3800389766693115, "learning_rate": 3.419839679358718e-05, "loss": 0.1121, "step": 1589 }, { "epoch": 6.36, "grad_norm": 0.9566818475723267, "learning_rate": 3.418837675350702e-05, "loss": 0.0592, "step": 1590 }, { "epoch": 6.364, "grad_norm": 1.6034566164016724, "learning_rate": 3.417835671342685e-05, "loss": 0.097, "step": 1591 }, { "epoch": 6.368, "grad_norm": 1.4005167484283447, "learning_rate": 3.4168336673346695e-05, "loss": 0.0934, "step": 1592 }, { "epoch": 6.372, "grad_norm": 1.4644639492034912, "learning_rate": 3.4158316633266536e-05, "loss": 0.0891, "step": 1593 }, { "epoch": 6.376, "grad_norm": 1.265230417251587, "learning_rate": 3.414829659318638e-05, "loss": 0.1015, "step": 1594 }, { "epoch": 6.38, "grad_norm": 1.375074863433838, "learning_rate": 3.413827655310621e-05, "loss": 0.1105, "step": 1595 }, { "epoch": 6.384, "grad_norm": 1.3185374736785889, "learning_rate": 3.412825651302605e-05, "loss": 0.0965, "step": 1596 }, { "epoch": 6.388, "grad_norm": 1.3281744718551636, "learning_rate": 3.4118236472945894e-05, "loss": 0.0881, "step": 1597 }, { "epoch": 6.392, "grad_norm": 1.5773831605911255, "learning_rate": 3.410821643286573e-05, "loss": 0.1217, "step": 1598 }, { "epoch": 6.396, "grad_norm": 1.4592585563659668, "learning_rate": 3.409819639278558e-05, "loss": 0.0941, "step": 1599 }, { "epoch": 6.4, "grad_norm": 1.4319229125976562, "learning_rate": 3.408817635270541e-05, "loss": 0.1064, "step": 1600 }, { "epoch": 6.404, "grad_norm": 1.5549246072769165, "learning_rate": 3.407815631262525e-05, "loss": 0.1062, "step": 1601 }, { "epoch": 6.408, "grad_norm": 1.5339570045471191, "learning_rate": 3.4068136272545094e-05, "loss": 0.1068, "step": 1602 }, { "epoch": 6.412, "grad_norm": 1.4197008609771729, "learning_rate": 3.405811623246493e-05, "loss": 0.093, "step": 1603 }, { "epoch": 6.416, "grad_norm": 1.3577507734298706, "learning_rate": 3.404809619238477e-05, "loss": 0.1006, "step": 1604 }, { "epoch": 6.42, "grad_norm": 0.8569574356079102, "learning_rate": 3.403807615230461e-05, "loss": 0.0439, "step": 1605 }, { "epoch": 6.424, "grad_norm": 1.6528804302215576, "learning_rate": 3.402805611222445e-05, "loss": 0.1212, "step": 1606 }, { "epoch": 6.428, "grad_norm": 1.4956883192062378, "learning_rate": 3.401803607214429e-05, "loss": 0.0922, "step": 1607 }, { "epoch": 6.432, "grad_norm": 1.6123335361480713, "learning_rate": 3.400801603206413e-05, "loss": 0.1055, "step": 1608 }, { "epoch": 6.436, "grad_norm": 1.539689302444458, "learning_rate": 3.399799599198397e-05, "loss": 0.0866, "step": 1609 }, { "epoch": 6.44, "grad_norm": 1.570948600769043, "learning_rate": 3.398797595190381e-05, "loss": 0.1343, "step": 1610 }, { "epoch": 6.444, "grad_norm": 1.6961643695831299, "learning_rate": 3.397795591182365e-05, "loss": 0.1115, "step": 1611 }, { "epoch": 6.448, "grad_norm": 1.4908407926559448, "learning_rate": 3.3967935871743486e-05, "loss": 0.0983, "step": 1612 }, { "epoch": 6.452, "grad_norm": 1.48050057888031, "learning_rate": 3.395791583166333e-05, "loss": 0.0982, "step": 1613 }, { "epoch": 6.456, "grad_norm": 1.5200427770614624, "learning_rate": 3.394789579158317e-05, "loss": 0.095, "step": 1614 }, { "epoch": 6.46, "grad_norm": 1.004806399345398, "learning_rate": 3.3937875751503003e-05, "loss": 0.0564, "step": 1615 }, { "epoch": 6.464, "grad_norm": 1.4128756523132324, "learning_rate": 3.3927855711422845e-05, "loss": 0.1113, "step": 1616 }, { "epoch": 6.468, "grad_norm": 1.1182926893234253, "learning_rate": 3.3917835671342686e-05, "loss": 0.0857, "step": 1617 }, { "epoch": 6.4719999999999995, "grad_norm": 1.3846279382705688, "learning_rate": 3.390781563126252e-05, "loss": 0.0842, "step": 1618 }, { "epoch": 6.476, "grad_norm": 1.4602874517440796, "learning_rate": 3.389779559118237e-05, "loss": 0.0941, "step": 1619 }, { "epoch": 6.48, "grad_norm": 1.6256506443023682, "learning_rate": 3.388777555110221e-05, "loss": 0.1055, "step": 1620 }, { "epoch": 6.484, "grad_norm": 0.9726806282997131, "learning_rate": 3.3877755511022044e-05, "loss": 0.0599, "step": 1621 }, { "epoch": 6.4879999999999995, "grad_norm": 2.609361410140991, "learning_rate": 3.3867735470941886e-05, "loss": 0.1239, "step": 1622 }, { "epoch": 6.492, "grad_norm": 1.5318516492843628, "learning_rate": 3.385771543086173e-05, "loss": 0.1655, "step": 1623 }, { "epoch": 6.496, "grad_norm": 1.4403146505355835, "learning_rate": 3.384769539078156e-05, "loss": 0.1011, "step": 1624 }, { "epoch": 6.5, "grad_norm": 1.5402027368545532, "learning_rate": 3.38376753507014e-05, "loss": 0.1085, "step": 1625 }, { "epoch": 6.504, "grad_norm": 1.6094437837600708, "learning_rate": 3.3827655310621244e-05, "loss": 0.1067, "step": 1626 }, { "epoch": 6.508, "grad_norm": 1.4926447868347168, "learning_rate": 3.381763527054108e-05, "loss": 0.0976, "step": 1627 }, { "epoch": 6.5120000000000005, "grad_norm": 1.4129664897918701, "learning_rate": 3.3807615230460927e-05, "loss": 0.0947, "step": 1628 }, { "epoch": 6.516, "grad_norm": 1.3668807744979858, "learning_rate": 3.379759519038077e-05, "loss": 0.1382, "step": 1629 }, { "epoch": 6.52, "grad_norm": 1.3848040103912354, "learning_rate": 3.37875751503006e-05, "loss": 0.1103, "step": 1630 }, { "epoch": 6.524, "grad_norm": 1.2794206142425537, "learning_rate": 3.3777555110220444e-05, "loss": 0.0964, "step": 1631 }, { "epoch": 6.5280000000000005, "grad_norm": 1.6137124300003052, "learning_rate": 3.3767535070140285e-05, "loss": 0.1081, "step": 1632 }, { "epoch": 6.532, "grad_norm": 1.6709508895874023, "learning_rate": 3.375751503006012e-05, "loss": 0.1414, "step": 1633 }, { "epoch": 6.536, "grad_norm": 1.5419001579284668, "learning_rate": 3.374749498997996e-05, "loss": 0.1198, "step": 1634 }, { "epoch": 6.54, "grad_norm": 1.83645761013031, "learning_rate": 3.37374749498998e-05, "loss": 0.1293, "step": 1635 }, { "epoch": 6.5440000000000005, "grad_norm": 1.4904134273529053, "learning_rate": 3.3727454909819637e-05, "loss": 0.1015, "step": 1636 }, { "epoch": 6.548, "grad_norm": 1.4517120122909546, "learning_rate": 3.371743486973948e-05, "loss": 0.0854, "step": 1637 }, { "epoch": 6.552, "grad_norm": 1.6505271196365356, "learning_rate": 3.3707414829659326e-05, "loss": 0.1111, "step": 1638 }, { "epoch": 6.556, "grad_norm": 2.092888116836548, "learning_rate": 3.369739478957916e-05, "loss": 0.0974, "step": 1639 }, { "epoch": 6.5600000000000005, "grad_norm": 1.2148265838623047, "learning_rate": 3.3687374749499e-05, "loss": 0.1113, "step": 1640 }, { "epoch": 6.564, "grad_norm": 1.4497480392456055, "learning_rate": 3.367735470941884e-05, "loss": 0.1082, "step": 1641 }, { "epoch": 6.568, "grad_norm": 1.4580659866333008, "learning_rate": 3.366733466933868e-05, "loss": 0.1145, "step": 1642 }, { "epoch": 6.572, "grad_norm": 1.3077856302261353, "learning_rate": 3.365731462925852e-05, "loss": 0.1039, "step": 1643 }, { "epoch": 6.576, "grad_norm": 1.3982523679733276, "learning_rate": 3.364729458917836e-05, "loss": 0.1041, "step": 1644 }, { "epoch": 6.58, "grad_norm": 1.5847138166427612, "learning_rate": 3.3637274549098195e-05, "loss": 0.1368, "step": 1645 }, { "epoch": 6.584, "grad_norm": 1.542724847793579, "learning_rate": 3.3627254509018036e-05, "loss": 0.1215, "step": 1646 }, { "epoch": 6.588, "grad_norm": 1.2111552953720093, "learning_rate": 3.361723446893788e-05, "loss": 0.0983, "step": 1647 }, { "epoch": 6.592, "grad_norm": 1.5661391019821167, "learning_rate": 3.360721442885772e-05, "loss": 0.1315, "step": 1648 }, { "epoch": 6.596, "grad_norm": 1.6418293714523315, "learning_rate": 3.359719438877756e-05, "loss": 0.1096, "step": 1649 }, { "epoch": 6.6, "grad_norm": 1.4757168292999268, "learning_rate": 3.35871743486974e-05, "loss": 0.1102, "step": 1650 }, { "epoch": 6.604, "grad_norm": 1.40822434425354, "learning_rate": 3.3577154308617235e-05, "loss": 0.0912, "step": 1651 }, { "epoch": 6.608, "grad_norm": 1.426364541053772, "learning_rate": 3.356713426853708e-05, "loss": 0.0913, "step": 1652 }, { "epoch": 6.612, "grad_norm": 1.2223780155181885, "learning_rate": 3.355711422845692e-05, "loss": 0.0945, "step": 1653 }, { "epoch": 6.616, "grad_norm": 1.4718832969665527, "learning_rate": 3.354709418837675e-05, "loss": 0.1092, "step": 1654 }, { "epoch": 6.62, "grad_norm": 1.6908650398254395, "learning_rate": 3.3537074148296594e-05, "loss": 0.1143, "step": 1655 }, { "epoch": 6.624, "grad_norm": 1.5453354120254517, "learning_rate": 3.3527054108216435e-05, "loss": 0.101, "step": 1656 }, { "epoch": 6.628, "grad_norm": 1.1836323738098145, "learning_rate": 3.351703406813627e-05, "loss": 0.0705, "step": 1657 }, { "epoch": 6.632, "grad_norm": 1.5613477230072021, "learning_rate": 3.350701402805612e-05, "loss": 0.109, "step": 1658 }, { "epoch": 6.636, "grad_norm": 1.5052834749221802, "learning_rate": 3.349699398797595e-05, "loss": 0.1057, "step": 1659 }, { "epoch": 6.64, "grad_norm": 1.1341853141784668, "learning_rate": 3.3486973947895793e-05, "loss": 0.0539, "step": 1660 }, { "epoch": 6.644, "grad_norm": 1.6815075874328613, "learning_rate": 3.3476953907815635e-05, "loss": 0.1237, "step": 1661 }, { "epoch": 6.648, "grad_norm": 1.5323514938354492, "learning_rate": 3.3466933867735476e-05, "loss": 0.0888, "step": 1662 }, { "epoch": 6.652, "grad_norm": 1.7266521453857422, "learning_rate": 3.345691382765531e-05, "loss": 0.1308, "step": 1663 }, { "epoch": 6.656, "grad_norm": 1.6895675659179688, "learning_rate": 3.344689378757515e-05, "loss": 0.1054, "step": 1664 }, { "epoch": 6.66, "grad_norm": 1.456880807876587, "learning_rate": 3.343687374749499e-05, "loss": 0.1082, "step": 1665 }, { "epoch": 6.664, "grad_norm": 1.5108088254928589, "learning_rate": 3.342685370741483e-05, "loss": 0.1111, "step": 1666 }, { "epoch": 6.668, "grad_norm": 1.3145025968551636, "learning_rate": 3.341683366733467e-05, "loss": 0.096, "step": 1667 }, { "epoch": 6.672, "grad_norm": 1.4500359296798706, "learning_rate": 3.340681362725451e-05, "loss": 0.0917, "step": 1668 }, { "epoch": 6.676, "grad_norm": 1.5362236499786377, "learning_rate": 3.339679358717435e-05, "loss": 0.1375, "step": 1669 }, { "epoch": 6.68, "grad_norm": 1.490013599395752, "learning_rate": 3.338677354709419e-05, "loss": 0.1124, "step": 1670 }, { "epoch": 6.684, "grad_norm": 1.6905912160873413, "learning_rate": 3.337675350701403e-05, "loss": 0.1238, "step": 1671 }, { "epoch": 6.688, "grad_norm": 1.4641087055206299, "learning_rate": 3.336673346693387e-05, "loss": 0.1398, "step": 1672 }, { "epoch": 6.692, "grad_norm": 1.4538987874984741, "learning_rate": 3.335671342685371e-05, "loss": 0.1147, "step": 1673 }, { "epoch": 6.696, "grad_norm": 1.6322211027145386, "learning_rate": 3.3346693386773544e-05, "loss": 0.1176, "step": 1674 }, { "epoch": 6.7, "grad_norm": 1.3581397533416748, "learning_rate": 3.3336673346693386e-05, "loss": 0.0936, "step": 1675 }, { "epoch": 6.704, "grad_norm": 1.356339693069458, "learning_rate": 3.332665330661323e-05, "loss": 0.0987, "step": 1676 }, { "epoch": 6.708, "grad_norm": 1.5190908908843994, "learning_rate": 3.331663326653307e-05, "loss": 0.1148, "step": 1677 }, { "epoch": 6.712, "grad_norm": 1.6022852659225464, "learning_rate": 3.330661322645291e-05, "loss": 0.1146, "step": 1678 }, { "epoch": 6.716, "grad_norm": 1.7773820161819458, "learning_rate": 3.329659318637275e-05, "loss": 0.1239, "step": 1679 }, { "epoch": 6.72, "grad_norm": 1.3916137218475342, "learning_rate": 3.3286573146292585e-05, "loss": 0.0931, "step": 1680 }, { "epoch": 6.724, "grad_norm": 1.4522221088409424, "learning_rate": 3.3276553106212426e-05, "loss": 0.1173, "step": 1681 }, { "epoch": 6.728, "grad_norm": 1.3560315370559692, "learning_rate": 3.326653306613227e-05, "loss": 0.1113, "step": 1682 }, { "epoch": 6.732, "grad_norm": 1.4816689491271973, "learning_rate": 3.32565130260521e-05, "loss": 0.1027, "step": 1683 }, { "epoch": 6.736, "grad_norm": 1.453627586364746, "learning_rate": 3.3246492985971944e-05, "loss": 0.1071, "step": 1684 }, { "epoch": 6.74, "grad_norm": 1.4733647108078003, "learning_rate": 3.3236472945891785e-05, "loss": 0.1208, "step": 1685 }, { "epoch": 6.744, "grad_norm": 1.3408794403076172, "learning_rate": 3.322645290581162e-05, "loss": 0.1031, "step": 1686 }, { "epoch": 6.748, "grad_norm": 1.5712541341781616, "learning_rate": 3.321643286573147e-05, "loss": 0.1169, "step": 1687 }, { "epoch": 6.752, "grad_norm": 1.2861677408218384, "learning_rate": 3.320641282565131e-05, "loss": 0.0946, "step": 1688 }, { "epoch": 6.756, "grad_norm": 1.5849751234054565, "learning_rate": 3.319639278557114e-05, "loss": 0.0961, "step": 1689 }, { "epoch": 6.76, "grad_norm": 1.468711018562317, "learning_rate": 3.3186372745490984e-05, "loss": 0.1496, "step": 1690 }, { "epoch": 6.764, "grad_norm": 1.6362906694412231, "learning_rate": 3.3176352705410826e-05, "loss": 0.1169, "step": 1691 }, { "epoch": 6.768, "grad_norm": 1.5838176012039185, "learning_rate": 3.316633266533066e-05, "loss": 0.1395, "step": 1692 }, { "epoch": 6.772, "grad_norm": 1.5610008239746094, "learning_rate": 3.31563126252505e-05, "loss": 0.0939, "step": 1693 }, { "epoch": 6.776, "grad_norm": 1.5154725313186646, "learning_rate": 3.314629258517034e-05, "loss": 0.1003, "step": 1694 }, { "epoch": 6.78, "grad_norm": 1.5459731817245483, "learning_rate": 3.313627254509018e-05, "loss": 0.1314, "step": 1695 }, { "epoch": 6.784, "grad_norm": 1.492156744003296, "learning_rate": 3.312625250501002e-05, "loss": 0.1215, "step": 1696 }, { "epoch": 6.788, "grad_norm": 1.5393439531326294, "learning_rate": 3.311623246492987e-05, "loss": 0.1102, "step": 1697 }, { "epoch": 6.792, "grad_norm": 1.3706822395324707, "learning_rate": 3.31062124248497e-05, "loss": 0.1246, "step": 1698 }, { "epoch": 6.796, "grad_norm": 1.3498224020004272, "learning_rate": 3.309619238476954e-05, "loss": 0.1176, "step": 1699 }, { "epoch": 6.8, "grad_norm": 1.5214121341705322, "learning_rate": 3.3086172344689384e-05, "loss": 0.1092, "step": 1700 }, { "epoch": 6.804, "grad_norm": 1.5812162160873413, "learning_rate": 3.307615230460922e-05, "loss": 0.1313, "step": 1701 }, { "epoch": 6.808, "grad_norm": 1.4328696727752686, "learning_rate": 3.306613226452906e-05, "loss": 0.1188, "step": 1702 }, { "epoch": 6.812, "grad_norm": 1.542195200920105, "learning_rate": 3.30561122244489e-05, "loss": 0.1328, "step": 1703 }, { "epoch": 6.816, "grad_norm": 1.2997112274169922, "learning_rate": 3.3046092184368735e-05, "loss": 0.1103, "step": 1704 }, { "epoch": 6.82, "grad_norm": 1.746984839439392, "learning_rate": 3.303607214428858e-05, "loss": 0.1134, "step": 1705 }, { "epoch": 6.824, "grad_norm": 1.5972540378570557, "learning_rate": 3.302605210420842e-05, "loss": 0.1525, "step": 1706 }, { "epoch": 6.828, "grad_norm": 1.4174765348434448, "learning_rate": 3.301603206412826e-05, "loss": 0.0846, "step": 1707 }, { "epoch": 6.832, "grad_norm": 1.3693246841430664, "learning_rate": 3.30060120240481e-05, "loss": 0.1022, "step": 1708 }, { "epoch": 6.836, "grad_norm": 1.5433459281921387, "learning_rate": 3.299599198396794e-05, "loss": 0.1151, "step": 1709 }, { "epoch": 6.84, "grad_norm": 1.515580415725708, "learning_rate": 3.2985971943887776e-05, "loss": 0.1157, "step": 1710 }, { "epoch": 6.844, "grad_norm": 1.4604809284210205, "learning_rate": 3.297595190380762e-05, "loss": 0.1203, "step": 1711 }, { "epoch": 6.848, "grad_norm": 1.8675997257232666, "learning_rate": 3.296593186372746e-05, "loss": 0.1357, "step": 1712 }, { "epoch": 6.852, "grad_norm": 1.4611036777496338, "learning_rate": 3.295591182364729e-05, "loss": 0.1203, "step": 1713 }, { "epoch": 6.856, "grad_norm": 1.5014901161193848, "learning_rate": 3.2945891783567135e-05, "loss": 0.1151, "step": 1714 }, { "epoch": 6.86, "grad_norm": 1.5301997661590576, "learning_rate": 3.2935871743486976e-05, "loss": 0.1121, "step": 1715 }, { "epoch": 6.864, "grad_norm": 1.5152416229248047, "learning_rate": 3.292585170340681e-05, "loss": 0.0901, "step": 1716 }, { "epoch": 6.868, "grad_norm": 1.5249059200286865, "learning_rate": 3.291583166332666e-05, "loss": 0.1061, "step": 1717 }, { "epoch": 6.872, "grad_norm": 1.729848027229309, "learning_rate": 3.290581162324649e-05, "loss": 0.1259, "step": 1718 }, { "epoch": 6.876, "grad_norm": 1.510529637336731, "learning_rate": 3.2895791583166334e-05, "loss": 0.11, "step": 1719 }, { "epoch": 6.88, "grad_norm": 1.3543599843978882, "learning_rate": 3.2885771543086176e-05, "loss": 0.1085, "step": 1720 }, { "epoch": 6.884, "grad_norm": 1.5983351469039917, "learning_rate": 3.287575150300602e-05, "loss": 0.144, "step": 1721 }, { "epoch": 6.888, "grad_norm": 1.4449596405029297, "learning_rate": 3.286573146292585e-05, "loss": 0.1132, "step": 1722 }, { "epoch": 6.892, "grad_norm": 1.5098458528518677, "learning_rate": 3.285571142284569e-05, "loss": 0.122, "step": 1723 }, { "epoch": 6.896, "grad_norm": 1.5662916898727417, "learning_rate": 3.2845691382765534e-05, "loss": 0.1112, "step": 1724 }, { "epoch": 6.9, "grad_norm": 1.4259101152420044, "learning_rate": 3.283567134268537e-05, "loss": 0.0982, "step": 1725 }, { "epoch": 6.904, "grad_norm": 1.3410980701446533, "learning_rate": 3.282565130260521e-05, "loss": 0.0893, "step": 1726 }, { "epoch": 6.908, "grad_norm": 1.5725864171981812, "learning_rate": 3.281563126252505e-05, "loss": 0.126, "step": 1727 }, { "epoch": 6.912, "grad_norm": 1.5315643548965454, "learning_rate": 3.280561122244489e-05, "loss": 0.1013, "step": 1728 }, { "epoch": 6.916, "grad_norm": 1.2779252529144287, "learning_rate": 3.2795591182364734e-05, "loss": 0.0934, "step": 1729 }, { "epoch": 6.92, "grad_norm": 1.4510380029678345, "learning_rate": 3.278557114228457e-05, "loss": 0.0971, "step": 1730 }, { "epoch": 6.924, "grad_norm": 1.527717113494873, "learning_rate": 3.277555110220441e-05, "loss": 0.1138, "step": 1731 }, { "epoch": 6.928, "grad_norm": 1.5672757625579834, "learning_rate": 3.276553106212425e-05, "loss": 0.1001, "step": 1732 }, { "epoch": 6.932, "grad_norm": 1.557324767112732, "learning_rate": 3.275551102204409e-05, "loss": 0.1195, "step": 1733 }, { "epoch": 6.936, "grad_norm": 1.4140926599502563, "learning_rate": 3.2745490981963926e-05, "loss": 0.1113, "step": 1734 }, { "epoch": 6.9399999999999995, "grad_norm": 1.4385108947753906, "learning_rate": 3.273547094188377e-05, "loss": 0.1139, "step": 1735 }, { "epoch": 6.944, "grad_norm": 1.5246384143829346, "learning_rate": 3.272545090180361e-05, "loss": 0.1125, "step": 1736 }, { "epoch": 6.948, "grad_norm": 1.0488942861557007, "learning_rate": 3.271543086172345e-05, "loss": 0.0691, "step": 1737 }, { "epoch": 6.952, "grad_norm": 1.495911717414856, "learning_rate": 3.270541082164329e-05, "loss": 0.0927, "step": 1738 }, { "epoch": 6.9559999999999995, "grad_norm": 1.6264322996139526, "learning_rate": 3.2695390781563126e-05, "loss": 0.1127, "step": 1739 }, { "epoch": 6.96, "grad_norm": 1.3778818845748901, "learning_rate": 3.268537074148297e-05, "loss": 0.1266, "step": 1740 }, { "epoch": 6.964, "grad_norm": 1.5300289392471313, "learning_rate": 3.267535070140281e-05, "loss": 0.1133, "step": 1741 }, { "epoch": 6.968, "grad_norm": 1.5947483777999878, "learning_rate": 3.266533066132264e-05, "loss": 0.1196, "step": 1742 }, { "epoch": 6.9719999999999995, "grad_norm": 1.4825366735458374, "learning_rate": 3.2655310621242484e-05, "loss": 0.1068, "step": 1743 }, { "epoch": 6.976, "grad_norm": 1.7026481628417969, "learning_rate": 3.2645290581162326e-05, "loss": 0.0977, "step": 1744 }, { "epoch": 6.98, "grad_norm": 1.6448338031768799, "learning_rate": 3.263527054108216e-05, "loss": 0.1112, "step": 1745 }, { "epoch": 6.984, "grad_norm": 1.6659256219863892, "learning_rate": 3.262525050100201e-05, "loss": 0.1179, "step": 1746 }, { "epoch": 6.9879999999999995, "grad_norm": 1.652055025100708, "learning_rate": 3.261523046092185e-05, "loss": 0.1535, "step": 1747 }, { "epoch": 6.992, "grad_norm": 1.4880528450012207, "learning_rate": 3.2605210420841684e-05, "loss": 0.067, "step": 1748 }, { "epoch": 6.996, "grad_norm": 1.5849748849868774, "learning_rate": 3.2595190380761525e-05, "loss": 0.1162, "step": 1749 }, { "epoch": 7.0, "grad_norm": 1.6205350160598755, "learning_rate": 3.2585170340681367e-05, "loss": 0.127, "step": 1750 }, { "epoch": 7.004, "grad_norm": 1.1027536392211914, "learning_rate": 3.25751503006012e-05, "loss": 0.0632, "step": 1751 }, { "epoch": 7.008, "grad_norm": 1.131488561630249, "learning_rate": 3.256513026052104e-05, "loss": 0.0832, "step": 1752 }, { "epoch": 7.012, "grad_norm": 1.1157985925674438, "learning_rate": 3.2555110220440884e-05, "loss": 0.0597, "step": 1753 }, { "epoch": 7.016, "grad_norm": 1.2950737476348877, "learning_rate": 3.254509018036072e-05, "loss": 0.07, "step": 1754 }, { "epoch": 7.02, "grad_norm": 1.0984954833984375, "learning_rate": 3.253507014028056e-05, "loss": 0.0584, "step": 1755 }, { "epoch": 7.024, "grad_norm": 1.1092177629470825, "learning_rate": 3.252505010020041e-05, "loss": 0.0605, "step": 1756 }, { "epoch": 7.028, "grad_norm": 0.8248789310455322, "learning_rate": 3.251503006012024e-05, "loss": 0.029, "step": 1757 }, { "epoch": 7.032, "grad_norm": 1.0940172672271729, "learning_rate": 3.250501002004008e-05, "loss": 0.05, "step": 1758 }, { "epoch": 7.036, "grad_norm": 1.4330635070800781, "learning_rate": 3.2494989979959925e-05, "loss": 0.0788, "step": 1759 }, { "epoch": 7.04, "grad_norm": 1.5703409910202026, "learning_rate": 3.248496993987976e-05, "loss": 0.0811, "step": 1760 }, { "epoch": 7.044, "grad_norm": 1.1306759119033813, "learning_rate": 3.24749498997996e-05, "loss": 0.0555, "step": 1761 }, { "epoch": 7.048, "grad_norm": 1.2541637420654297, "learning_rate": 3.246492985971944e-05, "loss": 0.0572, "step": 1762 }, { "epoch": 7.052, "grad_norm": 1.333361268043518, "learning_rate": 3.2454909819639276e-05, "loss": 0.0642, "step": 1763 }, { "epoch": 7.056, "grad_norm": 1.1626932621002197, "learning_rate": 3.244488977955912e-05, "loss": 0.0489, "step": 1764 }, { "epoch": 7.06, "grad_norm": 1.2295629978179932, "learning_rate": 3.243486973947896e-05, "loss": 0.0544, "step": 1765 }, { "epoch": 7.064, "grad_norm": 1.3224353790283203, "learning_rate": 3.24248496993988e-05, "loss": 0.0667, "step": 1766 }, { "epoch": 7.068, "grad_norm": 1.2137049436569214, "learning_rate": 3.241482965931864e-05, "loss": 0.0581, "step": 1767 }, { "epoch": 7.072, "grad_norm": 1.1840201616287231, "learning_rate": 3.240480961923848e-05, "loss": 0.0649, "step": 1768 }, { "epoch": 7.076, "grad_norm": 1.3883408308029175, "learning_rate": 3.239478957915832e-05, "loss": 0.0767, "step": 1769 }, { "epoch": 7.08, "grad_norm": 0.8142508268356323, "learning_rate": 3.238476953907816e-05, "loss": 0.0378, "step": 1770 }, { "epoch": 7.084, "grad_norm": 1.2708982229232788, "learning_rate": 3.2374749498998e-05, "loss": 0.0578, "step": 1771 }, { "epoch": 7.088, "grad_norm": 1.1672301292419434, "learning_rate": 3.2364729458917834e-05, "loss": 0.0538, "step": 1772 }, { "epoch": 7.092, "grad_norm": 1.0839089155197144, "learning_rate": 3.2354709418837675e-05, "loss": 0.0452, "step": 1773 }, { "epoch": 7.096, "grad_norm": 1.2967957258224487, "learning_rate": 3.234468937875752e-05, "loss": 0.0676, "step": 1774 }, { "epoch": 7.1, "grad_norm": 1.3709466457366943, "learning_rate": 3.233466933867735e-05, "loss": 0.0607, "step": 1775 }, { "epoch": 7.104, "grad_norm": 1.3387433290481567, "learning_rate": 3.23246492985972e-05, "loss": 0.0577, "step": 1776 }, { "epoch": 7.108, "grad_norm": 1.308927297592163, "learning_rate": 3.231462925851704e-05, "loss": 0.0827, "step": 1777 }, { "epoch": 7.112, "grad_norm": 1.114564061164856, "learning_rate": 3.2304609218436875e-05, "loss": 0.0473, "step": 1778 }, { "epoch": 7.116, "grad_norm": 1.2246047258377075, "learning_rate": 3.2294589178356716e-05, "loss": 0.0585, "step": 1779 }, { "epoch": 7.12, "grad_norm": 1.3446717262268066, "learning_rate": 3.228456913827656e-05, "loss": 0.0696, "step": 1780 }, { "epoch": 7.124, "grad_norm": 1.2419359683990479, "learning_rate": 3.227454909819639e-05, "loss": 0.0715, "step": 1781 }, { "epoch": 7.128, "grad_norm": 1.3638228178024292, "learning_rate": 3.2264529058116233e-05, "loss": 0.0566, "step": 1782 }, { "epoch": 7.132, "grad_norm": 1.1495225429534912, "learning_rate": 3.2254509018036075e-05, "loss": 0.057, "step": 1783 }, { "epoch": 7.136, "grad_norm": 1.3355066776275635, "learning_rate": 3.224448897795591e-05, "loss": 0.067, "step": 1784 }, { "epoch": 7.14, "grad_norm": 1.2129656076431274, "learning_rate": 3.223446893787575e-05, "loss": 0.052, "step": 1785 }, { "epoch": 7.144, "grad_norm": 1.1906895637512207, "learning_rate": 3.222444889779559e-05, "loss": 0.0627, "step": 1786 }, { "epoch": 7.148, "grad_norm": 1.0992094278335571, "learning_rate": 3.221442885771543e-05, "loss": 0.0473, "step": 1787 }, { "epoch": 7.152, "grad_norm": 1.235259771347046, "learning_rate": 3.2204408817635274e-05, "loss": 0.0644, "step": 1788 }, { "epoch": 7.156, "grad_norm": 1.3004990816116333, "learning_rate": 3.2194388777555116e-05, "loss": 0.0675, "step": 1789 }, { "epoch": 7.16, "grad_norm": 1.2756885290145874, "learning_rate": 3.218436873747495e-05, "loss": 0.067, "step": 1790 }, { "epoch": 7.164, "grad_norm": 1.1897791624069214, "learning_rate": 3.217434869739479e-05, "loss": 0.0483, "step": 1791 }, { "epoch": 7.168, "grad_norm": 1.2008836269378662, "learning_rate": 3.216432865731463e-05, "loss": 0.0651, "step": 1792 }, { "epoch": 7.172, "grad_norm": 1.4409757852554321, "learning_rate": 3.215430861723447e-05, "loss": 0.0662, "step": 1793 }, { "epoch": 7.176, "grad_norm": 1.4043469429016113, "learning_rate": 3.214428857715431e-05, "loss": 0.0937, "step": 1794 }, { "epoch": 7.18, "grad_norm": 1.6467530727386475, "learning_rate": 3.213426853707415e-05, "loss": 0.0913, "step": 1795 }, { "epoch": 7.184, "grad_norm": 1.4349619150161743, "learning_rate": 3.212424849699399e-05, "loss": 0.0623, "step": 1796 }, { "epoch": 7.188, "grad_norm": 1.2570191621780396, "learning_rate": 3.211422845691383e-05, "loss": 0.0597, "step": 1797 }, { "epoch": 7.192, "grad_norm": 1.2177025079727173, "learning_rate": 3.210420841683367e-05, "loss": 0.0568, "step": 1798 }, { "epoch": 7.196, "grad_norm": 1.1672996282577515, "learning_rate": 3.209418837675351e-05, "loss": 0.0621, "step": 1799 }, { "epoch": 7.2, "grad_norm": 1.3348926305770874, "learning_rate": 3.208416833667335e-05, "loss": 0.0656, "step": 1800 }, { "epoch": 7.204, "grad_norm": 1.202222466468811, "learning_rate": 3.2074148296593184e-05, "loss": 0.0549, "step": 1801 }, { "epoch": 7.208, "grad_norm": 1.333136796951294, "learning_rate": 3.2064128256513025e-05, "loss": 0.0645, "step": 1802 }, { "epoch": 7.212, "grad_norm": 1.3489561080932617, "learning_rate": 3.2054108216432866e-05, "loss": 0.0583, "step": 1803 }, { "epoch": 7.216, "grad_norm": 1.4054896831512451, "learning_rate": 3.204408817635271e-05, "loss": 0.0581, "step": 1804 }, { "epoch": 7.22, "grad_norm": 1.1050463914871216, "learning_rate": 3.203406813627255e-05, "loss": 0.0552, "step": 1805 }, { "epoch": 7.224, "grad_norm": 0.9208634495735168, "learning_rate": 3.202404809619239e-05, "loss": 0.0349, "step": 1806 }, { "epoch": 7.228, "grad_norm": 1.2044702768325806, "learning_rate": 3.2014028056112225e-05, "loss": 0.0567, "step": 1807 }, { "epoch": 7.232, "grad_norm": 1.2432702779769897, "learning_rate": 3.2004008016032066e-05, "loss": 0.0576, "step": 1808 }, { "epoch": 7.236, "grad_norm": 1.2744909524917603, "learning_rate": 3.199398797595191e-05, "loss": 0.0682, "step": 1809 }, { "epoch": 7.24, "grad_norm": 1.3941128253936768, "learning_rate": 3.198396793587174e-05, "loss": 0.0596, "step": 1810 }, { "epoch": 7.244, "grad_norm": 1.7390588521957397, "learning_rate": 3.197394789579158e-05, "loss": 0.0646, "step": 1811 }, { "epoch": 7.248, "grad_norm": 1.6309089660644531, "learning_rate": 3.1963927855711424e-05, "loss": 0.0645, "step": 1812 }, { "epoch": 7.252, "grad_norm": 1.3789124488830566, "learning_rate": 3.195390781563126e-05, "loss": 0.0684, "step": 1813 }, { "epoch": 7.256, "grad_norm": 1.2757648229599, "learning_rate": 3.19438877755511e-05, "loss": 0.0706, "step": 1814 }, { "epoch": 7.26, "grad_norm": 1.0513287782669067, "learning_rate": 3.193386773547095e-05, "loss": 0.0525, "step": 1815 }, { "epoch": 7.264, "grad_norm": 1.4275181293487549, "learning_rate": 3.192384769539078e-05, "loss": 0.0563, "step": 1816 }, { "epoch": 7.268, "grad_norm": 1.3130202293395996, "learning_rate": 3.1913827655310624e-05, "loss": 0.0626, "step": 1817 }, { "epoch": 7.272, "grad_norm": 1.3455928564071655, "learning_rate": 3.1903807615230465e-05, "loss": 0.0688, "step": 1818 }, { "epoch": 7.276, "grad_norm": 1.3660778999328613, "learning_rate": 3.18937875751503e-05, "loss": 0.0666, "step": 1819 }, { "epoch": 7.28, "grad_norm": 1.1925787925720215, "learning_rate": 3.188376753507014e-05, "loss": 0.0685, "step": 1820 }, { "epoch": 7.284, "grad_norm": 1.4987269639968872, "learning_rate": 3.187374749498998e-05, "loss": 0.0747, "step": 1821 }, { "epoch": 7.288, "grad_norm": 1.1474624872207642, "learning_rate": 3.186372745490982e-05, "loss": 0.0558, "step": 1822 }, { "epoch": 7.292, "grad_norm": 1.4372210502624512, "learning_rate": 3.185370741482966e-05, "loss": 0.068, "step": 1823 }, { "epoch": 7.296, "grad_norm": 1.5136364698410034, "learning_rate": 3.18436873747495e-05, "loss": 0.0852, "step": 1824 }, { "epoch": 7.3, "grad_norm": 0.8756834268569946, "learning_rate": 3.183366733466934e-05, "loss": 0.0306, "step": 1825 }, { "epoch": 7.304, "grad_norm": 1.6666139364242554, "learning_rate": 3.182364729458918e-05, "loss": 0.0818, "step": 1826 }, { "epoch": 7.308, "grad_norm": 1.2882213592529297, "learning_rate": 3.181362725450902e-05, "loss": 0.0663, "step": 1827 }, { "epoch": 7.312, "grad_norm": 1.2303308248519897, "learning_rate": 3.180360721442886e-05, "loss": 0.0696, "step": 1828 }, { "epoch": 7.316, "grad_norm": 1.3530824184417725, "learning_rate": 3.17935871743487e-05, "loss": 0.0665, "step": 1829 }, { "epoch": 7.32, "grad_norm": 1.375704288482666, "learning_rate": 3.178356713426854e-05, "loss": 0.0606, "step": 1830 }, { "epoch": 7.324, "grad_norm": 1.5587910413742065, "learning_rate": 3.1773547094188375e-05, "loss": 0.0797, "step": 1831 }, { "epoch": 7.328, "grad_norm": 1.2583500146865845, "learning_rate": 3.1763527054108216e-05, "loss": 0.0551, "step": 1832 }, { "epoch": 7.332, "grad_norm": 1.2027583122253418, "learning_rate": 3.175350701402806e-05, "loss": 0.0578, "step": 1833 }, { "epoch": 7.336, "grad_norm": 0.9741259813308716, "learning_rate": 3.174348697394789e-05, "loss": 0.0362, "step": 1834 }, { "epoch": 7.34, "grad_norm": 1.3720448017120361, "learning_rate": 3.173346693386774e-05, "loss": 0.0693, "step": 1835 }, { "epoch": 7.344, "grad_norm": 1.2734616994857788, "learning_rate": 3.172344689378758e-05, "loss": 0.0569, "step": 1836 }, { "epoch": 7.348, "grad_norm": 1.4324922561645508, "learning_rate": 3.1713426853707416e-05, "loss": 0.0765, "step": 1837 }, { "epoch": 7.352, "grad_norm": 1.2667415142059326, "learning_rate": 3.170340681362726e-05, "loss": 0.0605, "step": 1838 }, { "epoch": 7.356, "grad_norm": 1.3741072416305542, "learning_rate": 3.16933867735471e-05, "loss": 0.0634, "step": 1839 }, { "epoch": 7.36, "grad_norm": 1.2768627405166626, "learning_rate": 3.168336673346693e-05, "loss": 0.0575, "step": 1840 }, { "epoch": 7.364, "grad_norm": 1.393515944480896, "learning_rate": 3.1673346693386774e-05, "loss": 0.0642, "step": 1841 }, { "epoch": 7.368, "grad_norm": 1.6649253368377686, "learning_rate": 3.1663326653306616e-05, "loss": 0.0837, "step": 1842 }, { "epoch": 7.372, "grad_norm": 1.5157742500305176, "learning_rate": 3.165330661322645e-05, "loss": 0.07, "step": 1843 }, { "epoch": 7.376, "grad_norm": 1.4178087711334229, "learning_rate": 3.16432865731463e-05, "loss": 0.0717, "step": 1844 }, { "epoch": 7.38, "grad_norm": 1.1105268001556396, "learning_rate": 3.163326653306614e-05, "loss": 0.0495, "step": 1845 }, { "epoch": 7.384, "grad_norm": 1.3235563039779663, "learning_rate": 3.1623246492985974e-05, "loss": 0.0687, "step": 1846 }, { "epoch": 7.388, "grad_norm": 1.3781366348266602, "learning_rate": 3.1613226452905815e-05, "loss": 0.0741, "step": 1847 }, { "epoch": 7.392, "grad_norm": 1.330330491065979, "learning_rate": 3.1603206412825656e-05, "loss": 0.061, "step": 1848 }, { "epoch": 7.396, "grad_norm": 1.3802903890609741, "learning_rate": 3.159318637274549e-05, "loss": 0.068, "step": 1849 }, { "epoch": 7.4, "grad_norm": 0.8295315504074097, "learning_rate": 3.158316633266533e-05, "loss": 0.0321, "step": 1850 }, { "epoch": 7.404, "grad_norm": 1.4891018867492676, "learning_rate": 3.1573146292585173e-05, "loss": 0.0585, "step": 1851 }, { "epoch": 7.408, "grad_norm": 1.496936321258545, "learning_rate": 3.156312625250501e-05, "loss": 0.0689, "step": 1852 }, { "epoch": 7.412, "grad_norm": 1.4590027332305908, "learning_rate": 3.155310621242485e-05, "loss": 0.0722, "step": 1853 }, { "epoch": 7.416, "grad_norm": 1.5951083898544312, "learning_rate": 3.154308617234469e-05, "loss": 0.0666, "step": 1854 }, { "epoch": 7.42, "grad_norm": 1.2343343496322632, "learning_rate": 3.153306613226453e-05, "loss": 0.07, "step": 1855 }, { "epoch": 7.424, "grad_norm": 1.3191311359405518, "learning_rate": 3.152304609218437e-05, "loss": 0.0684, "step": 1856 }, { "epoch": 7.428, "grad_norm": 1.3911070823669434, "learning_rate": 3.151302605210421e-05, "loss": 0.0812, "step": 1857 }, { "epoch": 7.432, "grad_norm": 1.2717417478561401, "learning_rate": 3.150300601202405e-05, "loss": 0.0693, "step": 1858 }, { "epoch": 7.436, "grad_norm": 1.1313704252243042, "learning_rate": 3.149298597194389e-05, "loss": 0.0705, "step": 1859 }, { "epoch": 7.44, "grad_norm": 1.0588295459747314, "learning_rate": 3.148296593186373e-05, "loss": 0.0578, "step": 1860 }, { "epoch": 7.444, "grad_norm": 1.2012568712234497, "learning_rate": 3.1472945891783566e-05, "loss": 0.0607, "step": 1861 }, { "epoch": 7.448, "grad_norm": 2.303619861602783, "learning_rate": 3.146292585170341e-05, "loss": 0.0796, "step": 1862 }, { "epoch": 7.452, "grad_norm": 1.1877150535583496, "learning_rate": 3.145290581162325e-05, "loss": 0.0563, "step": 1863 }, { "epoch": 7.456, "grad_norm": 1.2776094675064087, "learning_rate": 3.144288577154309e-05, "loss": 0.0591, "step": 1864 }, { "epoch": 7.46, "grad_norm": 2.1104819774627686, "learning_rate": 3.143286573146293e-05, "loss": 0.0901, "step": 1865 }, { "epoch": 7.464, "grad_norm": 1.0128964185714722, "learning_rate": 3.1422845691382766e-05, "loss": 0.0475, "step": 1866 }, { "epoch": 7.468, "grad_norm": 1.3107222318649292, "learning_rate": 3.141282565130261e-05, "loss": 0.0709, "step": 1867 }, { "epoch": 7.4719999999999995, "grad_norm": 1.4184036254882812, "learning_rate": 3.140280561122245e-05, "loss": 0.0801, "step": 1868 }, { "epoch": 7.476, "grad_norm": 1.6373494863510132, "learning_rate": 3.139278557114228e-05, "loss": 0.0716, "step": 1869 }, { "epoch": 7.48, "grad_norm": 1.555827260017395, "learning_rate": 3.1382765531062124e-05, "loss": 0.0653, "step": 1870 }, { "epoch": 7.484, "grad_norm": 1.7851310968399048, "learning_rate": 3.1372745490981965e-05, "loss": 0.0892, "step": 1871 }, { "epoch": 7.4879999999999995, "grad_norm": 1.5268523693084717, "learning_rate": 3.13627254509018e-05, "loss": 0.0688, "step": 1872 }, { "epoch": 7.492, "grad_norm": 1.477304220199585, "learning_rate": 3.135270541082164e-05, "loss": 0.0733, "step": 1873 }, { "epoch": 7.496, "grad_norm": 1.2932136058807373, "learning_rate": 3.134268537074149e-05, "loss": 0.0636, "step": 1874 }, { "epoch": 7.5, "grad_norm": 1.4812061786651611, "learning_rate": 3.1332665330661324e-05, "loss": 0.0747, "step": 1875 }, { "epoch": 7.504, "grad_norm": 1.303823471069336, "learning_rate": 3.1322645290581165e-05, "loss": 0.0616, "step": 1876 }, { "epoch": 7.508, "grad_norm": 1.4868488311767578, "learning_rate": 3.1312625250501006e-05, "loss": 0.0898, "step": 1877 }, { "epoch": 7.5120000000000005, "grad_norm": 1.6528791189193726, "learning_rate": 3.130260521042084e-05, "loss": 0.0661, "step": 1878 }, { "epoch": 7.516, "grad_norm": 1.552954077720642, "learning_rate": 3.129258517034068e-05, "loss": 0.0716, "step": 1879 }, { "epoch": 7.52, "grad_norm": 1.2300714254379272, "learning_rate": 3.128256513026052e-05, "loss": 0.0625, "step": 1880 }, { "epoch": 7.524, "grad_norm": 1.54698646068573, "learning_rate": 3.127254509018036e-05, "loss": 0.0765, "step": 1881 }, { "epoch": 7.5280000000000005, "grad_norm": 1.5242317914962769, "learning_rate": 3.12625250501002e-05, "loss": 0.0922, "step": 1882 }, { "epoch": 7.532, "grad_norm": 1.4363588094711304, "learning_rate": 3.125250501002004e-05, "loss": 0.0784, "step": 1883 }, { "epoch": 7.536, "grad_norm": 1.3856273889541626, "learning_rate": 3.124248496993988e-05, "loss": 0.0669, "step": 1884 }, { "epoch": 7.54, "grad_norm": 1.296464443206787, "learning_rate": 3.123246492985972e-05, "loss": 0.0713, "step": 1885 }, { "epoch": 7.5440000000000005, "grad_norm": 1.3681565523147583, "learning_rate": 3.1222444889779564e-05, "loss": 0.0678, "step": 1886 }, { "epoch": 7.548, "grad_norm": 1.34380304813385, "learning_rate": 3.12124248496994e-05, "loss": 0.0552, "step": 1887 }, { "epoch": 7.552, "grad_norm": 1.2859309911727905, "learning_rate": 3.120240480961924e-05, "loss": 0.0602, "step": 1888 }, { "epoch": 7.556, "grad_norm": 1.4414995908737183, "learning_rate": 3.119238476953908e-05, "loss": 0.0646, "step": 1889 }, { "epoch": 7.5600000000000005, "grad_norm": 1.549656629562378, "learning_rate": 3.1182364729458916e-05, "loss": 0.0848, "step": 1890 }, { "epoch": 7.564, "grad_norm": 1.274708867073059, "learning_rate": 3.117234468937876e-05, "loss": 0.0634, "step": 1891 }, { "epoch": 7.568, "grad_norm": 2.1878645420074463, "learning_rate": 3.11623246492986e-05, "loss": 0.0593, "step": 1892 }, { "epoch": 7.572, "grad_norm": 1.367893099784851, "learning_rate": 3.115230460921843e-05, "loss": 0.0637, "step": 1893 }, { "epoch": 7.576, "grad_norm": 1.5797450542449951, "learning_rate": 3.114228456913828e-05, "loss": 0.0853, "step": 1894 }, { "epoch": 7.58, "grad_norm": 1.5319100618362427, "learning_rate": 3.113226452905812e-05, "loss": 0.0795, "step": 1895 }, { "epoch": 7.584, "grad_norm": 1.3238308429718018, "learning_rate": 3.112224448897796e-05, "loss": 0.0622, "step": 1896 }, { "epoch": 7.588, "grad_norm": 1.4970932006835938, "learning_rate": 3.11122244488978e-05, "loss": 0.0776, "step": 1897 }, { "epoch": 7.592, "grad_norm": 0.7695464491844177, "learning_rate": 3.110220440881764e-05, "loss": 0.0338, "step": 1898 }, { "epoch": 7.596, "grad_norm": 1.3819239139556885, "learning_rate": 3.1092184368737474e-05, "loss": 0.0804, "step": 1899 }, { "epoch": 7.6, "grad_norm": 1.5732004642486572, "learning_rate": 3.1082164328657315e-05, "loss": 0.0869, "step": 1900 }, { "epoch": 7.604, "grad_norm": 1.3113768100738525, "learning_rate": 3.1072144288577156e-05, "loss": 0.0659, "step": 1901 }, { "epoch": 7.608, "grad_norm": 1.4455841779708862, "learning_rate": 3.106212424849699e-05, "loss": 0.0847, "step": 1902 }, { "epoch": 7.612, "grad_norm": 0.8127650618553162, "learning_rate": 3.105210420841684e-05, "loss": 0.044, "step": 1903 }, { "epoch": 7.616, "grad_norm": 1.613440990447998, "learning_rate": 3.104208416833668e-05, "loss": 0.0779, "step": 1904 }, { "epoch": 7.62, "grad_norm": 1.3210656642913818, "learning_rate": 3.1032064128256515e-05, "loss": 0.058, "step": 1905 }, { "epoch": 7.624, "grad_norm": 1.4061453342437744, "learning_rate": 3.1022044088176356e-05, "loss": 0.0759, "step": 1906 }, { "epoch": 7.628, "grad_norm": 1.5052586793899536, "learning_rate": 3.10120240480962e-05, "loss": 0.0785, "step": 1907 }, { "epoch": 7.632, "grad_norm": 1.3261641263961792, "learning_rate": 3.100200400801603e-05, "loss": 0.0922, "step": 1908 }, { "epoch": 7.636, "grad_norm": 1.5588467121124268, "learning_rate": 3.099198396793587e-05, "loss": 0.0703, "step": 1909 }, { "epoch": 7.64, "grad_norm": 1.2506775856018066, "learning_rate": 3.0981963927855714e-05, "loss": 0.0537, "step": 1910 }, { "epoch": 7.644, "grad_norm": 1.6546599864959717, "learning_rate": 3.097194388777555e-05, "loss": 0.0913, "step": 1911 }, { "epoch": 7.648, "grad_norm": 1.600527286529541, "learning_rate": 3.096192384769539e-05, "loss": 0.0788, "step": 1912 }, { "epoch": 7.652, "grad_norm": 1.5613360404968262, "learning_rate": 3.095190380761523e-05, "loss": 0.068, "step": 1913 }, { "epoch": 7.656, "grad_norm": 1.5077072381973267, "learning_rate": 3.094188376753507e-05, "loss": 0.0729, "step": 1914 }, { "epoch": 7.66, "grad_norm": 1.6048588752746582, "learning_rate": 3.0931863727454914e-05, "loss": 0.0876, "step": 1915 }, { "epoch": 7.664, "grad_norm": 1.4152404069900513, "learning_rate": 3.0921843687374755e-05, "loss": 0.0876, "step": 1916 }, { "epoch": 7.668, "grad_norm": 1.5331817865371704, "learning_rate": 3.091182364729459e-05, "loss": 0.0765, "step": 1917 }, { "epoch": 7.672, "grad_norm": 1.5034297704696655, "learning_rate": 3.090180360721443e-05, "loss": 0.0618, "step": 1918 }, { "epoch": 7.676, "grad_norm": 1.4812408685684204, "learning_rate": 3.089178356713427e-05, "loss": 0.0791, "step": 1919 }, { "epoch": 7.68, "grad_norm": 1.4071189165115356, "learning_rate": 3.088176352705411e-05, "loss": 0.0773, "step": 1920 }, { "epoch": 7.684, "grad_norm": 1.3819324970245361, "learning_rate": 3.087174348697395e-05, "loss": 0.0734, "step": 1921 }, { "epoch": 7.688, "grad_norm": 1.3420048952102661, "learning_rate": 3.086172344689379e-05, "loss": 0.0659, "step": 1922 }, { "epoch": 7.692, "grad_norm": 1.4877381324768066, "learning_rate": 3.085170340681363e-05, "loss": 0.0729, "step": 1923 }, { "epoch": 7.696, "grad_norm": 1.2913259267807007, "learning_rate": 3.084168336673347e-05, "loss": 0.0593, "step": 1924 }, { "epoch": 7.7, "grad_norm": 1.3754169940948486, "learning_rate": 3.0831663326653306e-05, "loss": 0.0668, "step": 1925 }, { "epoch": 7.704, "grad_norm": 1.3683477640151978, "learning_rate": 3.082164328657315e-05, "loss": 0.0736, "step": 1926 }, { "epoch": 7.708, "grad_norm": 1.151803970336914, "learning_rate": 3.081162324649299e-05, "loss": 0.0581, "step": 1927 }, { "epoch": 7.712, "grad_norm": 1.4740242958068848, "learning_rate": 3.0801603206412824e-05, "loss": 0.0903, "step": 1928 }, { "epoch": 7.716, "grad_norm": 1.588667869567871, "learning_rate": 3.0791583166332665e-05, "loss": 0.083, "step": 1929 }, { "epoch": 7.72, "grad_norm": 1.4795126914978027, "learning_rate": 3.0781563126252506e-05, "loss": 0.0743, "step": 1930 }, { "epoch": 7.724, "grad_norm": 1.3423376083374023, "learning_rate": 3.077154308617235e-05, "loss": 0.0701, "step": 1931 }, { "epoch": 7.728, "grad_norm": 1.1936395168304443, "learning_rate": 3.076152304609218e-05, "loss": 0.0645, "step": 1932 }, { "epoch": 7.732, "grad_norm": 1.1713727712631226, "learning_rate": 3.075150300601203e-05, "loss": 0.0536, "step": 1933 }, { "epoch": 7.736, "grad_norm": 1.3141846656799316, "learning_rate": 3.0741482965931864e-05, "loss": 0.0723, "step": 1934 }, { "epoch": 7.74, "grad_norm": 1.4886362552642822, "learning_rate": 3.0731462925851706e-05, "loss": 0.0902, "step": 1935 }, { "epoch": 7.744, "grad_norm": 1.4828537702560425, "learning_rate": 3.072144288577155e-05, "loss": 0.0807, "step": 1936 }, { "epoch": 7.748, "grad_norm": 1.341299295425415, "learning_rate": 3.071142284569138e-05, "loss": 0.0692, "step": 1937 }, { "epoch": 7.752, "grad_norm": 1.3996098041534424, "learning_rate": 3.070140280561122e-05, "loss": 0.0658, "step": 1938 }, { "epoch": 7.756, "grad_norm": 1.3277019262313843, "learning_rate": 3.0691382765531064e-05, "loss": 0.0825, "step": 1939 }, { "epoch": 7.76, "grad_norm": 1.4799069166183472, "learning_rate": 3.06813627254509e-05, "loss": 0.0631, "step": 1940 }, { "epoch": 7.764, "grad_norm": 1.4274985790252686, "learning_rate": 3.067134268537074e-05, "loss": 0.0756, "step": 1941 }, { "epoch": 7.768, "grad_norm": 1.5825673341751099, "learning_rate": 3.066132264529058e-05, "loss": 0.0992, "step": 1942 }, { "epoch": 7.772, "grad_norm": 1.4629828929901123, "learning_rate": 3.065130260521042e-05, "loss": 0.0893, "step": 1943 }, { "epoch": 7.776, "grad_norm": 1.2774440050125122, "learning_rate": 3.0641282565130264e-05, "loss": 0.059, "step": 1944 }, { "epoch": 7.78, "grad_norm": 1.3957229852676392, "learning_rate": 3.0631262525050105e-05, "loss": 0.0699, "step": 1945 }, { "epoch": 7.784, "grad_norm": 1.5899467468261719, "learning_rate": 3.062124248496994e-05, "loss": 0.0865, "step": 1946 }, { "epoch": 7.788, "grad_norm": 1.4467995166778564, "learning_rate": 3.061122244488978e-05, "loss": 0.0765, "step": 1947 }, { "epoch": 7.792, "grad_norm": 1.4088798761367798, "learning_rate": 3.060120240480962e-05, "loss": 0.07, "step": 1948 }, { "epoch": 7.796, "grad_norm": 1.3148294687271118, "learning_rate": 3.0591182364729457e-05, "loss": 0.0845, "step": 1949 }, { "epoch": 7.8, "grad_norm": 1.445083737373352, "learning_rate": 3.05811623246493e-05, "loss": 0.0739, "step": 1950 }, { "epoch": 7.804, "grad_norm": 1.4596608877182007, "learning_rate": 3.057114228456914e-05, "loss": 0.0684, "step": 1951 }, { "epoch": 7.808, "grad_norm": 1.3999098539352417, "learning_rate": 3.0561122244488974e-05, "loss": 0.0638, "step": 1952 }, { "epoch": 7.812, "grad_norm": 1.6097480058670044, "learning_rate": 3.055110220440882e-05, "loss": 0.0812, "step": 1953 }, { "epoch": 7.816, "grad_norm": 1.6041979789733887, "learning_rate": 3.054108216432866e-05, "loss": 0.0989, "step": 1954 }, { "epoch": 7.82, "grad_norm": 1.4978128671646118, "learning_rate": 3.05310621242485e-05, "loss": 0.0799, "step": 1955 }, { "epoch": 7.824, "grad_norm": 1.465773582458496, "learning_rate": 3.052104208416834e-05, "loss": 0.0827, "step": 1956 }, { "epoch": 7.828, "grad_norm": 1.273937463760376, "learning_rate": 3.0511022044088177e-05, "loss": 0.0664, "step": 1957 }, { "epoch": 7.832, "grad_norm": 1.212803602218628, "learning_rate": 3.0501002004008018e-05, "loss": 0.0612, "step": 1958 }, { "epoch": 7.836, "grad_norm": 1.3836429119110107, "learning_rate": 3.0490981963927856e-05, "loss": 0.0711, "step": 1959 }, { "epoch": 7.84, "grad_norm": 1.5588130950927734, "learning_rate": 3.0480961923847694e-05, "loss": 0.0792, "step": 1960 }, { "epoch": 7.844, "grad_norm": 1.5219345092773438, "learning_rate": 3.0470941883767535e-05, "loss": 0.0826, "step": 1961 }, { "epoch": 7.848, "grad_norm": 1.6055107116699219, "learning_rate": 3.046092184368738e-05, "loss": 0.0766, "step": 1962 }, { "epoch": 7.852, "grad_norm": 1.2326933145523071, "learning_rate": 3.0450901803607218e-05, "loss": 0.0622, "step": 1963 }, { "epoch": 7.856, "grad_norm": 1.6281651258468628, "learning_rate": 3.0440881763527055e-05, "loss": 0.0789, "step": 1964 }, { "epoch": 7.86, "grad_norm": 1.5395461320877075, "learning_rate": 3.0430861723446897e-05, "loss": 0.0793, "step": 1965 }, { "epoch": 7.864, "grad_norm": 1.4532355070114136, "learning_rate": 3.0420841683366735e-05, "loss": 0.0667, "step": 1966 }, { "epoch": 7.868, "grad_norm": 1.4249470233917236, "learning_rate": 3.0410821643286576e-05, "loss": 0.0675, "step": 1967 }, { "epoch": 7.872, "grad_norm": 1.4712321758270264, "learning_rate": 3.0400801603206414e-05, "loss": 0.0563, "step": 1968 }, { "epoch": 7.876, "grad_norm": 1.6727486848831177, "learning_rate": 3.0390781563126252e-05, "loss": 0.0915, "step": 1969 }, { "epoch": 7.88, "grad_norm": 1.4988911151885986, "learning_rate": 3.0380761523046093e-05, "loss": 0.0754, "step": 1970 }, { "epoch": 7.884, "grad_norm": 1.4844058752059937, "learning_rate": 3.037074148296593e-05, "loss": 0.0777, "step": 1971 }, { "epoch": 7.888, "grad_norm": 1.4595845937728882, "learning_rate": 3.0360721442885776e-05, "loss": 0.0714, "step": 1972 }, { "epoch": 7.892, "grad_norm": 1.6251410245895386, "learning_rate": 3.0350701402805613e-05, "loss": 0.0891, "step": 1973 }, { "epoch": 7.896, "grad_norm": 1.4736934900283813, "learning_rate": 3.0340681362725455e-05, "loss": 0.0776, "step": 1974 }, { "epoch": 7.9, "grad_norm": 1.6098906993865967, "learning_rate": 3.0330661322645293e-05, "loss": 0.0738, "step": 1975 }, { "epoch": 7.904, "grad_norm": 1.4233413934707642, "learning_rate": 3.032064128256513e-05, "loss": 0.0681, "step": 1976 }, { "epoch": 7.908, "grad_norm": 1.4190319776535034, "learning_rate": 3.0310621242484972e-05, "loss": 0.0941, "step": 1977 }, { "epoch": 7.912, "grad_norm": 1.4623279571533203, "learning_rate": 3.030060120240481e-05, "loss": 0.0736, "step": 1978 }, { "epoch": 7.916, "grad_norm": 1.6991535425186157, "learning_rate": 3.029058116232465e-05, "loss": 0.0896, "step": 1979 }, { "epoch": 7.92, "grad_norm": 1.6135594844818115, "learning_rate": 3.028056112224449e-05, "loss": 0.0841, "step": 1980 }, { "epoch": 7.924, "grad_norm": 1.5118712186813354, "learning_rate": 3.0270541082164327e-05, "loss": 0.0966, "step": 1981 }, { "epoch": 7.928, "grad_norm": 1.3589533567428589, "learning_rate": 3.026052104208417e-05, "loss": 0.0725, "step": 1982 }, { "epoch": 7.932, "grad_norm": 1.5694752931594849, "learning_rate": 3.0250501002004013e-05, "loss": 0.0785, "step": 1983 }, { "epoch": 7.936, "grad_norm": 1.42135751247406, "learning_rate": 3.024048096192385e-05, "loss": 0.0802, "step": 1984 }, { "epoch": 7.9399999999999995, "grad_norm": 1.4186774492263794, "learning_rate": 3.023046092184369e-05, "loss": 0.071, "step": 1985 }, { "epoch": 7.944, "grad_norm": 0.7611780166625977, "learning_rate": 3.022044088176353e-05, "loss": 0.0184, "step": 1986 }, { "epoch": 7.948, "grad_norm": 1.5581532716751099, "learning_rate": 3.0210420841683368e-05, "loss": 0.0853, "step": 1987 }, { "epoch": 7.952, "grad_norm": 1.394500494003296, "learning_rate": 3.0200400801603206e-05, "loss": 0.0915, "step": 1988 }, { "epoch": 7.9559999999999995, "grad_norm": 1.33413565158844, "learning_rate": 3.0190380761523047e-05, "loss": 0.0689, "step": 1989 }, { "epoch": 7.96, "grad_norm": 1.417820692062378, "learning_rate": 3.0180360721442885e-05, "loss": 0.0861, "step": 1990 }, { "epoch": 7.964, "grad_norm": 1.309669852256775, "learning_rate": 3.0170340681362723e-05, "loss": 0.0703, "step": 1991 }, { "epoch": 7.968, "grad_norm": 1.4722129106521606, "learning_rate": 3.0160320641282567e-05, "loss": 0.0636, "step": 1992 }, { "epoch": 7.9719999999999995, "grad_norm": 1.4907193183898926, "learning_rate": 3.015030060120241e-05, "loss": 0.0699, "step": 1993 }, { "epoch": 7.976, "grad_norm": 1.5335925817489624, "learning_rate": 3.0140280561122247e-05, "loss": 0.0712, "step": 1994 }, { "epoch": 7.98, "grad_norm": 1.2255603075027466, "learning_rate": 3.0130260521042088e-05, "loss": 0.0602, "step": 1995 }, { "epoch": 7.984, "grad_norm": 1.4105703830718994, "learning_rate": 3.0120240480961926e-05, "loss": 0.0731, "step": 1996 }, { "epoch": 7.9879999999999995, "grad_norm": 1.5889180898666382, "learning_rate": 3.0110220440881764e-05, "loss": 0.0752, "step": 1997 }, { "epoch": 7.992, "grad_norm": 1.6140540838241577, "learning_rate": 3.0100200400801605e-05, "loss": 0.0753, "step": 1998 }, { "epoch": 7.996, "grad_norm": 1.826938509941101, "learning_rate": 3.0090180360721443e-05, "loss": 0.0852, "step": 1999 }, { "epoch": 8.0, "grad_norm": 1.0301682949066162, "learning_rate": 3.008016032064128e-05, "loss": 0.045, "step": 2000 }, { "epoch": 8.004, "grad_norm": 0.9636511206626892, "learning_rate": 3.0070140280561122e-05, "loss": 0.0422, "step": 2001 }, { "epoch": 8.008, "grad_norm": 0.9918342232704163, "learning_rate": 3.0060120240480967e-05, "loss": 0.0413, "step": 2002 }, { "epoch": 8.012, "grad_norm": 0.8967450857162476, "learning_rate": 3.0050100200400805e-05, "loss": 0.0343, "step": 2003 }, { "epoch": 8.016, "grad_norm": 1.0246479511260986, "learning_rate": 3.0040080160320642e-05, "loss": 0.0399, "step": 2004 }, { "epoch": 8.02, "grad_norm": 1.0252851247787476, "learning_rate": 3.0030060120240484e-05, "loss": 0.0458, "step": 2005 }, { "epoch": 8.024, "grad_norm": 0.974763810634613, "learning_rate": 3.002004008016032e-05, "loss": 0.0437, "step": 2006 }, { "epoch": 8.028, "grad_norm": 0.9569265842437744, "learning_rate": 3.0010020040080163e-05, "loss": 0.0364, "step": 2007 }, { "epoch": 8.032, "grad_norm": 1.104304313659668, "learning_rate": 3e-05, "loss": 0.0397, "step": 2008 }, { "epoch": 8.036, "grad_norm": 1.042075753211975, "learning_rate": 2.998997995991984e-05, "loss": 0.0319, "step": 2009 }, { "epoch": 8.04, "grad_norm": 1.41454017162323, "learning_rate": 2.997995991983968e-05, "loss": 0.0519, "step": 2010 }, { "epoch": 8.044, "grad_norm": 1.2228070497512817, "learning_rate": 2.9969939879759525e-05, "loss": 0.0471, "step": 2011 }, { "epoch": 8.048, "grad_norm": 1.3753093481063843, "learning_rate": 2.9959919839679363e-05, "loss": 0.0426, "step": 2012 }, { "epoch": 8.052, "grad_norm": 1.2249881029129028, "learning_rate": 2.99498997995992e-05, "loss": 0.0375, "step": 2013 }, { "epoch": 8.056, "grad_norm": 1.1894644498825073, "learning_rate": 2.993987975951904e-05, "loss": 0.0388, "step": 2014 }, { "epoch": 8.06, "grad_norm": 1.2309092283248901, "learning_rate": 2.992985971943888e-05, "loss": 0.0403, "step": 2015 }, { "epoch": 8.064, "grad_norm": 1.237890362739563, "learning_rate": 2.9919839679358717e-05, "loss": 0.0414, "step": 2016 }, { "epoch": 8.068, "grad_norm": 1.3843622207641602, "learning_rate": 2.990981963927856e-05, "loss": 0.0462, "step": 2017 }, { "epoch": 8.072, "grad_norm": 1.204715609550476, "learning_rate": 2.9899799599198397e-05, "loss": 0.0455, "step": 2018 }, { "epoch": 8.076, "grad_norm": 1.1877174377441406, "learning_rate": 2.9889779559118235e-05, "loss": 0.0429, "step": 2019 }, { "epoch": 8.08, "grad_norm": 0.9619832634925842, "learning_rate": 2.9879759519038076e-05, "loss": 0.0344, "step": 2020 }, { "epoch": 8.084, "grad_norm": 1.2770909070968628, "learning_rate": 2.986973947895792e-05, "loss": 0.0434, "step": 2021 }, { "epoch": 8.088, "grad_norm": 2.0295021533966064, "learning_rate": 2.985971943887776e-05, "loss": 0.0478, "step": 2022 }, { "epoch": 8.092, "grad_norm": 1.0458402633666992, "learning_rate": 2.98496993987976e-05, "loss": 0.0407, "step": 2023 }, { "epoch": 8.096, "grad_norm": 1.3169829845428467, "learning_rate": 2.9839679358717438e-05, "loss": 0.0383, "step": 2024 }, { "epoch": 8.1, "grad_norm": 1.2271572351455688, "learning_rate": 2.9829659318637275e-05, "loss": 0.0426, "step": 2025 }, { "epoch": 8.104, "grad_norm": 0.6691854596138, "learning_rate": 2.9819639278557117e-05, "loss": 0.0178, "step": 2026 }, { "epoch": 8.108, "grad_norm": 1.2203044891357422, "learning_rate": 2.9809619238476955e-05, "loss": 0.049, "step": 2027 }, { "epoch": 8.112, "grad_norm": 1.1004842519760132, "learning_rate": 2.9799599198396793e-05, "loss": 0.0413, "step": 2028 }, { "epoch": 8.116, "grad_norm": 1.1500310897827148, "learning_rate": 2.9789579158316634e-05, "loss": 0.052, "step": 2029 }, { "epoch": 8.12, "grad_norm": 1.0582787990570068, "learning_rate": 2.9779559118236472e-05, "loss": 0.0387, "step": 2030 }, { "epoch": 8.124, "grad_norm": 1.1282780170440674, "learning_rate": 2.9769539078156316e-05, "loss": 0.0463, "step": 2031 }, { "epoch": 8.128, "grad_norm": 1.2757964134216309, "learning_rate": 2.9759519038076154e-05, "loss": 0.0458, "step": 2032 }, { "epoch": 8.132, "grad_norm": 1.3332687616348267, "learning_rate": 2.9749498997995996e-05, "loss": 0.0527, "step": 2033 }, { "epoch": 8.136, "grad_norm": 1.269371747970581, "learning_rate": 2.9739478957915833e-05, "loss": 0.0555, "step": 2034 }, { "epoch": 8.14, "grad_norm": 1.179306149482727, "learning_rate": 2.9729458917835675e-05, "loss": 0.0379, "step": 2035 }, { "epoch": 8.144, "grad_norm": 1.3321940898895264, "learning_rate": 2.9719438877755513e-05, "loss": 0.0554, "step": 2036 }, { "epoch": 8.148, "grad_norm": 0.9157149791717529, "learning_rate": 2.970941883767535e-05, "loss": 0.033, "step": 2037 }, { "epoch": 8.152, "grad_norm": 1.1154097318649292, "learning_rate": 2.9699398797595192e-05, "loss": 0.0384, "step": 2038 }, { "epoch": 8.156, "grad_norm": 0.9774281978607178, "learning_rate": 2.968937875751503e-05, "loss": 0.0333, "step": 2039 }, { "epoch": 8.16, "grad_norm": 1.5343128442764282, "learning_rate": 2.9679358717434868e-05, "loss": 0.0353, "step": 2040 }, { "epoch": 8.164, "grad_norm": 1.0981296300888062, "learning_rate": 2.9669338677354712e-05, "loss": 0.0372, "step": 2041 }, { "epoch": 8.168, "grad_norm": 1.17277193069458, "learning_rate": 2.9659318637274554e-05, "loss": 0.0456, "step": 2042 }, { "epoch": 8.172, "grad_norm": 1.0405935049057007, "learning_rate": 2.964929859719439e-05, "loss": 0.0431, "step": 2043 }, { "epoch": 8.176, "grad_norm": 1.213351845741272, "learning_rate": 2.963927855711423e-05, "loss": 0.0469, "step": 2044 }, { "epoch": 8.18, "grad_norm": 1.091701865196228, "learning_rate": 2.962925851703407e-05, "loss": 0.0409, "step": 2045 }, { "epoch": 8.184, "grad_norm": 1.140825629234314, "learning_rate": 2.961923847695391e-05, "loss": 0.0399, "step": 2046 }, { "epoch": 8.188, "grad_norm": 1.3966275453567505, "learning_rate": 2.9609218436873746e-05, "loss": 0.047, "step": 2047 }, { "epoch": 8.192, "grad_norm": 1.0607054233551025, "learning_rate": 2.9599198396793588e-05, "loss": 0.0403, "step": 2048 }, { "epoch": 8.196, "grad_norm": 1.3309967517852783, "learning_rate": 2.9589178356713426e-05, "loss": 0.0424, "step": 2049 }, { "epoch": 8.2, "grad_norm": 1.007638931274414, "learning_rate": 2.9579158316633267e-05, "loss": 0.0319, "step": 2050 }, { "epoch": 8.204, "grad_norm": 1.2654892206192017, "learning_rate": 2.956913827655311e-05, "loss": 0.0453, "step": 2051 }, { "epoch": 8.208, "grad_norm": 1.064241647720337, "learning_rate": 2.955911823647295e-05, "loss": 0.0436, "step": 2052 }, { "epoch": 8.212, "grad_norm": 1.1412560939788818, "learning_rate": 2.9549098196392787e-05, "loss": 0.0434, "step": 2053 }, { "epoch": 8.216, "grad_norm": 1.2644426822662354, "learning_rate": 2.953907815631263e-05, "loss": 0.0443, "step": 2054 }, { "epoch": 8.22, "grad_norm": 1.153268814086914, "learning_rate": 2.9529058116232467e-05, "loss": 0.0446, "step": 2055 }, { "epoch": 8.224, "grad_norm": 1.1821683645248413, "learning_rate": 2.9519038076152304e-05, "loss": 0.0403, "step": 2056 }, { "epoch": 8.228, "grad_norm": 1.1083087921142578, "learning_rate": 2.9509018036072146e-05, "loss": 0.0376, "step": 2057 }, { "epoch": 8.232, "grad_norm": 1.251597285270691, "learning_rate": 2.9498997995991984e-05, "loss": 0.0458, "step": 2058 }, { "epoch": 8.236, "grad_norm": 1.3240916728973389, "learning_rate": 2.948897795591182e-05, "loss": 0.0505, "step": 2059 }, { "epoch": 8.24, "grad_norm": 1.3797112703323364, "learning_rate": 2.9478957915831663e-05, "loss": 0.0468, "step": 2060 }, { "epoch": 8.244, "grad_norm": 1.4539932012557983, "learning_rate": 2.9468937875751507e-05, "loss": 0.0459, "step": 2061 }, { "epoch": 8.248, "grad_norm": 1.0563139915466309, "learning_rate": 2.9458917835671345e-05, "loss": 0.0375, "step": 2062 }, { "epoch": 8.252, "grad_norm": 1.1842182874679565, "learning_rate": 2.9448897795591183e-05, "loss": 0.0441, "step": 2063 }, { "epoch": 8.256, "grad_norm": 1.2775763273239136, "learning_rate": 2.9438877755511024e-05, "loss": 0.0487, "step": 2064 }, { "epoch": 8.26, "grad_norm": 1.2534736394882202, "learning_rate": 2.9428857715430862e-05, "loss": 0.0389, "step": 2065 }, { "epoch": 8.264, "grad_norm": 1.3463037014007568, "learning_rate": 2.9418837675350704e-05, "loss": 0.053, "step": 2066 }, { "epoch": 8.268, "grad_norm": 0.9820384383201599, "learning_rate": 2.940881763527054e-05, "loss": 0.0467, "step": 2067 }, { "epoch": 8.272, "grad_norm": 1.1850944757461548, "learning_rate": 2.939879759519038e-05, "loss": 0.0398, "step": 2068 }, { "epoch": 8.276, "grad_norm": 0.9997429847717285, "learning_rate": 2.938877755511022e-05, "loss": 0.0343, "step": 2069 }, { "epoch": 8.28, "grad_norm": 1.2655606269836426, "learning_rate": 2.9378757515030065e-05, "loss": 0.0437, "step": 2070 }, { "epoch": 8.284, "grad_norm": 1.0781662464141846, "learning_rate": 2.9368737474949903e-05, "loss": 0.0349, "step": 2071 }, { "epoch": 8.288, "grad_norm": 1.158593773841858, "learning_rate": 2.935871743486974e-05, "loss": 0.046, "step": 2072 }, { "epoch": 8.292, "grad_norm": 1.2248380184173584, "learning_rate": 2.9348697394789582e-05, "loss": 0.0406, "step": 2073 }, { "epoch": 8.296, "grad_norm": 1.362268090248108, "learning_rate": 2.933867735470942e-05, "loss": 0.0516, "step": 2074 }, { "epoch": 8.3, "grad_norm": 1.1527179479599, "learning_rate": 2.9328657314629258e-05, "loss": 0.0396, "step": 2075 }, { "epoch": 8.304, "grad_norm": 1.1694166660308838, "learning_rate": 2.93186372745491e-05, "loss": 0.0374, "step": 2076 }, { "epoch": 8.308, "grad_norm": 1.28037428855896, "learning_rate": 2.9308617234468937e-05, "loss": 0.0416, "step": 2077 }, { "epoch": 8.312, "grad_norm": 1.2145733833312988, "learning_rate": 2.929859719438878e-05, "loss": 0.0433, "step": 2078 }, { "epoch": 8.316, "grad_norm": 1.0807759761810303, "learning_rate": 2.9288577154308617e-05, "loss": 0.0364, "step": 2079 }, { "epoch": 8.32, "grad_norm": 1.364870548248291, "learning_rate": 2.927855711422846e-05, "loss": 0.0582, "step": 2080 }, { "epoch": 8.324, "grad_norm": 1.2568074464797974, "learning_rate": 2.92685370741483e-05, "loss": 0.042, "step": 2081 }, { "epoch": 8.328, "grad_norm": 1.4024349451065063, "learning_rate": 2.925851703406814e-05, "loss": 0.0465, "step": 2082 }, { "epoch": 8.332, "grad_norm": 1.2387995719909668, "learning_rate": 2.924849699398798e-05, "loss": 0.0375, "step": 2083 }, { "epoch": 8.336, "grad_norm": 1.2565348148345947, "learning_rate": 2.9238476953907816e-05, "loss": 0.04, "step": 2084 }, { "epoch": 8.34, "grad_norm": 1.384647011756897, "learning_rate": 2.9228456913827658e-05, "loss": 0.0525, "step": 2085 }, { "epoch": 8.344, "grad_norm": 0.9669448137283325, "learning_rate": 2.9218436873747495e-05, "loss": 0.0398, "step": 2086 }, { "epoch": 8.348, "grad_norm": 1.022433876991272, "learning_rate": 2.9208416833667333e-05, "loss": 0.037, "step": 2087 }, { "epoch": 8.352, "grad_norm": 1.2381759881973267, "learning_rate": 2.9198396793587175e-05, "loss": 0.0512, "step": 2088 }, { "epoch": 8.356, "grad_norm": 0.967339813709259, "learning_rate": 2.9188376753507013e-05, "loss": 0.0392, "step": 2089 }, { "epoch": 8.36, "grad_norm": 1.4872891902923584, "learning_rate": 2.9178356713426857e-05, "loss": 0.0491, "step": 2090 }, { "epoch": 8.364, "grad_norm": 1.1258385181427002, "learning_rate": 2.9168336673346695e-05, "loss": 0.0395, "step": 2091 }, { "epoch": 8.368, "grad_norm": 1.4340835809707642, "learning_rate": 2.9158316633266536e-05, "loss": 0.0561, "step": 2092 }, { "epoch": 8.372, "grad_norm": 1.2354909181594849, "learning_rate": 2.9148296593186374e-05, "loss": 0.0415, "step": 2093 }, { "epoch": 8.376, "grad_norm": 1.414834976196289, "learning_rate": 2.9138276553106216e-05, "loss": 0.0563, "step": 2094 }, { "epoch": 8.38, "grad_norm": 1.3660783767700195, "learning_rate": 2.9128256513026053e-05, "loss": 0.049, "step": 2095 }, { "epoch": 8.384, "grad_norm": 1.2326569557189941, "learning_rate": 2.911823647294589e-05, "loss": 0.0458, "step": 2096 }, { "epoch": 8.388, "grad_norm": 1.343539834022522, "learning_rate": 2.9108216432865733e-05, "loss": 0.0498, "step": 2097 }, { "epoch": 8.392, "grad_norm": 1.4126864671707153, "learning_rate": 2.909819639278557e-05, "loss": 0.0485, "step": 2098 }, { "epoch": 8.396, "grad_norm": 1.0817058086395264, "learning_rate": 2.908817635270541e-05, "loss": 0.0342, "step": 2099 }, { "epoch": 8.4, "grad_norm": 1.1074159145355225, "learning_rate": 2.9078156312625253e-05, "loss": 0.0371, "step": 2100 }, { "epoch": 8.404, "grad_norm": 1.0859711170196533, "learning_rate": 2.9068136272545094e-05, "loss": 0.0408, "step": 2101 }, { "epoch": 8.408, "grad_norm": 1.2138441801071167, "learning_rate": 2.9058116232464932e-05, "loss": 0.0497, "step": 2102 }, { "epoch": 8.412, "grad_norm": 0.9061650037765503, "learning_rate": 2.904809619238477e-05, "loss": 0.0325, "step": 2103 }, { "epoch": 8.416, "grad_norm": 1.2485976219177246, "learning_rate": 2.903807615230461e-05, "loss": 0.0525, "step": 2104 }, { "epoch": 8.42, "grad_norm": 1.1576125621795654, "learning_rate": 2.902805611222445e-05, "loss": 0.036, "step": 2105 }, { "epoch": 8.424, "grad_norm": 1.201346755027771, "learning_rate": 2.901803607214429e-05, "loss": 0.0455, "step": 2106 }, { "epoch": 8.428, "grad_norm": 1.1513805389404297, "learning_rate": 2.900801603206413e-05, "loss": 0.0441, "step": 2107 }, { "epoch": 8.432, "grad_norm": 1.2415015697479248, "learning_rate": 2.8997995991983966e-05, "loss": 0.0485, "step": 2108 }, { "epoch": 8.436, "grad_norm": 1.4972500801086426, "learning_rate": 2.8987975951903808e-05, "loss": 0.0563, "step": 2109 }, { "epoch": 8.44, "grad_norm": 1.346211314201355, "learning_rate": 2.8977955911823652e-05, "loss": 0.0537, "step": 2110 }, { "epoch": 8.444, "grad_norm": 1.4355465173721313, "learning_rate": 2.896793587174349e-05, "loss": 0.0446, "step": 2111 }, { "epoch": 8.448, "grad_norm": 1.4319212436676025, "learning_rate": 2.8957915831663328e-05, "loss": 0.0439, "step": 2112 }, { "epoch": 8.452, "grad_norm": 1.0715012550354004, "learning_rate": 2.894789579158317e-05, "loss": 0.0377, "step": 2113 }, { "epoch": 8.456, "grad_norm": 1.2712820768356323, "learning_rate": 2.8937875751503007e-05, "loss": 0.0433, "step": 2114 }, { "epoch": 8.46, "grad_norm": 1.385491132736206, "learning_rate": 2.8927855711422845e-05, "loss": 0.0472, "step": 2115 }, { "epoch": 8.464, "grad_norm": 1.0765126943588257, "learning_rate": 2.8917835671342686e-05, "loss": 0.0392, "step": 2116 }, { "epoch": 8.468, "grad_norm": 0.964982807636261, "learning_rate": 2.8907815631262524e-05, "loss": 0.0364, "step": 2117 }, { "epoch": 8.472, "grad_norm": 1.0207576751708984, "learning_rate": 2.8897795591182362e-05, "loss": 0.0353, "step": 2118 }, { "epoch": 8.475999999999999, "grad_norm": 1.363625407218933, "learning_rate": 2.8887775551102207e-05, "loss": 0.0479, "step": 2119 }, { "epoch": 8.48, "grad_norm": 1.3118845224380493, "learning_rate": 2.8877755511022048e-05, "loss": 0.05, "step": 2120 }, { "epoch": 8.484, "grad_norm": 1.3515814542770386, "learning_rate": 2.8867735470941886e-05, "loss": 0.0449, "step": 2121 }, { "epoch": 8.488, "grad_norm": 1.2568910121917725, "learning_rate": 2.8857715430861727e-05, "loss": 0.0421, "step": 2122 }, { "epoch": 8.492, "grad_norm": 1.1728308200836182, "learning_rate": 2.8847695390781565e-05, "loss": 0.0402, "step": 2123 }, { "epoch": 8.496, "grad_norm": 1.3032832145690918, "learning_rate": 2.8837675350701403e-05, "loss": 0.0431, "step": 2124 }, { "epoch": 8.5, "grad_norm": 1.277818202972412, "learning_rate": 2.8827655310621244e-05, "loss": 0.0703, "step": 2125 }, { "epoch": 8.504, "grad_norm": 1.165521264076233, "learning_rate": 2.8817635270541082e-05, "loss": 0.0411, "step": 2126 }, { "epoch": 8.508, "grad_norm": 1.190546989440918, "learning_rate": 2.880761523046092e-05, "loss": 0.0426, "step": 2127 }, { "epoch": 8.512, "grad_norm": 1.323736310005188, "learning_rate": 2.879759519038076e-05, "loss": 0.0498, "step": 2128 }, { "epoch": 8.516, "grad_norm": 1.3947193622589111, "learning_rate": 2.8787575150300606e-05, "loss": 0.0562, "step": 2129 }, { "epoch": 8.52, "grad_norm": 1.3394712209701538, "learning_rate": 2.8777555110220444e-05, "loss": 0.0568, "step": 2130 }, { "epoch": 8.524000000000001, "grad_norm": 1.2725778818130493, "learning_rate": 2.8767535070140282e-05, "loss": 0.0453, "step": 2131 }, { "epoch": 8.528, "grad_norm": 0.6503176093101501, "learning_rate": 2.8757515030060123e-05, "loss": 0.0155, "step": 2132 }, { "epoch": 8.532, "grad_norm": 1.112267017364502, "learning_rate": 2.874749498997996e-05, "loss": 0.0351, "step": 2133 }, { "epoch": 8.536, "grad_norm": 1.2222670316696167, "learning_rate": 2.8737474949899802e-05, "loss": 0.04, "step": 2134 }, { "epoch": 8.54, "grad_norm": 1.2434343099594116, "learning_rate": 2.872745490981964e-05, "loss": 0.0432, "step": 2135 }, { "epoch": 8.544, "grad_norm": 1.369353175163269, "learning_rate": 2.8717434869739478e-05, "loss": 0.0541, "step": 2136 }, { "epoch": 8.548, "grad_norm": 1.3330413103103638, "learning_rate": 2.870741482965932e-05, "loss": 0.043, "step": 2137 }, { "epoch": 8.552, "grad_norm": 1.0383684635162354, "learning_rate": 2.8697394789579157e-05, "loss": 0.0414, "step": 2138 }, { "epoch": 8.556000000000001, "grad_norm": 1.3430683612823486, "learning_rate": 2.8687374749499002e-05, "loss": 0.0558, "step": 2139 }, { "epoch": 8.56, "grad_norm": 1.6391087770462036, "learning_rate": 2.867735470941884e-05, "loss": 0.0523, "step": 2140 }, { "epoch": 8.564, "grad_norm": 1.2342334985733032, "learning_rate": 2.866733466933868e-05, "loss": 0.0495, "step": 2141 }, { "epoch": 8.568, "grad_norm": 1.2186678647994995, "learning_rate": 2.865731462925852e-05, "loss": 0.0468, "step": 2142 }, { "epoch": 8.572, "grad_norm": 1.2047197818756104, "learning_rate": 2.8647294589178357e-05, "loss": 0.0485, "step": 2143 }, { "epoch": 8.576, "grad_norm": 1.139602541923523, "learning_rate": 2.86372745490982e-05, "loss": 0.0398, "step": 2144 }, { "epoch": 8.58, "grad_norm": 1.393538475036621, "learning_rate": 2.8627254509018036e-05, "loss": 0.0499, "step": 2145 }, { "epoch": 8.584, "grad_norm": 0.7064828276634216, "learning_rate": 2.8617234468937874e-05, "loss": 0.0208, "step": 2146 }, { "epoch": 8.588, "grad_norm": 1.0699363946914673, "learning_rate": 2.8607214428857715e-05, "loss": 0.0452, "step": 2147 }, { "epoch": 8.592, "grad_norm": 0.9702959060668945, "learning_rate": 2.8597194388777553e-05, "loss": 0.0258, "step": 2148 }, { "epoch": 8.596, "grad_norm": 1.0714532136917114, "learning_rate": 2.8587174348697398e-05, "loss": 0.0399, "step": 2149 }, { "epoch": 8.6, "grad_norm": 0.7246587872505188, "learning_rate": 2.857715430861724e-05, "loss": 0.0249, "step": 2150 }, { "epoch": 8.604, "grad_norm": 1.354813814163208, "learning_rate": 2.8567134268537077e-05, "loss": 0.0458, "step": 2151 }, { "epoch": 8.608, "grad_norm": 1.3162986040115356, "learning_rate": 2.8557114228456915e-05, "loss": 0.0508, "step": 2152 }, { "epoch": 8.612, "grad_norm": 1.2697739601135254, "learning_rate": 2.8547094188376756e-05, "loss": 0.0472, "step": 2153 }, { "epoch": 8.616, "grad_norm": 1.3663972616195679, "learning_rate": 2.8537074148296594e-05, "loss": 0.0535, "step": 2154 }, { "epoch": 8.62, "grad_norm": 1.2056630849838257, "learning_rate": 2.8527054108216432e-05, "loss": 0.0419, "step": 2155 }, { "epoch": 8.624, "grad_norm": 1.1879265308380127, "learning_rate": 2.8517034068136273e-05, "loss": 0.042, "step": 2156 }, { "epoch": 8.628, "grad_norm": 1.250275731086731, "learning_rate": 2.850701402805611e-05, "loss": 0.0428, "step": 2157 }, { "epoch": 8.632, "grad_norm": 1.2584730386734009, "learning_rate": 2.849699398797595e-05, "loss": 0.0453, "step": 2158 }, { "epoch": 8.636, "grad_norm": 1.3111329078674316, "learning_rate": 2.8486973947895794e-05, "loss": 0.0501, "step": 2159 }, { "epoch": 8.64, "grad_norm": 1.344029426574707, "learning_rate": 2.8476953907815635e-05, "loss": 0.0479, "step": 2160 }, { "epoch": 8.644, "grad_norm": 1.4813225269317627, "learning_rate": 2.8466933867735473e-05, "loss": 0.0489, "step": 2161 }, { "epoch": 8.648, "grad_norm": 1.3338834047317505, "learning_rate": 2.8456913827655314e-05, "loss": 0.0481, "step": 2162 }, { "epoch": 8.652, "grad_norm": 1.295943260192871, "learning_rate": 2.8446893787575152e-05, "loss": 0.0403, "step": 2163 }, { "epoch": 8.656, "grad_norm": 1.1170644760131836, "learning_rate": 2.843687374749499e-05, "loss": 0.0453, "step": 2164 }, { "epoch": 8.66, "grad_norm": 1.6115245819091797, "learning_rate": 2.842685370741483e-05, "loss": 0.0496, "step": 2165 }, { "epoch": 8.664, "grad_norm": 1.2752050161361694, "learning_rate": 2.841683366733467e-05, "loss": 0.0441, "step": 2166 }, { "epoch": 8.668, "grad_norm": 1.237499475479126, "learning_rate": 2.8406813627254507e-05, "loss": 0.0474, "step": 2167 }, { "epoch": 8.672, "grad_norm": 1.2884820699691772, "learning_rate": 2.839679358717435e-05, "loss": 0.0489, "step": 2168 }, { "epoch": 8.676, "grad_norm": 1.2853950262069702, "learning_rate": 2.8386773547094193e-05, "loss": 0.0514, "step": 2169 }, { "epoch": 8.68, "grad_norm": 1.3021364212036133, "learning_rate": 2.837675350701403e-05, "loss": 0.0451, "step": 2170 }, { "epoch": 8.684, "grad_norm": 1.1973280906677246, "learning_rate": 2.836673346693387e-05, "loss": 0.046, "step": 2171 }, { "epoch": 8.688, "grad_norm": 0.8755884170532227, "learning_rate": 2.835671342685371e-05, "loss": 0.0345, "step": 2172 }, { "epoch": 8.692, "grad_norm": 1.2425113916397095, "learning_rate": 2.8346693386773548e-05, "loss": 0.0407, "step": 2173 }, { "epoch": 8.696, "grad_norm": 1.3100374937057495, "learning_rate": 2.8336673346693386e-05, "loss": 0.0459, "step": 2174 }, { "epoch": 8.7, "grad_norm": 1.5196589231491089, "learning_rate": 2.8326653306613227e-05, "loss": 0.0492, "step": 2175 }, { "epoch": 8.704, "grad_norm": 1.3412437438964844, "learning_rate": 2.8316633266533065e-05, "loss": 0.0522, "step": 2176 }, { "epoch": 8.708, "grad_norm": 1.1661128997802734, "learning_rate": 2.8306613226452906e-05, "loss": 0.0533, "step": 2177 }, { "epoch": 8.712, "grad_norm": 1.1783884763717651, "learning_rate": 2.829659318637275e-05, "loss": 0.0511, "step": 2178 }, { "epoch": 8.716, "grad_norm": 1.162697434425354, "learning_rate": 2.828657314629259e-05, "loss": 0.0452, "step": 2179 }, { "epoch": 8.72, "grad_norm": 1.5059949159622192, "learning_rate": 2.8276553106212427e-05, "loss": 0.0503, "step": 2180 }, { "epoch": 8.724, "grad_norm": 1.33864426612854, "learning_rate": 2.8266533066132268e-05, "loss": 0.0519, "step": 2181 }, { "epoch": 8.728, "grad_norm": 1.316199779510498, "learning_rate": 2.8256513026052106e-05, "loss": 0.0515, "step": 2182 }, { "epoch": 8.732, "grad_norm": 1.2155145406723022, "learning_rate": 2.8246492985971944e-05, "loss": 0.0425, "step": 2183 }, { "epoch": 8.736, "grad_norm": 1.4967896938323975, "learning_rate": 2.8236472945891785e-05, "loss": 0.0589, "step": 2184 }, { "epoch": 8.74, "grad_norm": 1.188382863998413, "learning_rate": 2.8226452905811623e-05, "loss": 0.0446, "step": 2185 }, { "epoch": 8.744, "grad_norm": 1.3887944221496582, "learning_rate": 2.821643286573146e-05, "loss": 0.0513, "step": 2186 }, { "epoch": 8.748, "grad_norm": 1.4028888940811157, "learning_rate": 2.8206412825651302e-05, "loss": 0.0536, "step": 2187 }, { "epoch": 8.752, "grad_norm": 1.4226263761520386, "learning_rate": 2.8196392785571147e-05, "loss": 0.0561, "step": 2188 }, { "epoch": 8.756, "grad_norm": 1.1816654205322266, "learning_rate": 2.8186372745490985e-05, "loss": 0.039, "step": 2189 }, { "epoch": 8.76, "grad_norm": 1.3053758144378662, "learning_rate": 2.8176352705410823e-05, "loss": 0.043, "step": 2190 }, { "epoch": 8.764, "grad_norm": 1.021385669708252, "learning_rate": 2.8166332665330664e-05, "loss": 0.0425, "step": 2191 }, { "epoch": 8.768, "grad_norm": 1.6020197868347168, "learning_rate": 2.8156312625250502e-05, "loss": 0.0488, "step": 2192 }, { "epoch": 8.772, "grad_norm": 1.498152256011963, "learning_rate": 2.8146292585170343e-05, "loss": 0.0512, "step": 2193 }, { "epoch": 8.776, "grad_norm": 1.3913947343826294, "learning_rate": 2.813627254509018e-05, "loss": 0.0509, "step": 2194 }, { "epoch": 8.78, "grad_norm": 1.3088997602462769, "learning_rate": 2.812625250501002e-05, "loss": 0.0483, "step": 2195 }, { "epoch": 8.784, "grad_norm": 0.9024881720542908, "learning_rate": 2.811623246492986e-05, "loss": 0.0297, "step": 2196 }, { "epoch": 8.788, "grad_norm": 1.3125743865966797, "learning_rate": 2.8106212424849698e-05, "loss": 0.0552, "step": 2197 }, { "epoch": 8.792, "grad_norm": 1.094606876373291, "learning_rate": 2.8096192384769543e-05, "loss": 0.0398, "step": 2198 }, { "epoch": 8.796, "grad_norm": 1.1351479291915894, "learning_rate": 2.808617234468938e-05, "loss": 0.0518, "step": 2199 }, { "epoch": 8.8, "grad_norm": 1.218656063079834, "learning_rate": 2.8076152304609222e-05, "loss": 0.0547, "step": 2200 }, { "epoch": 8.804, "grad_norm": 1.1994880437850952, "learning_rate": 2.806613226452906e-05, "loss": 0.0405, "step": 2201 }, { "epoch": 8.808, "grad_norm": 1.25562584400177, "learning_rate": 2.8056112224448898e-05, "loss": 0.0537, "step": 2202 }, { "epoch": 8.812, "grad_norm": 1.5509350299835205, "learning_rate": 2.804609218436874e-05, "loss": 0.0624, "step": 2203 }, { "epoch": 8.816, "grad_norm": 1.4089373350143433, "learning_rate": 2.8036072144288577e-05, "loss": 0.0627, "step": 2204 }, { "epoch": 8.82, "grad_norm": 1.2322133779525757, "learning_rate": 2.802605210420842e-05, "loss": 0.0432, "step": 2205 }, { "epoch": 8.824, "grad_norm": 1.5502153635025024, "learning_rate": 2.8016032064128256e-05, "loss": 0.0682, "step": 2206 }, { "epoch": 8.828, "grad_norm": 1.2696598768234253, "learning_rate": 2.8006012024048094e-05, "loss": 0.046, "step": 2207 }, { "epoch": 8.832, "grad_norm": 1.2160747051239014, "learning_rate": 2.799599198396794e-05, "loss": 0.0401, "step": 2208 }, { "epoch": 8.836, "grad_norm": 1.38071608543396, "learning_rate": 2.798597194388778e-05, "loss": 0.0495, "step": 2209 }, { "epoch": 8.84, "grad_norm": 1.6408637762069702, "learning_rate": 2.7975951903807618e-05, "loss": 0.0646, "step": 2210 }, { "epoch": 8.844, "grad_norm": 1.340449333190918, "learning_rate": 2.7965931863727456e-05, "loss": 0.0475, "step": 2211 }, { "epoch": 8.848, "grad_norm": 1.3540539741516113, "learning_rate": 2.7955911823647297e-05, "loss": 0.0526, "step": 2212 }, { "epoch": 8.852, "grad_norm": 1.4355782270431519, "learning_rate": 2.7945891783567135e-05, "loss": 0.0478, "step": 2213 }, { "epoch": 8.856, "grad_norm": 1.5981539487838745, "learning_rate": 2.7935871743486973e-05, "loss": 0.0546, "step": 2214 }, { "epoch": 8.86, "grad_norm": 1.3910454511642456, "learning_rate": 2.7925851703406814e-05, "loss": 0.0513, "step": 2215 }, { "epoch": 8.864, "grad_norm": 1.2546508312225342, "learning_rate": 2.7915831663326652e-05, "loss": 0.0506, "step": 2216 }, { "epoch": 8.868, "grad_norm": 1.4212666749954224, "learning_rate": 2.790581162324649e-05, "loss": 0.05, "step": 2217 }, { "epoch": 8.872, "grad_norm": 1.2646417617797852, "learning_rate": 2.7895791583166335e-05, "loss": 0.046, "step": 2218 }, { "epoch": 8.876, "grad_norm": 1.1945741176605225, "learning_rate": 2.7885771543086176e-05, "loss": 0.0445, "step": 2219 }, { "epoch": 8.88, "grad_norm": 1.351228952407837, "learning_rate": 2.7875751503006014e-05, "loss": 0.0574, "step": 2220 }, { "epoch": 8.884, "grad_norm": 1.2459324598312378, "learning_rate": 2.7865731462925855e-05, "loss": 0.0479, "step": 2221 }, { "epoch": 8.888, "grad_norm": 1.5551952123641968, "learning_rate": 2.7855711422845693e-05, "loss": 0.0703, "step": 2222 }, { "epoch": 8.892, "grad_norm": 1.3713449239730835, "learning_rate": 2.784569138276553e-05, "loss": 0.049, "step": 2223 }, { "epoch": 8.896, "grad_norm": 1.3457205295562744, "learning_rate": 2.7835671342685372e-05, "loss": 0.0417, "step": 2224 }, { "epoch": 8.9, "grad_norm": 1.2651445865631104, "learning_rate": 2.782565130260521e-05, "loss": 0.0554, "step": 2225 }, { "epoch": 8.904, "grad_norm": 1.3092762231826782, "learning_rate": 2.7815631262525048e-05, "loss": 0.05, "step": 2226 }, { "epoch": 8.908, "grad_norm": 1.3287698030471802, "learning_rate": 2.780561122244489e-05, "loss": 0.0495, "step": 2227 }, { "epoch": 8.912, "grad_norm": 1.1512635946273804, "learning_rate": 2.7795591182364734e-05, "loss": 0.0453, "step": 2228 }, { "epoch": 8.916, "grad_norm": 1.3240880966186523, "learning_rate": 2.7785571142284572e-05, "loss": 0.0417, "step": 2229 }, { "epoch": 8.92, "grad_norm": 1.21474289894104, "learning_rate": 2.777555110220441e-05, "loss": 0.047, "step": 2230 }, { "epoch": 8.924, "grad_norm": 1.4137842655181885, "learning_rate": 2.776553106212425e-05, "loss": 0.0747, "step": 2231 }, { "epoch": 8.928, "grad_norm": 1.2826186418533325, "learning_rate": 2.775551102204409e-05, "loss": 0.0541, "step": 2232 }, { "epoch": 8.932, "grad_norm": 1.2921860218048096, "learning_rate": 2.774549098196393e-05, "loss": 0.0671, "step": 2233 }, { "epoch": 8.936, "grad_norm": 1.2751491069793701, "learning_rate": 2.7735470941883768e-05, "loss": 0.0357, "step": 2234 }, { "epoch": 8.94, "grad_norm": 1.2974969148635864, "learning_rate": 2.7725450901803606e-05, "loss": 0.05, "step": 2235 }, { "epoch": 8.943999999999999, "grad_norm": 1.132696509361267, "learning_rate": 2.7715430861723447e-05, "loss": 0.0425, "step": 2236 }, { "epoch": 8.948, "grad_norm": 0.6913595199584961, "learning_rate": 2.7705410821643292e-05, "loss": 0.0187, "step": 2237 }, { "epoch": 8.952, "grad_norm": 1.1450154781341553, "learning_rate": 2.769539078156313e-05, "loss": 0.0421, "step": 2238 }, { "epoch": 8.956, "grad_norm": 1.154909372329712, "learning_rate": 2.7685370741482968e-05, "loss": 0.0384, "step": 2239 }, { "epoch": 8.96, "grad_norm": 1.3707146644592285, "learning_rate": 2.767535070140281e-05, "loss": 0.0517, "step": 2240 }, { "epoch": 8.964, "grad_norm": 1.5236717462539673, "learning_rate": 2.7665330661322647e-05, "loss": 0.0684, "step": 2241 }, { "epoch": 8.968, "grad_norm": 1.2843303680419922, "learning_rate": 2.7655310621242485e-05, "loss": 0.0427, "step": 2242 }, { "epoch": 8.972, "grad_norm": 1.1353422403335571, "learning_rate": 2.7645290581162326e-05, "loss": 0.043, "step": 2243 }, { "epoch": 8.975999999999999, "grad_norm": 1.2605583667755127, "learning_rate": 2.7635270541082164e-05, "loss": 0.0486, "step": 2244 }, { "epoch": 8.98, "grad_norm": 1.3495458364486694, "learning_rate": 2.7625250501002002e-05, "loss": 0.0474, "step": 2245 }, { "epoch": 8.984, "grad_norm": 1.3079636096954346, "learning_rate": 2.7615230460921843e-05, "loss": 0.0639, "step": 2246 }, { "epoch": 8.988, "grad_norm": 1.171911358833313, "learning_rate": 2.7605210420841688e-05, "loss": 0.0417, "step": 2247 }, { "epoch": 8.992, "grad_norm": 1.34932541847229, "learning_rate": 2.7595190380761526e-05, "loss": 0.0482, "step": 2248 }, { "epoch": 8.996, "grad_norm": 1.653420329093933, "learning_rate": 2.7585170340681367e-05, "loss": 0.0602, "step": 2249 }, { "epoch": 9.0, "grad_norm": 1.4358410835266113, "learning_rate": 2.7575150300601205e-05, "loss": 0.0582, "step": 2250 }, { "epoch": 9.004, "grad_norm": 0.6799370050430298, "learning_rate": 2.7565130260521043e-05, "loss": 0.0219, "step": 2251 }, { "epoch": 9.008, "grad_norm": 0.6610779762268066, "learning_rate": 2.7555110220440884e-05, "loss": 0.022, "step": 2252 }, { "epoch": 9.012, "grad_norm": 0.9135136604309082, "learning_rate": 2.7545090180360722e-05, "loss": 0.0262, "step": 2253 }, { "epoch": 9.016, "grad_norm": 0.8344765305519104, "learning_rate": 2.753507014028056e-05, "loss": 0.0254, "step": 2254 }, { "epoch": 9.02, "grad_norm": 0.8494986891746521, "learning_rate": 2.75250501002004e-05, "loss": 0.0273, "step": 2255 }, { "epoch": 9.024, "grad_norm": 0.9093011021614075, "learning_rate": 2.751503006012024e-05, "loss": 0.0281, "step": 2256 }, { "epoch": 9.028, "grad_norm": 0.9227543473243713, "learning_rate": 2.7505010020040084e-05, "loss": 0.0294, "step": 2257 }, { "epoch": 9.032, "grad_norm": 0.6513843536376953, "learning_rate": 2.749498997995992e-05, "loss": 0.0201, "step": 2258 }, { "epoch": 9.036, "grad_norm": 0.8121753334999084, "learning_rate": 2.7484969939879763e-05, "loss": 0.0262, "step": 2259 }, { "epoch": 9.04, "grad_norm": 1.054721713066101, "learning_rate": 2.74749498997996e-05, "loss": 0.0277, "step": 2260 }, { "epoch": 9.044, "grad_norm": 0.8000840544700623, "learning_rate": 2.7464929859719442e-05, "loss": 0.0227, "step": 2261 }, { "epoch": 9.048, "grad_norm": 0.8721778988838196, "learning_rate": 2.745490981963928e-05, "loss": 0.0236, "step": 2262 }, { "epoch": 9.052, "grad_norm": 0.9745427370071411, "learning_rate": 2.7444889779559118e-05, "loss": 0.0236, "step": 2263 }, { "epoch": 9.056, "grad_norm": 0.972322404384613, "learning_rate": 2.743486973947896e-05, "loss": 0.0278, "step": 2264 }, { "epoch": 9.06, "grad_norm": 1.111842155456543, "learning_rate": 2.7424849699398797e-05, "loss": 0.0299, "step": 2265 }, { "epoch": 9.064, "grad_norm": 0.7864890694618225, "learning_rate": 2.7414829659318635e-05, "loss": 0.0243, "step": 2266 }, { "epoch": 9.068, "grad_norm": 1.2372549772262573, "learning_rate": 2.740480961923848e-05, "loss": 0.043, "step": 2267 }, { "epoch": 9.072, "grad_norm": 0.7905459403991699, "learning_rate": 2.739478957915832e-05, "loss": 0.021, "step": 2268 }, { "epoch": 9.076, "grad_norm": 1.100546956062317, "learning_rate": 2.738476953907816e-05, "loss": 0.0277, "step": 2269 }, { "epoch": 9.08, "grad_norm": 0.8412137031555176, "learning_rate": 2.7374749498997997e-05, "loss": 0.025, "step": 2270 }, { "epoch": 9.084, "grad_norm": 0.9313427805900574, "learning_rate": 2.7364729458917838e-05, "loss": 0.0231, "step": 2271 }, { "epoch": 9.088, "grad_norm": 1.1528956890106201, "learning_rate": 2.7354709418837676e-05, "loss": 0.0338, "step": 2272 }, { "epoch": 9.092, "grad_norm": 0.7774552702903748, "learning_rate": 2.7344689378757514e-05, "loss": 0.0247, "step": 2273 }, { "epoch": 9.096, "grad_norm": 1.030387282371521, "learning_rate": 2.7334669338677355e-05, "loss": 0.0266, "step": 2274 }, { "epoch": 9.1, "grad_norm": 0.8086778521537781, "learning_rate": 2.7324649298597193e-05, "loss": 0.0246, "step": 2275 }, { "epoch": 9.104, "grad_norm": 0.8489255905151367, "learning_rate": 2.7314629258517034e-05, "loss": 0.0249, "step": 2276 }, { "epoch": 9.108, "grad_norm": 0.7739505171775818, "learning_rate": 2.730460921843688e-05, "loss": 0.0257, "step": 2277 }, { "epoch": 9.112, "grad_norm": 0.8986899852752686, "learning_rate": 2.7294589178356717e-05, "loss": 0.0258, "step": 2278 }, { "epoch": 9.116, "grad_norm": 1.1886520385742188, "learning_rate": 2.7284569138276555e-05, "loss": 0.0349, "step": 2279 }, { "epoch": 9.12, "grad_norm": 0.9789061546325684, "learning_rate": 2.7274549098196396e-05, "loss": 0.0277, "step": 2280 }, { "epoch": 9.124, "grad_norm": 1.192771315574646, "learning_rate": 2.7264529058116234e-05, "loss": 0.0336, "step": 2281 }, { "epoch": 9.128, "grad_norm": 1.048073649406433, "learning_rate": 2.7254509018036072e-05, "loss": 0.031, "step": 2282 }, { "epoch": 9.132, "grad_norm": 1.163774013519287, "learning_rate": 2.7244488977955913e-05, "loss": 0.0309, "step": 2283 }, { "epoch": 9.136, "grad_norm": 0.8994669914245605, "learning_rate": 2.723446893787575e-05, "loss": 0.0204, "step": 2284 }, { "epoch": 9.14, "grad_norm": 1.123653769493103, "learning_rate": 2.722444889779559e-05, "loss": 0.0325, "step": 2285 }, { "epoch": 9.144, "grad_norm": 0.9699426889419556, "learning_rate": 2.7214428857715433e-05, "loss": 0.0253, "step": 2286 }, { "epoch": 9.148, "grad_norm": 0.8264506459236145, "learning_rate": 2.7204408817635275e-05, "loss": 0.0234, "step": 2287 }, { "epoch": 9.152, "grad_norm": 1.0332759618759155, "learning_rate": 2.7194388777555113e-05, "loss": 0.0315, "step": 2288 }, { "epoch": 9.156, "grad_norm": 1.0331981182098389, "learning_rate": 2.7184368737474954e-05, "loss": 0.0336, "step": 2289 }, { "epoch": 9.16, "grad_norm": 2.753606081008911, "learning_rate": 2.7174348697394792e-05, "loss": 0.0322, "step": 2290 }, { "epoch": 9.164, "grad_norm": 0.8838220834732056, "learning_rate": 2.716432865731463e-05, "loss": 0.0244, "step": 2291 }, { "epoch": 9.168, "grad_norm": 0.9516180753707886, "learning_rate": 2.715430861723447e-05, "loss": 0.0325, "step": 2292 }, { "epoch": 9.172, "grad_norm": 0.9610980153083801, "learning_rate": 2.714428857715431e-05, "loss": 0.0257, "step": 2293 }, { "epoch": 9.176, "grad_norm": 1.0998841524124146, "learning_rate": 2.7134268537074147e-05, "loss": 0.0291, "step": 2294 }, { "epoch": 9.18, "grad_norm": 1.0278939008712769, "learning_rate": 2.7124248496993988e-05, "loss": 0.0308, "step": 2295 }, { "epoch": 9.184, "grad_norm": 1.050432562828064, "learning_rate": 2.7114228456913833e-05, "loss": 0.0362, "step": 2296 }, { "epoch": 9.188, "grad_norm": 1.3083264827728271, "learning_rate": 2.710420841683367e-05, "loss": 0.0347, "step": 2297 }, { "epoch": 9.192, "grad_norm": 1.0661804676055908, "learning_rate": 2.709418837675351e-05, "loss": 0.0317, "step": 2298 }, { "epoch": 9.196, "grad_norm": 0.8684881925582886, "learning_rate": 2.708416833667335e-05, "loss": 0.028, "step": 2299 }, { "epoch": 9.2, "grad_norm": 0.4513463079929352, "learning_rate": 2.7074148296593188e-05, "loss": 0.0072, "step": 2300 }, { "epoch": 9.204, "grad_norm": 0.9160906076431274, "learning_rate": 2.7064128256513026e-05, "loss": 0.0268, "step": 2301 }, { "epoch": 9.208, "grad_norm": 1.1022686958312988, "learning_rate": 2.7054108216432867e-05, "loss": 0.0261, "step": 2302 }, { "epoch": 9.212, "grad_norm": 1.2009657621383667, "learning_rate": 2.7044088176352705e-05, "loss": 0.0388, "step": 2303 }, { "epoch": 9.216, "grad_norm": 1.1833915710449219, "learning_rate": 2.7034068136272546e-05, "loss": 0.031, "step": 2304 }, { "epoch": 9.22, "grad_norm": 1.0764391422271729, "learning_rate": 2.7024048096192384e-05, "loss": 0.0327, "step": 2305 }, { "epoch": 9.224, "grad_norm": 0.9721907377243042, "learning_rate": 2.701402805611223e-05, "loss": 0.0326, "step": 2306 }, { "epoch": 9.228, "grad_norm": 1.1473323106765747, "learning_rate": 2.7004008016032067e-05, "loss": 0.0345, "step": 2307 }, { "epoch": 9.232, "grad_norm": 1.1332943439483643, "learning_rate": 2.6993987975951908e-05, "loss": 0.0313, "step": 2308 }, { "epoch": 9.236, "grad_norm": 1.263730525970459, "learning_rate": 2.6983967935871746e-05, "loss": 0.0302, "step": 2309 }, { "epoch": 9.24, "grad_norm": 1.0454695224761963, "learning_rate": 2.6973947895791584e-05, "loss": 0.028, "step": 2310 }, { "epoch": 9.244, "grad_norm": 1.1046359539031982, "learning_rate": 2.6963927855711425e-05, "loss": 0.0343, "step": 2311 }, { "epoch": 9.248, "grad_norm": 1.1158086061477661, "learning_rate": 2.6953907815631263e-05, "loss": 0.0298, "step": 2312 }, { "epoch": 9.252, "grad_norm": 1.1569856405258179, "learning_rate": 2.69438877755511e-05, "loss": 0.0281, "step": 2313 }, { "epoch": 9.256, "grad_norm": 0.6049909591674805, "learning_rate": 2.6933867735470942e-05, "loss": 0.0132, "step": 2314 }, { "epoch": 9.26, "grad_norm": 1.2866177558898926, "learning_rate": 2.692384769539078e-05, "loss": 0.0361, "step": 2315 }, { "epoch": 9.264, "grad_norm": 0.9961463809013367, "learning_rate": 2.6913827655310625e-05, "loss": 0.0267, "step": 2316 }, { "epoch": 9.268, "grad_norm": 1.083455204963684, "learning_rate": 2.6903807615230462e-05, "loss": 0.0271, "step": 2317 }, { "epoch": 9.272, "grad_norm": 0.8019450306892395, "learning_rate": 2.6893787575150304e-05, "loss": 0.0313, "step": 2318 }, { "epoch": 9.276, "grad_norm": 1.0289325714111328, "learning_rate": 2.688376753507014e-05, "loss": 0.0338, "step": 2319 }, { "epoch": 9.28, "grad_norm": 1.015210509300232, "learning_rate": 2.6873747494989983e-05, "loss": 0.028, "step": 2320 }, { "epoch": 9.284, "grad_norm": 0.8635653257369995, "learning_rate": 2.686372745490982e-05, "loss": 0.0256, "step": 2321 }, { "epoch": 9.288, "grad_norm": 0.9069684743881226, "learning_rate": 2.685370741482966e-05, "loss": 0.0324, "step": 2322 }, { "epoch": 9.292, "grad_norm": 1.1683540344238281, "learning_rate": 2.68436873747495e-05, "loss": 0.0279, "step": 2323 }, { "epoch": 9.296, "grad_norm": 1.084283471107483, "learning_rate": 2.6833667334669338e-05, "loss": 0.033, "step": 2324 }, { "epoch": 9.3, "grad_norm": 1.0750457048416138, "learning_rate": 2.6823647294589176e-05, "loss": 0.026, "step": 2325 }, { "epoch": 9.304, "grad_norm": 1.1331920623779297, "learning_rate": 2.681362725450902e-05, "loss": 0.0282, "step": 2326 }, { "epoch": 9.308, "grad_norm": 0.7947516441345215, "learning_rate": 2.6803607214428862e-05, "loss": 0.0229, "step": 2327 }, { "epoch": 9.312, "grad_norm": 1.2119550704956055, "learning_rate": 2.67935871743487e-05, "loss": 0.0352, "step": 2328 }, { "epoch": 9.316, "grad_norm": 0.9553737640380859, "learning_rate": 2.6783567134268537e-05, "loss": 0.0291, "step": 2329 }, { "epoch": 9.32, "grad_norm": 0.9401413202285767, "learning_rate": 2.677354709418838e-05, "loss": 0.0278, "step": 2330 }, { "epoch": 9.324, "grad_norm": 0.9966760873794556, "learning_rate": 2.6763527054108217e-05, "loss": 0.0247, "step": 2331 }, { "epoch": 9.328, "grad_norm": 1.0120066404342651, "learning_rate": 2.6753507014028058e-05, "loss": 0.0313, "step": 2332 }, { "epoch": 9.332, "grad_norm": 1.1858829259872437, "learning_rate": 2.6743486973947896e-05, "loss": 0.0358, "step": 2333 }, { "epoch": 9.336, "grad_norm": 1.082135796546936, "learning_rate": 2.6733466933867734e-05, "loss": 0.0311, "step": 2334 }, { "epoch": 9.34, "grad_norm": 0.9590717554092407, "learning_rate": 2.6723446893787575e-05, "loss": 0.0316, "step": 2335 }, { "epoch": 9.344, "grad_norm": 1.2413767576217651, "learning_rate": 2.671342685370742e-05, "loss": 0.0373, "step": 2336 }, { "epoch": 9.348, "grad_norm": 1.1042641401290894, "learning_rate": 2.6703406813627258e-05, "loss": 0.0325, "step": 2337 }, { "epoch": 9.352, "grad_norm": 0.944624125957489, "learning_rate": 2.6693386773547095e-05, "loss": 0.0273, "step": 2338 }, { "epoch": 9.356, "grad_norm": 1.1185686588287354, "learning_rate": 2.6683366733466937e-05, "loss": 0.0377, "step": 2339 }, { "epoch": 9.36, "grad_norm": 1.205579400062561, "learning_rate": 2.6673346693386775e-05, "loss": 0.0345, "step": 2340 }, { "epoch": 9.364, "grad_norm": 1.2945799827575684, "learning_rate": 2.6663326653306613e-05, "loss": 0.0324, "step": 2341 }, { "epoch": 9.368, "grad_norm": 0.9837092161178589, "learning_rate": 2.6653306613226454e-05, "loss": 0.0276, "step": 2342 }, { "epoch": 9.372, "grad_norm": 1.1606407165527344, "learning_rate": 2.6643286573146292e-05, "loss": 0.0331, "step": 2343 }, { "epoch": 9.376, "grad_norm": 1.0431427955627441, "learning_rate": 2.663326653306613e-05, "loss": 0.0295, "step": 2344 }, { "epoch": 9.38, "grad_norm": 1.0889356136322021, "learning_rate": 2.6623246492985974e-05, "loss": 0.0291, "step": 2345 }, { "epoch": 9.384, "grad_norm": 0.9746689200401306, "learning_rate": 2.6613226452905816e-05, "loss": 0.0281, "step": 2346 }, { "epoch": 9.388, "grad_norm": 1.1333224773406982, "learning_rate": 2.6603206412825653e-05, "loss": 0.0306, "step": 2347 }, { "epoch": 9.392, "grad_norm": 1.082342505455017, "learning_rate": 2.6593186372745495e-05, "loss": 0.032, "step": 2348 }, { "epoch": 9.396, "grad_norm": 1.0573281049728394, "learning_rate": 2.6583166332665333e-05, "loss": 0.0261, "step": 2349 }, { "epoch": 9.4, "grad_norm": 1.1344096660614014, "learning_rate": 2.657314629258517e-05, "loss": 0.03, "step": 2350 }, { "epoch": 9.404, "grad_norm": 1.1539257764816284, "learning_rate": 2.6563126252505012e-05, "loss": 0.0325, "step": 2351 }, { "epoch": 9.408, "grad_norm": 1.318395733833313, "learning_rate": 2.655310621242485e-05, "loss": 0.0383, "step": 2352 }, { "epoch": 9.412, "grad_norm": 0.6710442900657654, "learning_rate": 2.6543086172344688e-05, "loss": 0.0202, "step": 2353 }, { "epoch": 9.416, "grad_norm": 1.0864232778549194, "learning_rate": 2.653306613226453e-05, "loss": 0.0286, "step": 2354 }, { "epoch": 9.42, "grad_norm": 1.046763300895691, "learning_rate": 2.6523046092184374e-05, "loss": 0.0292, "step": 2355 }, { "epoch": 9.424, "grad_norm": 1.1092429161071777, "learning_rate": 2.651302605210421e-05, "loss": 0.0308, "step": 2356 }, { "epoch": 9.428, "grad_norm": 1.0427887439727783, "learning_rate": 2.650300601202405e-05, "loss": 0.0282, "step": 2357 }, { "epoch": 9.432, "grad_norm": 1.171705722808838, "learning_rate": 2.649298597194389e-05, "loss": 0.0326, "step": 2358 }, { "epoch": 9.436, "grad_norm": 0.982445240020752, "learning_rate": 2.648296593186373e-05, "loss": 0.0275, "step": 2359 }, { "epoch": 9.44, "grad_norm": 0.9876815676689148, "learning_rate": 2.647294589178357e-05, "loss": 0.0366, "step": 2360 }, { "epoch": 9.444, "grad_norm": 0.9229361414909363, "learning_rate": 2.6462925851703408e-05, "loss": 0.0284, "step": 2361 }, { "epoch": 9.448, "grad_norm": 0.9455786943435669, "learning_rate": 2.6452905811623246e-05, "loss": 0.0279, "step": 2362 }, { "epoch": 9.452, "grad_norm": 0.5432707071304321, "learning_rate": 2.6442885771543087e-05, "loss": 0.0153, "step": 2363 }, { "epoch": 9.456, "grad_norm": 1.1233261823654175, "learning_rate": 2.6432865731462925e-05, "loss": 0.0322, "step": 2364 }, { "epoch": 9.46, "grad_norm": 1.0408973693847656, "learning_rate": 2.642284569138277e-05, "loss": 0.0334, "step": 2365 }, { "epoch": 9.464, "grad_norm": 1.3444018363952637, "learning_rate": 2.6412825651302607e-05, "loss": 0.0354, "step": 2366 }, { "epoch": 9.468, "grad_norm": 1.0790801048278809, "learning_rate": 2.640280561122245e-05, "loss": 0.0291, "step": 2367 }, { "epoch": 9.472, "grad_norm": 1.0651098489761353, "learning_rate": 2.6392785571142287e-05, "loss": 0.0412, "step": 2368 }, { "epoch": 9.475999999999999, "grad_norm": 1.0434693098068237, "learning_rate": 2.6382765531062124e-05, "loss": 0.0287, "step": 2369 }, { "epoch": 9.48, "grad_norm": 1.1507903337478638, "learning_rate": 2.6372745490981966e-05, "loss": 0.032, "step": 2370 }, { "epoch": 9.484, "grad_norm": 1.0209254026412964, "learning_rate": 2.6362725450901804e-05, "loss": 0.0312, "step": 2371 }, { "epoch": 9.488, "grad_norm": 1.2612141370773315, "learning_rate": 2.635270541082164e-05, "loss": 0.038, "step": 2372 }, { "epoch": 9.492, "grad_norm": 0.9201658368110657, "learning_rate": 2.6342685370741483e-05, "loss": 0.0302, "step": 2373 }, { "epoch": 9.496, "grad_norm": 1.2548552751541138, "learning_rate": 2.633266533066132e-05, "loss": 0.0356, "step": 2374 }, { "epoch": 9.5, "grad_norm": 0.9841686487197876, "learning_rate": 2.6322645290581165e-05, "loss": 0.0262, "step": 2375 }, { "epoch": 9.504, "grad_norm": 1.228925108909607, "learning_rate": 2.6312625250501007e-05, "loss": 0.0354, "step": 2376 }, { "epoch": 9.508, "grad_norm": 1.1918587684631348, "learning_rate": 2.6302605210420845e-05, "loss": 0.0334, "step": 2377 }, { "epoch": 9.512, "grad_norm": 1.1744786500930786, "learning_rate": 2.6292585170340682e-05, "loss": 0.0391, "step": 2378 }, { "epoch": 9.516, "grad_norm": 1.0026384592056274, "learning_rate": 2.6282565130260524e-05, "loss": 0.0361, "step": 2379 }, { "epoch": 9.52, "grad_norm": 1.3792084455490112, "learning_rate": 2.627254509018036e-05, "loss": 0.0323, "step": 2380 }, { "epoch": 9.524000000000001, "grad_norm": 0.7252622842788696, "learning_rate": 2.62625250501002e-05, "loss": 0.0176, "step": 2381 }, { "epoch": 9.528, "grad_norm": 1.0033866167068481, "learning_rate": 2.625250501002004e-05, "loss": 0.0307, "step": 2382 }, { "epoch": 9.532, "grad_norm": 1.1288398504257202, "learning_rate": 2.624248496993988e-05, "loss": 0.0363, "step": 2383 }, { "epoch": 9.536, "grad_norm": 1.11295485496521, "learning_rate": 2.6232464929859717e-05, "loss": 0.0359, "step": 2384 }, { "epoch": 9.54, "grad_norm": 0.9084358215332031, "learning_rate": 2.622244488977956e-05, "loss": 0.026, "step": 2385 }, { "epoch": 9.544, "grad_norm": 1.091408371925354, "learning_rate": 2.6212424849699403e-05, "loss": 0.0328, "step": 2386 }, { "epoch": 9.548, "grad_norm": 1.1012197732925415, "learning_rate": 2.620240480961924e-05, "loss": 0.0337, "step": 2387 }, { "epoch": 9.552, "grad_norm": 1.2079898118972778, "learning_rate": 2.619238476953908e-05, "loss": 0.0318, "step": 2388 }, { "epoch": 9.556000000000001, "grad_norm": 0.6078687310218811, "learning_rate": 2.618236472945892e-05, "loss": 0.0192, "step": 2389 }, { "epoch": 9.56, "grad_norm": 0.9216514825820923, "learning_rate": 2.6172344689378757e-05, "loss": 0.0291, "step": 2390 }, { "epoch": 9.564, "grad_norm": 0.9543266892433167, "learning_rate": 2.61623246492986e-05, "loss": 0.0283, "step": 2391 }, { "epoch": 9.568, "grad_norm": 1.2358766794204712, "learning_rate": 2.6152304609218437e-05, "loss": 0.0338, "step": 2392 }, { "epoch": 9.572, "grad_norm": 1.0046981573104858, "learning_rate": 2.6142284569138275e-05, "loss": 0.0234, "step": 2393 }, { "epoch": 9.576, "grad_norm": 1.3283228874206543, "learning_rate": 2.6132264529058116e-05, "loss": 0.0488, "step": 2394 }, { "epoch": 9.58, "grad_norm": 0.8663226962089539, "learning_rate": 2.612224448897796e-05, "loss": 0.0219, "step": 2395 }, { "epoch": 9.584, "grad_norm": 1.2725272178649902, "learning_rate": 2.61122244488978e-05, "loss": 0.037, "step": 2396 }, { "epoch": 9.588, "grad_norm": 1.0414756536483765, "learning_rate": 2.6102204408817636e-05, "loss": 0.0337, "step": 2397 }, { "epoch": 9.592, "grad_norm": 1.0537278652191162, "learning_rate": 2.6092184368737478e-05, "loss": 0.0296, "step": 2398 }, { "epoch": 9.596, "grad_norm": 1.2478158473968506, "learning_rate": 2.6082164328657315e-05, "loss": 0.0344, "step": 2399 }, { "epoch": 9.6, "grad_norm": 1.1711362600326538, "learning_rate": 2.6072144288577153e-05, "loss": 0.036, "step": 2400 }, { "epoch": 9.604, "grad_norm": 1.2118823528289795, "learning_rate": 2.6062124248496995e-05, "loss": 0.0284, "step": 2401 }, { "epoch": 9.608, "grad_norm": 1.1261894702911377, "learning_rate": 2.6052104208416833e-05, "loss": 0.0339, "step": 2402 }, { "epoch": 9.612, "grad_norm": 1.8169751167297363, "learning_rate": 2.6042084168336674e-05, "loss": 0.0326, "step": 2403 }, { "epoch": 9.616, "grad_norm": 1.1546857357025146, "learning_rate": 2.603206412825652e-05, "loss": 0.0358, "step": 2404 }, { "epoch": 9.62, "grad_norm": 1.1949758529663086, "learning_rate": 2.6022044088176356e-05, "loss": 0.0337, "step": 2405 }, { "epoch": 9.624, "grad_norm": 0.9737666249275208, "learning_rate": 2.6012024048096194e-05, "loss": 0.0294, "step": 2406 }, { "epoch": 9.628, "grad_norm": 1.1526328325271606, "learning_rate": 2.6002004008016036e-05, "loss": 0.0343, "step": 2407 }, { "epoch": 9.632, "grad_norm": 1.266064167022705, "learning_rate": 2.5991983967935873e-05, "loss": 0.0309, "step": 2408 }, { "epoch": 9.636, "grad_norm": 1.0405514240264893, "learning_rate": 2.598196392785571e-05, "loss": 0.0325, "step": 2409 }, { "epoch": 9.64, "grad_norm": 1.1470423936843872, "learning_rate": 2.5971943887775553e-05, "loss": 0.0315, "step": 2410 }, { "epoch": 9.644, "grad_norm": 1.1034044027328491, "learning_rate": 2.596192384769539e-05, "loss": 0.0325, "step": 2411 }, { "epoch": 9.648, "grad_norm": 1.101983666419983, "learning_rate": 2.595190380761523e-05, "loss": 0.0281, "step": 2412 }, { "epoch": 9.652, "grad_norm": 1.0074468851089478, "learning_rate": 2.594188376753507e-05, "loss": 0.0311, "step": 2413 }, { "epoch": 9.656, "grad_norm": 1.2595995664596558, "learning_rate": 2.5931863727454914e-05, "loss": 0.0396, "step": 2414 }, { "epoch": 9.66, "grad_norm": 1.195958137512207, "learning_rate": 2.5921843687374752e-05, "loss": 0.0329, "step": 2415 }, { "epoch": 9.664, "grad_norm": 1.3391655683517456, "learning_rate": 2.5911823647294594e-05, "loss": 0.0402, "step": 2416 }, { "epoch": 9.668, "grad_norm": 1.2114450931549072, "learning_rate": 2.590180360721443e-05, "loss": 0.0401, "step": 2417 }, { "epoch": 9.672, "grad_norm": 1.2215373516082764, "learning_rate": 2.589178356713427e-05, "loss": 0.0325, "step": 2418 }, { "epoch": 9.676, "grad_norm": 1.1048833131790161, "learning_rate": 2.588176352705411e-05, "loss": 0.0337, "step": 2419 }, { "epoch": 9.68, "grad_norm": 0.823464035987854, "learning_rate": 2.587174348697395e-05, "loss": 0.0204, "step": 2420 }, { "epoch": 9.684, "grad_norm": 1.2144451141357422, "learning_rate": 2.5861723446893786e-05, "loss": 0.0337, "step": 2421 }, { "epoch": 9.688, "grad_norm": 0.928974986076355, "learning_rate": 2.5851703406813628e-05, "loss": 0.0276, "step": 2422 }, { "epoch": 9.692, "grad_norm": 1.3975306749343872, "learning_rate": 2.5841683366733466e-05, "loss": 0.041, "step": 2423 }, { "epoch": 9.696, "grad_norm": 1.0173219442367554, "learning_rate": 2.583166332665331e-05, "loss": 0.0308, "step": 2424 }, { "epoch": 9.7, "grad_norm": 0.8563686013221741, "learning_rate": 2.5821643286573148e-05, "loss": 0.0308, "step": 2425 }, { "epoch": 9.704, "grad_norm": 0.930245041847229, "learning_rate": 2.581162324649299e-05, "loss": 0.0298, "step": 2426 }, { "epoch": 9.708, "grad_norm": 1.0278750658035278, "learning_rate": 2.5801603206412827e-05, "loss": 0.0298, "step": 2427 }, { "epoch": 9.712, "grad_norm": 0.9554607272148132, "learning_rate": 2.5791583166332665e-05, "loss": 0.0321, "step": 2428 }, { "epoch": 9.716, "grad_norm": 0.9845343828201294, "learning_rate": 2.5781563126252507e-05, "loss": 0.0365, "step": 2429 }, { "epoch": 9.72, "grad_norm": 0.959926187992096, "learning_rate": 2.5771543086172344e-05, "loss": 0.0323, "step": 2430 }, { "epoch": 9.724, "grad_norm": 1.3230634927749634, "learning_rate": 2.5761523046092186e-05, "loss": 0.0397, "step": 2431 }, { "epoch": 9.728, "grad_norm": 1.1665371656417847, "learning_rate": 2.5751503006012024e-05, "loss": 0.0299, "step": 2432 }, { "epoch": 9.732, "grad_norm": 1.1947264671325684, "learning_rate": 2.574148296593186e-05, "loss": 0.0344, "step": 2433 }, { "epoch": 9.736, "grad_norm": 1.088255763053894, "learning_rate": 2.5731462925851706e-05, "loss": 0.0308, "step": 2434 }, { "epoch": 9.74, "grad_norm": 1.1948275566101074, "learning_rate": 2.5721442885771547e-05, "loss": 0.0433, "step": 2435 }, { "epoch": 9.744, "grad_norm": 0.96909099817276, "learning_rate": 2.5711422845691385e-05, "loss": 0.0311, "step": 2436 }, { "epoch": 9.748, "grad_norm": 1.1078877449035645, "learning_rate": 2.5701402805611223e-05, "loss": 0.033, "step": 2437 }, { "epoch": 9.752, "grad_norm": 1.0920991897583008, "learning_rate": 2.5691382765531065e-05, "loss": 0.0345, "step": 2438 }, { "epoch": 9.756, "grad_norm": 1.28229558467865, "learning_rate": 2.5681362725450902e-05, "loss": 0.0386, "step": 2439 }, { "epoch": 9.76, "grad_norm": 1.235954761505127, "learning_rate": 2.567134268537074e-05, "loss": 0.037, "step": 2440 }, { "epoch": 9.764, "grad_norm": 1.089188575744629, "learning_rate": 2.566132264529058e-05, "loss": 0.0308, "step": 2441 }, { "epoch": 9.768, "grad_norm": 1.021936058998108, "learning_rate": 2.565130260521042e-05, "loss": 0.0275, "step": 2442 }, { "epoch": 9.772, "grad_norm": 1.178109049797058, "learning_rate": 2.564128256513026e-05, "loss": 0.0328, "step": 2443 }, { "epoch": 9.776, "grad_norm": 1.2006083726882935, "learning_rate": 2.5631262525050102e-05, "loss": 0.0372, "step": 2444 }, { "epoch": 9.78, "grad_norm": 1.1977381706237793, "learning_rate": 2.5621242484969943e-05, "loss": 0.0331, "step": 2445 }, { "epoch": 9.784, "grad_norm": 1.240316390991211, "learning_rate": 2.561122244488978e-05, "loss": 0.0335, "step": 2446 }, { "epoch": 9.788, "grad_norm": 1.1987797021865845, "learning_rate": 2.5601202404809622e-05, "loss": 0.0341, "step": 2447 }, { "epoch": 9.792, "grad_norm": 1.0871502161026, "learning_rate": 2.559118236472946e-05, "loss": 0.0343, "step": 2448 }, { "epoch": 9.796, "grad_norm": 0.9493705034255981, "learning_rate": 2.5581162324649298e-05, "loss": 0.0262, "step": 2449 }, { "epoch": 9.8, "grad_norm": 1.1285532712936401, "learning_rate": 2.557114228456914e-05, "loss": 0.0334, "step": 2450 }, { "epoch": 9.804, "grad_norm": 0.9567793011665344, "learning_rate": 2.5561122244488977e-05, "loss": 0.0291, "step": 2451 }, { "epoch": 9.808, "grad_norm": 1.1705495119094849, "learning_rate": 2.5551102204408815e-05, "loss": 0.0334, "step": 2452 }, { "epoch": 9.812, "grad_norm": 1.1043469905853271, "learning_rate": 2.554108216432866e-05, "loss": 0.0329, "step": 2453 }, { "epoch": 9.816, "grad_norm": 1.5261973142623901, "learning_rate": 2.55310621242485e-05, "loss": 0.04, "step": 2454 }, { "epoch": 9.82, "grad_norm": 1.164994716644287, "learning_rate": 2.552104208416834e-05, "loss": 0.0349, "step": 2455 }, { "epoch": 9.824, "grad_norm": 0.960659384727478, "learning_rate": 2.5511022044088177e-05, "loss": 0.0251, "step": 2456 }, { "epoch": 9.828, "grad_norm": 1.2397003173828125, "learning_rate": 2.550100200400802e-05, "loss": 0.0379, "step": 2457 }, { "epoch": 9.832, "grad_norm": 1.0921655893325806, "learning_rate": 2.5490981963927856e-05, "loss": 0.0377, "step": 2458 }, { "epoch": 9.836, "grad_norm": 1.165229320526123, "learning_rate": 2.5480961923847698e-05, "loss": 0.032, "step": 2459 }, { "epoch": 9.84, "grad_norm": 1.2369738817214966, "learning_rate": 2.5470941883767535e-05, "loss": 0.0372, "step": 2460 }, { "epoch": 9.844, "grad_norm": 1.1303399801254272, "learning_rate": 2.5460921843687373e-05, "loss": 0.031, "step": 2461 }, { "epoch": 9.848, "grad_norm": 1.3156812191009521, "learning_rate": 2.5450901803607215e-05, "loss": 0.0334, "step": 2462 }, { "epoch": 9.852, "grad_norm": 1.2975099086761475, "learning_rate": 2.544088176352706e-05, "loss": 0.0342, "step": 2463 }, { "epoch": 9.856, "grad_norm": 1.2751260995864868, "learning_rate": 2.5430861723446897e-05, "loss": 0.0391, "step": 2464 }, { "epoch": 9.86, "grad_norm": 1.1819888353347778, "learning_rate": 2.5420841683366735e-05, "loss": 0.0326, "step": 2465 }, { "epoch": 9.864, "grad_norm": 1.6855754852294922, "learning_rate": 2.5410821643286576e-05, "loss": 0.0476, "step": 2466 }, { "epoch": 9.868, "grad_norm": 1.1449778079986572, "learning_rate": 2.5400801603206414e-05, "loss": 0.0332, "step": 2467 }, { "epoch": 9.872, "grad_norm": 1.15505850315094, "learning_rate": 2.5390781563126252e-05, "loss": 0.0334, "step": 2468 }, { "epoch": 9.876, "grad_norm": 1.2710686922073364, "learning_rate": 2.5380761523046093e-05, "loss": 0.0317, "step": 2469 }, { "epoch": 9.88, "grad_norm": 1.2653512954711914, "learning_rate": 2.537074148296593e-05, "loss": 0.0387, "step": 2470 }, { "epoch": 9.884, "grad_norm": 1.1764731407165527, "learning_rate": 2.536072144288577e-05, "loss": 0.0375, "step": 2471 }, { "epoch": 9.888, "grad_norm": 0.7444102764129639, "learning_rate": 2.535070140280561e-05, "loss": 0.0182, "step": 2472 }, { "epoch": 9.892, "grad_norm": 1.2663006782531738, "learning_rate": 2.5340681362725455e-05, "loss": 0.034, "step": 2473 }, { "epoch": 9.896, "grad_norm": 1.14084792137146, "learning_rate": 2.5330661322645293e-05, "loss": 0.0365, "step": 2474 }, { "epoch": 9.9, "grad_norm": 1.0226420164108276, "learning_rate": 2.5320641282565134e-05, "loss": 0.0301, "step": 2475 }, { "epoch": 9.904, "grad_norm": 1.0205984115600586, "learning_rate": 2.5310621242484972e-05, "loss": 0.0305, "step": 2476 }, { "epoch": 9.908, "grad_norm": 1.1715893745422363, "learning_rate": 2.530060120240481e-05, "loss": 0.0322, "step": 2477 }, { "epoch": 9.912, "grad_norm": 1.4050195217132568, "learning_rate": 2.529058116232465e-05, "loss": 0.0434, "step": 2478 }, { "epoch": 9.916, "grad_norm": 1.1078695058822632, "learning_rate": 2.528056112224449e-05, "loss": 0.0311, "step": 2479 }, { "epoch": 9.92, "grad_norm": 1.0815905332565308, "learning_rate": 2.5270541082164327e-05, "loss": 0.0372, "step": 2480 }, { "epoch": 9.924, "grad_norm": 1.1205084323883057, "learning_rate": 2.526052104208417e-05, "loss": 0.0309, "step": 2481 }, { "epoch": 9.928, "grad_norm": 1.123581051826477, "learning_rate": 2.5250501002004006e-05, "loss": 0.0332, "step": 2482 }, { "epoch": 9.932, "grad_norm": 1.1913464069366455, "learning_rate": 2.524048096192385e-05, "loss": 0.0344, "step": 2483 }, { "epoch": 9.936, "grad_norm": 1.033087134361267, "learning_rate": 2.523046092184369e-05, "loss": 0.0326, "step": 2484 }, { "epoch": 9.94, "grad_norm": 1.0734940767288208, "learning_rate": 2.522044088176353e-05, "loss": 0.0364, "step": 2485 }, { "epoch": 9.943999999999999, "grad_norm": 1.0637986660003662, "learning_rate": 2.5210420841683368e-05, "loss": 0.038, "step": 2486 }, { "epoch": 9.948, "grad_norm": 1.1711645126342773, "learning_rate": 2.520040080160321e-05, "loss": 0.0343, "step": 2487 }, { "epoch": 9.952, "grad_norm": 1.1081736087799072, "learning_rate": 2.5190380761523047e-05, "loss": 0.032, "step": 2488 }, { "epoch": 9.956, "grad_norm": 1.0930702686309814, "learning_rate": 2.5180360721442885e-05, "loss": 0.0389, "step": 2489 }, { "epoch": 9.96, "grad_norm": 1.1130561828613281, "learning_rate": 2.5170340681362726e-05, "loss": 0.0332, "step": 2490 }, { "epoch": 9.964, "grad_norm": 1.0953967571258545, "learning_rate": 2.5160320641282564e-05, "loss": 0.0427, "step": 2491 }, { "epoch": 9.968, "grad_norm": 1.0438133478164673, "learning_rate": 2.5150300601202402e-05, "loss": 0.0337, "step": 2492 }, { "epoch": 9.972, "grad_norm": 1.1139825582504272, "learning_rate": 2.5140280561122247e-05, "loss": 0.0326, "step": 2493 }, { "epoch": 9.975999999999999, "grad_norm": 1.1483579874038696, "learning_rate": 2.5130260521042088e-05, "loss": 0.0321, "step": 2494 }, { "epoch": 9.98, "grad_norm": 0.9585171937942505, "learning_rate": 2.5120240480961926e-05, "loss": 0.0264, "step": 2495 }, { "epoch": 9.984, "grad_norm": 1.2960076332092285, "learning_rate": 2.5110220440881764e-05, "loss": 0.0475, "step": 2496 }, { "epoch": 9.988, "grad_norm": 1.278764009475708, "learning_rate": 2.5100200400801605e-05, "loss": 0.0376, "step": 2497 }, { "epoch": 9.992, "grad_norm": 1.109755039215088, "learning_rate": 2.5090180360721443e-05, "loss": 0.0304, "step": 2498 }, { "epoch": 9.996, "grad_norm": 1.2089651823043823, "learning_rate": 2.508016032064128e-05, "loss": 0.0345, "step": 2499 }, { "epoch": 10.0, "grad_norm": 1.3059208393096924, "learning_rate": 2.5070140280561122e-05, "loss": 0.044, "step": 2500 }, { "epoch": 10.004, "grad_norm": 0.838786780834198, "learning_rate": 2.506012024048096e-05, "loss": 0.0223, "step": 2501 }, { "epoch": 10.008, "grad_norm": 0.9346208572387695, "learning_rate": 2.50501002004008e-05, "loss": 0.0308, "step": 2502 }, { "epoch": 10.012, "grad_norm": 0.627692699432373, "learning_rate": 2.5040080160320646e-05, "loss": 0.0171, "step": 2503 }, { "epoch": 10.016, "grad_norm": 0.789028525352478, "learning_rate": 2.5030060120240484e-05, "loss": 0.0211, "step": 2504 }, { "epoch": 10.02, "grad_norm": 0.8333077430725098, "learning_rate": 2.5020040080160322e-05, "loss": 0.0253, "step": 2505 }, { "epoch": 10.024, "grad_norm": 0.7909857630729675, "learning_rate": 2.5010020040080163e-05, "loss": 0.0231, "step": 2506 }, { "epoch": 10.028, "grad_norm": 0.9159106016159058, "learning_rate": 2.5e-05, "loss": 0.0219, "step": 2507 }, { "epoch": 10.032, "grad_norm": 0.5722668766975403, "learning_rate": 2.498997995991984e-05, "loss": 0.0185, "step": 2508 }, { "epoch": 10.036, "grad_norm": 0.8425304889678955, "learning_rate": 2.497995991983968e-05, "loss": 0.0223, "step": 2509 }, { "epoch": 10.04, "grad_norm": 0.9075009226799011, "learning_rate": 2.496993987975952e-05, "loss": 0.0228, "step": 2510 }, { "epoch": 10.044, "grad_norm": 0.7858025431632996, "learning_rate": 2.495991983967936e-05, "loss": 0.0196, "step": 2511 }, { "epoch": 10.048, "grad_norm": 0.6345151662826538, "learning_rate": 2.49498997995992e-05, "loss": 0.0161, "step": 2512 }, { "epoch": 10.052, "grad_norm": 0.6853556036949158, "learning_rate": 2.493987975951904e-05, "loss": 0.0174, "step": 2513 }, { "epoch": 10.056, "grad_norm": 0.7381585240364075, "learning_rate": 2.4929859719438877e-05, "loss": 0.0182, "step": 2514 }, { "epoch": 10.06, "grad_norm": 1.0127655267715454, "learning_rate": 2.491983967935872e-05, "loss": 0.0226, "step": 2515 }, { "epoch": 10.064, "grad_norm": 1.03529691696167, "learning_rate": 2.490981963927856e-05, "loss": 0.0265, "step": 2516 }, { "epoch": 10.068, "grad_norm": 1.1951006650924683, "learning_rate": 2.4899799599198397e-05, "loss": 0.0231, "step": 2517 }, { "epoch": 10.072, "grad_norm": 0.7003726959228516, "learning_rate": 2.488977955911824e-05, "loss": 0.0186, "step": 2518 }, { "epoch": 10.076, "grad_norm": 0.7388056516647339, "learning_rate": 2.4879759519038076e-05, "loss": 0.0185, "step": 2519 }, { "epoch": 10.08, "grad_norm": 0.766671895980835, "learning_rate": 2.4869739478957918e-05, "loss": 0.0196, "step": 2520 }, { "epoch": 10.084, "grad_norm": 0.6915393471717834, "learning_rate": 2.4859719438877755e-05, "loss": 0.0161, "step": 2521 }, { "epoch": 10.088, "grad_norm": 0.831326961517334, "learning_rate": 2.4849699398797597e-05, "loss": 0.0222, "step": 2522 }, { "epoch": 10.092, "grad_norm": 0.8071035146713257, "learning_rate": 2.4839679358717435e-05, "loss": 0.0198, "step": 2523 }, { "epoch": 10.096, "grad_norm": 0.8890426754951477, "learning_rate": 2.4829659318637276e-05, "loss": 0.0222, "step": 2524 }, { "epoch": 10.1, "grad_norm": 0.8120467066764832, "learning_rate": 2.4819639278557117e-05, "loss": 0.0193, "step": 2525 }, { "epoch": 10.104, "grad_norm": 0.7911978960037231, "learning_rate": 2.4809619238476955e-05, "loss": 0.0228, "step": 2526 }, { "epoch": 10.108, "grad_norm": 0.82595294713974, "learning_rate": 2.4799599198396793e-05, "loss": 0.0213, "step": 2527 }, { "epoch": 10.112, "grad_norm": 0.40247252583503723, "learning_rate": 2.4789579158316634e-05, "loss": 0.0098, "step": 2528 }, { "epoch": 10.116, "grad_norm": 1.0697221755981445, "learning_rate": 2.4779559118236472e-05, "loss": 0.0252, "step": 2529 }, { "epoch": 10.12, "grad_norm": 0.8177196979522705, "learning_rate": 2.4769539078156313e-05, "loss": 0.0214, "step": 2530 }, { "epoch": 10.124, "grad_norm": 0.6448612213134766, "learning_rate": 2.4759519038076155e-05, "loss": 0.0163, "step": 2531 }, { "epoch": 10.128, "grad_norm": 0.7582941055297852, "learning_rate": 2.4749498997995993e-05, "loss": 0.0191, "step": 2532 }, { "epoch": 10.132, "grad_norm": 0.6828077435493469, "learning_rate": 2.473947895791583e-05, "loss": 0.0174, "step": 2533 }, { "epoch": 10.136, "grad_norm": 0.7914385199546814, "learning_rate": 2.4729458917835672e-05, "loss": 0.0198, "step": 2534 }, { "epoch": 10.14, "grad_norm": 0.6168143153190613, "learning_rate": 2.4719438877755513e-05, "loss": 0.0171, "step": 2535 }, { "epoch": 10.144, "grad_norm": 0.7246816158294678, "learning_rate": 2.470941883767535e-05, "loss": 0.0207, "step": 2536 }, { "epoch": 10.148, "grad_norm": 1.0454907417297363, "learning_rate": 2.4699398797595192e-05, "loss": 0.025, "step": 2537 }, { "epoch": 10.152, "grad_norm": 0.9220485091209412, "learning_rate": 2.468937875751503e-05, "loss": 0.0234, "step": 2538 }, { "epoch": 10.156, "grad_norm": 0.8142296671867371, "learning_rate": 2.467935871743487e-05, "loss": 0.0197, "step": 2539 }, { "epoch": 10.16, "grad_norm": 0.7292171716690063, "learning_rate": 2.4669338677354713e-05, "loss": 0.0197, "step": 2540 }, { "epoch": 10.164, "grad_norm": 0.8474435210227966, "learning_rate": 2.465931863727455e-05, "loss": 0.0199, "step": 2541 }, { "epoch": 10.168, "grad_norm": 0.677284836769104, "learning_rate": 2.464929859719439e-05, "loss": 0.0161, "step": 2542 }, { "epoch": 10.172, "grad_norm": 0.8956500887870789, "learning_rate": 2.463927855711423e-05, "loss": 0.0233, "step": 2543 }, { "epoch": 10.176, "grad_norm": 1.009035348892212, "learning_rate": 2.462925851703407e-05, "loss": 0.0243, "step": 2544 }, { "epoch": 10.18, "grad_norm": 0.867087721824646, "learning_rate": 2.461923847695391e-05, "loss": 0.0201, "step": 2545 }, { "epoch": 10.184, "grad_norm": 1.1653364896774292, "learning_rate": 2.460921843687375e-05, "loss": 0.0296, "step": 2546 }, { "epoch": 10.188, "grad_norm": 0.7345436215400696, "learning_rate": 2.4599198396793588e-05, "loss": 0.0194, "step": 2547 }, { "epoch": 10.192, "grad_norm": 0.9385821223258972, "learning_rate": 2.4589178356713426e-05, "loss": 0.0223, "step": 2548 }, { "epoch": 10.196, "grad_norm": 0.6956146359443665, "learning_rate": 2.4579158316633267e-05, "loss": 0.0169, "step": 2549 }, { "epoch": 10.2, "grad_norm": 0.7195176482200623, "learning_rate": 2.456913827655311e-05, "loss": 0.0189, "step": 2550 }, { "epoch": 10.204, "grad_norm": 1.0395700931549072, "learning_rate": 2.4559118236472946e-05, "loss": 0.0214, "step": 2551 }, { "epoch": 10.208, "grad_norm": 1.0134451389312744, "learning_rate": 2.4549098196392788e-05, "loss": 0.0218, "step": 2552 }, { "epoch": 10.212, "grad_norm": 1.1786450147628784, "learning_rate": 2.4539078156312626e-05, "loss": 0.0177, "step": 2553 }, { "epoch": 10.216, "grad_norm": 0.8216089606285095, "learning_rate": 2.4529058116232467e-05, "loss": 0.0197, "step": 2554 }, { "epoch": 10.22, "grad_norm": 0.5676589608192444, "learning_rate": 2.4519038076152305e-05, "loss": 0.0171, "step": 2555 }, { "epoch": 10.224, "grad_norm": 0.8638578653335571, "learning_rate": 2.4509018036072146e-05, "loss": 0.0225, "step": 2556 }, { "epoch": 10.228, "grad_norm": 0.9760416746139526, "learning_rate": 2.4498997995991984e-05, "loss": 0.0222, "step": 2557 }, { "epoch": 10.232, "grad_norm": 0.8012329339981079, "learning_rate": 2.4488977955911825e-05, "loss": 0.0164, "step": 2558 }, { "epoch": 10.236, "grad_norm": 0.6655237078666687, "learning_rate": 2.4478957915831667e-05, "loss": 0.0189, "step": 2559 }, { "epoch": 10.24, "grad_norm": 1.0602320432662964, "learning_rate": 2.4468937875751504e-05, "loss": 0.0271, "step": 2560 }, { "epoch": 10.244, "grad_norm": 0.8225724697113037, "learning_rate": 2.4458917835671342e-05, "loss": 0.0207, "step": 2561 }, { "epoch": 10.248, "grad_norm": 0.783803403377533, "learning_rate": 2.4448897795591184e-05, "loss": 0.0186, "step": 2562 }, { "epoch": 10.252, "grad_norm": 0.704971969127655, "learning_rate": 2.443887775551102e-05, "loss": 0.0179, "step": 2563 }, { "epoch": 10.256, "grad_norm": 0.8094855546951294, "learning_rate": 2.4428857715430863e-05, "loss": 0.0209, "step": 2564 }, { "epoch": 10.26, "grad_norm": 0.8836872577667236, "learning_rate": 2.4418837675350704e-05, "loss": 0.0192, "step": 2565 }, { "epoch": 10.264, "grad_norm": 0.901753842830658, "learning_rate": 2.4408817635270542e-05, "loss": 0.0241, "step": 2566 }, { "epoch": 10.268, "grad_norm": 0.7555407881736755, "learning_rate": 2.439879759519038e-05, "loss": 0.0191, "step": 2567 }, { "epoch": 10.272, "grad_norm": 0.8224442005157471, "learning_rate": 2.438877755511022e-05, "loss": 0.0209, "step": 2568 }, { "epoch": 10.276, "grad_norm": 0.9603815078735352, "learning_rate": 2.4378757515030062e-05, "loss": 0.0205, "step": 2569 }, { "epoch": 10.28, "grad_norm": 0.7587599158287048, "learning_rate": 2.43687374749499e-05, "loss": 0.0165, "step": 2570 }, { "epoch": 10.284, "grad_norm": 0.8181880712509155, "learning_rate": 2.435871743486974e-05, "loss": 0.0217, "step": 2571 }, { "epoch": 10.288, "grad_norm": 0.8437207341194153, "learning_rate": 2.434869739478958e-05, "loss": 0.0209, "step": 2572 }, { "epoch": 10.292, "grad_norm": 0.9303336143493652, "learning_rate": 2.4338677354709417e-05, "loss": 0.0271, "step": 2573 }, { "epoch": 10.296, "grad_norm": 1.0167421102523804, "learning_rate": 2.4328657314629262e-05, "loss": 0.0206, "step": 2574 }, { "epoch": 10.3, "grad_norm": 0.9930081367492676, "learning_rate": 2.43186372745491e-05, "loss": 0.0221, "step": 2575 }, { "epoch": 10.304, "grad_norm": 0.6778939962387085, "learning_rate": 2.4308617234468938e-05, "loss": 0.0205, "step": 2576 }, { "epoch": 10.308, "grad_norm": 0.8133848309516907, "learning_rate": 2.429859719438878e-05, "loss": 0.0196, "step": 2577 }, { "epoch": 10.312, "grad_norm": 0.7504951357841492, "learning_rate": 2.4288577154308617e-05, "loss": 0.0173, "step": 2578 }, { "epoch": 10.316, "grad_norm": 1.1020333766937256, "learning_rate": 2.427855711422846e-05, "loss": 0.0258, "step": 2579 }, { "epoch": 10.32, "grad_norm": 0.807967483997345, "learning_rate": 2.42685370741483e-05, "loss": 0.0199, "step": 2580 }, { "epoch": 10.324, "grad_norm": 0.9082990884780884, "learning_rate": 2.4258517034068138e-05, "loss": 0.0199, "step": 2581 }, { "epoch": 10.328, "grad_norm": 0.8345447182655334, "learning_rate": 2.4248496993987975e-05, "loss": 0.021, "step": 2582 }, { "epoch": 10.332, "grad_norm": 0.9574979543685913, "learning_rate": 2.4238476953907817e-05, "loss": 0.0243, "step": 2583 }, { "epoch": 10.336, "grad_norm": 0.8273745775222778, "learning_rate": 2.4228456913827658e-05, "loss": 0.02, "step": 2584 }, { "epoch": 10.34, "grad_norm": 0.695869505405426, "learning_rate": 2.4218436873747496e-05, "loss": 0.0154, "step": 2585 }, { "epoch": 10.344, "grad_norm": 0.8283644914627075, "learning_rate": 2.4208416833667337e-05, "loss": 0.0204, "step": 2586 }, { "epoch": 10.348, "grad_norm": 1.0260083675384521, "learning_rate": 2.4198396793587175e-05, "loss": 0.0259, "step": 2587 }, { "epoch": 10.352, "grad_norm": 0.9744479656219482, "learning_rate": 2.4188376753507013e-05, "loss": 0.0243, "step": 2588 }, { "epoch": 10.356, "grad_norm": 0.937955379486084, "learning_rate": 2.4178356713426854e-05, "loss": 0.0223, "step": 2589 }, { "epoch": 10.36, "grad_norm": 1.1364651918411255, "learning_rate": 2.4168336673346696e-05, "loss": 0.0244, "step": 2590 }, { "epoch": 10.364, "grad_norm": 0.9114662408828735, "learning_rate": 2.4158316633266533e-05, "loss": 0.027, "step": 2591 }, { "epoch": 10.368, "grad_norm": 0.7826032638549805, "learning_rate": 2.4148296593186375e-05, "loss": 0.0192, "step": 2592 }, { "epoch": 10.372, "grad_norm": 0.45732957124710083, "learning_rate": 2.4138276553106216e-05, "loss": 0.0133, "step": 2593 }, { "epoch": 10.376, "grad_norm": 0.6898213624954224, "learning_rate": 2.4128256513026054e-05, "loss": 0.0214, "step": 2594 }, { "epoch": 10.38, "grad_norm": 0.7986359596252441, "learning_rate": 2.4118236472945892e-05, "loss": 0.0184, "step": 2595 }, { "epoch": 10.384, "grad_norm": 0.9170551896095276, "learning_rate": 2.4108216432865733e-05, "loss": 0.0215, "step": 2596 }, { "epoch": 10.388, "grad_norm": 0.8053649067878723, "learning_rate": 2.409819639278557e-05, "loss": 0.0213, "step": 2597 }, { "epoch": 10.392, "grad_norm": 0.8867815136909485, "learning_rate": 2.4088176352705412e-05, "loss": 0.0226, "step": 2598 }, { "epoch": 10.396, "grad_norm": 0.8279624581336975, "learning_rate": 2.4078156312625254e-05, "loss": 0.0209, "step": 2599 }, { "epoch": 10.4, "grad_norm": 0.6529654264450073, "learning_rate": 2.406813627254509e-05, "loss": 0.0134, "step": 2600 }, { "epoch": 10.404, "grad_norm": 0.8741607666015625, "learning_rate": 2.405811623246493e-05, "loss": 0.0199, "step": 2601 }, { "epoch": 10.408, "grad_norm": 0.9696851968765259, "learning_rate": 2.404809619238477e-05, "loss": 0.0212, "step": 2602 }, { "epoch": 10.412, "grad_norm": 0.7984656095504761, "learning_rate": 2.4038076152304612e-05, "loss": 0.0197, "step": 2603 }, { "epoch": 10.416, "grad_norm": 0.7449184656143188, "learning_rate": 2.402805611222445e-05, "loss": 0.0203, "step": 2604 }, { "epoch": 10.42, "grad_norm": 0.7866941094398499, "learning_rate": 2.401803607214429e-05, "loss": 0.0214, "step": 2605 }, { "epoch": 10.424, "grad_norm": 0.802543580532074, "learning_rate": 2.400801603206413e-05, "loss": 0.0216, "step": 2606 }, { "epoch": 10.428, "grad_norm": 0.7483913898468018, "learning_rate": 2.3997995991983967e-05, "loss": 0.0205, "step": 2607 }, { "epoch": 10.432, "grad_norm": 0.8060435652732849, "learning_rate": 2.398797595190381e-05, "loss": 0.0199, "step": 2608 }, { "epoch": 10.436, "grad_norm": 0.7773402333259583, "learning_rate": 2.397795591182365e-05, "loss": 0.0184, "step": 2609 }, { "epoch": 10.44, "grad_norm": 0.7629860639572144, "learning_rate": 2.3967935871743487e-05, "loss": 0.0191, "step": 2610 }, { "epoch": 10.444, "grad_norm": 1.0891491174697876, "learning_rate": 2.395791583166333e-05, "loss": 0.0267, "step": 2611 }, { "epoch": 10.448, "grad_norm": 0.8873718976974487, "learning_rate": 2.3947895791583166e-05, "loss": 0.023, "step": 2612 }, { "epoch": 10.452, "grad_norm": 0.9421393871307373, "learning_rate": 2.3937875751503008e-05, "loss": 0.0231, "step": 2613 }, { "epoch": 10.456, "grad_norm": 0.7414140105247498, "learning_rate": 2.392785571142285e-05, "loss": 0.0202, "step": 2614 }, { "epoch": 10.46, "grad_norm": 0.9896325469017029, "learning_rate": 2.3917835671342687e-05, "loss": 0.0234, "step": 2615 }, { "epoch": 10.464, "grad_norm": 0.7930034399032593, "learning_rate": 2.3907815631262525e-05, "loss": 0.021, "step": 2616 }, { "epoch": 10.468, "grad_norm": 0.9249135851860046, "learning_rate": 2.3897795591182366e-05, "loss": 0.0225, "step": 2617 }, { "epoch": 10.472, "grad_norm": 0.837981641292572, "learning_rate": 2.3887775551102207e-05, "loss": 0.0232, "step": 2618 }, { "epoch": 10.475999999999999, "grad_norm": 1.0759623050689697, "learning_rate": 2.3877755511022045e-05, "loss": 0.0241, "step": 2619 }, { "epoch": 10.48, "grad_norm": 0.8368483781814575, "learning_rate": 2.3867735470941887e-05, "loss": 0.0222, "step": 2620 }, { "epoch": 10.484, "grad_norm": 0.9433078169822693, "learning_rate": 2.3857715430861724e-05, "loss": 0.0259, "step": 2621 }, { "epoch": 10.488, "grad_norm": 0.9696186780929565, "learning_rate": 2.3847695390781562e-05, "loss": 0.0223, "step": 2622 }, { "epoch": 10.492, "grad_norm": 1.07587468624115, "learning_rate": 2.3837675350701404e-05, "loss": 0.0252, "step": 2623 }, { "epoch": 10.496, "grad_norm": 0.7479248642921448, "learning_rate": 2.3827655310621245e-05, "loss": 0.0196, "step": 2624 }, { "epoch": 10.5, "grad_norm": 0.678727924823761, "learning_rate": 2.3817635270541083e-05, "loss": 0.0215, "step": 2625 }, { "epoch": 10.504, "grad_norm": 0.7351118326187134, "learning_rate": 2.380761523046092e-05, "loss": 0.0214, "step": 2626 }, { "epoch": 10.508, "grad_norm": 1.21925687789917, "learning_rate": 2.3797595190380762e-05, "loss": 0.0252, "step": 2627 }, { "epoch": 10.512, "grad_norm": 0.9814172983169556, "learning_rate": 2.3787575150300603e-05, "loss": 0.0235, "step": 2628 }, { "epoch": 10.516, "grad_norm": 0.880654513835907, "learning_rate": 2.377755511022044e-05, "loss": 0.0215, "step": 2629 }, { "epoch": 10.52, "grad_norm": 0.9024983644485474, "learning_rate": 2.3767535070140282e-05, "loss": 0.0229, "step": 2630 }, { "epoch": 10.524000000000001, "grad_norm": 1.2271602153778076, "learning_rate": 2.375751503006012e-05, "loss": 0.0266, "step": 2631 }, { "epoch": 10.528, "grad_norm": 1.0690782070159912, "learning_rate": 2.3747494989979958e-05, "loss": 0.0242, "step": 2632 }, { "epoch": 10.532, "grad_norm": 0.753580629825592, "learning_rate": 2.3737474949899803e-05, "loss": 0.0183, "step": 2633 }, { "epoch": 10.536, "grad_norm": 0.7326211333274841, "learning_rate": 2.372745490981964e-05, "loss": 0.0209, "step": 2634 }, { "epoch": 10.54, "grad_norm": 0.9598518013954163, "learning_rate": 2.371743486973948e-05, "loss": 0.0246, "step": 2635 }, { "epoch": 10.544, "grad_norm": 0.7582380771636963, "learning_rate": 2.370741482965932e-05, "loss": 0.0197, "step": 2636 }, { "epoch": 10.548, "grad_norm": 0.7412502765655518, "learning_rate": 2.3697394789579158e-05, "loss": 0.0207, "step": 2637 }, { "epoch": 10.552, "grad_norm": 0.8043134808540344, "learning_rate": 2.3687374749499e-05, "loss": 0.0207, "step": 2638 }, { "epoch": 10.556000000000001, "grad_norm": 1.084338665008545, "learning_rate": 2.367735470941884e-05, "loss": 0.0292, "step": 2639 }, { "epoch": 10.56, "grad_norm": 0.871437668800354, "learning_rate": 2.366733466933868e-05, "loss": 0.0261, "step": 2640 }, { "epoch": 10.564, "grad_norm": 0.8200016617774963, "learning_rate": 2.3657314629258516e-05, "loss": 0.0198, "step": 2641 }, { "epoch": 10.568, "grad_norm": 0.9645569920539856, "learning_rate": 2.3647294589178358e-05, "loss": 0.0223, "step": 2642 }, { "epoch": 10.572, "grad_norm": 1.119446873664856, "learning_rate": 2.36372745490982e-05, "loss": 0.0252, "step": 2643 }, { "epoch": 10.576, "grad_norm": 1.2717159986495972, "learning_rate": 2.3627254509018037e-05, "loss": 0.0264, "step": 2644 }, { "epoch": 10.58, "grad_norm": 0.7650407552719116, "learning_rate": 2.3617234468937878e-05, "loss": 0.0204, "step": 2645 }, { "epoch": 10.584, "grad_norm": 0.48650267720222473, "learning_rate": 2.3607214428857716e-05, "loss": 0.011, "step": 2646 }, { "epoch": 10.588, "grad_norm": 1.0167272090911865, "learning_rate": 2.3597194388777557e-05, "loss": 0.0281, "step": 2647 }, { "epoch": 10.592, "grad_norm": 1.0820997953414917, "learning_rate": 2.3587174348697395e-05, "loss": 0.029, "step": 2648 }, { "epoch": 10.596, "grad_norm": 0.4999312460422516, "learning_rate": 2.3577154308617236e-05, "loss": 0.0094, "step": 2649 }, { "epoch": 10.6, "grad_norm": 0.9790526628494263, "learning_rate": 2.3567134268537074e-05, "loss": 0.0226, "step": 2650 }, { "epoch": 10.604, "grad_norm": 0.9920722842216492, "learning_rate": 2.3557114228456916e-05, "loss": 0.0232, "step": 2651 }, { "epoch": 10.608, "grad_norm": 0.6009891033172607, "learning_rate": 2.3547094188376757e-05, "loss": 0.0203, "step": 2652 }, { "epoch": 10.612, "grad_norm": 0.7871741056442261, "learning_rate": 2.3537074148296595e-05, "loss": 0.0215, "step": 2653 }, { "epoch": 10.616, "grad_norm": 1.217743992805481, "learning_rate": 2.3527054108216433e-05, "loss": 0.0251, "step": 2654 }, { "epoch": 10.62, "grad_norm": 1.0484158992767334, "learning_rate": 2.3517034068136274e-05, "loss": 0.0261, "step": 2655 }, { "epoch": 10.624, "grad_norm": 1.014896273612976, "learning_rate": 2.3507014028056112e-05, "loss": 0.0233, "step": 2656 }, { "epoch": 10.628, "grad_norm": 1.0810414552688599, "learning_rate": 2.3496993987975953e-05, "loss": 0.0216, "step": 2657 }, { "epoch": 10.632, "grad_norm": 0.7048183083534241, "learning_rate": 2.3486973947895794e-05, "loss": 0.0145, "step": 2658 }, { "epoch": 10.636, "grad_norm": 1.054413914680481, "learning_rate": 2.3476953907815632e-05, "loss": 0.0252, "step": 2659 }, { "epoch": 10.64, "grad_norm": 0.8256332874298096, "learning_rate": 2.346693386773547e-05, "loss": 0.021, "step": 2660 }, { "epoch": 10.644, "grad_norm": 0.9366419911384583, "learning_rate": 2.345691382765531e-05, "loss": 0.0245, "step": 2661 }, { "epoch": 10.648, "grad_norm": 0.9146870970726013, "learning_rate": 2.3446893787575153e-05, "loss": 0.0218, "step": 2662 }, { "epoch": 10.652, "grad_norm": 0.9589238166809082, "learning_rate": 2.343687374749499e-05, "loss": 0.0272, "step": 2663 }, { "epoch": 10.656, "grad_norm": 0.7012674808502197, "learning_rate": 2.3426853707414832e-05, "loss": 0.0205, "step": 2664 }, { "epoch": 10.66, "grad_norm": 0.872169554233551, "learning_rate": 2.341683366733467e-05, "loss": 0.0212, "step": 2665 }, { "epoch": 10.664, "grad_norm": 1.1105194091796875, "learning_rate": 2.3406813627254508e-05, "loss": 0.0236, "step": 2666 }, { "epoch": 10.668, "grad_norm": 0.9917757511138916, "learning_rate": 2.3396793587174352e-05, "loss": 0.0274, "step": 2667 }, { "epoch": 10.672, "grad_norm": 0.7267541289329529, "learning_rate": 2.338677354709419e-05, "loss": 0.0186, "step": 2668 }, { "epoch": 10.676, "grad_norm": 0.8156752586364746, "learning_rate": 2.3376753507014028e-05, "loss": 0.0192, "step": 2669 }, { "epoch": 10.68, "grad_norm": 0.8577165603637695, "learning_rate": 2.336673346693387e-05, "loss": 0.0235, "step": 2670 }, { "epoch": 10.684, "grad_norm": 0.5355784893035889, "learning_rate": 2.3356713426853707e-05, "loss": 0.0113, "step": 2671 }, { "epoch": 10.688, "grad_norm": 1.0173070430755615, "learning_rate": 2.334669338677355e-05, "loss": 0.0263, "step": 2672 }, { "epoch": 10.692, "grad_norm": 0.8700453639030457, "learning_rate": 2.333667334669339e-05, "loss": 0.0253, "step": 2673 }, { "epoch": 10.696, "grad_norm": 0.8330980539321899, "learning_rate": 2.3326653306613228e-05, "loss": 0.0198, "step": 2674 }, { "epoch": 10.7, "grad_norm": 0.7934946417808533, "learning_rate": 2.3316633266533066e-05, "loss": 0.0204, "step": 2675 }, { "epoch": 10.704, "grad_norm": 0.7809090614318848, "learning_rate": 2.3306613226452907e-05, "loss": 0.026, "step": 2676 }, { "epoch": 10.708, "grad_norm": 0.8526442646980286, "learning_rate": 2.3296593186372748e-05, "loss": 0.0208, "step": 2677 }, { "epoch": 10.712, "grad_norm": 0.9328563213348389, "learning_rate": 2.3286573146292586e-05, "loss": 0.0229, "step": 2678 }, { "epoch": 10.716, "grad_norm": 0.767686665058136, "learning_rate": 2.3276553106212427e-05, "loss": 0.021, "step": 2679 }, { "epoch": 10.72, "grad_norm": 1.204643726348877, "learning_rate": 2.3266533066132265e-05, "loss": 0.0283, "step": 2680 }, { "epoch": 10.724, "grad_norm": 0.8405128121376038, "learning_rate": 2.3256513026052103e-05, "loss": 0.0216, "step": 2681 }, { "epoch": 10.728, "grad_norm": 0.9929158091545105, "learning_rate": 2.3246492985971944e-05, "loss": 0.0236, "step": 2682 }, { "epoch": 10.732, "grad_norm": 1.1628848314285278, "learning_rate": 2.3236472945891786e-05, "loss": 0.0234, "step": 2683 }, { "epoch": 10.736, "grad_norm": 1.0076568126678467, "learning_rate": 2.3226452905811624e-05, "loss": 0.0232, "step": 2684 }, { "epoch": 10.74, "grad_norm": 0.8345778584480286, "learning_rate": 2.3216432865731465e-05, "loss": 0.0221, "step": 2685 }, { "epoch": 10.744, "grad_norm": 0.8815706968307495, "learning_rate": 2.3206412825651303e-05, "loss": 0.0255, "step": 2686 }, { "epoch": 10.748, "grad_norm": 1.1422778367996216, "learning_rate": 2.3196392785571144e-05, "loss": 0.0309, "step": 2687 }, { "epoch": 10.752, "grad_norm": 0.9792990684509277, "learning_rate": 2.3186372745490982e-05, "loss": 0.0271, "step": 2688 }, { "epoch": 10.756, "grad_norm": 0.972955048084259, "learning_rate": 2.3176352705410823e-05, "loss": 0.0263, "step": 2689 }, { "epoch": 10.76, "grad_norm": 0.9153329133987427, "learning_rate": 2.316633266533066e-05, "loss": 0.0213, "step": 2690 }, { "epoch": 10.764, "grad_norm": 1.1069750785827637, "learning_rate": 2.3156312625250502e-05, "loss": 0.028, "step": 2691 }, { "epoch": 10.768, "grad_norm": 1.0217959880828857, "learning_rate": 2.3146292585170344e-05, "loss": 0.0308, "step": 2692 }, { "epoch": 10.772, "grad_norm": 0.8818560838699341, "learning_rate": 2.313627254509018e-05, "loss": 0.0241, "step": 2693 }, { "epoch": 10.776, "grad_norm": 1.0862679481506348, "learning_rate": 2.312625250501002e-05, "loss": 0.0253, "step": 2694 }, { "epoch": 10.78, "grad_norm": 0.8923512697219849, "learning_rate": 2.311623246492986e-05, "loss": 0.0232, "step": 2695 }, { "epoch": 10.784, "grad_norm": 0.8306083679199219, "learning_rate": 2.31062124248497e-05, "loss": 0.0169, "step": 2696 }, { "epoch": 10.788, "grad_norm": 0.904339075088501, "learning_rate": 2.309619238476954e-05, "loss": 0.0221, "step": 2697 }, { "epoch": 10.792, "grad_norm": 1.143362283706665, "learning_rate": 2.308617234468938e-05, "loss": 0.0272, "step": 2698 }, { "epoch": 10.796, "grad_norm": 0.9651113152503967, "learning_rate": 2.307615230460922e-05, "loss": 0.0271, "step": 2699 }, { "epoch": 10.8, "grad_norm": 1.1126807928085327, "learning_rate": 2.3066132264529057e-05, "loss": 0.0264, "step": 2700 }, { "epoch": 10.804, "grad_norm": 1.1432498693466187, "learning_rate": 2.30561122244489e-05, "loss": 0.0294, "step": 2701 }, { "epoch": 10.808, "grad_norm": 0.907027542591095, "learning_rate": 2.304609218436874e-05, "loss": 0.0257, "step": 2702 }, { "epoch": 10.812, "grad_norm": 1.038845181465149, "learning_rate": 2.3036072144288577e-05, "loss": 0.0269, "step": 2703 }, { "epoch": 10.816, "grad_norm": 0.6149145364761353, "learning_rate": 2.302605210420842e-05, "loss": 0.012, "step": 2704 }, { "epoch": 10.82, "grad_norm": 0.7798912525177002, "learning_rate": 2.3016032064128257e-05, "loss": 0.0213, "step": 2705 }, { "epoch": 10.824, "grad_norm": 0.8147614002227783, "learning_rate": 2.3006012024048098e-05, "loss": 0.0209, "step": 2706 }, { "epoch": 10.828, "grad_norm": 1.100545048713684, "learning_rate": 2.299599198396794e-05, "loss": 0.0239, "step": 2707 }, { "epoch": 10.832, "grad_norm": 0.9930369853973389, "learning_rate": 2.2985971943887777e-05, "loss": 0.0247, "step": 2708 }, { "epoch": 10.836, "grad_norm": 0.8570621609687805, "learning_rate": 2.2975951903807615e-05, "loss": 0.0212, "step": 2709 }, { "epoch": 10.84, "grad_norm": 0.8312192559242249, "learning_rate": 2.2965931863727456e-05, "loss": 0.0168, "step": 2710 }, { "epoch": 10.844, "grad_norm": 1.1362755298614502, "learning_rate": 2.2955911823647298e-05, "loss": 0.0303, "step": 2711 }, { "epoch": 10.848, "grad_norm": 1.1022303104400635, "learning_rate": 2.2945891783567135e-05, "loss": 0.0293, "step": 2712 }, { "epoch": 10.852, "grad_norm": 1.1668660640716553, "learning_rate": 2.2935871743486977e-05, "loss": 0.0259, "step": 2713 }, { "epoch": 10.856, "grad_norm": 1.326250672340393, "learning_rate": 2.2925851703406815e-05, "loss": 0.0259, "step": 2714 }, { "epoch": 10.86, "grad_norm": 0.9431809782981873, "learning_rate": 2.2915831663326653e-05, "loss": 0.0223, "step": 2715 }, { "epoch": 10.864, "grad_norm": 1.0126006603240967, "learning_rate": 2.2905811623246494e-05, "loss": 0.0231, "step": 2716 }, { "epoch": 10.868, "grad_norm": 0.9708862900733948, "learning_rate": 2.2895791583166335e-05, "loss": 0.0237, "step": 2717 }, { "epoch": 10.872, "grad_norm": 0.9651861786842346, "learning_rate": 2.2885771543086173e-05, "loss": 0.0229, "step": 2718 }, { "epoch": 10.876, "grad_norm": 0.8365417718887329, "learning_rate": 2.2875751503006014e-05, "loss": 0.0233, "step": 2719 }, { "epoch": 10.88, "grad_norm": 0.8717615008354187, "learning_rate": 2.2865731462925852e-05, "loss": 0.022, "step": 2720 }, { "epoch": 10.884, "grad_norm": 1.1816632747650146, "learning_rate": 2.2855711422845693e-05, "loss": 0.0273, "step": 2721 }, { "epoch": 10.888, "grad_norm": 0.9528033137321472, "learning_rate": 2.284569138276553e-05, "loss": 0.0229, "step": 2722 }, { "epoch": 10.892, "grad_norm": 1.0653725862503052, "learning_rate": 2.2835671342685373e-05, "loss": 0.027, "step": 2723 }, { "epoch": 10.896, "grad_norm": 0.5483238101005554, "learning_rate": 2.282565130260521e-05, "loss": 0.013, "step": 2724 }, { "epoch": 10.9, "grad_norm": 1.0084025859832764, "learning_rate": 2.281563126252505e-05, "loss": 0.0233, "step": 2725 }, { "epoch": 10.904, "grad_norm": 1.0322855710983276, "learning_rate": 2.2805611222444893e-05, "loss": 0.0251, "step": 2726 }, { "epoch": 10.908, "grad_norm": 0.9313704967498779, "learning_rate": 2.279559118236473e-05, "loss": 0.0234, "step": 2727 }, { "epoch": 10.912, "grad_norm": 1.1278380155563354, "learning_rate": 2.278557114228457e-05, "loss": 0.0299, "step": 2728 }, { "epoch": 10.916, "grad_norm": 1.118483304977417, "learning_rate": 2.277555110220441e-05, "loss": 0.0252, "step": 2729 }, { "epoch": 10.92, "grad_norm": 0.8251449465751648, "learning_rate": 2.2765531062124248e-05, "loss": 0.0198, "step": 2730 }, { "epoch": 10.924, "grad_norm": 1.0784047842025757, "learning_rate": 2.275551102204409e-05, "loss": 0.0305, "step": 2731 }, { "epoch": 10.928, "grad_norm": 1.0076247453689575, "learning_rate": 2.274549098196393e-05, "loss": 0.028, "step": 2732 }, { "epoch": 10.932, "grad_norm": 1.17436683177948, "learning_rate": 2.273547094188377e-05, "loss": 0.0264, "step": 2733 }, { "epoch": 10.936, "grad_norm": 1.0806264877319336, "learning_rate": 2.2725450901803606e-05, "loss": 0.025, "step": 2734 }, { "epoch": 10.94, "grad_norm": 0.8272656202316284, "learning_rate": 2.2715430861723448e-05, "loss": 0.0243, "step": 2735 }, { "epoch": 10.943999999999999, "grad_norm": 1.1387969255447388, "learning_rate": 2.270541082164329e-05, "loss": 0.0316, "step": 2736 }, { "epoch": 10.948, "grad_norm": 0.8464391231536865, "learning_rate": 2.2695390781563127e-05, "loss": 0.0245, "step": 2737 }, { "epoch": 10.952, "grad_norm": 0.8681789636611938, "learning_rate": 2.2685370741482968e-05, "loss": 0.0247, "step": 2738 }, { "epoch": 10.956, "grad_norm": 0.7716442942619324, "learning_rate": 2.2675350701402806e-05, "loss": 0.0196, "step": 2739 }, { "epoch": 10.96, "grad_norm": 0.8722916841506958, "learning_rate": 2.2665330661322644e-05, "loss": 0.0243, "step": 2740 }, { "epoch": 10.964, "grad_norm": 0.9783482551574707, "learning_rate": 2.265531062124249e-05, "loss": 0.025, "step": 2741 }, { "epoch": 10.968, "grad_norm": 0.7285762429237366, "learning_rate": 2.2645290581162327e-05, "loss": 0.023, "step": 2742 }, { "epoch": 10.972, "grad_norm": 1.012169599533081, "learning_rate": 2.2635270541082164e-05, "loss": 0.0263, "step": 2743 }, { "epoch": 10.975999999999999, "grad_norm": 1.017822265625, "learning_rate": 2.2625250501002006e-05, "loss": 0.0249, "step": 2744 }, { "epoch": 10.98, "grad_norm": 0.8947110772132874, "learning_rate": 2.2615230460921844e-05, "loss": 0.0242, "step": 2745 }, { "epoch": 10.984, "grad_norm": 0.8670669198036194, "learning_rate": 2.2605210420841685e-05, "loss": 0.0209, "step": 2746 }, { "epoch": 10.988, "grad_norm": 1.1200064420700073, "learning_rate": 2.2595190380761526e-05, "loss": 0.0294, "step": 2747 }, { "epoch": 10.992, "grad_norm": 0.7422370910644531, "learning_rate": 2.2585170340681364e-05, "loss": 0.0218, "step": 2748 }, { "epoch": 10.996, "grad_norm": 1.087212324142456, "learning_rate": 2.2575150300601202e-05, "loss": 0.0252, "step": 2749 }, { "epoch": 11.0, "grad_norm": 0.9888896942138672, "learning_rate": 2.2565130260521043e-05, "loss": 0.0232, "step": 2750 }, { "epoch": 11.004, "grad_norm": 0.7371053695678711, "learning_rate": 2.2555110220440885e-05, "loss": 0.0197, "step": 2751 }, { "epoch": 11.008, "grad_norm": 0.7480749487876892, "learning_rate": 2.2545090180360722e-05, "loss": 0.0151, "step": 2752 }, { "epoch": 11.012, "grad_norm": 0.47101399302482605, "learning_rate": 2.253507014028056e-05, "loss": 0.0126, "step": 2753 }, { "epoch": 11.016, "grad_norm": 0.5534325242042542, "learning_rate": 2.25250501002004e-05, "loss": 0.0152, "step": 2754 }, { "epoch": 11.02, "grad_norm": 0.6473478078842163, "learning_rate": 2.251503006012024e-05, "loss": 0.0152, "step": 2755 }, { "epoch": 11.024, "grad_norm": 0.6150280833244324, "learning_rate": 2.250501002004008e-05, "loss": 0.0164, "step": 2756 }, { "epoch": 11.028, "grad_norm": 0.6889513731002808, "learning_rate": 2.2494989979959922e-05, "loss": 0.0138, "step": 2757 }, { "epoch": 11.032, "grad_norm": 0.7769390344619751, "learning_rate": 2.248496993987976e-05, "loss": 0.0163, "step": 2758 }, { "epoch": 11.036, "grad_norm": 0.7019205093383789, "learning_rate": 2.2474949899799598e-05, "loss": 0.0157, "step": 2759 }, { "epoch": 11.04, "grad_norm": 0.966162383556366, "learning_rate": 2.2464929859719443e-05, "loss": 0.0205, "step": 2760 }, { "epoch": 11.044, "grad_norm": 0.7418727278709412, "learning_rate": 2.245490981963928e-05, "loss": 0.0172, "step": 2761 }, { "epoch": 11.048, "grad_norm": 0.5667153596878052, "learning_rate": 2.244488977955912e-05, "loss": 0.0099, "step": 2762 }, { "epoch": 11.052, "grad_norm": 0.9118229150772095, "learning_rate": 2.243486973947896e-05, "loss": 0.0182, "step": 2763 }, { "epoch": 11.056, "grad_norm": 0.8428515195846558, "learning_rate": 2.2424849699398797e-05, "loss": 0.018, "step": 2764 }, { "epoch": 11.06, "grad_norm": 0.5862798094749451, "learning_rate": 2.241482965931864e-05, "loss": 0.015, "step": 2765 }, { "epoch": 11.064, "grad_norm": 0.6175693273544312, "learning_rate": 2.240480961923848e-05, "loss": 0.0171, "step": 2766 }, { "epoch": 11.068, "grad_norm": 0.7874165177345276, "learning_rate": 2.2394789579158318e-05, "loss": 0.0183, "step": 2767 }, { "epoch": 11.072, "grad_norm": 0.9282909631729126, "learning_rate": 2.2384769539078156e-05, "loss": 0.0184, "step": 2768 }, { "epoch": 11.076, "grad_norm": 0.7215555906295776, "learning_rate": 2.2374749498997997e-05, "loss": 0.0185, "step": 2769 }, { "epoch": 11.08, "grad_norm": 0.8601921200752258, "learning_rate": 2.236472945891784e-05, "loss": 0.0185, "step": 2770 }, { "epoch": 11.084, "grad_norm": 0.9709322452545166, "learning_rate": 2.2354709418837676e-05, "loss": 0.0185, "step": 2771 }, { "epoch": 11.088, "grad_norm": 0.799983561038971, "learning_rate": 2.2344689378757518e-05, "loss": 0.0164, "step": 2772 }, { "epoch": 11.092, "grad_norm": 0.47060427069664, "learning_rate": 2.2334669338677355e-05, "loss": 0.0136, "step": 2773 }, { "epoch": 11.096, "grad_norm": 0.8248326182365417, "learning_rate": 2.2324649298597193e-05, "loss": 0.018, "step": 2774 }, { "epoch": 11.1, "grad_norm": 0.7863957285881042, "learning_rate": 2.2314629258517035e-05, "loss": 0.0199, "step": 2775 }, { "epoch": 11.104, "grad_norm": 0.584855318069458, "learning_rate": 2.2304609218436876e-05, "loss": 0.0125, "step": 2776 }, { "epoch": 11.108, "grad_norm": 0.7124798893928528, "learning_rate": 2.2294589178356714e-05, "loss": 0.0128, "step": 2777 }, { "epoch": 11.112, "grad_norm": 0.6670029759407043, "learning_rate": 2.2284569138276555e-05, "loss": 0.0167, "step": 2778 }, { "epoch": 11.116, "grad_norm": 0.7364521026611328, "learning_rate": 2.2274549098196393e-05, "loss": 0.0155, "step": 2779 }, { "epoch": 11.12, "grad_norm": 0.7299067974090576, "learning_rate": 2.2264529058116234e-05, "loss": 0.0162, "step": 2780 }, { "epoch": 11.124, "grad_norm": 0.5108133554458618, "learning_rate": 2.2254509018036072e-05, "loss": 0.0162, "step": 2781 }, { "epoch": 11.128, "grad_norm": 0.8171691298484802, "learning_rate": 2.2244488977955913e-05, "loss": 0.0185, "step": 2782 }, { "epoch": 11.132, "grad_norm": 0.7556437253952026, "learning_rate": 2.223446893787575e-05, "loss": 0.0181, "step": 2783 }, { "epoch": 11.136, "grad_norm": 0.8455727100372314, "learning_rate": 2.2224448897795593e-05, "loss": 0.0158, "step": 2784 }, { "epoch": 11.14, "grad_norm": 0.7372032999992371, "learning_rate": 2.2214428857715434e-05, "loss": 0.0168, "step": 2785 }, { "epoch": 11.144, "grad_norm": 0.9267023801803589, "learning_rate": 2.2204408817635272e-05, "loss": 0.0162, "step": 2786 }, { "epoch": 11.148, "grad_norm": 0.7094591856002808, "learning_rate": 2.219438877755511e-05, "loss": 0.0167, "step": 2787 }, { "epoch": 11.152, "grad_norm": 0.8599808812141418, "learning_rate": 2.218436873747495e-05, "loss": 0.0223, "step": 2788 }, { "epoch": 11.156, "grad_norm": 0.6255855560302734, "learning_rate": 2.217434869739479e-05, "loss": 0.014, "step": 2789 }, { "epoch": 11.16, "grad_norm": 0.5172903537750244, "learning_rate": 2.216432865731463e-05, "loss": 0.0137, "step": 2790 }, { "epoch": 11.164, "grad_norm": 0.971419095993042, "learning_rate": 2.215430861723447e-05, "loss": 0.0165, "step": 2791 }, { "epoch": 11.168, "grad_norm": 0.8272615671157837, "learning_rate": 2.214428857715431e-05, "loss": 0.0165, "step": 2792 }, { "epoch": 11.172, "grad_norm": 0.8575184345245361, "learning_rate": 2.2134268537074147e-05, "loss": 0.0194, "step": 2793 }, { "epoch": 11.176, "grad_norm": 0.54851895570755, "learning_rate": 2.212424849699399e-05, "loss": 0.0147, "step": 2794 }, { "epoch": 11.18, "grad_norm": 0.6194243431091309, "learning_rate": 2.211422845691383e-05, "loss": 0.0155, "step": 2795 }, { "epoch": 11.184, "grad_norm": 0.6905847191810608, "learning_rate": 2.2104208416833668e-05, "loss": 0.0166, "step": 2796 }, { "epoch": 11.188, "grad_norm": 0.7952155470848083, "learning_rate": 2.209418837675351e-05, "loss": 0.0172, "step": 2797 }, { "epoch": 11.192, "grad_norm": 0.5479825735092163, "learning_rate": 2.2084168336673347e-05, "loss": 0.0138, "step": 2798 }, { "epoch": 11.196, "grad_norm": 0.7587847709655762, "learning_rate": 2.2074148296593185e-05, "loss": 0.0174, "step": 2799 }, { "epoch": 11.2, "grad_norm": 0.6983550786972046, "learning_rate": 2.206412825651303e-05, "loss": 0.0158, "step": 2800 }, { "epoch": 11.204, "grad_norm": 0.7654238939285278, "learning_rate": 2.2054108216432867e-05, "loss": 0.0162, "step": 2801 }, { "epoch": 11.208, "grad_norm": 0.7754288911819458, "learning_rate": 2.2044088176352705e-05, "loss": 0.016, "step": 2802 }, { "epoch": 11.212, "grad_norm": 0.8081237077713013, "learning_rate": 2.2034068136272547e-05, "loss": 0.0176, "step": 2803 }, { "epoch": 11.216, "grad_norm": 0.6752600073814392, "learning_rate": 2.2024048096192384e-05, "loss": 0.0147, "step": 2804 }, { "epoch": 11.22, "grad_norm": 0.6993342041969299, "learning_rate": 2.2014028056112226e-05, "loss": 0.0159, "step": 2805 }, { "epoch": 11.224, "grad_norm": 0.6669374108314514, "learning_rate": 2.2004008016032067e-05, "loss": 0.0163, "step": 2806 }, { "epoch": 11.228, "grad_norm": 0.6162493228912354, "learning_rate": 2.1993987975951905e-05, "loss": 0.0168, "step": 2807 }, { "epoch": 11.232, "grad_norm": 1.0216128826141357, "learning_rate": 2.1983967935871743e-05, "loss": 0.0179, "step": 2808 }, { "epoch": 11.236, "grad_norm": 0.8339212536811829, "learning_rate": 2.1973947895791584e-05, "loss": 0.0161, "step": 2809 }, { "epoch": 11.24, "grad_norm": 0.6215983033180237, "learning_rate": 2.1963927855711425e-05, "loss": 0.0166, "step": 2810 }, { "epoch": 11.244, "grad_norm": 0.5474804639816284, "learning_rate": 2.1953907815631263e-05, "loss": 0.0149, "step": 2811 }, { "epoch": 11.248, "grad_norm": 0.5640460252761841, "learning_rate": 2.1943887775551105e-05, "loss": 0.0132, "step": 2812 }, { "epoch": 11.252, "grad_norm": 0.8308565616607666, "learning_rate": 2.1933867735470942e-05, "loss": 0.0185, "step": 2813 }, { "epoch": 11.256, "grad_norm": 1.0242302417755127, "learning_rate": 2.1923847695390784e-05, "loss": 0.0197, "step": 2814 }, { "epoch": 11.26, "grad_norm": 0.8074948787689209, "learning_rate": 2.191382765531062e-05, "loss": 0.0146, "step": 2815 }, { "epoch": 11.264, "grad_norm": 0.5657005906105042, "learning_rate": 2.1903807615230463e-05, "loss": 0.0135, "step": 2816 }, { "epoch": 11.268, "grad_norm": 0.8721190690994263, "learning_rate": 2.18937875751503e-05, "loss": 0.0193, "step": 2817 }, { "epoch": 11.272, "grad_norm": 0.5107552409172058, "learning_rate": 2.1883767535070142e-05, "loss": 0.0158, "step": 2818 }, { "epoch": 11.276, "grad_norm": 0.747622549533844, "learning_rate": 2.1873747494989983e-05, "loss": 0.0152, "step": 2819 }, { "epoch": 11.28, "grad_norm": 0.7449367046356201, "learning_rate": 2.186372745490982e-05, "loss": 0.0163, "step": 2820 }, { "epoch": 11.284, "grad_norm": 0.47581028938293457, "learning_rate": 2.185370741482966e-05, "loss": 0.0142, "step": 2821 }, { "epoch": 11.288, "grad_norm": 0.5532326698303223, "learning_rate": 2.18436873747495e-05, "loss": 0.014, "step": 2822 }, { "epoch": 11.292, "grad_norm": 0.7616000175476074, "learning_rate": 2.1833667334669338e-05, "loss": 0.0169, "step": 2823 }, { "epoch": 11.296, "grad_norm": 0.4382837414741516, "learning_rate": 2.182364729458918e-05, "loss": 0.0098, "step": 2824 }, { "epoch": 11.3, "grad_norm": 0.6851035952568054, "learning_rate": 2.181362725450902e-05, "loss": 0.0172, "step": 2825 }, { "epoch": 11.304, "grad_norm": 0.7380544543266296, "learning_rate": 2.180360721442886e-05, "loss": 0.0176, "step": 2826 }, { "epoch": 11.308, "grad_norm": 0.4597725570201874, "learning_rate": 2.1793587174348697e-05, "loss": 0.0134, "step": 2827 }, { "epoch": 11.312, "grad_norm": 0.40633803606033325, "learning_rate": 2.1783567134268538e-05, "loss": 0.0129, "step": 2828 }, { "epoch": 11.316, "grad_norm": 0.9875346422195435, "learning_rate": 2.177354709418838e-05, "loss": 0.0103, "step": 2829 }, { "epoch": 11.32, "grad_norm": 0.9050310254096985, "learning_rate": 2.1763527054108217e-05, "loss": 0.0154, "step": 2830 }, { "epoch": 11.324, "grad_norm": 0.7824398279190063, "learning_rate": 2.175350701402806e-05, "loss": 0.0186, "step": 2831 }, { "epoch": 11.328, "grad_norm": 0.9175853729248047, "learning_rate": 2.1743486973947896e-05, "loss": 0.018, "step": 2832 }, { "epoch": 11.332, "grad_norm": 0.75944584608078, "learning_rate": 2.1733466933867734e-05, "loss": 0.0188, "step": 2833 }, { "epoch": 11.336, "grad_norm": 0.7953121662139893, "learning_rate": 2.172344689378758e-05, "loss": 0.0164, "step": 2834 }, { "epoch": 11.34, "grad_norm": 0.662932276725769, "learning_rate": 2.1713426853707417e-05, "loss": 0.016, "step": 2835 }, { "epoch": 11.344, "grad_norm": 0.6539945602416992, "learning_rate": 2.1703406813627255e-05, "loss": 0.0145, "step": 2836 }, { "epoch": 11.348, "grad_norm": 0.8975854516029358, "learning_rate": 2.1693386773547096e-05, "loss": 0.0186, "step": 2837 }, { "epoch": 11.352, "grad_norm": 0.8039414882659912, "learning_rate": 2.1683366733466934e-05, "loss": 0.0185, "step": 2838 }, { "epoch": 11.356, "grad_norm": 0.891394853591919, "learning_rate": 2.1673346693386775e-05, "loss": 0.018, "step": 2839 }, { "epoch": 11.36, "grad_norm": 0.6643425226211548, "learning_rate": 2.1663326653306616e-05, "loss": 0.0151, "step": 2840 }, { "epoch": 11.364, "grad_norm": 0.6176607012748718, "learning_rate": 2.1653306613226454e-05, "loss": 0.0145, "step": 2841 }, { "epoch": 11.368, "grad_norm": 0.4472271203994751, "learning_rate": 2.1643286573146292e-05, "loss": 0.0092, "step": 2842 }, { "epoch": 11.372, "grad_norm": 0.7300881743431091, "learning_rate": 2.1633266533066133e-05, "loss": 0.0182, "step": 2843 }, { "epoch": 11.376, "grad_norm": 0.5038206577301025, "learning_rate": 2.1623246492985975e-05, "loss": 0.0171, "step": 2844 }, { "epoch": 11.38, "grad_norm": 0.8443142175674438, "learning_rate": 2.1613226452905813e-05, "loss": 0.0155, "step": 2845 }, { "epoch": 11.384, "grad_norm": 0.7423885464668274, "learning_rate": 2.1603206412825654e-05, "loss": 0.0155, "step": 2846 }, { "epoch": 11.388, "grad_norm": 0.6411489248275757, "learning_rate": 2.1593186372745492e-05, "loss": 0.0157, "step": 2847 }, { "epoch": 11.392, "grad_norm": 0.7509790062904358, "learning_rate": 2.158316633266533e-05, "loss": 0.0163, "step": 2848 }, { "epoch": 11.396, "grad_norm": 0.7018547058105469, "learning_rate": 2.157314629258517e-05, "loss": 0.019, "step": 2849 }, { "epoch": 11.4, "grad_norm": 0.8679090738296509, "learning_rate": 2.1563126252505012e-05, "loss": 0.0145, "step": 2850 }, { "epoch": 11.404, "grad_norm": 0.8156054019927979, "learning_rate": 2.155310621242485e-05, "loss": 0.0157, "step": 2851 }, { "epoch": 11.408, "grad_norm": 0.7699236869812012, "learning_rate": 2.1543086172344688e-05, "loss": 0.0169, "step": 2852 }, { "epoch": 11.412, "grad_norm": 0.9097781777381897, "learning_rate": 2.153306613226453e-05, "loss": 0.014, "step": 2853 }, { "epoch": 11.416, "grad_norm": 0.3984467387199402, "learning_rate": 2.152304609218437e-05, "loss": 0.0126, "step": 2854 }, { "epoch": 11.42, "grad_norm": 0.8444249033927917, "learning_rate": 2.151302605210421e-05, "loss": 0.0182, "step": 2855 }, { "epoch": 11.424, "grad_norm": 0.9840474724769592, "learning_rate": 2.150300601202405e-05, "loss": 0.0199, "step": 2856 }, { "epoch": 11.428, "grad_norm": 0.778589129447937, "learning_rate": 2.1492985971943888e-05, "loss": 0.015, "step": 2857 }, { "epoch": 11.432, "grad_norm": 0.5163719654083252, "learning_rate": 2.1482965931863726e-05, "loss": 0.0148, "step": 2858 }, { "epoch": 11.436, "grad_norm": 0.6548240780830383, "learning_rate": 2.147294589178357e-05, "loss": 0.0159, "step": 2859 }, { "epoch": 11.44, "grad_norm": 0.6455692648887634, "learning_rate": 2.1462925851703408e-05, "loss": 0.0169, "step": 2860 }, { "epoch": 11.444, "grad_norm": 0.7446917295455933, "learning_rate": 2.1452905811623246e-05, "loss": 0.0189, "step": 2861 }, { "epoch": 11.448, "grad_norm": 0.6257652640342712, "learning_rate": 2.1442885771543087e-05, "loss": 0.0153, "step": 2862 }, { "epoch": 11.452, "grad_norm": 0.9588707685470581, "learning_rate": 2.1432865731462925e-05, "loss": 0.0209, "step": 2863 }, { "epoch": 11.456, "grad_norm": 0.825225830078125, "learning_rate": 2.1422845691382767e-05, "loss": 0.0163, "step": 2864 }, { "epoch": 11.46, "grad_norm": 0.8426507711410522, "learning_rate": 2.1412825651302608e-05, "loss": 0.0208, "step": 2865 }, { "epoch": 11.464, "grad_norm": 0.6643405556678772, "learning_rate": 2.1402805611222446e-05, "loss": 0.0169, "step": 2866 }, { "epoch": 11.468, "grad_norm": 0.604695737361908, "learning_rate": 2.1392785571142284e-05, "loss": 0.0145, "step": 2867 }, { "epoch": 11.472, "grad_norm": 0.6502740383148193, "learning_rate": 2.1382765531062128e-05, "loss": 0.0152, "step": 2868 }, { "epoch": 11.475999999999999, "grad_norm": 0.47215506434440613, "learning_rate": 2.1372745490981966e-05, "loss": 0.0089, "step": 2869 }, { "epoch": 11.48, "grad_norm": 0.597215473651886, "learning_rate": 2.1362725450901804e-05, "loss": 0.0157, "step": 2870 }, { "epoch": 11.484, "grad_norm": 0.5259137153625488, "learning_rate": 2.1352705410821645e-05, "loss": 0.0144, "step": 2871 }, { "epoch": 11.488, "grad_norm": 0.680182695388794, "learning_rate": 2.1342685370741483e-05, "loss": 0.0183, "step": 2872 }, { "epoch": 11.492, "grad_norm": 0.8447750210762024, "learning_rate": 2.1332665330661324e-05, "loss": 0.0199, "step": 2873 }, { "epoch": 11.496, "grad_norm": 0.8049308657646179, "learning_rate": 2.1322645290581166e-05, "loss": 0.0175, "step": 2874 }, { "epoch": 11.5, "grad_norm": 0.916851818561554, "learning_rate": 2.1312625250501004e-05, "loss": 0.0191, "step": 2875 }, { "epoch": 11.504, "grad_norm": 0.7663381695747375, "learning_rate": 2.130260521042084e-05, "loss": 0.0162, "step": 2876 }, { "epoch": 11.508, "grad_norm": 0.639718770980835, "learning_rate": 2.1292585170340683e-05, "loss": 0.0128, "step": 2877 }, { "epoch": 11.512, "grad_norm": 0.7572237849235535, "learning_rate": 2.1282565130260524e-05, "loss": 0.0167, "step": 2878 }, { "epoch": 11.516, "grad_norm": 0.8541759252548218, "learning_rate": 2.1272545090180362e-05, "loss": 0.0182, "step": 2879 }, { "epoch": 11.52, "grad_norm": 0.8712204098701477, "learning_rate": 2.12625250501002e-05, "loss": 0.02, "step": 2880 }, { "epoch": 11.524000000000001, "grad_norm": 0.7454221248626709, "learning_rate": 2.125250501002004e-05, "loss": 0.0155, "step": 2881 }, { "epoch": 11.528, "grad_norm": 0.47309982776641846, "learning_rate": 2.124248496993988e-05, "loss": 0.0129, "step": 2882 }, { "epoch": 11.532, "grad_norm": 1.0623902082443237, "learning_rate": 2.123246492985972e-05, "loss": 0.0201, "step": 2883 }, { "epoch": 11.536, "grad_norm": 0.600472092628479, "learning_rate": 2.122244488977956e-05, "loss": 0.0149, "step": 2884 }, { "epoch": 11.54, "grad_norm": 0.9830151200294495, "learning_rate": 2.12124248496994e-05, "loss": 0.0171, "step": 2885 }, { "epoch": 11.544, "grad_norm": 0.49249526858329773, "learning_rate": 2.1202404809619237e-05, "loss": 0.0116, "step": 2886 }, { "epoch": 11.548, "grad_norm": 0.7066675424575806, "learning_rate": 2.119238476953908e-05, "loss": 0.0167, "step": 2887 }, { "epoch": 11.552, "grad_norm": 0.9759204983711243, "learning_rate": 2.118236472945892e-05, "loss": 0.0221, "step": 2888 }, { "epoch": 11.556000000000001, "grad_norm": 0.6902356743812561, "learning_rate": 2.1172344689378758e-05, "loss": 0.0154, "step": 2889 }, { "epoch": 11.56, "grad_norm": 0.7708073854446411, "learning_rate": 2.11623246492986e-05, "loss": 0.0142, "step": 2890 }, { "epoch": 11.564, "grad_norm": 0.6646739840507507, "learning_rate": 2.1152304609218437e-05, "loss": 0.0132, "step": 2891 }, { "epoch": 11.568, "grad_norm": 0.5692991614341736, "learning_rate": 2.1142284569138275e-05, "loss": 0.0158, "step": 2892 }, { "epoch": 11.572, "grad_norm": 0.683018147945404, "learning_rate": 2.113226452905812e-05, "loss": 0.0158, "step": 2893 }, { "epoch": 11.576, "grad_norm": 0.6154299974441528, "learning_rate": 2.1122244488977958e-05, "loss": 0.0185, "step": 2894 }, { "epoch": 11.58, "grad_norm": 0.9322575926780701, "learning_rate": 2.1112224448897795e-05, "loss": 0.0197, "step": 2895 }, { "epoch": 11.584, "grad_norm": 0.6337608098983765, "learning_rate": 2.1102204408817637e-05, "loss": 0.0163, "step": 2896 }, { "epoch": 11.588, "grad_norm": 0.8087349534034729, "learning_rate": 2.1092184368737475e-05, "loss": 0.017, "step": 2897 }, { "epoch": 11.592, "grad_norm": 0.5522025227546692, "learning_rate": 2.1082164328657316e-05, "loss": 0.0155, "step": 2898 }, { "epoch": 11.596, "grad_norm": 0.5902281403541565, "learning_rate": 2.1072144288577157e-05, "loss": 0.017, "step": 2899 }, { "epoch": 11.6, "grad_norm": 0.5093696713447571, "learning_rate": 2.1062124248496995e-05, "loss": 0.0154, "step": 2900 }, { "epoch": 11.604, "grad_norm": 0.602093517780304, "learning_rate": 2.1052104208416833e-05, "loss": 0.0163, "step": 2901 }, { "epoch": 11.608, "grad_norm": 0.4907231628894806, "learning_rate": 2.1042084168336674e-05, "loss": 0.0105, "step": 2902 }, { "epoch": 11.612, "grad_norm": 0.6661096811294556, "learning_rate": 2.1032064128256516e-05, "loss": 0.0167, "step": 2903 }, { "epoch": 11.616, "grad_norm": 1.0688780546188354, "learning_rate": 2.1022044088176353e-05, "loss": 0.021, "step": 2904 }, { "epoch": 11.62, "grad_norm": 0.7456181645393372, "learning_rate": 2.1012024048096195e-05, "loss": 0.0168, "step": 2905 }, { "epoch": 11.624, "grad_norm": 0.7473933696746826, "learning_rate": 2.1002004008016033e-05, "loss": 0.0162, "step": 2906 }, { "epoch": 11.628, "grad_norm": 0.539610743522644, "learning_rate": 2.099198396793587e-05, "loss": 0.0148, "step": 2907 }, { "epoch": 11.632, "grad_norm": 1.1146992444992065, "learning_rate": 2.0981963927855712e-05, "loss": 0.0239, "step": 2908 }, { "epoch": 11.636, "grad_norm": 0.7290841341018677, "learning_rate": 2.0971943887775553e-05, "loss": 0.0145, "step": 2909 }, { "epoch": 11.64, "grad_norm": 0.6614787578582764, "learning_rate": 2.096192384769539e-05, "loss": 0.0169, "step": 2910 }, { "epoch": 11.644, "grad_norm": 0.8798813819885254, "learning_rate": 2.0951903807615232e-05, "loss": 0.0196, "step": 2911 }, { "epoch": 11.648, "grad_norm": 0.5717126727104187, "learning_rate": 2.094188376753507e-05, "loss": 0.0155, "step": 2912 }, { "epoch": 11.652, "grad_norm": 1.0631353855133057, "learning_rate": 2.093186372745491e-05, "loss": 0.0191, "step": 2913 }, { "epoch": 11.656, "grad_norm": 0.7153030633926392, "learning_rate": 2.092184368737475e-05, "loss": 0.0173, "step": 2914 }, { "epoch": 11.66, "grad_norm": 0.8468221426010132, "learning_rate": 2.091182364729459e-05, "loss": 0.0203, "step": 2915 }, { "epoch": 11.664, "grad_norm": 0.6937319040298462, "learning_rate": 2.090180360721443e-05, "loss": 0.016, "step": 2916 }, { "epoch": 11.668, "grad_norm": 0.7014555931091309, "learning_rate": 2.089178356713427e-05, "loss": 0.0207, "step": 2917 }, { "epoch": 11.672, "grad_norm": 0.7516029477119446, "learning_rate": 2.088176352705411e-05, "loss": 0.016, "step": 2918 }, { "epoch": 11.676, "grad_norm": 0.8219577074050903, "learning_rate": 2.087174348697395e-05, "loss": 0.0167, "step": 2919 }, { "epoch": 11.68, "grad_norm": 0.5712000131607056, "learning_rate": 2.0861723446893787e-05, "loss": 0.0175, "step": 2920 }, { "epoch": 11.684, "grad_norm": 0.9513866305351257, "learning_rate": 2.0851703406813628e-05, "loss": 0.0197, "step": 2921 }, { "epoch": 11.688, "grad_norm": 0.7864239811897278, "learning_rate": 2.0841683366733466e-05, "loss": 0.0175, "step": 2922 }, { "epoch": 11.692, "grad_norm": 0.7664267420768738, "learning_rate": 2.0831663326653307e-05, "loss": 0.0189, "step": 2923 }, { "epoch": 11.696, "grad_norm": 0.6849053502082825, "learning_rate": 2.082164328657315e-05, "loss": 0.0175, "step": 2924 }, { "epoch": 11.7, "grad_norm": 0.7375827431678772, "learning_rate": 2.0811623246492986e-05, "loss": 0.0177, "step": 2925 }, { "epoch": 11.704, "grad_norm": 0.661365807056427, "learning_rate": 2.0801603206412824e-05, "loss": 0.0154, "step": 2926 }, { "epoch": 11.708, "grad_norm": 0.5127745866775513, "learning_rate": 2.079158316633267e-05, "loss": 0.014, "step": 2927 }, { "epoch": 11.712, "grad_norm": 0.6667503714561462, "learning_rate": 2.0781563126252507e-05, "loss": 0.0158, "step": 2928 }, { "epoch": 11.716, "grad_norm": 0.6455088257789612, "learning_rate": 2.0771543086172345e-05, "loss": 0.0156, "step": 2929 }, { "epoch": 11.72, "grad_norm": 0.683720588684082, "learning_rate": 2.0761523046092186e-05, "loss": 0.015, "step": 2930 }, { "epoch": 11.724, "grad_norm": 0.6581544280052185, "learning_rate": 2.0751503006012024e-05, "loss": 0.0158, "step": 2931 }, { "epoch": 11.728, "grad_norm": 0.5899677276611328, "learning_rate": 2.0741482965931865e-05, "loss": 0.0159, "step": 2932 }, { "epoch": 11.732, "grad_norm": 0.6744598746299744, "learning_rate": 2.0731462925851707e-05, "loss": 0.0157, "step": 2933 }, { "epoch": 11.736, "grad_norm": 0.8028687834739685, "learning_rate": 2.0721442885771544e-05, "loss": 0.0172, "step": 2934 }, { "epoch": 11.74, "grad_norm": 0.8576624393463135, "learning_rate": 2.0711422845691382e-05, "loss": 0.0201, "step": 2935 }, { "epoch": 11.744, "grad_norm": 0.8122572302818298, "learning_rate": 2.0701402805611224e-05, "loss": 0.0187, "step": 2936 }, { "epoch": 11.748, "grad_norm": 0.5539607405662537, "learning_rate": 2.0691382765531065e-05, "loss": 0.0155, "step": 2937 }, { "epoch": 11.752, "grad_norm": 0.6936144232749939, "learning_rate": 2.0681362725450903e-05, "loss": 0.017, "step": 2938 }, { "epoch": 11.756, "grad_norm": 0.8226979970932007, "learning_rate": 2.0671342685370744e-05, "loss": 0.0196, "step": 2939 }, { "epoch": 11.76, "grad_norm": 0.49609795212745667, "learning_rate": 2.0661322645290582e-05, "loss": 0.014, "step": 2940 }, { "epoch": 11.764, "grad_norm": 0.8211017847061157, "learning_rate": 2.065130260521042e-05, "loss": 0.0176, "step": 2941 }, { "epoch": 11.768, "grad_norm": 0.6408782601356506, "learning_rate": 2.064128256513026e-05, "loss": 0.0166, "step": 2942 }, { "epoch": 11.772, "grad_norm": 0.721869945526123, "learning_rate": 2.0631262525050102e-05, "loss": 0.0168, "step": 2943 }, { "epoch": 11.776, "grad_norm": 0.7406426668167114, "learning_rate": 2.062124248496994e-05, "loss": 0.0172, "step": 2944 }, { "epoch": 11.78, "grad_norm": 0.9095814228057861, "learning_rate": 2.061122244488978e-05, "loss": 0.0199, "step": 2945 }, { "epoch": 11.784, "grad_norm": 0.6619588136672974, "learning_rate": 2.060120240480962e-05, "loss": 0.0193, "step": 2946 }, { "epoch": 11.788, "grad_norm": 0.8828310966491699, "learning_rate": 2.059118236472946e-05, "loss": 0.0172, "step": 2947 }, { "epoch": 11.792, "grad_norm": 0.6653797626495361, "learning_rate": 2.05811623246493e-05, "loss": 0.0158, "step": 2948 }, { "epoch": 11.796, "grad_norm": 0.78447425365448, "learning_rate": 2.057114228456914e-05, "loss": 0.016, "step": 2949 }, { "epoch": 11.8, "grad_norm": 0.7116568088531494, "learning_rate": 2.0561122244488978e-05, "loss": 0.0183, "step": 2950 }, { "epoch": 11.804, "grad_norm": 0.5957525372505188, "learning_rate": 2.055110220440882e-05, "loss": 0.0165, "step": 2951 }, { "epoch": 11.808, "grad_norm": 0.6143118143081665, "learning_rate": 2.054108216432866e-05, "loss": 0.0168, "step": 2952 }, { "epoch": 11.812, "grad_norm": 0.8501677513122559, "learning_rate": 2.05310621242485e-05, "loss": 0.0147, "step": 2953 }, { "epoch": 11.816, "grad_norm": 0.8333978056907654, "learning_rate": 2.0521042084168336e-05, "loss": 0.0178, "step": 2954 }, { "epoch": 11.82, "grad_norm": 0.6845681667327881, "learning_rate": 2.0511022044088178e-05, "loss": 0.0186, "step": 2955 }, { "epoch": 11.824, "grad_norm": 0.6612383127212524, "learning_rate": 2.0501002004008015e-05, "loss": 0.0161, "step": 2956 }, { "epoch": 11.828, "grad_norm": 0.3920148015022278, "learning_rate": 2.0490981963927857e-05, "loss": 0.0082, "step": 2957 }, { "epoch": 11.832, "grad_norm": 0.9177662134170532, "learning_rate": 2.0480961923847698e-05, "loss": 0.0208, "step": 2958 }, { "epoch": 11.836, "grad_norm": 0.635342538356781, "learning_rate": 2.0470941883767536e-05, "loss": 0.0139, "step": 2959 }, { "epoch": 11.84, "grad_norm": 0.9092418551445007, "learning_rate": 2.0460921843687374e-05, "loss": 0.0228, "step": 2960 }, { "epoch": 11.844, "grad_norm": 0.7522537112236023, "learning_rate": 2.0450901803607215e-05, "loss": 0.0165, "step": 2961 }, { "epoch": 11.848, "grad_norm": 0.8673317432403564, "learning_rate": 2.0440881763527056e-05, "loss": 0.0196, "step": 2962 }, { "epoch": 11.852, "grad_norm": 0.5442277193069458, "learning_rate": 2.0430861723446894e-05, "loss": 0.015, "step": 2963 }, { "epoch": 11.856, "grad_norm": 0.8119293451309204, "learning_rate": 2.0420841683366736e-05, "loss": 0.0189, "step": 2964 }, { "epoch": 11.86, "grad_norm": 0.8945901989936829, "learning_rate": 2.0410821643286573e-05, "loss": 0.0165, "step": 2965 }, { "epoch": 11.864, "grad_norm": 0.6969497203826904, "learning_rate": 2.040080160320641e-05, "loss": 0.0157, "step": 2966 }, { "epoch": 11.868, "grad_norm": 0.700520396232605, "learning_rate": 2.0390781563126256e-05, "loss": 0.0168, "step": 2967 }, { "epoch": 11.872, "grad_norm": 0.679799497127533, "learning_rate": 2.0380761523046094e-05, "loss": 0.016, "step": 2968 }, { "epoch": 11.876, "grad_norm": 0.6355128884315491, "learning_rate": 2.0370741482965932e-05, "loss": 0.0156, "step": 2969 }, { "epoch": 11.88, "grad_norm": 0.7955642342567444, "learning_rate": 2.0360721442885773e-05, "loss": 0.0174, "step": 2970 }, { "epoch": 11.884, "grad_norm": 0.8534349203109741, "learning_rate": 2.035070140280561e-05, "loss": 0.0172, "step": 2971 }, { "epoch": 11.888, "grad_norm": 0.9360595941543579, "learning_rate": 2.0340681362725452e-05, "loss": 0.0177, "step": 2972 }, { "epoch": 11.892, "grad_norm": 0.7169991731643677, "learning_rate": 2.0330661322645294e-05, "loss": 0.0197, "step": 2973 }, { "epoch": 11.896, "grad_norm": 0.853161096572876, "learning_rate": 2.032064128256513e-05, "loss": 0.0197, "step": 2974 }, { "epoch": 11.9, "grad_norm": 0.714497447013855, "learning_rate": 2.031062124248497e-05, "loss": 0.0158, "step": 2975 }, { "epoch": 11.904, "grad_norm": 0.8602288365364075, "learning_rate": 2.030060120240481e-05, "loss": 0.0176, "step": 2976 }, { "epoch": 11.908, "grad_norm": 0.7017549276351929, "learning_rate": 2.0290581162324652e-05, "loss": 0.0171, "step": 2977 }, { "epoch": 11.912, "grad_norm": 0.7314296960830688, "learning_rate": 2.028056112224449e-05, "loss": 0.0177, "step": 2978 }, { "epoch": 11.916, "grad_norm": 0.5251822471618652, "learning_rate": 2.0270541082164328e-05, "loss": 0.0145, "step": 2979 }, { "epoch": 11.92, "grad_norm": 0.5692286491394043, "learning_rate": 2.026052104208417e-05, "loss": 0.0152, "step": 2980 }, { "epoch": 11.924, "grad_norm": 0.9542722105979919, "learning_rate": 2.025050100200401e-05, "loss": 0.019, "step": 2981 }, { "epoch": 11.928, "grad_norm": 0.4988907277584076, "learning_rate": 2.0240480961923848e-05, "loss": 0.0148, "step": 2982 }, { "epoch": 11.932, "grad_norm": 0.3393643796443939, "learning_rate": 2.023046092184369e-05, "loss": 0.0062, "step": 2983 }, { "epoch": 11.936, "grad_norm": 0.6072190999984741, "learning_rate": 2.0220440881763527e-05, "loss": 0.0157, "step": 2984 }, { "epoch": 11.94, "grad_norm": 0.7978100776672363, "learning_rate": 2.0210420841683365e-05, "loss": 0.0168, "step": 2985 }, { "epoch": 11.943999999999999, "grad_norm": 0.6525113582611084, "learning_rate": 2.020040080160321e-05, "loss": 0.0162, "step": 2986 }, { "epoch": 11.948, "grad_norm": 0.7721410989761353, "learning_rate": 2.0190380761523048e-05, "loss": 0.0168, "step": 2987 }, { "epoch": 11.952, "grad_norm": 0.9068525433540344, "learning_rate": 2.0180360721442886e-05, "loss": 0.019, "step": 2988 }, { "epoch": 11.956, "grad_norm": 0.6718014478683472, "learning_rate": 2.0170340681362727e-05, "loss": 0.0163, "step": 2989 }, { "epoch": 11.96, "grad_norm": 0.8195679187774658, "learning_rate": 2.0160320641282565e-05, "loss": 0.0176, "step": 2990 }, { "epoch": 11.964, "grad_norm": 0.6604552865028381, "learning_rate": 2.0150300601202406e-05, "loss": 0.0153, "step": 2991 }, { "epoch": 11.968, "grad_norm": 0.7295999526977539, "learning_rate": 2.0140280561122247e-05, "loss": 0.0154, "step": 2992 }, { "epoch": 11.972, "grad_norm": 0.9136268496513367, "learning_rate": 2.0130260521042085e-05, "loss": 0.0192, "step": 2993 }, { "epoch": 11.975999999999999, "grad_norm": 0.8510957956314087, "learning_rate": 2.0120240480961923e-05, "loss": 0.0182, "step": 2994 }, { "epoch": 11.98, "grad_norm": 0.9759835600852966, "learning_rate": 2.0110220440881764e-05, "loss": 0.0199, "step": 2995 }, { "epoch": 11.984, "grad_norm": 0.5948774814605713, "learning_rate": 2.0100200400801606e-05, "loss": 0.0156, "step": 2996 }, { "epoch": 11.988, "grad_norm": 0.6279386281967163, "learning_rate": 2.0090180360721444e-05, "loss": 0.0171, "step": 2997 }, { "epoch": 11.992, "grad_norm": 0.4541473686695099, "learning_rate": 2.0080160320641285e-05, "loss": 0.0098, "step": 2998 }, { "epoch": 11.996, "grad_norm": 0.7771435379981995, "learning_rate": 2.0070140280561123e-05, "loss": 0.0174, "step": 2999 }, { "epoch": 12.0, "grad_norm": 0.8217807412147522, "learning_rate": 2.006012024048096e-05, "loss": 0.021, "step": 3000 }, { "epoch": 12.004, "grad_norm": 0.6948625445365906, "learning_rate": 2.0050100200400805e-05, "loss": 0.0124, "step": 3001 }, { "epoch": 12.008, "grad_norm": 0.4800888001918793, "learning_rate": 2.0040080160320643e-05, "loss": 0.0112, "step": 3002 }, { "epoch": 12.012, "grad_norm": 0.5231291055679321, "learning_rate": 2.003006012024048e-05, "loss": 0.0113, "step": 3003 }, { "epoch": 12.016, "grad_norm": 0.35470184683799744, "learning_rate": 2.0020040080160322e-05, "loss": 0.0113, "step": 3004 }, { "epoch": 12.02, "grad_norm": 0.5849162936210632, "learning_rate": 2.001002004008016e-05, "loss": 0.0122, "step": 3005 }, { "epoch": 12.024, "grad_norm": 0.5772752165794373, "learning_rate": 2e-05, "loss": 0.0126, "step": 3006 }, { "epoch": 12.028, "grad_norm": 0.39507535099983215, "learning_rate": 1.998997995991984e-05, "loss": 0.0104, "step": 3007 }, { "epoch": 12.032, "grad_norm": 0.5409855842590332, "learning_rate": 1.997995991983968e-05, "loss": 0.0125, "step": 3008 }, { "epoch": 12.036, "grad_norm": 0.5285557508468628, "learning_rate": 1.996993987975952e-05, "loss": 0.014, "step": 3009 }, { "epoch": 12.04, "grad_norm": 0.520379364490509, "learning_rate": 1.995991983967936e-05, "loss": 0.0131, "step": 3010 }, { "epoch": 12.044, "grad_norm": 0.5537810325622559, "learning_rate": 1.99498997995992e-05, "loss": 0.0127, "step": 3011 }, { "epoch": 12.048, "grad_norm": 0.4161956310272217, "learning_rate": 1.993987975951904e-05, "loss": 0.0123, "step": 3012 }, { "epoch": 12.052, "grad_norm": 0.2786479890346527, "learning_rate": 1.9929859719438877e-05, "loss": 0.0063, "step": 3013 }, { "epoch": 12.056, "grad_norm": 0.6354433298110962, "learning_rate": 1.991983967935872e-05, "loss": 0.0134, "step": 3014 }, { "epoch": 12.06, "grad_norm": 0.4777648150920868, "learning_rate": 1.9909819639278556e-05, "loss": 0.0113, "step": 3015 }, { "epoch": 12.064, "grad_norm": 0.622520923614502, "learning_rate": 1.9899799599198398e-05, "loss": 0.0121, "step": 3016 }, { "epoch": 12.068, "grad_norm": 0.4313918650150299, "learning_rate": 1.988977955911824e-05, "loss": 0.0111, "step": 3017 }, { "epoch": 12.072, "grad_norm": 0.35469359159469604, "learning_rate": 1.9879759519038077e-05, "loss": 0.0076, "step": 3018 }, { "epoch": 12.076, "grad_norm": 0.6942519545555115, "learning_rate": 1.9869739478957915e-05, "loss": 0.0121, "step": 3019 }, { "epoch": 12.08, "grad_norm": 0.5710741281509399, "learning_rate": 1.9859719438877756e-05, "loss": 0.0127, "step": 3020 }, { "epoch": 12.084, "grad_norm": 0.3416895270347595, "learning_rate": 1.9849699398797597e-05, "loss": 0.0116, "step": 3021 }, { "epoch": 12.088, "grad_norm": 0.5317337512969971, "learning_rate": 1.9839679358717435e-05, "loss": 0.0127, "step": 3022 }, { "epoch": 12.092, "grad_norm": 0.45818451046943665, "learning_rate": 1.9829659318637276e-05, "loss": 0.012, "step": 3023 }, { "epoch": 12.096, "grad_norm": 0.564237654209137, "learning_rate": 1.9819639278557114e-05, "loss": 0.0112, "step": 3024 }, { "epoch": 12.1, "grad_norm": 0.5866971611976624, "learning_rate": 1.9809619238476952e-05, "loss": 0.0153, "step": 3025 }, { "epoch": 12.104, "grad_norm": 0.44592806696891785, "learning_rate": 1.9799599198396797e-05, "loss": 0.0105, "step": 3026 }, { "epoch": 12.108, "grad_norm": 0.3792635500431061, "learning_rate": 1.9789579158316635e-05, "loss": 0.0102, "step": 3027 }, { "epoch": 12.112, "grad_norm": 0.5635868906974792, "learning_rate": 1.9779559118236473e-05, "loss": 0.0113, "step": 3028 }, { "epoch": 12.116, "grad_norm": 0.6451796293258667, "learning_rate": 1.9769539078156314e-05, "loss": 0.012, "step": 3029 }, { "epoch": 12.12, "grad_norm": 0.35301733016967773, "learning_rate": 1.9759519038076152e-05, "loss": 0.0103, "step": 3030 }, { "epoch": 12.124, "grad_norm": 0.4701129198074341, "learning_rate": 1.9749498997995993e-05, "loss": 0.0113, "step": 3031 }, { "epoch": 12.128, "grad_norm": 0.39844104647636414, "learning_rate": 1.9739478957915834e-05, "loss": 0.011, "step": 3032 }, { "epoch": 12.132, "grad_norm": 0.5066357254981995, "learning_rate": 1.9729458917835672e-05, "loss": 0.0114, "step": 3033 }, { "epoch": 12.136, "grad_norm": 0.3592749536037445, "learning_rate": 1.971943887775551e-05, "loss": 0.0109, "step": 3034 }, { "epoch": 12.14, "grad_norm": 0.5839434266090393, "learning_rate": 1.970941883767535e-05, "loss": 0.0137, "step": 3035 }, { "epoch": 12.144, "grad_norm": 0.35629791021347046, "learning_rate": 1.9699398797595193e-05, "loss": 0.0108, "step": 3036 }, { "epoch": 12.148, "grad_norm": 0.474680095911026, "learning_rate": 1.968937875751503e-05, "loss": 0.0099, "step": 3037 }, { "epoch": 12.152, "grad_norm": 0.4389362335205078, "learning_rate": 1.9679358717434872e-05, "loss": 0.0121, "step": 3038 }, { "epoch": 12.156, "grad_norm": 0.5454397201538086, "learning_rate": 1.966933867735471e-05, "loss": 0.012, "step": 3039 }, { "epoch": 12.16, "grad_norm": 0.4310266375541687, "learning_rate": 1.965931863727455e-05, "loss": 0.0124, "step": 3040 }, { "epoch": 12.164, "grad_norm": 0.38319042325019836, "learning_rate": 1.964929859719439e-05, "loss": 0.0115, "step": 3041 }, { "epoch": 12.168, "grad_norm": 0.5554765462875366, "learning_rate": 1.963927855711423e-05, "loss": 0.0123, "step": 3042 }, { "epoch": 12.172, "grad_norm": 0.43762677907943726, "learning_rate": 1.9629258517034068e-05, "loss": 0.0126, "step": 3043 }, { "epoch": 12.176, "grad_norm": 0.6405286192893982, "learning_rate": 1.961923847695391e-05, "loss": 0.0109, "step": 3044 }, { "epoch": 12.18, "grad_norm": 0.4233160614967346, "learning_rate": 1.960921843687375e-05, "loss": 0.0131, "step": 3045 }, { "epoch": 12.184, "grad_norm": 0.4447900652885437, "learning_rate": 1.959919839679359e-05, "loss": 0.012, "step": 3046 }, { "epoch": 12.188, "grad_norm": 0.44814789295196533, "learning_rate": 1.9589178356713426e-05, "loss": 0.0134, "step": 3047 }, { "epoch": 12.192, "grad_norm": 0.3883339762687683, "learning_rate": 1.9579158316633268e-05, "loss": 0.0127, "step": 3048 }, { "epoch": 12.196, "grad_norm": 0.476978063583374, "learning_rate": 1.9569138276553106e-05, "loss": 0.0114, "step": 3049 }, { "epoch": 12.2, "grad_norm": 0.41597089171409607, "learning_rate": 1.9559118236472947e-05, "loss": 0.0108, "step": 3050 }, { "epoch": 12.204, "grad_norm": 0.3592725992202759, "learning_rate": 1.9549098196392788e-05, "loss": 0.0114, "step": 3051 }, { "epoch": 12.208, "grad_norm": 0.5016531348228455, "learning_rate": 1.9539078156312626e-05, "loss": 0.0128, "step": 3052 }, { "epoch": 12.212, "grad_norm": 0.46499642729759216, "learning_rate": 1.9529058116232464e-05, "loss": 0.01, "step": 3053 }, { "epoch": 12.216, "grad_norm": 0.6043004989624023, "learning_rate": 1.9519038076152305e-05, "loss": 0.0102, "step": 3054 }, { "epoch": 12.22, "grad_norm": 0.48706644773483276, "learning_rate": 1.9509018036072147e-05, "loss": 0.0126, "step": 3055 }, { "epoch": 12.224, "grad_norm": 0.7386260032653809, "learning_rate": 1.9498997995991984e-05, "loss": 0.0117, "step": 3056 }, { "epoch": 12.228, "grad_norm": 0.6791260838508606, "learning_rate": 1.9488977955911826e-05, "loss": 0.0133, "step": 3057 }, { "epoch": 12.232, "grad_norm": 0.38665708899497986, "learning_rate": 1.9478957915831664e-05, "loss": 0.012, "step": 3058 }, { "epoch": 12.236, "grad_norm": 0.6571320295333862, "learning_rate": 1.94689378757515e-05, "loss": 0.0113, "step": 3059 }, { "epoch": 12.24, "grad_norm": 0.5152968168258667, "learning_rate": 1.9458917835671346e-05, "loss": 0.0121, "step": 3060 }, { "epoch": 12.244, "grad_norm": 1.3934143781661987, "learning_rate": 1.9448897795591184e-05, "loss": 0.0139, "step": 3061 }, { "epoch": 12.248, "grad_norm": 0.47328341007232666, "learning_rate": 1.9438877755511022e-05, "loss": 0.0149, "step": 3062 }, { "epoch": 12.252, "grad_norm": 0.24610672891139984, "learning_rate": 1.9428857715430863e-05, "loss": 0.0089, "step": 3063 }, { "epoch": 12.256, "grad_norm": 0.6528359055519104, "learning_rate": 1.94188376753507e-05, "loss": 0.0132, "step": 3064 }, { "epoch": 12.26, "grad_norm": 0.5463185906410217, "learning_rate": 1.9408817635270542e-05, "loss": 0.0133, "step": 3065 }, { "epoch": 12.264, "grad_norm": 0.40765008330345154, "learning_rate": 1.9398797595190384e-05, "loss": 0.0109, "step": 3066 }, { "epoch": 12.268, "grad_norm": 0.6481624841690063, "learning_rate": 1.938877755511022e-05, "loss": 0.0168, "step": 3067 }, { "epoch": 12.272, "grad_norm": 0.4352094829082489, "learning_rate": 1.937875751503006e-05, "loss": 0.0118, "step": 3068 }, { "epoch": 12.276, "grad_norm": 0.3659978210926056, "learning_rate": 1.93687374749499e-05, "loss": 0.0123, "step": 3069 }, { "epoch": 12.28, "grad_norm": 0.5505935549736023, "learning_rate": 1.9358717434869742e-05, "loss": 0.0125, "step": 3070 }, { "epoch": 12.284, "grad_norm": 0.5281729102134705, "learning_rate": 1.934869739478958e-05, "loss": 0.0158, "step": 3071 }, { "epoch": 12.288, "grad_norm": 0.42845743894577026, "learning_rate": 1.933867735470942e-05, "loss": 0.0114, "step": 3072 }, { "epoch": 12.292, "grad_norm": 0.3950668275356293, "learning_rate": 1.932865731462926e-05, "loss": 0.0115, "step": 3073 }, { "epoch": 12.296, "grad_norm": 0.6833834052085876, "learning_rate": 1.9318637274549097e-05, "loss": 0.0145, "step": 3074 }, { "epoch": 12.3, "grad_norm": 0.3856029808521271, "learning_rate": 1.930861723446894e-05, "loss": 0.0132, "step": 3075 }, { "epoch": 12.304, "grad_norm": 0.4963999092578888, "learning_rate": 1.929859719438878e-05, "loss": 0.0123, "step": 3076 }, { "epoch": 12.308, "grad_norm": 0.46567070484161377, "learning_rate": 1.9288577154308618e-05, "loss": 0.0106, "step": 3077 }, { "epoch": 12.312, "grad_norm": 0.5482111573219299, "learning_rate": 1.927855711422846e-05, "loss": 0.0145, "step": 3078 }, { "epoch": 12.316, "grad_norm": 0.5668717622756958, "learning_rate": 1.9268537074148297e-05, "loss": 0.0119, "step": 3079 }, { "epoch": 12.32, "grad_norm": 0.4780943691730499, "learning_rate": 1.9258517034068138e-05, "loss": 0.0131, "step": 3080 }, { "epoch": 12.324, "grad_norm": 0.5395445227622986, "learning_rate": 1.9248496993987976e-05, "loss": 0.0119, "step": 3081 }, { "epoch": 12.328, "grad_norm": 0.346057265996933, "learning_rate": 1.9238476953907817e-05, "loss": 0.011, "step": 3082 }, { "epoch": 12.332, "grad_norm": 0.37034234404563904, "learning_rate": 1.9228456913827655e-05, "loss": 0.0119, "step": 3083 }, { "epoch": 12.336, "grad_norm": 0.3500770032405853, "learning_rate": 1.9218436873747493e-05, "loss": 0.0125, "step": 3084 }, { "epoch": 12.34, "grad_norm": 0.6100295782089233, "learning_rate": 1.9208416833667338e-05, "loss": 0.0154, "step": 3085 }, { "epoch": 12.344, "grad_norm": 0.5396653413772583, "learning_rate": 1.9198396793587175e-05, "loss": 0.0139, "step": 3086 }, { "epoch": 12.348, "grad_norm": 0.5847299695014954, "learning_rate": 1.9188376753507013e-05, "loss": 0.0119, "step": 3087 }, { "epoch": 12.352, "grad_norm": 0.38033461570739746, "learning_rate": 1.9178356713426855e-05, "loss": 0.012, "step": 3088 }, { "epoch": 12.356, "grad_norm": 0.4676550030708313, "learning_rate": 1.9168336673346693e-05, "loss": 0.0131, "step": 3089 }, { "epoch": 12.36, "grad_norm": 0.40676653385162354, "learning_rate": 1.9158316633266534e-05, "loss": 0.0128, "step": 3090 }, { "epoch": 12.364, "grad_norm": 0.4821133017539978, "learning_rate": 1.9148296593186375e-05, "loss": 0.0127, "step": 3091 }, { "epoch": 12.368, "grad_norm": 0.3716961145401001, "learning_rate": 1.9138276553106213e-05, "loss": 0.0123, "step": 3092 }, { "epoch": 12.372, "grad_norm": 0.4660569131374359, "learning_rate": 1.912825651302605e-05, "loss": 0.0112, "step": 3093 }, { "epoch": 12.376, "grad_norm": 0.579636812210083, "learning_rate": 1.9118236472945896e-05, "loss": 0.0129, "step": 3094 }, { "epoch": 12.38, "grad_norm": 0.373367577791214, "learning_rate": 1.9108216432865733e-05, "loss": 0.0112, "step": 3095 }, { "epoch": 12.384, "grad_norm": 0.41835731267929077, "learning_rate": 1.909819639278557e-05, "loss": 0.0079, "step": 3096 }, { "epoch": 12.388, "grad_norm": 0.43698638677597046, "learning_rate": 1.9088176352705413e-05, "loss": 0.0112, "step": 3097 }, { "epoch": 12.392, "grad_norm": 0.7125802636146545, "learning_rate": 1.907815631262525e-05, "loss": 0.0122, "step": 3098 }, { "epoch": 12.396, "grad_norm": 0.6462778449058533, "learning_rate": 1.9068136272545092e-05, "loss": 0.0124, "step": 3099 }, { "epoch": 12.4, "grad_norm": 0.3642807900905609, "learning_rate": 1.9058116232464933e-05, "loss": 0.0109, "step": 3100 }, { "epoch": 12.404, "grad_norm": 0.44956448674201965, "learning_rate": 1.904809619238477e-05, "loss": 0.0127, "step": 3101 }, { "epoch": 12.408, "grad_norm": 0.398598849773407, "learning_rate": 1.903807615230461e-05, "loss": 0.0076, "step": 3102 }, { "epoch": 12.412, "grad_norm": 0.8257546424865723, "learning_rate": 1.902805611222445e-05, "loss": 0.0131, "step": 3103 }, { "epoch": 12.416, "grad_norm": 0.5510148406028748, "learning_rate": 1.901803607214429e-05, "loss": 0.0134, "step": 3104 }, { "epoch": 12.42, "grad_norm": 0.415303111076355, "learning_rate": 1.900801603206413e-05, "loss": 0.0125, "step": 3105 }, { "epoch": 12.424, "grad_norm": 0.36105599999427795, "learning_rate": 1.8997995991983967e-05, "loss": 0.0126, "step": 3106 }, { "epoch": 12.428, "grad_norm": 0.45794200897216797, "learning_rate": 1.898797595190381e-05, "loss": 0.0139, "step": 3107 }, { "epoch": 12.432, "grad_norm": 0.5848199129104614, "learning_rate": 1.8977955911823646e-05, "loss": 0.0119, "step": 3108 }, { "epoch": 12.436, "grad_norm": 0.528476893901825, "learning_rate": 1.8967935871743488e-05, "loss": 0.013, "step": 3109 }, { "epoch": 12.44, "grad_norm": 0.43130603432655334, "learning_rate": 1.895791583166333e-05, "loss": 0.0125, "step": 3110 }, { "epoch": 12.444, "grad_norm": 0.5508957505226135, "learning_rate": 1.8947895791583167e-05, "loss": 0.014, "step": 3111 }, { "epoch": 12.448, "grad_norm": 0.5716979503631592, "learning_rate": 1.8937875751503005e-05, "loss": 0.0123, "step": 3112 }, { "epoch": 12.452, "grad_norm": 0.5015970468521118, "learning_rate": 1.8927855711422846e-05, "loss": 0.0122, "step": 3113 }, { "epoch": 12.456, "grad_norm": 0.379427045583725, "learning_rate": 1.8917835671342687e-05, "loss": 0.0117, "step": 3114 }, { "epoch": 12.46, "grad_norm": 0.3971169888973236, "learning_rate": 1.8907815631262525e-05, "loss": 0.0131, "step": 3115 }, { "epoch": 12.464, "grad_norm": 0.540484607219696, "learning_rate": 1.8897795591182367e-05, "loss": 0.012, "step": 3116 }, { "epoch": 12.468, "grad_norm": 0.6403137445449829, "learning_rate": 1.8887775551102204e-05, "loss": 0.0119, "step": 3117 }, { "epoch": 12.472, "grad_norm": 0.4189258813858032, "learning_rate": 1.8877755511022042e-05, "loss": 0.0121, "step": 3118 }, { "epoch": 12.475999999999999, "grad_norm": 0.731610119342804, "learning_rate": 1.8867735470941887e-05, "loss": 0.0126, "step": 3119 }, { "epoch": 12.48, "grad_norm": 0.5180526375770569, "learning_rate": 1.8857715430861725e-05, "loss": 0.0113, "step": 3120 }, { "epoch": 12.484, "grad_norm": 0.45408037304878235, "learning_rate": 1.8847695390781563e-05, "loss": 0.0114, "step": 3121 }, { "epoch": 12.488, "grad_norm": 0.5250622034072876, "learning_rate": 1.8837675350701404e-05, "loss": 0.0125, "step": 3122 }, { "epoch": 12.492, "grad_norm": 0.41708803176879883, "learning_rate": 1.8827655310621242e-05, "loss": 0.0108, "step": 3123 }, { "epoch": 12.496, "grad_norm": 0.4630829989910126, "learning_rate": 1.8817635270541083e-05, "loss": 0.014, "step": 3124 }, { "epoch": 12.5, "grad_norm": 0.5149716138839722, "learning_rate": 1.8807615230460925e-05, "loss": 0.0145, "step": 3125 }, { "epoch": 12.504, "grad_norm": 0.4337029457092285, "learning_rate": 1.8797595190380762e-05, "loss": 0.0129, "step": 3126 }, { "epoch": 12.508, "grad_norm": 0.5135932564735413, "learning_rate": 1.87875751503006e-05, "loss": 0.0123, "step": 3127 }, { "epoch": 12.512, "grad_norm": 0.5922970771789551, "learning_rate": 1.877755511022044e-05, "loss": 0.0142, "step": 3128 }, { "epoch": 12.516, "grad_norm": 0.6139187216758728, "learning_rate": 1.8767535070140283e-05, "loss": 0.0155, "step": 3129 }, { "epoch": 12.52, "grad_norm": 0.34099677205085754, "learning_rate": 1.875751503006012e-05, "loss": 0.0114, "step": 3130 }, { "epoch": 12.524000000000001, "grad_norm": 0.35690101981163025, "learning_rate": 1.8747494989979962e-05, "loss": 0.0115, "step": 3131 }, { "epoch": 12.528, "grad_norm": 0.4974209666252136, "learning_rate": 1.87374749498998e-05, "loss": 0.0166, "step": 3132 }, { "epoch": 12.532, "grad_norm": 0.47665518522262573, "learning_rate": 1.8727454909819638e-05, "loss": 0.0135, "step": 3133 }, { "epoch": 12.536, "grad_norm": 0.45617246627807617, "learning_rate": 1.871743486973948e-05, "loss": 0.0135, "step": 3134 }, { "epoch": 12.54, "grad_norm": 0.4416828453540802, "learning_rate": 1.870741482965932e-05, "loss": 0.0118, "step": 3135 }, { "epoch": 12.544, "grad_norm": 0.4680028259754181, "learning_rate": 1.869739478957916e-05, "loss": 0.0128, "step": 3136 }, { "epoch": 12.548, "grad_norm": 0.4106837809085846, "learning_rate": 1.8687374749499e-05, "loss": 0.0121, "step": 3137 }, { "epoch": 12.552, "grad_norm": 0.53117835521698, "learning_rate": 1.8677354709418837e-05, "loss": 0.0121, "step": 3138 }, { "epoch": 12.556000000000001, "grad_norm": 0.43182119727134705, "learning_rate": 1.866733466933868e-05, "loss": 0.0123, "step": 3139 }, { "epoch": 12.56, "grad_norm": 0.7537881731987, "learning_rate": 1.8657314629258517e-05, "loss": 0.0145, "step": 3140 }, { "epoch": 12.564, "grad_norm": 0.3404240906238556, "learning_rate": 1.8647294589178358e-05, "loss": 0.0115, "step": 3141 }, { "epoch": 12.568, "grad_norm": 0.6614292860031128, "learning_rate": 1.8637274549098196e-05, "loss": 0.0117, "step": 3142 }, { "epoch": 12.572, "grad_norm": 0.46803048253059387, "learning_rate": 1.8627254509018037e-05, "loss": 0.0117, "step": 3143 }, { "epoch": 12.576, "grad_norm": 0.3678813874721527, "learning_rate": 1.861723446893788e-05, "loss": 0.0115, "step": 3144 }, { "epoch": 12.58, "grad_norm": 0.30099377036094666, "learning_rate": 1.8607214428857716e-05, "loss": 0.0103, "step": 3145 }, { "epoch": 12.584, "grad_norm": 0.47530147433280945, "learning_rate": 1.8597194388777554e-05, "loss": 0.013, "step": 3146 }, { "epoch": 12.588, "grad_norm": 1.0010900497436523, "learning_rate": 1.8587174348697395e-05, "loss": 0.0121, "step": 3147 }, { "epoch": 12.592, "grad_norm": 0.3169981837272644, "learning_rate": 1.8577154308617237e-05, "loss": 0.0113, "step": 3148 }, { "epoch": 12.596, "grad_norm": 0.4307844936847687, "learning_rate": 1.8567134268537075e-05, "loss": 0.0123, "step": 3149 }, { "epoch": 12.6, "grad_norm": 0.5654677748680115, "learning_rate": 1.8557114228456916e-05, "loss": 0.0135, "step": 3150 }, { "epoch": 12.604, "grad_norm": 0.432781457901001, "learning_rate": 1.8547094188376754e-05, "loss": 0.0112, "step": 3151 }, { "epoch": 12.608, "grad_norm": 0.6548603177070618, "learning_rate": 1.8537074148296592e-05, "loss": 0.013, "step": 3152 }, { "epoch": 12.612, "grad_norm": 0.5078880786895752, "learning_rate": 1.8527054108216436e-05, "loss": 0.0114, "step": 3153 }, { "epoch": 12.616, "grad_norm": 0.6445592641830444, "learning_rate": 1.8517034068136274e-05, "loss": 0.0132, "step": 3154 }, { "epoch": 12.62, "grad_norm": 0.29496875405311584, "learning_rate": 1.8507014028056112e-05, "loss": 0.0109, "step": 3155 }, { "epoch": 12.624, "grad_norm": 1.0399456024169922, "learning_rate": 1.8496993987975953e-05, "loss": 0.0156, "step": 3156 }, { "epoch": 12.628, "grad_norm": 0.5184885263442993, "learning_rate": 1.848697394789579e-05, "loss": 0.011, "step": 3157 }, { "epoch": 12.632, "grad_norm": 0.251708984375, "learning_rate": 1.8476953907815633e-05, "loss": 0.0064, "step": 3158 }, { "epoch": 12.636, "grad_norm": 0.5945011377334595, "learning_rate": 1.8466933867735474e-05, "loss": 0.0151, "step": 3159 }, { "epoch": 12.64, "grad_norm": 0.3488098382949829, "learning_rate": 1.8456913827655312e-05, "loss": 0.0112, "step": 3160 }, { "epoch": 12.644, "grad_norm": 0.46110039949417114, "learning_rate": 1.844689378757515e-05, "loss": 0.0123, "step": 3161 }, { "epoch": 12.648, "grad_norm": 0.5689302682876587, "learning_rate": 1.843687374749499e-05, "loss": 0.012, "step": 3162 }, { "epoch": 12.652, "grad_norm": 0.4283682703971863, "learning_rate": 1.8426853707414832e-05, "loss": 0.0115, "step": 3163 }, { "epoch": 12.656, "grad_norm": 0.5536277890205383, "learning_rate": 1.841683366733467e-05, "loss": 0.0108, "step": 3164 }, { "epoch": 12.66, "grad_norm": 0.5680063366889954, "learning_rate": 1.840681362725451e-05, "loss": 0.014, "step": 3165 }, { "epoch": 12.664, "grad_norm": 0.42657792568206787, "learning_rate": 1.839679358717435e-05, "loss": 0.013, "step": 3166 }, { "epoch": 12.668, "grad_norm": 0.33542749285697937, "learning_rate": 1.8386773547094187e-05, "loss": 0.0123, "step": 3167 }, { "epoch": 12.672, "grad_norm": 0.7784429788589478, "learning_rate": 1.837675350701403e-05, "loss": 0.0186, "step": 3168 }, { "epoch": 12.676, "grad_norm": 0.35441887378692627, "learning_rate": 1.836673346693387e-05, "loss": 0.0113, "step": 3169 }, { "epoch": 12.68, "grad_norm": 0.6247470378875732, "learning_rate": 1.8356713426853708e-05, "loss": 0.0117, "step": 3170 }, { "epoch": 12.684, "grad_norm": 0.7267860770225525, "learning_rate": 1.834669338677355e-05, "loss": 0.0126, "step": 3171 }, { "epoch": 12.688, "grad_norm": 0.40373706817626953, "learning_rate": 1.8336673346693387e-05, "loss": 0.0124, "step": 3172 }, { "epoch": 12.692, "grad_norm": 0.38257259130477905, "learning_rate": 1.8326653306613228e-05, "loss": 0.0122, "step": 3173 }, { "epoch": 12.696, "grad_norm": 0.3000936210155487, "learning_rate": 1.8316633266533066e-05, "loss": 0.0107, "step": 3174 }, { "epoch": 12.7, "grad_norm": 0.5243247747421265, "learning_rate": 1.8306613226452907e-05, "loss": 0.012, "step": 3175 }, { "epoch": 12.704, "grad_norm": 0.6114549040794373, "learning_rate": 1.8296593186372745e-05, "loss": 0.0106, "step": 3176 }, { "epoch": 12.708, "grad_norm": 0.9472494721412659, "learning_rate": 1.8286573146292587e-05, "loss": 0.0164, "step": 3177 }, { "epoch": 12.712, "grad_norm": 0.5240547060966492, "learning_rate": 1.8276553106212428e-05, "loss": 0.012, "step": 3178 }, { "epoch": 12.716, "grad_norm": 0.6829943656921387, "learning_rate": 1.8266533066132266e-05, "loss": 0.0123, "step": 3179 }, { "epoch": 12.72, "grad_norm": 0.4161919951438904, "learning_rate": 1.8256513026052104e-05, "loss": 0.0115, "step": 3180 }, { "epoch": 12.724, "grad_norm": 0.2771739959716797, "learning_rate": 1.8246492985971945e-05, "loss": 0.0073, "step": 3181 }, { "epoch": 12.728, "grad_norm": 0.40933287143707275, "learning_rate": 1.8236472945891783e-05, "loss": 0.0119, "step": 3182 }, { "epoch": 12.732, "grad_norm": 0.44479766488075256, "learning_rate": 1.8226452905811624e-05, "loss": 0.0143, "step": 3183 }, { "epoch": 12.736, "grad_norm": 0.32802703976631165, "learning_rate": 1.8216432865731465e-05, "loss": 0.0102, "step": 3184 }, { "epoch": 12.74, "grad_norm": 0.43562448024749756, "learning_rate": 1.8206412825651303e-05, "loss": 0.0125, "step": 3185 }, { "epoch": 12.744, "grad_norm": 0.2993567883968353, "learning_rate": 1.819639278557114e-05, "loss": 0.0064, "step": 3186 }, { "epoch": 12.748, "grad_norm": 0.45686689019203186, "learning_rate": 1.8186372745490982e-05, "loss": 0.0137, "step": 3187 }, { "epoch": 12.752, "grad_norm": 0.6637890934944153, "learning_rate": 1.8176352705410824e-05, "loss": 0.0117, "step": 3188 }, { "epoch": 12.756, "grad_norm": 0.4745820462703705, "learning_rate": 1.816633266533066e-05, "loss": 0.0127, "step": 3189 }, { "epoch": 12.76, "grad_norm": 0.5020652413368225, "learning_rate": 1.8156312625250503e-05, "loss": 0.0141, "step": 3190 }, { "epoch": 12.764, "grad_norm": 0.49237433075904846, "learning_rate": 1.814629258517034e-05, "loss": 0.0151, "step": 3191 }, { "epoch": 12.768, "grad_norm": 0.6075673699378967, "learning_rate": 1.813627254509018e-05, "loss": 0.0133, "step": 3192 }, { "epoch": 12.772, "grad_norm": 0.5600119233131409, "learning_rate": 1.8126252505010023e-05, "loss": 0.0158, "step": 3193 }, { "epoch": 12.776, "grad_norm": 0.4790187478065491, "learning_rate": 1.811623246492986e-05, "loss": 0.0146, "step": 3194 }, { "epoch": 12.78, "grad_norm": 0.43495261669158936, "learning_rate": 1.81062124248497e-05, "loss": 0.0118, "step": 3195 }, { "epoch": 12.784, "grad_norm": 0.36963024735450745, "learning_rate": 1.809619238476954e-05, "loss": 0.0116, "step": 3196 }, { "epoch": 12.788, "grad_norm": 0.49386468529701233, "learning_rate": 1.8086172344689378e-05, "loss": 0.0144, "step": 3197 }, { "epoch": 12.792, "grad_norm": 0.6671590805053711, "learning_rate": 1.807615230460922e-05, "loss": 0.013, "step": 3198 }, { "epoch": 12.796, "grad_norm": 0.39627164602279663, "learning_rate": 1.806613226452906e-05, "loss": 0.0122, "step": 3199 }, { "epoch": 12.8, "grad_norm": 0.5293060541152954, "learning_rate": 1.80561122244489e-05, "loss": 0.0139, "step": 3200 }, { "epoch": 12.804, "grad_norm": 0.4009303152561188, "learning_rate": 1.8046092184368737e-05, "loss": 0.0131, "step": 3201 }, { "epoch": 12.808, "grad_norm": 0.4425906240940094, "learning_rate": 1.8036072144288578e-05, "loss": 0.0124, "step": 3202 }, { "epoch": 12.812, "grad_norm": 0.38386768102645874, "learning_rate": 1.802605210420842e-05, "loss": 0.0126, "step": 3203 }, { "epoch": 12.816, "grad_norm": 0.48892340064048767, "learning_rate": 1.8016032064128257e-05, "loss": 0.0118, "step": 3204 }, { "epoch": 12.82, "grad_norm": 0.44380101561546326, "learning_rate": 1.80060120240481e-05, "loss": 0.0129, "step": 3205 }, { "epoch": 12.824, "grad_norm": 0.3459862172603607, "learning_rate": 1.7995991983967936e-05, "loss": 0.0119, "step": 3206 }, { "epoch": 12.828, "grad_norm": 0.41011375188827515, "learning_rate": 1.7985971943887778e-05, "loss": 0.0119, "step": 3207 }, { "epoch": 12.832, "grad_norm": 0.5159206986427307, "learning_rate": 1.7975951903807615e-05, "loss": 0.0167, "step": 3208 }, { "epoch": 12.836, "grad_norm": 0.22304032742977142, "learning_rate": 1.7965931863727457e-05, "loss": 0.0066, "step": 3209 }, { "epoch": 12.84, "grad_norm": 0.3644813597202301, "learning_rate": 1.7955911823647295e-05, "loss": 0.0084, "step": 3210 }, { "epoch": 12.844, "grad_norm": 0.5323387384414673, "learning_rate": 1.7945891783567133e-05, "loss": 0.0146, "step": 3211 }, { "epoch": 12.848, "grad_norm": 0.647765576839447, "learning_rate": 1.7935871743486977e-05, "loss": 0.0127, "step": 3212 }, { "epoch": 12.852, "grad_norm": 0.4638051688671112, "learning_rate": 1.7925851703406815e-05, "loss": 0.0083, "step": 3213 }, { "epoch": 12.856, "grad_norm": 0.7238399982452393, "learning_rate": 1.7915831663326653e-05, "loss": 0.0136, "step": 3214 }, { "epoch": 12.86, "grad_norm": 0.7715800404548645, "learning_rate": 1.7905811623246494e-05, "loss": 0.0149, "step": 3215 }, { "epoch": 12.864, "grad_norm": 0.7044159173965454, "learning_rate": 1.7895791583166332e-05, "loss": 0.0154, "step": 3216 }, { "epoch": 12.868, "grad_norm": 0.45285147428512573, "learning_rate": 1.7885771543086173e-05, "loss": 0.0113, "step": 3217 }, { "epoch": 12.872, "grad_norm": 0.42862364649772644, "learning_rate": 1.7875751503006015e-05, "loss": 0.0122, "step": 3218 }, { "epoch": 12.876, "grad_norm": 0.4750185012817383, "learning_rate": 1.7865731462925853e-05, "loss": 0.0129, "step": 3219 }, { "epoch": 12.88, "grad_norm": 0.4365493059158325, "learning_rate": 1.785571142284569e-05, "loss": 0.0143, "step": 3220 }, { "epoch": 12.884, "grad_norm": 0.7560842633247375, "learning_rate": 1.7845691382765532e-05, "loss": 0.0162, "step": 3221 }, { "epoch": 12.888, "grad_norm": 0.21909433603286743, "learning_rate": 1.7835671342685373e-05, "loss": 0.0064, "step": 3222 }, { "epoch": 12.892, "grad_norm": 0.6783998608589172, "learning_rate": 1.782565130260521e-05, "loss": 0.0135, "step": 3223 }, { "epoch": 12.896, "grad_norm": 0.3435244560241699, "learning_rate": 1.7815631262525052e-05, "loss": 0.0119, "step": 3224 }, { "epoch": 12.9, "grad_norm": 0.20075704157352448, "learning_rate": 1.780561122244489e-05, "loss": 0.0071, "step": 3225 }, { "epoch": 12.904, "grad_norm": 0.44066622853279114, "learning_rate": 1.7795591182364728e-05, "loss": 0.0124, "step": 3226 }, { "epoch": 12.908, "grad_norm": 0.3443538248538971, "learning_rate": 1.7785571142284573e-05, "loss": 0.0116, "step": 3227 }, { "epoch": 12.912, "grad_norm": 0.3867914378643036, "learning_rate": 1.777555110220441e-05, "loss": 0.0131, "step": 3228 }, { "epoch": 12.916, "grad_norm": 0.44781914353370667, "learning_rate": 1.776553106212425e-05, "loss": 0.0138, "step": 3229 }, { "epoch": 12.92, "grad_norm": 0.4673486649990082, "learning_rate": 1.775551102204409e-05, "loss": 0.0128, "step": 3230 }, { "epoch": 12.924, "grad_norm": 0.8357963562011719, "learning_rate": 1.7745490981963928e-05, "loss": 0.0163, "step": 3231 }, { "epoch": 12.928, "grad_norm": 0.52073073387146, "learning_rate": 1.773547094188377e-05, "loss": 0.0121, "step": 3232 }, { "epoch": 12.932, "grad_norm": 0.620764970779419, "learning_rate": 1.7725450901803607e-05, "loss": 0.0132, "step": 3233 }, { "epoch": 12.936, "grad_norm": 0.5402158498764038, "learning_rate": 1.7715430861723448e-05, "loss": 0.0147, "step": 3234 }, { "epoch": 12.94, "grad_norm": 0.6234936118125916, "learning_rate": 1.7705410821643286e-05, "loss": 0.0142, "step": 3235 }, { "epoch": 12.943999999999999, "grad_norm": 0.7595784664154053, "learning_rate": 1.7695390781563127e-05, "loss": 0.017, "step": 3236 }, { "epoch": 12.948, "grad_norm": 0.7245892882347107, "learning_rate": 1.768537074148297e-05, "loss": 0.014, "step": 3237 }, { "epoch": 12.952, "grad_norm": 0.6920550465583801, "learning_rate": 1.7675350701402807e-05, "loss": 0.0126, "step": 3238 }, { "epoch": 12.956, "grad_norm": 0.6069607734680176, "learning_rate": 1.7665330661322644e-05, "loss": 0.014, "step": 3239 }, { "epoch": 12.96, "grad_norm": 0.40507733821868896, "learning_rate": 1.7655310621242486e-05, "loss": 0.0138, "step": 3240 }, { "epoch": 12.964, "grad_norm": 0.40632104873657227, "learning_rate": 1.7645290581162324e-05, "loss": 0.0138, "step": 3241 }, { "epoch": 12.968, "grad_norm": 0.6928650140762329, "learning_rate": 1.7635270541082165e-05, "loss": 0.016, "step": 3242 }, { "epoch": 12.972, "grad_norm": 0.5469399094581604, "learning_rate": 1.7625250501002006e-05, "loss": 0.0149, "step": 3243 }, { "epoch": 12.975999999999999, "grad_norm": 0.44268378615379333, "learning_rate": 1.7615230460921844e-05, "loss": 0.013, "step": 3244 }, { "epoch": 12.98, "grad_norm": 0.40536338090896606, "learning_rate": 1.7605210420841682e-05, "loss": 0.012, "step": 3245 }, { "epoch": 12.984, "grad_norm": 0.4465670585632324, "learning_rate": 1.7595190380761523e-05, "loss": 0.0125, "step": 3246 }, { "epoch": 12.988, "grad_norm": 0.9920943975448608, "learning_rate": 1.7585170340681365e-05, "loss": 0.014, "step": 3247 }, { "epoch": 12.992, "grad_norm": 0.26608768105506897, "learning_rate": 1.7575150300601202e-05, "loss": 0.0072, "step": 3248 }, { "epoch": 12.996, "grad_norm": 0.48054268956184387, "learning_rate": 1.7565130260521044e-05, "loss": 0.0135, "step": 3249 }, { "epoch": 13.0, "grad_norm": 0.7327283024787903, "learning_rate": 1.755511022044088e-05, "loss": 0.0247, "step": 3250 }, { "epoch": 13.004, "grad_norm": 0.27949610352516174, "learning_rate": 1.754509018036072e-05, "loss": 0.0092, "step": 3251 }, { "epoch": 13.008, "grad_norm": 0.5924502015113831, "learning_rate": 1.7535070140280564e-05, "loss": 0.0124, "step": 3252 }, { "epoch": 13.012, "grad_norm": 0.2856701612472534, "learning_rate": 1.7525050100200402e-05, "loss": 0.0088, "step": 3253 }, { "epoch": 13.016, "grad_norm": 0.6892845034599304, "learning_rate": 1.751503006012024e-05, "loss": 0.0101, "step": 3254 }, { "epoch": 13.02, "grad_norm": 0.28304752707481384, "learning_rate": 1.750501002004008e-05, "loss": 0.0093, "step": 3255 }, { "epoch": 13.024, "grad_norm": 0.44429293274879456, "learning_rate": 1.7494989979959922e-05, "loss": 0.0106, "step": 3256 }, { "epoch": 13.028, "grad_norm": 0.4208962917327881, "learning_rate": 1.748496993987976e-05, "loss": 0.011, "step": 3257 }, { "epoch": 13.032, "grad_norm": 0.24816329777240753, "learning_rate": 1.74749498997996e-05, "loss": 0.0091, "step": 3258 }, { "epoch": 13.036, "grad_norm": 0.3193182647228241, "learning_rate": 1.746492985971944e-05, "loss": 0.0109, "step": 3259 }, { "epoch": 13.04, "grad_norm": 0.8636166453361511, "learning_rate": 1.7454909819639277e-05, "loss": 0.0097, "step": 3260 }, { "epoch": 13.044, "grad_norm": 0.26683780550956726, "learning_rate": 1.744488977955912e-05, "loss": 0.0095, "step": 3261 }, { "epoch": 13.048, "grad_norm": 0.5071774125099182, "learning_rate": 1.743486973947896e-05, "loss": 0.0126, "step": 3262 }, { "epoch": 13.052, "grad_norm": 0.21876634657382965, "learning_rate": 1.7424849699398798e-05, "loss": 0.0094, "step": 3263 }, { "epoch": 13.056, "grad_norm": 0.18555937707424164, "learning_rate": 1.741482965931864e-05, "loss": 0.0059, "step": 3264 }, { "epoch": 13.06, "grad_norm": 0.2228597104549408, "learning_rate": 1.7404809619238477e-05, "loss": 0.0094, "step": 3265 }, { "epoch": 13.064, "grad_norm": 0.21233904361724854, "learning_rate": 1.739478957915832e-05, "loss": 0.0082, "step": 3266 }, { "epoch": 13.068, "grad_norm": 0.2386617809534073, "learning_rate": 1.7384769539078156e-05, "loss": 0.0094, "step": 3267 }, { "epoch": 13.072, "grad_norm": 0.37460488080978394, "learning_rate": 1.7374749498997998e-05, "loss": 0.0108, "step": 3268 }, { "epoch": 13.076, "grad_norm": 0.2160538136959076, "learning_rate": 1.7364729458917835e-05, "loss": 0.0094, "step": 3269 }, { "epoch": 13.08, "grad_norm": 0.2701886296272278, "learning_rate": 1.7354709418837677e-05, "loss": 0.0065, "step": 3270 }, { "epoch": 13.084, "grad_norm": 0.3855418264865875, "learning_rate": 1.7344689378757518e-05, "loss": 0.0096, "step": 3271 }, { "epoch": 13.088, "grad_norm": 0.31014111638069153, "learning_rate": 1.7334669338677356e-05, "loss": 0.0095, "step": 3272 }, { "epoch": 13.092, "grad_norm": 0.24090217053890228, "learning_rate": 1.7324649298597194e-05, "loss": 0.0095, "step": 3273 }, { "epoch": 13.096, "grad_norm": 0.2191203087568283, "learning_rate": 1.7314629258517035e-05, "loss": 0.0093, "step": 3274 }, { "epoch": 13.1, "grad_norm": 0.4989674389362335, "learning_rate": 1.7304609218436873e-05, "loss": 0.0121, "step": 3275 }, { "epoch": 13.104, "grad_norm": 0.3217163383960724, "learning_rate": 1.7294589178356714e-05, "loss": 0.011, "step": 3276 }, { "epoch": 13.108, "grad_norm": 0.3589065968990326, "learning_rate": 1.7284569138276556e-05, "loss": 0.0101, "step": 3277 }, { "epoch": 13.112, "grad_norm": 0.3767361640930176, "learning_rate": 1.7274549098196393e-05, "loss": 0.0098, "step": 3278 }, { "epoch": 13.116, "grad_norm": 0.3067415952682495, "learning_rate": 1.726452905811623e-05, "loss": 0.0095, "step": 3279 }, { "epoch": 13.12, "grad_norm": 0.32393768429756165, "learning_rate": 1.7254509018036073e-05, "loss": 0.0106, "step": 3280 }, { "epoch": 13.124, "grad_norm": 0.579896867275238, "learning_rate": 1.7244488977955914e-05, "loss": 0.0111, "step": 3281 }, { "epoch": 13.128, "grad_norm": 0.16686947643756866, "learning_rate": 1.7234468937875752e-05, "loss": 0.006, "step": 3282 }, { "epoch": 13.132, "grad_norm": 0.44065243005752563, "learning_rate": 1.7224448897795593e-05, "loss": 0.0106, "step": 3283 }, { "epoch": 13.136, "grad_norm": 0.3439742922782898, "learning_rate": 1.721442885771543e-05, "loss": 0.0118, "step": 3284 }, { "epoch": 13.14, "grad_norm": 0.35956379771232605, "learning_rate": 1.720440881763527e-05, "loss": 0.0101, "step": 3285 }, { "epoch": 13.144, "grad_norm": 0.33086642622947693, "learning_rate": 1.7194388777555114e-05, "loss": 0.0103, "step": 3286 }, { "epoch": 13.148, "grad_norm": 0.3616533577442169, "learning_rate": 1.718436873747495e-05, "loss": 0.0094, "step": 3287 }, { "epoch": 13.152, "grad_norm": 0.3333515524864197, "learning_rate": 1.717434869739479e-05, "loss": 0.0093, "step": 3288 }, { "epoch": 13.156, "grad_norm": 0.3492039144039154, "learning_rate": 1.716432865731463e-05, "loss": 0.011, "step": 3289 }, { "epoch": 13.16, "grad_norm": 0.29600805044174194, "learning_rate": 1.715430861723447e-05, "loss": 0.0117, "step": 3290 }, { "epoch": 13.164, "grad_norm": 0.3409759998321533, "learning_rate": 1.714428857715431e-05, "loss": 0.01, "step": 3291 }, { "epoch": 13.168, "grad_norm": 0.237446591258049, "learning_rate": 1.713426853707415e-05, "loss": 0.0088, "step": 3292 }, { "epoch": 13.172, "grad_norm": 0.3831733763217926, "learning_rate": 1.712424849699399e-05, "loss": 0.0099, "step": 3293 }, { "epoch": 13.176, "grad_norm": 0.3086282014846802, "learning_rate": 1.7114228456913827e-05, "loss": 0.0101, "step": 3294 }, { "epoch": 13.18, "grad_norm": 0.2323417216539383, "learning_rate": 1.7104208416833668e-05, "loss": 0.01, "step": 3295 }, { "epoch": 13.184, "grad_norm": 0.2264842838048935, "learning_rate": 1.709418837675351e-05, "loss": 0.0089, "step": 3296 }, { "epoch": 13.188, "grad_norm": 0.18145760893821716, "learning_rate": 1.7084168336673347e-05, "loss": 0.0087, "step": 3297 }, { "epoch": 13.192, "grad_norm": 0.21976445615291595, "learning_rate": 1.707414829659319e-05, "loss": 0.0085, "step": 3298 }, { "epoch": 13.196, "grad_norm": 0.2721193730831146, "learning_rate": 1.7064128256513026e-05, "loss": 0.0092, "step": 3299 }, { "epoch": 13.2, "grad_norm": 0.2802695035934448, "learning_rate": 1.7054108216432864e-05, "loss": 0.0104, "step": 3300 }, { "epoch": 13.204, "grad_norm": 0.48233532905578613, "learning_rate": 1.7044088176352706e-05, "loss": 0.0097, "step": 3301 }, { "epoch": 13.208, "grad_norm": 0.2295309603214264, "learning_rate": 1.7034068136272547e-05, "loss": 0.0094, "step": 3302 }, { "epoch": 13.212, "grad_norm": 0.32852253317832947, "learning_rate": 1.7024048096192385e-05, "loss": 0.0102, "step": 3303 }, { "epoch": 13.216, "grad_norm": 0.30892303586006165, "learning_rate": 1.7014028056112226e-05, "loss": 0.01, "step": 3304 }, { "epoch": 13.22, "grad_norm": 0.46433284878730774, "learning_rate": 1.7004008016032064e-05, "loss": 0.0097, "step": 3305 }, { "epoch": 13.224, "grad_norm": 0.2719483971595764, "learning_rate": 1.6993987975951905e-05, "loss": 0.0094, "step": 3306 }, { "epoch": 13.228, "grad_norm": 0.1855124831199646, "learning_rate": 1.6983967935871743e-05, "loss": 0.0085, "step": 3307 }, { "epoch": 13.232, "grad_norm": 0.3346885144710541, "learning_rate": 1.6973947895791584e-05, "loss": 0.0109, "step": 3308 }, { "epoch": 13.236, "grad_norm": 0.4800672233104706, "learning_rate": 1.6963927855711422e-05, "loss": 0.0123, "step": 3309 }, { "epoch": 13.24, "grad_norm": 0.3444778025150299, "learning_rate": 1.695390781563126e-05, "loss": 0.0103, "step": 3310 }, { "epoch": 13.244, "grad_norm": 0.2996631860733032, "learning_rate": 1.6943887775551105e-05, "loss": 0.0092, "step": 3311 }, { "epoch": 13.248, "grad_norm": 0.2334967851638794, "learning_rate": 1.6933867735470943e-05, "loss": 0.0093, "step": 3312 }, { "epoch": 13.252, "grad_norm": 0.20295573770999908, "learning_rate": 1.692384769539078e-05, "loss": 0.0086, "step": 3313 }, { "epoch": 13.256, "grad_norm": 0.242875874042511, "learning_rate": 1.6913827655310622e-05, "loss": 0.0092, "step": 3314 }, { "epoch": 13.26, "grad_norm": 0.41469478607177734, "learning_rate": 1.6903807615230463e-05, "loss": 0.0112, "step": 3315 }, { "epoch": 13.264, "grad_norm": 0.2873092591762543, "learning_rate": 1.68937875751503e-05, "loss": 0.01, "step": 3316 }, { "epoch": 13.268, "grad_norm": 0.16174815595149994, "learning_rate": 1.6883767535070142e-05, "loss": 0.005, "step": 3317 }, { "epoch": 13.272, "grad_norm": 0.5048874020576477, "learning_rate": 1.687374749498998e-05, "loss": 0.012, "step": 3318 }, { "epoch": 13.276, "grad_norm": 0.3545554578304291, "learning_rate": 1.6863727454909818e-05, "loss": 0.0095, "step": 3319 }, { "epoch": 13.28, "grad_norm": 0.40919992327690125, "learning_rate": 1.6853707414829663e-05, "loss": 0.0111, "step": 3320 }, { "epoch": 13.284, "grad_norm": 0.2165553867816925, "learning_rate": 1.68436873747495e-05, "loss": 0.0084, "step": 3321 }, { "epoch": 13.288, "grad_norm": 0.369000107049942, "learning_rate": 1.683366733466934e-05, "loss": 0.0117, "step": 3322 }, { "epoch": 13.292, "grad_norm": 0.3298990726470947, "learning_rate": 1.682364729458918e-05, "loss": 0.0087, "step": 3323 }, { "epoch": 13.296, "grad_norm": 0.37005317211151123, "learning_rate": 1.6813627254509018e-05, "loss": 0.0103, "step": 3324 }, { "epoch": 13.3, "grad_norm": 0.5654655694961548, "learning_rate": 1.680360721442886e-05, "loss": 0.0102, "step": 3325 }, { "epoch": 13.304, "grad_norm": 0.7227338552474976, "learning_rate": 1.67935871743487e-05, "loss": 0.0166, "step": 3326 }, { "epoch": 13.308, "grad_norm": 0.443639874458313, "learning_rate": 1.678356713426854e-05, "loss": 0.0111, "step": 3327 }, { "epoch": 13.312, "grad_norm": 0.25988972187042236, "learning_rate": 1.6773547094188376e-05, "loss": 0.0093, "step": 3328 }, { "epoch": 13.316, "grad_norm": 0.3140908181667328, "learning_rate": 1.6763527054108218e-05, "loss": 0.0094, "step": 3329 }, { "epoch": 13.32, "grad_norm": 0.23378266394138336, "learning_rate": 1.675350701402806e-05, "loss": 0.0096, "step": 3330 }, { "epoch": 13.324, "grad_norm": 0.31349173188209534, "learning_rate": 1.6743486973947897e-05, "loss": 0.0071, "step": 3331 }, { "epoch": 13.328, "grad_norm": 0.3660728633403778, "learning_rate": 1.6733466933867738e-05, "loss": 0.0111, "step": 3332 }, { "epoch": 13.332, "grad_norm": 0.4420227110385895, "learning_rate": 1.6723446893787576e-05, "loss": 0.0137, "step": 3333 }, { "epoch": 13.336, "grad_norm": 0.2265954315662384, "learning_rate": 1.6713426853707414e-05, "loss": 0.0093, "step": 3334 }, { "epoch": 13.34, "grad_norm": 0.31898754835128784, "learning_rate": 1.6703406813627255e-05, "loss": 0.0103, "step": 3335 }, { "epoch": 13.344, "grad_norm": 0.34343066811561584, "learning_rate": 1.6693386773547096e-05, "loss": 0.0112, "step": 3336 }, { "epoch": 13.348, "grad_norm": 0.27078312635421753, "learning_rate": 1.6683366733466934e-05, "loss": 0.01, "step": 3337 }, { "epoch": 13.352, "grad_norm": 0.2199242115020752, "learning_rate": 1.6673346693386772e-05, "loss": 0.011, "step": 3338 }, { "epoch": 13.356, "grad_norm": 0.25304946303367615, "learning_rate": 1.6663326653306613e-05, "loss": 0.0101, "step": 3339 }, { "epoch": 13.36, "grad_norm": 0.3057776689529419, "learning_rate": 1.6653306613226455e-05, "loss": 0.0099, "step": 3340 }, { "epoch": 13.364, "grad_norm": 0.17325368523597717, "learning_rate": 1.6643286573146293e-05, "loss": 0.0087, "step": 3341 }, { "epoch": 13.368, "grad_norm": 0.2971237599849701, "learning_rate": 1.6633266533066134e-05, "loss": 0.0109, "step": 3342 }, { "epoch": 13.372, "grad_norm": 0.2874193787574768, "learning_rate": 1.6623246492985972e-05, "loss": 0.0099, "step": 3343 }, { "epoch": 13.376, "grad_norm": 0.3510386347770691, "learning_rate": 1.661322645290581e-05, "loss": 0.0119, "step": 3344 }, { "epoch": 13.38, "grad_norm": 0.4247153401374817, "learning_rate": 1.6603206412825654e-05, "loss": 0.0138, "step": 3345 }, { "epoch": 13.384, "grad_norm": 0.2908627986907959, "learning_rate": 1.6593186372745492e-05, "loss": 0.0103, "step": 3346 }, { "epoch": 13.388, "grad_norm": 0.2727614939212799, "learning_rate": 1.658316633266533e-05, "loss": 0.0101, "step": 3347 }, { "epoch": 13.392, "grad_norm": 0.3280937969684601, "learning_rate": 1.657314629258517e-05, "loss": 0.0103, "step": 3348 }, { "epoch": 13.396, "grad_norm": 0.2449714094400406, "learning_rate": 1.656312625250501e-05, "loss": 0.0104, "step": 3349 }, { "epoch": 13.4, "grad_norm": 0.24496877193450928, "learning_rate": 1.655310621242485e-05, "loss": 0.0099, "step": 3350 }, { "epoch": 13.404, "grad_norm": 0.4488162398338318, "learning_rate": 1.6543086172344692e-05, "loss": 0.0127, "step": 3351 }, { "epoch": 13.408, "grad_norm": 0.3072626292705536, "learning_rate": 1.653306613226453e-05, "loss": 0.0105, "step": 3352 }, { "epoch": 13.412, "grad_norm": 0.27603739500045776, "learning_rate": 1.6523046092184368e-05, "loss": 0.0103, "step": 3353 }, { "epoch": 13.416, "grad_norm": 0.3227679431438446, "learning_rate": 1.651302605210421e-05, "loss": 0.0104, "step": 3354 }, { "epoch": 13.42, "grad_norm": 0.23910482227802277, "learning_rate": 1.650300601202405e-05, "loss": 0.0101, "step": 3355 }, { "epoch": 13.424, "grad_norm": 0.4852541387081146, "learning_rate": 1.6492985971943888e-05, "loss": 0.0118, "step": 3356 }, { "epoch": 13.428, "grad_norm": 0.3126225471496582, "learning_rate": 1.648296593186373e-05, "loss": 0.0103, "step": 3357 }, { "epoch": 13.432, "grad_norm": 0.2821721136569977, "learning_rate": 1.6472945891783567e-05, "loss": 0.0098, "step": 3358 }, { "epoch": 13.436, "grad_norm": 0.3140818774700165, "learning_rate": 1.6462925851703405e-05, "loss": 0.0103, "step": 3359 }, { "epoch": 13.44, "grad_norm": 0.3383825719356537, "learning_rate": 1.6452905811623246e-05, "loss": 0.0112, "step": 3360 }, { "epoch": 13.444, "grad_norm": 0.3161516487598419, "learning_rate": 1.6442885771543088e-05, "loss": 0.0109, "step": 3361 }, { "epoch": 13.448, "grad_norm": 0.24821779131889343, "learning_rate": 1.6432865731462926e-05, "loss": 0.0093, "step": 3362 }, { "epoch": 13.452, "grad_norm": 0.3279327154159546, "learning_rate": 1.6422845691382767e-05, "loss": 0.0098, "step": 3363 }, { "epoch": 13.456, "grad_norm": 0.3004770874977112, "learning_rate": 1.6412825651302605e-05, "loss": 0.0101, "step": 3364 }, { "epoch": 13.46, "grad_norm": 0.26334452629089355, "learning_rate": 1.6402805611222446e-05, "loss": 0.0094, "step": 3365 }, { "epoch": 13.464, "grad_norm": 0.2865053713321686, "learning_rate": 1.6392785571142284e-05, "loss": 0.0107, "step": 3366 }, { "epoch": 13.468, "grad_norm": 0.3839758038520813, "learning_rate": 1.6382765531062125e-05, "loss": 0.0103, "step": 3367 }, { "epoch": 13.472, "grad_norm": 0.48596179485321045, "learning_rate": 1.6372745490981963e-05, "loss": 0.0097, "step": 3368 }, { "epoch": 13.475999999999999, "grad_norm": 0.22549477219581604, "learning_rate": 1.6362725450901804e-05, "loss": 0.0085, "step": 3369 }, { "epoch": 13.48, "grad_norm": 0.3293830156326294, "learning_rate": 1.6352705410821646e-05, "loss": 0.0096, "step": 3370 }, { "epoch": 13.484, "grad_norm": 0.2493169605731964, "learning_rate": 1.6342685370741484e-05, "loss": 0.0093, "step": 3371 }, { "epoch": 13.488, "grad_norm": 0.4540705978870392, "learning_rate": 1.633266533066132e-05, "loss": 0.0115, "step": 3372 }, { "epoch": 13.492, "grad_norm": 0.2564983665943146, "learning_rate": 1.6322645290581163e-05, "loss": 0.0096, "step": 3373 }, { "epoch": 13.496, "grad_norm": 0.26169297099113464, "learning_rate": 1.6312625250501004e-05, "loss": 0.0099, "step": 3374 }, { "epoch": 13.5, "grad_norm": 0.31739065051078796, "learning_rate": 1.6302605210420842e-05, "loss": 0.0097, "step": 3375 }, { "epoch": 13.504, "grad_norm": 0.2799374759197235, "learning_rate": 1.6292585170340683e-05, "loss": 0.0099, "step": 3376 }, { "epoch": 13.508, "grad_norm": 0.22275328636169434, "learning_rate": 1.628256513026052e-05, "loss": 0.0095, "step": 3377 }, { "epoch": 13.512, "grad_norm": 0.37261059880256653, "learning_rate": 1.627254509018036e-05, "loss": 0.0096, "step": 3378 }, { "epoch": 13.516, "grad_norm": 0.23295381665229797, "learning_rate": 1.6262525050100204e-05, "loss": 0.0089, "step": 3379 }, { "epoch": 13.52, "grad_norm": 0.23815542459487915, "learning_rate": 1.625250501002004e-05, "loss": 0.0101, "step": 3380 }, { "epoch": 13.524000000000001, "grad_norm": 0.27734702825546265, "learning_rate": 1.624248496993988e-05, "loss": 0.01, "step": 3381 }, { "epoch": 13.528, "grad_norm": 0.2713322937488556, "learning_rate": 1.623246492985972e-05, "loss": 0.0092, "step": 3382 }, { "epoch": 13.532, "grad_norm": 0.4412878751754761, "learning_rate": 1.622244488977956e-05, "loss": 0.0113, "step": 3383 }, { "epoch": 13.536, "grad_norm": 0.3151780068874359, "learning_rate": 1.62124248496994e-05, "loss": 0.0106, "step": 3384 }, { "epoch": 13.54, "grad_norm": 0.3254270851612091, "learning_rate": 1.620240480961924e-05, "loss": 0.0102, "step": 3385 }, { "epoch": 13.544, "grad_norm": 0.3681427240371704, "learning_rate": 1.619238476953908e-05, "loss": 0.0116, "step": 3386 }, { "epoch": 13.548, "grad_norm": 0.3805301785469055, "learning_rate": 1.6182364729458917e-05, "loss": 0.009, "step": 3387 }, { "epoch": 13.552, "grad_norm": 0.2876235246658325, "learning_rate": 1.617234468937876e-05, "loss": 0.0103, "step": 3388 }, { "epoch": 13.556000000000001, "grad_norm": 0.3005918264389038, "learning_rate": 1.61623246492986e-05, "loss": 0.0099, "step": 3389 }, { "epoch": 13.56, "grad_norm": 0.382587730884552, "learning_rate": 1.6152304609218438e-05, "loss": 0.0108, "step": 3390 }, { "epoch": 13.564, "grad_norm": 0.4565657377243042, "learning_rate": 1.614228456913828e-05, "loss": 0.01, "step": 3391 }, { "epoch": 13.568, "grad_norm": 0.14987795054912567, "learning_rate": 1.6132264529058117e-05, "loss": 0.0059, "step": 3392 }, { "epoch": 13.572, "grad_norm": 0.21870999038219452, "learning_rate": 1.6122244488977955e-05, "loss": 0.009, "step": 3393 }, { "epoch": 13.576, "grad_norm": 0.5497450232505798, "learning_rate": 1.6112224448897796e-05, "loss": 0.0117, "step": 3394 }, { "epoch": 13.58, "grad_norm": 0.35143905878067017, "learning_rate": 1.6102204408817637e-05, "loss": 0.0111, "step": 3395 }, { "epoch": 13.584, "grad_norm": 0.27477556467056274, "learning_rate": 1.6092184368737475e-05, "loss": 0.0105, "step": 3396 }, { "epoch": 13.588, "grad_norm": 0.29407718777656555, "learning_rate": 1.6082164328657316e-05, "loss": 0.0095, "step": 3397 }, { "epoch": 13.592, "grad_norm": 0.1857583224773407, "learning_rate": 1.6072144288577154e-05, "loss": 0.0092, "step": 3398 }, { "epoch": 13.596, "grad_norm": 0.4153088927268982, "learning_rate": 1.6062124248496996e-05, "loss": 0.0096, "step": 3399 }, { "epoch": 13.6, "grad_norm": 0.28967979550361633, "learning_rate": 1.6052104208416833e-05, "loss": 0.0107, "step": 3400 }, { "epoch": 13.604, "grad_norm": 0.24863949418067932, "learning_rate": 1.6042084168336675e-05, "loss": 0.0103, "step": 3401 }, { "epoch": 13.608, "grad_norm": 0.3102586269378662, "learning_rate": 1.6032064128256513e-05, "loss": 0.0103, "step": 3402 }, { "epoch": 13.612, "grad_norm": 0.361162394285202, "learning_rate": 1.6022044088176354e-05, "loss": 0.0125, "step": 3403 }, { "epoch": 13.616, "grad_norm": 0.40168410539627075, "learning_rate": 1.6012024048096195e-05, "loss": 0.0114, "step": 3404 }, { "epoch": 13.62, "grad_norm": 0.29243239760398865, "learning_rate": 1.6002004008016033e-05, "loss": 0.0055, "step": 3405 }, { "epoch": 13.624, "grad_norm": 0.26829758286476135, "learning_rate": 1.599198396793587e-05, "loss": 0.0112, "step": 3406 }, { "epoch": 13.628, "grad_norm": 0.40442490577697754, "learning_rate": 1.5981963927855712e-05, "loss": 0.0115, "step": 3407 }, { "epoch": 13.632, "grad_norm": 0.31696653366088867, "learning_rate": 1.597194388777555e-05, "loss": 0.0109, "step": 3408 }, { "epoch": 13.636, "grad_norm": 0.44148167967796326, "learning_rate": 1.596192384769539e-05, "loss": 0.0107, "step": 3409 }, { "epoch": 13.64, "grad_norm": 0.2826448082923889, "learning_rate": 1.5951903807615233e-05, "loss": 0.0107, "step": 3410 }, { "epoch": 13.644, "grad_norm": 0.524307906627655, "learning_rate": 1.594188376753507e-05, "loss": 0.0111, "step": 3411 }, { "epoch": 13.648, "grad_norm": 0.30534476041793823, "learning_rate": 1.593186372745491e-05, "loss": 0.0106, "step": 3412 }, { "epoch": 13.652, "grad_norm": 0.3145725727081299, "learning_rate": 1.592184368737475e-05, "loss": 0.0098, "step": 3413 }, { "epoch": 13.656, "grad_norm": 0.21220719814300537, "learning_rate": 1.591182364729459e-05, "loss": 0.0111, "step": 3414 }, { "epoch": 13.66, "grad_norm": 0.37001675367355347, "learning_rate": 1.590180360721443e-05, "loss": 0.0102, "step": 3415 }, { "epoch": 13.664, "grad_norm": 0.2675144672393799, "learning_rate": 1.589178356713427e-05, "loss": 0.0104, "step": 3416 }, { "epoch": 13.668, "grad_norm": 0.4971553683280945, "learning_rate": 1.5881763527054108e-05, "loss": 0.0144, "step": 3417 }, { "epoch": 13.672, "grad_norm": 0.32960245013237, "learning_rate": 1.5871743486973946e-05, "loss": 0.0096, "step": 3418 }, { "epoch": 13.676, "grad_norm": 0.1954725980758667, "learning_rate": 1.586172344689379e-05, "loss": 0.0084, "step": 3419 }, { "epoch": 13.68, "grad_norm": 0.338614821434021, "learning_rate": 1.585170340681363e-05, "loss": 0.012, "step": 3420 }, { "epoch": 13.684, "grad_norm": 0.27286550402641296, "learning_rate": 1.5841683366733466e-05, "loss": 0.0105, "step": 3421 }, { "epoch": 13.688, "grad_norm": 0.3043253421783447, "learning_rate": 1.5831663326653308e-05, "loss": 0.0111, "step": 3422 }, { "epoch": 13.692, "grad_norm": 0.608754575252533, "learning_rate": 1.582164328657315e-05, "loss": 0.0122, "step": 3423 }, { "epoch": 13.696, "grad_norm": 0.44034209847450256, "learning_rate": 1.5811623246492987e-05, "loss": 0.0103, "step": 3424 }, { "epoch": 13.7, "grad_norm": 0.3596172630786896, "learning_rate": 1.5801603206412828e-05, "loss": 0.0101, "step": 3425 }, { "epoch": 13.704, "grad_norm": 0.1947639137506485, "learning_rate": 1.5791583166332666e-05, "loss": 0.0086, "step": 3426 }, { "epoch": 13.708, "grad_norm": 0.595519483089447, "learning_rate": 1.5781563126252504e-05, "loss": 0.0113, "step": 3427 }, { "epoch": 13.712, "grad_norm": 0.30522915720939636, "learning_rate": 1.5771543086172345e-05, "loss": 0.011, "step": 3428 }, { "epoch": 13.716, "grad_norm": 0.2296917587518692, "learning_rate": 1.5761523046092187e-05, "loss": 0.01, "step": 3429 }, { "epoch": 13.72, "grad_norm": 0.5215935707092285, "learning_rate": 1.5751503006012024e-05, "loss": 0.0099, "step": 3430 }, { "epoch": 13.724, "grad_norm": 0.22468657791614532, "learning_rate": 1.5741482965931866e-05, "loss": 0.0108, "step": 3431 }, { "epoch": 13.728, "grad_norm": 0.25664982199668884, "learning_rate": 1.5731462925851704e-05, "loss": 0.0109, "step": 3432 }, { "epoch": 13.732, "grad_norm": 0.2053966224193573, "learning_rate": 1.5721442885771545e-05, "loss": 0.0096, "step": 3433 }, { "epoch": 13.736, "grad_norm": 0.2271842062473297, "learning_rate": 1.5711422845691383e-05, "loss": 0.0092, "step": 3434 }, { "epoch": 13.74, "grad_norm": 0.41068193316459656, "learning_rate": 1.5701402805611224e-05, "loss": 0.0119, "step": 3435 }, { "epoch": 13.744, "grad_norm": 0.2704119086265564, "learning_rate": 1.5691382765531062e-05, "loss": 0.0096, "step": 3436 }, { "epoch": 13.748, "grad_norm": 0.37335240840911865, "learning_rate": 1.56813627254509e-05, "loss": 0.0106, "step": 3437 }, { "epoch": 13.752, "grad_norm": 0.6146287322044373, "learning_rate": 1.5671342685370745e-05, "loss": 0.0117, "step": 3438 }, { "epoch": 13.756, "grad_norm": 0.22417746484279633, "learning_rate": 1.5661322645290582e-05, "loss": 0.0061, "step": 3439 }, { "epoch": 13.76, "grad_norm": 0.38575106859207153, "learning_rate": 1.565130260521042e-05, "loss": 0.0108, "step": 3440 }, { "epoch": 13.764, "grad_norm": 0.2831723690032959, "learning_rate": 1.564128256513026e-05, "loss": 0.0103, "step": 3441 }, { "epoch": 13.768, "grad_norm": 0.39680731296539307, "learning_rate": 1.56312625250501e-05, "loss": 0.0117, "step": 3442 }, { "epoch": 13.772, "grad_norm": 0.5540252327919006, "learning_rate": 1.562124248496994e-05, "loss": 0.0123, "step": 3443 }, { "epoch": 13.776, "grad_norm": 0.34284451603889465, "learning_rate": 1.5611222444889782e-05, "loss": 0.011, "step": 3444 }, { "epoch": 13.78, "grad_norm": 0.23013213276863098, "learning_rate": 1.560120240480962e-05, "loss": 0.011, "step": 3445 }, { "epoch": 13.784, "grad_norm": 0.583663284778595, "learning_rate": 1.5591182364729458e-05, "loss": 0.0121, "step": 3446 }, { "epoch": 13.788, "grad_norm": 0.2305268794298172, "learning_rate": 1.55811623246493e-05, "loss": 0.0098, "step": 3447 }, { "epoch": 13.792, "grad_norm": 0.2994031608104706, "learning_rate": 1.557114228456914e-05, "loss": 0.0102, "step": 3448 }, { "epoch": 13.796, "grad_norm": 0.26790478825569153, "learning_rate": 1.556112224448898e-05, "loss": 0.009, "step": 3449 }, { "epoch": 13.8, "grad_norm": 0.2465956211090088, "learning_rate": 1.555110220440882e-05, "loss": 0.0106, "step": 3450 }, { "epoch": 13.804, "grad_norm": 0.40356552600860596, "learning_rate": 1.5541082164328658e-05, "loss": 0.0105, "step": 3451 }, { "epoch": 13.808, "grad_norm": 0.2311774492263794, "learning_rate": 1.5531062124248495e-05, "loss": 0.0094, "step": 3452 }, { "epoch": 13.812, "grad_norm": 0.32201912999153137, "learning_rate": 1.552104208416834e-05, "loss": 0.012, "step": 3453 }, { "epoch": 13.816, "grad_norm": 0.35122230648994446, "learning_rate": 1.5511022044088178e-05, "loss": 0.0101, "step": 3454 }, { "epoch": 13.82, "grad_norm": 0.3138692378997803, "learning_rate": 1.5501002004008016e-05, "loss": 0.0109, "step": 3455 }, { "epoch": 13.824, "grad_norm": 0.3369031846523285, "learning_rate": 1.5490981963927857e-05, "loss": 0.0101, "step": 3456 }, { "epoch": 13.828, "grad_norm": 0.31963375210762024, "learning_rate": 1.5480961923847695e-05, "loss": 0.011, "step": 3457 }, { "epoch": 13.832, "grad_norm": 0.42217960953712463, "learning_rate": 1.5470941883767536e-05, "loss": 0.0111, "step": 3458 }, { "epoch": 13.836, "grad_norm": 0.39974090456962585, "learning_rate": 1.5460921843687378e-05, "loss": 0.0114, "step": 3459 }, { "epoch": 13.84, "grad_norm": 0.540280818939209, "learning_rate": 1.5450901803607216e-05, "loss": 0.0129, "step": 3460 }, { "epoch": 13.844, "grad_norm": 0.2044568657875061, "learning_rate": 1.5440881763527053e-05, "loss": 0.0095, "step": 3461 }, { "epoch": 13.848, "grad_norm": 0.376753032207489, "learning_rate": 1.5430861723446895e-05, "loss": 0.0115, "step": 3462 }, { "epoch": 13.852, "grad_norm": 0.40467560291290283, "learning_rate": 1.5420841683366736e-05, "loss": 0.0122, "step": 3463 }, { "epoch": 13.856, "grad_norm": 0.2836839556694031, "learning_rate": 1.5410821643286574e-05, "loss": 0.0098, "step": 3464 }, { "epoch": 13.86, "grad_norm": 0.5280314683914185, "learning_rate": 1.5400801603206412e-05, "loss": 0.0105, "step": 3465 }, { "epoch": 13.864, "grad_norm": 0.24809741973876953, "learning_rate": 1.5390781563126253e-05, "loss": 0.0101, "step": 3466 }, { "epoch": 13.868, "grad_norm": 0.3241751790046692, "learning_rate": 1.538076152304609e-05, "loss": 0.0111, "step": 3467 }, { "epoch": 13.872, "grad_norm": 0.2702431380748749, "learning_rate": 1.5370741482965932e-05, "loss": 0.0112, "step": 3468 }, { "epoch": 13.876, "grad_norm": 0.4810154139995575, "learning_rate": 1.5360721442885773e-05, "loss": 0.0118, "step": 3469 }, { "epoch": 13.88, "grad_norm": 0.3676247000694275, "learning_rate": 1.535070140280561e-05, "loss": 0.0104, "step": 3470 }, { "epoch": 13.884, "grad_norm": 0.27984535694122314, "learning_rate": 1.534068136272545e-05, "loss": 0.0099, "step": 3471 }, { "epoch": 13.888, "grad_norm": 0.20753216743469238, "learning_rate": 1.533066132264529e-05, "loss": 0.01, "step": 3472 }, { "epoch": 13.892, "grad_norm": 0.22736266255378723, "learning_rate": 1.5320641282565132e-05, "loss": 0.0094, "step": 3473 }, { "epoch": 13.896, "grad_norm": 0.2514065206050873, "learning_rate": 1.531062124248497e-05, "loss": 0.0095, "step": 3474 }, { "epoch": 13.9, "grad_norm": 0.39425936341285706, "learning_rate": 1.530060120240481e-05, "loss": 0.011, "step": 3475 }, { "epoch": 13.904, "grad_norm": 0.3547126054763794, "learning_rate": 1.529058116232465e-05, "loss": 0.01, "step": 3476 }, { "epoch": 13.908, "grad_norm": 0.20616790652275085, "learning_rate": 1.5280561122244487e-05, "loss": 0.0067, "step": 3477 }, { "epoch": 13.912, "grad_norm": 0.2141866683959961, "learning_rate": 1.527054108216433e-05, "loss": 0.0094, "step": 3478 }, { "epoch": 13.916, "grad_norm": 0.3642483055591583, "learning_rate": 1.526052104208417e-05, "loss": 0.01, "step": 3479 }, { "epoch": 13.92, "grad_norm": 0.24967001378536224, "learning_rate": 1.5250501002004009e-05, "loss": 0.0115, "step": 3480 }, { "epoch": 13.924, "grad_norm": 0.31511619687080383, "learning_rate": 1.5240480961923847e-05, "loss": 0.009, "step": 3481 }, { "epoch": 13.928, "grad_norm": 0.5050347447395325, "learning_rate": 1.523046092184369e-05, "loss": 0.0128, "step": 3482 }, { "epoch": 13.932, "grad_norm": 0.3271845579147339, "learning_rate": 1.5220440881763528e-05, "loss": 0.0102, "step": 3483 }, { "epoch": 13.936, "grad_norm": 0.49033862352371216, "learning_rate": 1.5210420841683367e-05, "loss": 0.0095, "step": 3484 }, { "epoch": 13.94, "grad_norm": 0.34133315086364746, "learning_rate": 1.5200400801603207e-05, "loss": 0.0106, "step": 3485 }, { "epoch": 13.943999999999999, "grad_norm": 0.29839426279067993, "learning_rate": 1.5190380761523047e-05, "loss": 0.0096, "step": 3486 }, { "epoch": 13.948, "grad_norm": 0.21854914724826813, "learning_rate": 1.5180360721442888e-05, "loss": 0.0096, "step": 3487 }, { "epoch": 13.952, "grad_norm": 0.41724851727485657, "learning_rate": 1.5170340681362727e-05, "loss": 0.012, "step": 3488 }, { "epoch": 13.956, "grad_norm": 0.09979083389043808, "learning_rate": 1.5160320641282565e-05, "loss": 0.0041, "step": 3489 }, { "epoch": 13.96, "grad_norm": 0.1923419088125229, "learning_rate": 1.5150300601202405e-05, "loss": 0.0062, "step": 3490 }, { "epoch": 13.964, "grad_norm": 0.2782224416732788, "learning_rate": 1.5140280561122244e-05, "loss": 0.0104, "step": 3491 }, { "epoch": 13.968, "grad_norm": 0.3307152986526489, "learning_rate": 1.5130260521042086e-05, "loss": 0.0108, "step": 3492 }, { "epoch": 13.972, "grad_norm": 0.247494637966156, "learning_rate": 1.5120240480961925e-05, "loss": 0.006, "step": 3493 }, { "epoch": 13.975999999999999, "grad_norm": 0.27174144983291626, "learning_rate": 1.5110220440881765e-05, "loss": 0.0081, "step": 3494 }, { "epoch": 13.98, "grad_norm": 0.26648882031440735, "learning_rate": 1.5100200400801603e-05, "loss": 0.0101, "step": 3495 }, { "epoch": 13.984, "grad_norm": 0.2658537030220032, "learning_rate": 1.5090180360721442e-05, "loss": 0.0098, "step": 3496 }, { "epoch": 13.988, "grad_norm": 0.2758139371871948, "learning_rate": 1.5080160320641284e-05, "loss": 0.0095, "step": 3497 }, { "epoch": 13.992, "grad_norm": 0.3168604075908661, "learning_rate": 1.5070140280561123e-05, "loss": 0.0106, "step": 3498 }, { "epoch": 13.996, "grad_norm": 0.3637627065181732, "learning_rate": 1.5060120240480963e-05, "loss": 0.012, "step": 3499 }, { "epoch": 14.0, "grad_norm": 0.3541504144668579, "learning_rate": 1.5050100200400802e-05, "loss": 0.0065, "step": 3500 }, { "epoch": 14.004, "grad_norm": 0.15899381041526794, "learning_rate": 1.504008016032064e-05, "loss": 0.008, "step": 3501 }, { "epoch": 14.008, "grad_norm": 0.18751265108585358, "learning_rate": 1.5030060120240483e-05, "loss": 0.0079, "step": 3502 }, { "epoch": 14.012, "grad_norm": 0.1956561952829361, "learning_rate": 1.5020040080160321e-05, "loss": 0.0084, "step": 3503 }, { "epoch": 14.016, "grad_norm": 0.17938251793384552, "learning_rate": 1.501002004008016e-05, "loss": 0.0093, "step": 3504 }, { "epoch": 14.02, "grad_norm": 0.2153725028038025, "learning_rate": 1.5e-05, "loss": 0.0086, "step": 3505 }, { "epoch": 14.024, "grad_norm": 0.08399572223424911, "learning_rate": 1.498997995991984e-05, "loss": 0.0041, "step": 3506 }, { "epoch": 14.028, "grad_norm": 0.20708952844142914, "learning_rate": 1.4979959919839681e-05, "loss": 0.0078, "step": 3507 }, { "epoch": 14.032, "grad_norm": 0.18916714191436768, "learning_rate": 1.496993987975952e-05, "loss": 0.0089, "step": 3508 }, { "epoch": 14.036, "grad_norm": 0.14934112131595612, "learning_rate": 1.4959919839679359e-05, "loss": 0.0076, "step": 3509 }, { "epoch": 14.04, "grad_norm": 0.2596082389354706, "learning_rate": 1.4949899799599198e-05, "loss": 0.0092, "step": 3510 }, { "epoch": 14.044, "grad_norm": 0.16445767879486084, "learning_rate": 1.4939879759519038e-05, "loss": 0.0054, "step": 3511 }, { "epoch": 14.048, "grad_norm": 0.2873588800430298, "learning_rate": 1.492985971943888e-05, "loss": 0.0098, "step": 3512 }, { "epoch": 14.052, "grad_norm": 0.19016903638839722, "learning_rate": 1.4919839679358719e-05, "loss": 0.0082, "step": 3513 }, { "epoch": 14.056, "grad_norm": 0.1945473998785019, "learning_rate": 1.4909819639278558e-05, "loss": 0.0084, "step": 3514 }, { "epoch": 14.06, "grad_norm": 0.5994365215301514, "learning_rate": 1.4899799599198396e-05, "loss": 0.0127, "step": 3515 }, { "epoch": 14.064, "grad_norm": 0.2284667044878006, "learning_rate": 1.4889779559118236e-05, "loss": 0.0081, "step": 3516 }, { "epoch": 14.068, "grad_norm": 0.1868845373392105, "learning_rate": 1.4879759519038077e-05, "loss": 0.0087, "step": 3517 }, { "epoch": 14.072, "grad_norm": 0.25140196084976196, "learning_rate": 1.4869739478957917e-05, "loss": 0.0087, "step": 3518 }, { "epoch": 14.076, "grad_norm": 0.30023810267448425, "learning_rate": 1.4859719438877756e-05, "loss": 0.0098, "step": 3519 }, { "epoch": 14.08, "grad_norm": 0.3479897379875183, "learning_rate": 1.4849699398797596e-05, "loss": 0.0087, "step": 3520 }, { "epoch": 14.084, "grad_norm": 0.33798903226852417, "learning_rate": 1.4839679358717434e-05, "loss": 0.0078, "step": 3521 }, { "epoch": 14.088, "grad_norm": 0.21001707017421722, "learning_rate": 1.4829659318637277e-05, "loss": 0.0056, "step": 3522 }, { "epoch": 14.092, "grad_norm": 0.3976731598377228, "learning_rate": 1.4819639278557115e-05, "loss": 0.0098, "step": 3523 }, { "epoch": 14.096, "grad_norm": 0.26884251832962036, "learning_rate": 1.4809619238476954e-05, "loss": 0.008, "step": 3524 }, { "epoch": 14.1, "grad_norm": 0.16603799164295197, "learning_rate": 1.4799599198396794e-05, "loss": 0.0081, "step": 3525 }, { "epoch": 14.104, "grad_norm": 0.25354495644569397, "learning_rate": 1.4789579158316633e-05, "loss": 0.0085, "step": 3526 }, { "epoch": 14.108, "grad_norm": 0.49945899844169617, "learning_rate": 1.4779559118236475e-05, "loss": 0.0098, "step": 3527 }, { "epoch": 14.112, "grad_norm": 0.15931391716003418, "learning_rate": 1.4769539078156314e-05, "loss": 0.0051, "step": 3528 }, { "epoch": 14.116, "grad_norm": 0.2964676022529602, "learning_rate": 1.4759519038076152e-05, "loss": 0.0096, "step": 3529 }, { "epoch": 14.12, "grad_norm": 0.2884165048599243, "learning_rate": 1.4749498997995992e-05, "loss": 0.008, "step": 3530 }, { "epoch": 14.124, "grad_norm": 1.5515947341918945, "learning_rate": 1.4739478957915831e-05, "loss": 0.0073, "step": 3531 }, { "epoch": 14.128, "grad_norm": 0.2911422550678253, "learning_rate": 1.4729458917835673e-05, "loss": 0.0107, "step": 3532 }, { "epoch": 14.132, "grad_norm": 0.23672667145729065, "learning_rate": 1.4719438877755512e-05, "loss": 0.009, "step": 3533 }, { "epoch": 14.136, "grad_norm": 0.4554106593132019, "learning_rate": 1.4709418837675352e-05, "loss": 0.0095, "step": 3534 }, { "epoch": 14.14, "grad_norm": 0.27175405621528625, "learning_rate": 1.469939879759519e-05, "loss": 0.0049, "step": 3535 }, { "epoch": 14.144, "grad_norm": 0.3456001281738281, "learning_rate": 1.4689378757515033e-05, "loss": 0.0107, "step": 3536 }, { "epoch": 14.148, "grad_norm": 0.2605019509792328, "learning_rate": 1.467935871743487e-05, "loss": 0.0064, "step": 3537 }, { "epoch": 14.152, "grad_norm": 0.26379692554473877, "learning_rate": 1.466933867735471e-05, "loss": 0.0084, "step": 3538 }, { "epoch": 14.156, "grad_norm": 0.23636199533939362, "learning_rate": 1.465931863727455e-05, "loss": 0.0082, "step": 3539 }, { "epoch": 14.16, "grad_norm": 0.1825660616159439, "learning_rate": 1.464929859719439e-05, "loss": 0.008, "step": 3540 }, { "epoch": 14.164, "grad_norm": 0.21577121317386627, "learning_rate": 1.463927855711423e-05, "loss": 0.0088, "step": 3541 }, { "epoch": 14.168, "grad_norm": 0.18342304229736328, "learning_rate": 1.462925851703407e-05, "loss": 0.008, "step": 3542 }, { "epoch": 14.172, "grad_norm": 0.16573582589626312, "learning_rate": 1.4619238476953908e-05, "loss": 0.0082, "step": 3543 }, { "epoch": 14.176, "grad_norm": 0.20480182766914368, "learning_rate": 1.4609218436873748e-05, "loss": 0.0081, "step": 3544 }, { "epoch": 14.18, "grad_norm": 0.19577351212501526, "learning_rate": 1.4599198396793587e-05, "loss": 0.0093, "step": 3545 }, { "epoch": 14.184, "grad_norm": 0.23374129831790924, "learning_rate": 1.4589178356713429e-05, "loss": 0.0082, "step": 3546 }, { "epoch": 14.188, "grad_norm": 0.22724345326423645, "learning_rate": 1.4579158316633268e-05, "loss": 0.0089, "step": 3547 }, { "epoch": 14.192, "grad_norm": 0.31487712264060974, "learning_rate": 1.4569138276553108e-05, "loss": 0.0082, "step": 3548 }, { "epoch": 14.196, "grad_norm": 0.2387700378894806, "learning_rate": 1.4559118236472946e-05, "loss": 0.0093, "step": 3549 }, { "epoch": 14.2, "grad_norm": 0.24899888038635254, "learning_rate": 1.4549098196392785e-05, "loss": 0.0086, "step": 3550 }, { "epoch": 14.204, "grad_norm": 0.21508151292800903, "learning_rate": 1.4539078156312627e-05, "loss": 0.0093, "step": 3551 }, { "epoch": 14.208, "grad_norm": 0.2187836617231369, "learning_rate": 1.4529058116232466e-05, "loss": 0.0084, "step": 3552 }, { "epoch": 14.212, "grad_norm": 0.2252088487148285, "learning_rate": 1.4519038076152306e-05, "loss": 0.0078, "step": 3553 }, { "epoch": 14.216, "grad_norm": 0.22216878831386566, "learning_rate": 1.4509018036072145e-05, "loss": 0.0088, "step": 3554 }, { "epoch": 14.22, "grad_norm": 0.27992987632751465, "learning_rate": 1.4498997995991983e-05, "loss": 0.0082, "step": 3555 }, { "epoch": 14.224, "grad_norm": 0.21498125791549683, "learning_rate": 1.4488977955911826e-05, "loss": 0.0088, "step": 3556 }, { "epoch": 14.228, "grad_norm": 0.15717551112174988, "learning_rate": 1.4478957915831664e-05, "loss": 0.0074, "step": 3557 }, { "epoch": 14.232, "grad_norm": 0.22314034402370453, "learning_rate": 1.4468937875751504e-05, "loss": 0.0082, "step": 3558 }, { "epoch": 14.236, "grad_norm": 0.28885912895202637, "learning_rate": 1.4458917835671343e-05, "loss": 0.0101, "step": 3559 }, { "epoch": 14.24, "grad_norm": 0.22253185510635376, "learning_rate": 1.4448897795591181e-05, "loss": 0.0084, "step": 3560 }, { "epoch": 14.244, "grad_norm": 0.22679786384105682, "learning_rate": 1.4438877755511024e-05, "loss": 0.0097, "step": 3561 }, { "epoch": 14.248, "grad_norm": 0.22354552149772644, "learning_rate": 1.4428857715430864e-05, "loss": 0.0085, "step": 3562 }, { "epoch": 14.252, "grad_norm": 0.24927063286304474, "learning_rate": 1.4418837675350702e-05, "loss": 0.0096, "step": 3563 }, { "epoch": 14.256, "grad_norm": 0.20839311182498932, "learning_rate": 1.4408817635270541e-05, "loss": 0.0084, "step": 3564 }, { "epoch": 14.26, "grad_norm": 0.15880200266838074, "learning_rate": 1.439879759519038e-05, "loss": 0.0084, "step": 3565 }, { "epoch": 14.264, "grad_norm": 0.1647965908050537, "learning_rate": 1.4388777555110222e-05, "loss": 0.0077, "step": 3566 }, { "epoch": 14.268, "grad_norm": 0.20082905888557434, "learning_rate": 1.4378757515030062e-05, "loss": 0.0092, "step": 3567 }, { "epoch": 14.272, "grad_norm": 0.49212315678596497, "learning_rate": 1.4368737474949901e-05, "loss": 0.012, "step": 3568 }, { "epoch": 14.276, "grad_norm": 0.20353491604328156, "learning_rate": 1.4358717434869739e-05, "loss": 0.0085, "step": 3569 }, { "epoch": 14.28, "grad_norm": 0.18611657619476318, "learning_rate": 1.4348697394789579e-05, "loss": 0.0086, "step": 3570 }, { "epoch": 14.284, "grad_norm": 0.2258971929550171, "learning_rate": 1.433867735470942e-05, "loss": 0.0081, "step": 3571 }, { "epoch": 14.288, "grad_norm": 0.21564999222755432, "learning_rate": 1.432865731462926e-05, "loss": 0.0076, "step": 3572 }, { "epoch": 14.292, "grad_norm": 0.17343463003635406, "learning_rate": 1.43186372745491e-05, "loss": 0.0085, "step": 3573 }, { "epoch": 14.296, "grad_norm": 0.367103636264801, "learning_rate": 1.4308617234468937e-05, "loss": 0.01, "step": 3574 }, { "epoch": 14.3, "grad_norm": 0.21690668165683746, "learning_rate": 1.4298597194388777e-05, "loss": 0.0094, "step": 3575 }, { "epoch": 14.304, "grad_norm": 0.15850843489170074, "learning_rate": 1.428857715430862e-05, "loss": 0.0075, "step": 3576 }, { "epoch": 14.308, "grad_norm": 0.31984981894493103, "learning_rate": 1.4278557114228458e-05, "loss": 0.0101, "step": 3577 }, { "epoch": 14.312, "grad_norm": 0.45315784215927124, "learning_rate": 1.4268537074148297e-05, "loss": 0.0115, "step": 3578 }, { "epoch": 14.316, "grad_norm": 0.207364022731781, "learning_rate": 1.4258517034068137e-05, "loss": 0.0086, "step": 3579 }, { "epoch": 14.32, "grad_norm": 0.18384985625743866, "learning_rate": 1.4248496993987975e-05, "loss": 0.0086, "step": 3580 }, { "epoch": 14.324, "grad_norm": 0.23308269679546356, "learning_rate": 1.4238476953907818e-05, "loss": 0.0089, "step": 3581 }, { "epoch": 14.328, "grad_norm": 0.3302237093448639, "learning_rate": 1.4228456913827657e-05, "loss": 0.0093, "step": 3582 }, { "epoch": 14.332, "grad_norm": 0.23539598286151886, "learning_rate": 1.4218436873747495e-05, "loss": 0.0089, "step": 3583 }, { "epoch": 14.336, "grad_norm": 0.16788506507873535, "learning_rate": 1.4208416833667335e-05, "loss": 0.0072, "step": 3584 }, { "epoch": 14.34, "grad_norm": 0.20347057282924652, "learning_rate": 1.4198396793587174e-05, "loss": 0.0087, "step": 3585 }, { "epoch": 14.344, "grad_norm": 0.2108476608991623, "learning_rate": 1.4188376753507016e-05, "loss": 0.0092, "step": 3586 }, { "epoch": 14.348, "grad_norm": 0.24422381818294525, "learning_rate": 1.4178356713426855e-05, "loss": 0.0096, "step": 3587 }, { "epoch": 14.352, "grad_norm": 0.16197939217090607, "learning_rate": 1.4168336673346693e-05, "loss": 0.0081, "step": 3588 }, { "epoch": 14.356, "grad_norm": 0.20140737295150757, "learning_rate": 1.4158316633266533e-05, "loss": 0.0085, "step": 3589 }, { "epoch": 14.36, "grad_norm": 0.27182310819625854, "learning_rate": 1.4148296593186376e-05, "loss": 0.0097, "step": 3590 }, { "epoch": 14.364, "grad_norm": 0.1505798101425171, "learning_rate": 1.4138276553106213e-05, "loss": 0.0081, "step": 3591 }, { "epoch": 14.368, "grad_norm": 0.17416727542877197, "learning_rate": 1.4128256513026053e-05, "loss": 0.0081, "step": 3592 }, { "epoch": 14.372, "grad_norm": 0.21804209053516388, "learning_rate": 1.4118236472945893e-05, "loss": 0.0083, "step": 3593 }, { "epoch": 14.376, "grad_norm": 0.36001965403556824, "learning_rate": 1.410821643286573e-05, "loss": 0.0104, "step": 3594 }, { "epoch": 14.38, "grad_norm": 0.20571143925189972, "learning_rate": 1.4098196392785574e-05, "loss": 0.0082, "step": 3595 }, { "epoch": 14.384, "grad_norm": 0.19949229061603546, "learning_rate": 1.4088176352705411e-05, "loss": 0.0089, "step": 3596 }, { "epoch": 14.388, "grad_norm": 0.667675256729126, "learning_rate": 1.4078156312625251e-05, "loss": 0.0088, "step": 3597 }, { "epoch": 14.392, "grad_norm": 0.167070671916008, "learning_rate": 1.406813627254509e-05, "loss": 0.0087, "step": 3598 }, { "epoch": 14.396, "grad_norm": 0.3734051585197449, "learning_rate": 1.405811623246493e-05, "loss": 0.0095, "step": 3599 }, { "epoch": 14.4, "grad_norm": 0.28370603919029236, "learning_rate": 1.4048096192384771e-05, "loss": 0.0088, "step": 3600 }, { "epoch": 14.404, "grad_norm": 0.30198901891708374, "learning_rate": 1.4038076152304611e-05, "loss": 0.0098, "step": 3601 }, { "epoch": 14.408, "grad_norm": 0.23166725039482117, "learning_rate": 1.4028056112224449e-05, "loss": 0.008, "step": 3602 }, { "epoch": 14.412, "grad_norm": 0.19708774983882904, "learning_rate": 1.4018036072144289e-05, "loss": 0.009, "step": 3603 }, { "epoch": 14.416, "grad_norm": 0.30307242274284363, "learning_rate": 1.4008016032064128e-05, "loss": 0.0101, "step": 3604 }, { "epoch": 14.42, "grad_norm": 0.16705302894115448, "learning_rate": 1.399799599198397e-05, "loss": 0.0076, "step": 3605 }, { "epoch": 14.424, "grad_norm": 0.1781936138868332, "learning_rate": 1.3987975951903809e-05, "loss": 0.0087, "step": 3606 }, { "epoch": 14.428, "grad_norm": 0.2449142336845398, "learning_rate": 1.3977955911823649e-05, "loss": 0.0104, "step": 3607 }, { "epoch": 14.432, "grad_norm": 0.16610880196094513, "learning_rate": 1.3967935871743486e-05, "loss": 0.0082, "step": 3608 }, { "epoch": 14.436, "grad_norm": 0.26270821690559387, "learning_rate": 1.3957915831663326e-05, "loss": 0.0096, "step": 3609 }, { "epoch": 14.44, "grad_norm": 0.18053866922855377, "learning_rate": 1.3947895791583167e-05, "loss": 0.0055, "step": 3610 }, { "epoch": 14.444, "grad_norm": 0.17106251418590546, "learning_rate": 1.3937875751503007e-05, "loss": 0.0085, "step": 3611 }, { "epoch": 14.448, "grad_norm": 1.2041096687316895, "learning_rate": 1.3927855711422847e-05, "loss": 0.0099, "step": 3612 }, { "epoch": 14.452, "grad_norm": 0.2078244984149933, "learning_rate": 1.3917835671342686e-05, "loss": 0.0093, "step": 3613 }, { "epoch": 14.456, "grad_norm": 0.2774932384490967, "learning_rate": 1.3907815631262524e-05, "loss": 0.0095, "step": 3614 }, { "epoch": 14.46, "grad_norm": 0.17771194875240326, "learning_rate": 1.3897795591182367e-05, "loss": 0.0075, "step": 3615 }, { "epoch": 14.464, "grad_norm": 0.17751413583755493, "learning_rate": 1.3887775551102205e-05, "loss": 0.0086, "step": 3616 }, { "epoch": 14.468, "grad_norm": 0.289604514837265, "learning_rate": 1.3877755511022044e-05, "loss": 0.0094, "step": 3617 }, { "epoch": 14.472, "grad_norm": 0.1762779951095581, "learning_rate": 1.3867735470941884e-05, "loss": 0.0083, "step": 3618 }, { "epoch": 14.475999999999999, "grad_norm": 0.2782403528690338, "learning_rate": 1.3857715430861724e-05, "loss": 0.0064, "step": 3619 }, { "epoch": 14.48, "grad_norm": 0.22804078459739685, "learning_rate": 1.3847695390781565e-05, "loss": 0.0094, "step": 3620 }, { "epoch": 14.484, "grad_norm": 0.29062676429748535, "learning_rate": 1.3837675350701405e-05, "loss": 0.0109, "step": 3621 }, { "epoch": 14.488, "grad_norm": 0.2231641560792923, "learning_rate": 1.3827655310621242e-05, "loss": 0.0086, "step": 3622 }, { "epoch": 14.492, "grad_norm": 0.1812341958284378, "learning_rate": 1.3817635270541082e-05, "loss": 0.0088, "step": 3623 }, { "epoch": 14.496, "grad_norm": 0.1771288812160492, "learning_rate": 1.3807615230460922e-05, "loss": 0.009, "step": 3624 }, { "epoch": 14.5, "grad_norm": 0.23491171002388, "learning_rate": 1.3797595190380763e-05, "loss": 0.0091, "step": 3625 }, { "epoch": 14.504, "grad_norm": 0.20434904098510742, "learning_rate": 1.3787575150300602e-05, "loss": 0.0086, "step": 3626 }, { "epoch": 14.508, "grad_norm": 0.16007202863693237, "learning_rate": 1.3777555110220442e-05, "loss": 0.0075, "step": 3627 }, { "epoch": 14.512, "grad_norm": 0.21671073138713837, "learning_rate": 1.376753507014028e-05, "loss": 0.0085, "step": 3628 }, { "epoch": 14.516, "grad_norm": 0.23178523778915405, "learning_rate": 1.375751503006012e-05, "loss": 0.009, "step": 3629 }, { "epoch": 14.52, "grad_norm": 0.2952883541584015, "learning_rate": 1.374749498997996e-05, "loss": 0.0104, "step": 3630 }, { "epoch": 14.524000000000001, "grad_norm": 0.22766168415546417, "learning_rate": 1.37374749498998e-05, "loss": 0.0082, "step": 3631 }, { "epoch": 14.528, "grad_norm": 0.17687074840068817, "learning_rate": 1.372745490981964e-05, "loss": 0.0077, "step": 3632 }, { "epoch": 14.532, "grad_norm": 0.2766241133213043, "learning_rate": 1.371743486973948e-05, "loss": 0.0097, "step": 3633 }, { "epoch": 14.536, "grad_norm": 0.2916935980319977, "learning_rate": 1.3707414829659317e-05, "loss": 0.011, "step": 3634 }, { "epoch": 14.54, "grad_norm": 0.4812479615211487, "learning_rate": 1.369739478957916e-05, "loss": 0.0104, "step": 3635 }, { "epoch": 14.544, "grad_norm": 0.20633623003959656, "learning_rate": 1.3687374749498998e-05, "loss": 0.008, "step": 3636 }, { "epoch": 14.548, "grad_norm": 0.20415335893630981, "learning_rate": 1.3677354709418838e-05, "loss": 0.0083, "step": 3637 }, { "epoch": 14.552, "grad_norm": 0.23814812302589417, "learning_rate": 1.3667334669338678e-05, "loss": 0.0095, "step": 3638 }, { "epoch": 14.556000000000001, "grad_norm": 0.33633625507354736, "learning_rate": 1.3657314629258517e-05, "loss": 0.0099, "step": 3639 }, { "epoch": 14.56, "grad_norm": 0.3303033113479614, "learning_rate": 1.3647294589178358e-05, "loss": 0.0094, "step": 3640 }, { "epoch": 14.564, "grad_norm": 0.2839348316192627, "learning_rate": 1.3637274549098198e-05, "loss": 0.0103, "step": 3641 }, { "epoch": 14.568, "grad_norm": 0.2589602470397949, "learning_rate": 1.3627254509018036e-05, "loss": 0.0086, "step": 3642 }, { "epoch": 14.572, "grad_norm": 0.17141884565353394, "learning_rate": 1.3617234468937875e-05, "loss": 0.0085, "step": 3643 }, { "epoch": 14.576, "grad_norm": 0.20422735810279846, "learning_rate": 1.3607214428857717e-05, "loss": 0.0087, "step": 3644 }, { "epoch": 14.58, "grad_norm": 0.36146315932273865, "learning_rate": 1.3597194388777556e-05, "loss": 0.0103, "step": 3645 }, { "epoch": 14.584, "grad_norm": 0.3737517297267914, "learning_rate": 1.3587174348697396e-05, "loss": 0.0085, "step": 3646 }, { "epoch": 14.588, "grad_norm": 0.20324692130088806, "learning_rate": 1.3577154308617236e-05, "loss": 0.0083, "step": 3647 }, { "epoch": 14.592, "grad_norm": 0.21344539523124695, "learning_rate": 1.3567134268537073e-05, "loss": 0.009, "step": 3648 }, { "epoch": 14.596, "grad_norm": 0.3655132055282593, "learning_rate": 1.3557114228456916e-05, "loss": 0.011, "step": 3649 }, { "epoch": 14.6, "grad_norm": 0.20782031118869781, "learning_rate": 1.3547094188376754e-05, "loss": 0.0089, "step": 3650 }, { "epoch": 14.604, "grad_norm": 0.28657761216163635, "learning_rate": 1.3537074148296594e-05, "loss": 0.0093, "step": 3651 }, { "epoch": 14.608, "grad_norm": 0.25337931513786316, "learning_rate": 1.3527054108216433e-05, "loss": 0.0093, "step": 3652 }, { "epoch": 14.612, "grad_norm": 0.36946016550064087, "learning_rate": 1.3517034068136273e-05, "loss": 0.0113, "step": 3653 }, { "epoch": 14.616, "grad_norm": 0.18227268755435944, "learning_rate": 1.3507014028056114e-05, "loss": 0.0084, "step": 3654 }, { "epoch": 14.62, "grad_norm": 0.1899784952402115, "learning_rate": 1.3496993987975954e-05, "loss": 0.0085, "step": 3655 }, { "epoch": 14.624, "grad_norm": 0.16006070375442505, "learning_rate": 1.3486973947895792e-05, "loss": 0.0086, "step": 3656 }, { "epoch": 14.628, "grad_norm": 0.2663494348526001, "learning_rate": 1.3476953907815631e-05, "loss": 0.0104, "step": 3657 }, { "epoch": 14.632, "grad_norm": 0.19320881366729736, "learning_rate": 1.3466933867735471e-05, "loss": 0.0092, "step": 3658 }, { "epoch": 14.636, "grad_norm": 0.5768705010414124, "learning_rate": 1.3456913827655312e-05, "loss": 0.0127, "step": 3659 }, { "epoch": 14.64, "grad_norm": 0.15998607873916626, "learning_rate": 1.3446893787575152e-05, "loss": 0.0089, "step": 3660 }, { "epoch": 14.644, "grad_norm": 0.21522219479084015, "learning_rate": 1.3436873747494991e-05, "loss": 0.0079, "step": 3661 }, { "epoch": 14.648, "grad_norm": 0.2248634248971939, "learning_rate": 1.342685370741483e-05, "loss": 0.0098, "step": 3662 }, { "epoch": 14.652, "grad_norm": 0.23990559577941895, "learning_rate": 1.3416833667334669e-05, "loss": 0.0096, "step": 3663 }, { "epoch": 14.656, "grad_norm": 0.18346965312957764, "learning_rate": 1.340681362725451e-05, "loss": 0.0081, "step": 3664 }, { "epoch": 14.66, "grad_norm": 0.46722784638404846, "learning_rate": 1.339679358717435e-05, "loss": 0.0119, "step": 3665 }, { "epoch": 14.664, "grad_norm": 0.19062618911266327, "learning_rate": 1.338677354709419e-05, "loss": 0.0082, "step": 3666 }, { "epoch": 14.668, "grad_norm": 0.40659964084625244, "learning_rate": 1.3376753507014029e-05, "loss": 0.0097, "step": 3667 }, { "epoch": 14.672, "grad_norm": 0.22992847859859467, "learning_rate": 1.3366733466933867e-05, "loss": 0.0094, "step": 3668 }, { "epoch": 14.676, "grad_norm": 0.3991800844669342, "learning_rate": 1.335671342685371e-05, "loss": 0.0129, "step": 3669 }, { "epoch": 14.68, "grad_norm": 0.1878584921360016, "learning_rate": 1.3346693386773548e-05, "loss": 0.0085, "step": 3670 }, { "epoch": 14.684, "grad_norm": 0.1736784130334854, "learning_rate": 1.3336673346693387e-05, "loss": 0.0094, "step": 3671 }, { "epoch": 14.688, "grad_norm": 0.18842560052871704, "learning_rate": 1.3326653306613227e-05, "loss": 0.0085, "step": 3672 }, { "epoch": 14.692, "grad_norm": 0.2932375967502594, "learning_rate": 1.3316633266533065e-05, "loss": 0.0094, "step": 3673 }, { "epoch": 14.696, "grad_norm": 0.5152665376663208, "learning_rate": 1.3306613226452908e-05, "loss": 0.0096, "step": 3674 }, { "epoch": 14.7, "grad_norm": 0.34712889790534973, "learning_rate": 1.3296593186372747e-05, "loss": 0.0092, "step": 3675 }, { "epoch": 14.704, "grad_norm": 0.18002943694591522, "learning_rate": 1.3286573146292585e-05, "loss": 0.0089, "step": 3676 }, { "epoch": 14.708, "grad_norm": 0.26756277680397034, "learning_rate": 1.3276553106212425e-05, "loss": 0.0092, "step": 3677 }, { "epoch": 14.712, "grad_norm": 0.24038799107074738, "learning_rate": 1.3266533066132264e-05, "loss": 0.01, "step": 3678 }, { "epoch": 14.716, "grad_norm": 0.526962161064148, "learning_rate": 1.3256513026052106e-05, "loss": 0.0112, "step": 3679 }, { "epoch": 14.72, "grad_norm": 0.184894397854805, "learning_rate": 1.3246492985971945e-05, "loss": 0.008, "step": 3680 }, { "epoch": 14.724, "grad_norm": 0.2070516049861908, "learning_rate": 1.3236472945891785e-05, "loss": 0.0087, "step": 3681 }, { "epoch": 14.728, "grad_norm": 0.23932699859142303, "learning_rate": 1.3226452905811623e-05, "loss": 0.0099, "step": 3682 }, { "epoch": 14.732, "grad_norm": 0.24801121652126312, "learning_rate": 1.3216432865731462e-05, "loss": 0.0094, "step": 3683 }, { "epoch": 14.736, "grad_norm": 0.21665263175964355, "learning_rate": 1.3206412825651304e-05, "loss": 0.0094, "step": 3684 }, { "epoch": 14.74, "grad_norm": 0.19703127443790436, "learning_rate": 1.3196392785571143e-05, "loss": 0.008, "step": 3685 }, { "epoch": 14.744, "grad_norm": 0.40190309286117554, "learning_rate": 1.3186372745490983e-05, "loss": 0.0089, "step": 3686 }, { "epoch": 14.748, "grad_norm": 0.2351587414741516, "learning_rate": 1.317635270541082e-05, "loss": 0.0086, "step": 3687 }, { "epoch": 14.752, "grad_norm": 0.26315975189208984, "learning_rate": 1.316633266533066e-05, "loss": 0.01, "step": 3688 }, { "epoch": 14.756, "grad_norm": 0.33228597044944763, "learning_rate": 1.3156312625250503e-05, "loss": 0.0115, "step": 3689 }, { "epoch": 14.76, "grad_norm": 0.2187896966934204, "learning_rate": 1.3146292585170341e-05, "loss": 0.0086, "step": 3690 }, { "epoch": 14.764, "grad_norm": 0.2327001541852951, "learning_rate": 1.313627254509018e-05, "loss": 0.0096, "step": 3691 }, { "epoch": 14.768, "grad_norm": 0.1458553969860077, "learning_rate": 1.312625250501002e-05, "loss": 0.0083, "step": 3692 }, { "epoch": 14.772, "grad_norm": 0.2837975323200226, "learning_rate": 1.3116232464929858e-05, "loss": 0.0096, "step": 3693 }, { "epoch": 14.776, "grad_norm": 0.2698477804660797, "learning_rate": 1.3106212424849701e-05, "loss": 0.0095, "step": 3694 }, { "epoch": 14.78, "grad_norm": 0.6040169596672058, "learning_rate": 1.309619238476954e-05, "loss": 0.0119, "step": 3695 }, { "epoch": 14.784, "grad_norm": 0.14232851564884186, "learning_rate": 1.3086172344689379e-05, "loss": 0.0085, "step": 3696 }, { "epoch": 14.788, "grad_norm": 0.24106432497501373, "learning_rate": 1.3076152304609218e-05, "loss": 0.0088, "step": 3697 }, { "epoch": 14.792, "grad_norm": 0.2642269432544708, "learning_rate": 1.3066132264529058e-05, "loss": 0.0098, "step": 3698 }, { "epoch": 14.796, "grad_norm": 0.23594169318675995, "learning_rate": 1.30561122244489e-05, "loss": 0.0092, "step": 3699 }, { "epoch": 14.8, "grad_norm": 0.24775654077529907, "learning_rate": 1.3046092184368739e-05, "loss": 0.0092, "step": 3700 }, { "epoch": 14.804, "grad_norm": 0.2097250074148178, "learning_rate": 1.3036072144288577e-05, "loss": 0.0096, "step": 3701 }, { "epoch": 14.808, "grad_norm": 0.2012004554271698, "learning_rate": 1.3026052104208416e-05, "loss": 0.0092, "step": 3702 }, { "epoch": 14.812, "grad_norm": 0.20894677937030792, "learning_rate": 1.301603206412826e-05, "loss": 0.0093, "step": 3703 }, { "epoch": 14.816, "grad_norm": 0.23382696509361267, "learning_rate": 1.3006012024048097e-05, "loss": 0.009, "step": 3704 }, { "epoch": 14.82, "grad_norm": 0.19932790100574493, "learning_rate": 1.2995991983967937e-05, "loss": 0.0103, "step": 3705 }, { "epoch": 14.824, "grad_norm": 0.28219786286354065, "learning_rate": 1.2985971943887776e-05, "loss": 0.01, "step": 3706 }, { "epoch": 14.828, "grad_norm": 0.2530341148376465, "learning_rate": 1.2975951903807614e-05, "loss": 0.0092, "step": 3707 }, { "epoch": 14.832, "grad_norm": 0.26421231031417847, "learning_rate": 1.2965931863727457e-05, "loss": 0.0097, "step": 3708 }, { "epoch": 14.836, "grad_norm": 0.1763448268175125, "learning_rate": 1.2955911823647297e-05, "loss": 0.0084, "step": 3709 }, { "epoch": 14.84, "grad_norm": 0.26100030541419983, "learning_rate": 1.2945891783567135e-05, "loss": 0.0102, "step": 3710 }, { "epoch": 14.844, "grad_norm": 0.22259047627449036, "learning_rate": 1.2935871743486974e-05, "loss": 0.0092, "step": 3711 }, { "epoch": 14.848, "grad_norm": 0.19816388189792633, "learning_rate": 1.2925851703406814e-05, "loss": 0.0059, "step": 3712 }, { "epoch": 14.852, "grad_norm": 0.270816832780838, "learning_rate": 1.2915831663326655e-05, "loss": 0.01, "step": 3713 }, { "epoch": 14.856, "grad_norm": 0.25951826572418213, "learning_rate": 1.2905811623246495e-05, "loss": 0.0098, "step": 3714 }, { "epoch": 14.86, "grad_norm": 0.28285837173461914, "learning_rate": 1.2895791583166333e-05, "loss": 0.0093, "step": 3715 }, { "epoch": 14.864, "grad_norm": 0.42073768377304077, "learning_rate": 1.2885771543086172e-05, "loss": 0.0108, "step": 3716 }, { "epoch": 14.868, "grad_norm": 0.506669282913208, "learning_rate": 1.2875751503006012e-05, "loss": 0.0094, "step": 3717 }, { "epoch": 14.872, "grad_norm": 0.18972696363925934, "learning_rate": 1.2865731462925853e-05, "loss": 0.009, "step": 3718 }, { "epoch": 14.876, "grad_norm": 0.17668847739696503, "learning_rate": 1.2855711422845693e-05, "loss": 0.0082, "step": 3719 }, { "epoch": 14.88, "grad_norm": 0.30280017852783203, "learning_rate": 1.2845691382765532e-05, "loss": 0.0109, "step": 3720 }, { "epoch": 14.884, "grad_norm": 0.21172870695590973, "learning_rate": 1.283567134268537e-05, "loss": 0.0094, "step": 3721 }, { "epoch": 14.888, "grad_norm": 0.17289844155311584, "learning_rate": 1.282565130260521e-05, "loss": 0.0074, "step": 3722 }, { "epoch": 14.892, "grad_norm": 0.17328234016895294, "learning_rate": 1.2815631262525051e-05, "loss": 0.0082, "step": 3723 }, { "epoch": 14.896, "grad_norm": 0.2376927137374878, "learning_rate": 1.280561122244489e-05, "loss": 0.01, "step": 3724 }, { "epoch": 14.9, "grad_norm": 0.21705985069274902, "learning_rate": 1.279559118236473e-05, "loss": 0.0086, "step": 3725 }, { "epoch": 14.904, "grad_norm": 0.272956520318985, "learning_rate": 1.278557114228457e-05, "loss": 0.0092, "step": 3726 }, { "epoch": 14.908, "grad_norm": 0.18439574539661407, "learning_rate": 1.2775551102204408e-05, "loss": 0.0084, "step": 3727 }, { "epoch": 14.912, "grad_norm": 0.19920729100704193, "learning_rate": 1.276553106212425e-05, "loss": 0.0091, "step": 3728 }, { "epoch": 14.916, "grad_norm": 0.2559821605682373, "learning_rate": 1.2755511022044089e-05, "loss": 0.0091, "step": 3729 }, { "epoch": 14.92, "grad_norm": 0.23575206100940704, "learning_rate": 1.2745490981963928e-05, "loss": 0.0084, "step": 3730 }, { "epoch": 14.924, "grad_norm": 0.2531851828098297, "learning_rate": 1.2735470941883768e-05, "loss": 0.0089, "step": 3731 }, { "epoch": 14.928, "grad_norm": 0.21447163820266724, "learning_rate": 1.2725450901803607e-05, "loss": 0.0099, "step": 3732 }, { "epoch": 14.932, "grad_norm": 0.23144927620887756, "learning_rate": 1.2715430861723449e-05, "loss": 0.0098, "step": 3733 }, { "epoch": 14.936, "grad_norm": 0.16847439110279083, "learning_rate": 1.2705410821643288e-05, "loss": 0.0083, "step": 3734 }, { "epoch": 14.94, "grad_norm": 0.25669029355049133, "learning_rate": 1.2695390781563126e-05, "loss": 0.0097, "step": 3735 }, { "epoch": 14.943999999999999, "grad_norm": 0.3748454451560974, "learning_rate": 1.2685370741482966e-05, "loss": 0.0097, "step": 3736 }, { "epoch": 14.948, "grad_norm": 0.2604009509086609, "learning_rate": 1.2675350701402805e-05, "loss": 0.0094, "step": 3737 }, { "epoch": 14.952, "grad_norm": 0.09197622537612915, "learning_rate": 1.2665330661322647e-05, "loss": 0.004, "step": 3738 }, { "epoch": 14.956, "grad_norm": 0.2926792800426483, "learning_rate": 1.2655310621242486e-05, "loss": 0.0103, "step": 3739 }, { "epoch": 14.96, "grad_norm": 0.19025073945522308, "learning_rate": 1.2645290581162326e-05, "loss": 0.009, "step": 3740 }, { "epoch": 14.964, "grad_norm": 0.17044049501419067, "learning_rate": 1.2635270541082164e-05, "loss": 0.006, "step": 3741 }, { "epoch": 14.968, "grad_norm": 0.16821922361850739, "learning_rate": 1.2625250501002003e-05, "loss": 0.006, "step": 3742 }, { "epoch": 14.972, "grad_norm": 0.2913142740726471, "learning_rate": 1.2615230460921844e-05, "loss": 0.0097, "step": 3743 }, { "epoch": 14.975999999999999, "grad_norm": 0.48879778385162354, "learning_rate": 1.2605210420841684e-05, "loss": 0.0101, "step": 3744 }, { "epoch": 14.98, "grad_norm": 0.2455979734659195, "learning_rate": 1.2595190380761524e-05, "loss": 0.009, "step": 3745 }, { "epoch": 14.984, "grad_norm": 0.20019696652889252, "learning_rate": 1.2585170340681363e-05, "loss": 0.009, "step": 3746 }, { "epoch": 14.988, "grad_norm": 0.30625882744789124, "learning_rate": 1.2575150300601201e-05, "loss": 0.0101, "step": 3747 }, { "epoch": 14.992, "grad_norm": 0.18524271249771118, "learning_rate": 1.2565130260521044e-05, "loss": 0.0093, "step": 3748 }, { "epoch": 14.996, "grad_norm": 0.33599841594696045, "learning_rate": 1.2555110220440882e-05, "loss": 0.0118, "step": 3749 }, { "epoch": 15.0, "grad_norm": 0.17473024129867554, "learning_rate": 1.2545090180360722e-05, "loss": 0.006, "step": 3750 }, { "epoch": 15.004, "grad_norm": 0.19427764415740967, "learning_rate": 1.2535070140280561e-05, "loss": 0.0093, "step": 3751 }, { "epoch": 15.008, "grad_norm": 0.1349514275789261, "learning_rate": 1.25250501002004e-05, "loss": 0.0072, "step": 3752 }, { "epoch": 15.012, "grad_norm": 0.11184310913085938, "learning_rate": 1.2515030060120242e-05, "loss": 0.0056, "step": 3753 }, { "epoch": 15.016, "grad_norm": 0.22410862147808075, "learning_rate": 1.2505010020040082e-05, "loss": 0.0085, "step": 3754 }, { "epoch": 15.02, "grad_norm": 0.20344886183738708, "learning_rate": 1.249498997995992e-05, "loss": 0.0077, "step": 3755 }, { "epoch": 15.024, "grad_norm": 0.20372788608074188, "learning_rate": 1.248496993987976e-05, "loss": 0.0083, "step": 3756 }, { "epoch": 15.028, "grad_norm": 0.31820449233055115, "learning_rate": 1.24749498997996e-05, "loss": 0.0081, "step": 3757 }, { "epoch": 15.032, "grad_norm": 0.2090875804424286, "learning_rate": 1.2464929859719438e-05, "loss": 0.0087, "step": 3758 }, { "epoch": 15.036, "grad_norm": 0.15436923503875732, "learning_rate": 1.245490981963928e-05, "loss": 0.0069, "step": 3759 }, { "epoch": 15.04, "grad_norm": 0.17816013097763062, "learning_rate": 1.244488977955912e-05, "loss": 0.0087, "step": 3760 }, { "epoch": 15.044, "grad_norm": 0.14754058420658112, "learning_rate": 1.2434869739478959e-05, "loss": 0.0072, "step": 3761 }, { "epoch": 15.048, "grad_norm": 0.29081836342811584, "learning_rate": 1.2424849699398798e-05, "loss": 0.0085, "step": 3762 }, { "epoch": 15.052, "grad_norm": 0.14590558409690857, "learning_rate": 1.2414829659318638e-05, "loss": 0.0067, "step": 3763 }, { "epoch": 15.056, "grad_norm": 0.1646532565355301, "learning_rate": 1.2404809619238478e-05, "loss": 0.0072, "step": 3764 }, { "epoch": 15.06, "grad_norm": 0.3623543977737427, "learning_rate": 1.2394789579158317e-05, "loss": 0.0097, "step": 3765 }, { "epoch": 15.064, "grad_norm": 0.074567049741745, "learning_rate": 1.2384769539078157e-05, "loss": 0.0032, "step": 3766 }, { "epoch": 15.068, "grad_norm": 0.25378715991973877, "learning_rate": 1.2374749498997996e-05, "loss": 0.0085, "step": 3767 }, { "epoch": 15.072, "grad_norm": 0.14567115902900696, "learning_rate": 1.2364729458917836e-05, "loss": 0.0076, "step": 3768 }, { "epoch": 15.076, "grad_norm": 0.13616079092025757, "learning_rate": 1.2354709418837675e-05, "loss": 0.0077, "step": 3769 }, { "epoch": 15.08, "grad_norm": 0.13720935583114624, "learning_rate": 1.2344689378757515e-05, "loss": 0.0076, "step": 3770 }, { "epoch": 15.084, "grad_norm": 0.13663603365421295, "learning_rate": 1.2334669338677356e-05, "loss": 0.0045, "step": 3771 }, { "epoch": 15.088, "grad_norm": 0.16399826109409332, "learning_rate": 1.2324649298597194e-05, "loss": 0.0079, "step": 3772 }, { "epoch": 15.092, "grad_norm": 0.25261420011520386, "learning_rate": 1.2314629258517036e-05, "loss": 0.0087, "step": 3773 }, { "epoch": 15.096, "grad_norm": 0.17582829296588898, "learning_rate": 1.2304609218436875e-05, "loss": 0.0068, "step": 3774 }, { "epoch": 15.1, "grad_norm": 0.18607257306575775, "learning_rate": 1.2294589178356713e-05, "loss": 0.0079, "step": 3775 }, { "epoch": 15.104, "grad_norm": 0.19465355575084686, "learning_rate": 1.2284569138276554e-05, "loss": 0.0067, "step": 3776 }, { "epoch": 15.108, "grad_norm": 0.28906580805778503, "learning_rate": 1.2274549098196394e-05, "loss": 0.0092, "step": 3777 }, { "epoch": 15.112, "grad_norm": 0.2137903869152069, "learning_rate": 1.2264529058116233e-05, "loss": 0.0084, "step": 3778 }, { "epoch": 15.116, "grad_norm": 0.18388208746910095, "learning_rate": 1.2254509018036073e-05, "loss": 0.0075, "step": 3779 }, { "epoch": 15.12, "grad_norm": 0.237356498837471, "learning_rate": 1.2244488977955913e-05, "loss": 0.008, "step": 3780 }, { "epoch": 15.124, "grad_norm": 0.22582365572452545, "learning_rate": 1.2234468937875752e-05, "loss": 0.0094, "step": 3781 }, { "epoch": 15.128, "grad_norm": 0.150029718875885, "learning_rate": 1.2224448897795592e-05, "loss": 0.008, "step": 3782 }, { "epoch": 15.132, "grad_norm": 0.2120167315006256, "learning_rate": 1.2214428857715431e-05, "loss": 0.0091, "step": 3783 }, { "epoch": 15.136, "grad_norm": 0.18410436809062958, "learning_rate": 1.2204408817635271e-05, "loss": 0.0073, "step": 3784 }, { "epoch": 15.14, "grad_norm": 0.18777211010456085, "learning_rate": 1.219438877755511e-05, "loss": 0.0084, "step": 3785 }, { "epoch": 15.144, "grad_norm": 0.1506580412387848, "learning_rate": 1.218436873747495e-05, "loss": 0.0077, "step": 3786 }, { "epoch": 15.148, "grad_norm": 0.1882610023021698, "learning_rate": 1.217434869739479e-05, "loss": 0.0072, "step": 3787 }, { "epoch": 15.152, "grad_norm": 0.14596189558506012, "learning_rate": 1.2164328657314631e-05, "loss": 0.0074, "step": 3788 }, { "epoch": 15.156, "grad_norm": 0.14775827527046204, "learning_rate": 1.2154308617234469e-05, "loss": 0.0074, "step": 3789 }, { "epoch": 15.16, "grad_norm": 0.1532842367887497, "learning_rate": 1.2144288577154309e-05, "loss": 0.0071, "step": 3790 }, { "epoch": 15.164, "grad_norm": 0.18244397640228271, "learning_rate": 1.213426853707415e-05, "loss": 0.0085, "step": 3791 }, { "epoch": 15.168, "grad_norm": 0.17389196157455444, "learning_rate": 1.2124248496993988e-05, "loss": 0.0081, "step": 3792 }, { "epoch": 15.172, "grad_norm": 0.24255871772766113, "learning_rate": 1.2114228456913829e-05, "loss": 0.0095, "step": 3793 }, { "epoch": 15.176, "grad_norm": 0.13241663575172424, "learning_rate": 1.2104208416833669e-05, "loss": 0.0066, "step": 3794 }, { "epoch": 15.18, "grad_norm": 0.2261979728937149, "learning_rate": 1.2094188376753506e-05, "loss": 0.0105, "step": 3795 }, { "epoch": 15.184, "grad_norm": 0.1342063695192337, "learning_rate": 1.2084168336673348e-05, "loss": 0.0073, "step": 3796 }, { "epoch": 15.188, "grad_norm": 0.18595094978809357, "learning_rate": 1.2074148296593187e-05, "loss": 0.0078, "step": 3797 }, { "epoch": 15.192, "grad_norm": 0.21512697637081146, "learning_rate": 1.2064128256513027e-05, "loss": 0.0083, "step": 3798 }, { "epoch": 15.196, "grad_norm": 0.19803418219089508, "learning_rate": 1.2054108216432867e-05, "loss": 0.0079, "step": 3799 }, { "epoch": 15.2, "grad_norm": 0.2116500288248062, "learning_rate": 1.2044088176352706e-05, "loss": 0.0083, "step": 3800 }, { "epoch": 15.204, "grad_norm": 0.18894079327583313, "learning_rate": 1.2034068136272546e-05, "loss": 0.0075, "step": 3801 }, { "epoch": 15.208, "grad_norm": 0.30808115005493164, "learning_rate": 1.2024048096192385e-05, "loss": 0.0085, "step": 3802 }, { "epoch": 15.212, "grad_norm": 0.17777995765209198, "learning_rate": 1.2014028056112225e-05, "loss": 0.0078, "step": 3803 }, { "epoch": 15.216, "grad_norm": 0.28307095170021057, "learning_rate": 1.2004008016032064e-05, "loss": 0.0089, "step": 3804 }, { "epoch": 15.22, "grad_norm": 0.2219671756029129, "learning_rate": 1.1993987975951906e-05, "loss": 0.009, "step": 3805 }, { "epoch": 15.224, "grad_norm": 0.1308339238166809, "learning_rate": 1.1983967935871744e-05, "loss": 0.008, "step": 3806 }, { "epoch": 15.228, "grad_norm": 0.1603306233882904, "learning_rate": 1.1973947895791583e-05, "loss": 0.0073, "step": 3807 }, { "epoch": 15.232, "grad_norm": 0.21934445202350616, "learning_rate": 1.1963927855711425e-05, "loss": 0.0089, "step": 3808 }, { "epoch": 15.236, "grad_norm": 0.16111411154270172, "learning_rate": 1.1953907815631262e-05, "loss": 0.0081, "step": 3809 }, { "epoch": 15.24, "grad_norm": 0.1615634709596634, "learning_rate": 1.1943887775551104e-05, "loss": 0.0078, "step": 3810 }, { "epoch": 15.244, "grad_norm": 0.1142813116312027, "learning_rate": 1.1933867735470943e-05, "loss": 0.0042, "step": 3811 }, { "epoch": 15.248, "grad_norm": 0.20265451073646545, "learning_rate": 1.1923847695390781e-05, "loss": 0.0075, "step": 3812 }, { "epoch": 15.252, "grad_norm": 0.17693500220775604, "learning_rate": 1.1913827655310622e-05, "loss": 0.0091, "step": 3813 }, { "epoch": 15.256, "grad_norm": 0.16070616245269775, "learning_rate": 1.190380761523046e-05, "loss": 0.0084, "step": 3814 }, { "epoch": 15.26, "grad_norm": 0.2477542757987976, "learning_rate": 1.1893787575150302e-05, "loss": 0.0087, "step": 3815 }, { "epoch": 15.264, "grad_norm": 0.2083113044500351, "learning_rate": 1.1883767535070141e-05, "loss": 0.0086, "step": 3816 }, { "epoch": 15.268, "grad_norm": 0.20015212893486023, "learning_rate": 1.1873747494989979e-05, "loss": 0.0087, "step": 3817 }, { "epoch": 15.272, "grad_norm": 0.1395539939403534, "learning_rate": 1.186372745490982e-05, "loss": 0.0073, "step": 3818 }, { "epoch": 15.276, "grad_norm": 0.1408277451992035, "learning_rate": 1.185370741482966e-05, "loss": 0.0081, "step": 3819 }, { "epoch": 15.28, "grad_norm": 0.20688875019550323, "learning_rate": 1.18436873747495e-05, "loss": 0.0082, "step": 3820 }, { "epoch": 15.284, "grad_norm": 0.21537630259990692, "learning_rate": 1.183366733466934e-05, "loss": 0.0085, "step": 3821 }, { "epoch": 15.288, "grad_norm": 0.22216440737247467, "learning_rate": 1.1823647294589179e-05, "loss": 0.0074, "step": 3822 }, { "epoch": 15.292, "grad_norm": 0.21135936677455902, "learning_rate": 1.1813627254509018e-05, "loss": 0.0091, "step": 3823 }, { "epoch": 15.296, "grad_norm": 0.23614716529846191, "learning_rate": 1.1803607214428858e-05, "loss": 0.0082, "step": 3824 }, { "epoch": 15.3, "grad_norm": 0.16992735862731934, "learning_rate": 1.1793587174348698e-05, "loss": 0.0074, "step": 3825 }, { "epoch": 15.304, "grad_norm": 0.21491000056266785, "learning_rate": 1.1783567134268537e-05, "loss": 0.0073, "step": 3826 }, { "epoch": 15.308, "grad_norm": 0.16719304025173187, "learning_rate": 1.1773547094188378e-05, "loss": 0.0069, "step": 3827 }, { "epoch": 15.312, "grad_norm": 0.2545108497142792, "learning_rate": 1.1763527054108216e-05, "loss": 0.0094, "step": 3828 }, { "epoch": 15.316, "grad_norm": 0.16995342075824738, "learning_rate": 1.1753507014028056e-05, "loss": 0.0072, "step": 3829 }, { "epoch": 15.32, "grad_norm": 0.16017405688762665, "learning_rate": 1.1743486973947897e-05, "loss": 0.007, "step": 3830 }, { "epoch": 15.324, "grad_norm": 0.2081645429134369, "learning_rate": 1.1733466933867735e-05, "loss": 0.009, "step": 3831 }, { "epoch": 15.328, "grad_norm": 0.1570480614900589, "learning_rate": 1.1723446893787576e-05, "loss": 0.0075, "step": 3832 }, { "epoch": 15.332, "grad_norm": 0.23571652173995972, "learning_rate": 1.1713426853707416e-05, "loss": 0.0095, "step": 3833 }, { "epoch": 15.336, "grad_norm": 0.2799575626850128, "learning_rate": 1.1703406813627254e-05, "loss": 0.0107, "step": 3834 }, { "epoch": 15.34, "grad_norm": 0.22484329342842102, "learning_rate": 1.1693386773547095e-05, "loss": 0.0087, "step": 3835 }, { "epoch": 15.344, "grad_norm": 0.2471844106912613, "learning_rate": 1.1683366733466935e-05, "loss": 0.009, "step": 3836 }, { "epoch": 15.348, "grad_norm": 0.21544387936592102, "learning_rate": 1.1673346693386774e-05, "loss": 0.0083, "step": 3837 }, { "epoch": 15.352, "grad_norm": 0.21488280594348907, "learning_rate": 1.1663326653306614e-05, "loss": 0.0088, "step": 3838 }, { "epoch": 15.356, "grad_norm": 0.12035142630338669, "learning_rate": 1.1653306613226453e-05, "loss": 0.0068, "step": 3839 }, { "epoch": 15.36, "grad_norm": 0.20783907175064087, "learning_rate": 1.1643286573146293e-05, "loss": 0.0079, "step": 3840 }, { "epoch": 15.364, "grad_norm": 0.15776130557060242, "learning_rate": 1.1633266533066133e-05, "loss": 0.0078, "step": 3841 }, { "epoch": 15.368, "grad_norm": 0.1693926751613617, "learning_rate": 1.1623246492985972e-05, "loss": 0.0085, "step": 3842 }, { "epoch": 15.372, "grad_norm": 0.20709924399852753, "learning_rate": 1.1613226452905812e-05, "loss": 0.0088, "step": 3843 }, { "epoch": 15.376, "grad_norm": 0.33956825733184814, "learning_rate": 1.1603206412825651e-05, "loss": 0.0091, "step": 3844 }, { "epoch": 15.38, "grad_norm": 0.20944558084011078, "learning_rate": 1.1593186372745491e-05, "loss": 0.0087, "step": 3845 }, { "epoch": 15.384, "grad_norm": 0.16297529637813568, "learning_rate": 1.158316633266533e-05, "loss": 0.0078, "step": 3846 }, { "epoch": 15.388, "grad_norm": 0.2716394364833832, "learning_rate": 1.1573146292585172e-05, "loss": 0.0089, "step": 3847 }, { "epoch": 15.392, "grad_norm": 0.15041834115982056, "learning_rate": 1.156312625250501e-05, "loss": 0.0076, "step": 3848 }, { "epoch": 15.396, "grad_norm": 0.16877737641334534, "learning_rate": 1.155310621242485e-05, "loss": 0.0074, "step": 3849 }, { "epoch": 15.4, "grad_norm": 0.2795097231864929, "learning_rate": 1.154308617234469e-05, "loss": 0.0096, "step": 3850 }, { "epoch": 15.404, "grad_norm": 0.13509052991867065, "learning_rate": 1.1533066132264529e-05, "loss": 0.0078, "step": 3851 }, { "epoch": 15.408, "grad_norm": 0.2292792648077011, "learning_rate": 1.152304609218437e-05, "loss": 0.0084, "step": 3852 }, { "epoch": 15.412, "grad_norm": 0.19881927967071533, "learning_rate": 1.151302605210421e-05, "loss": 0.0076, "step": 3853 }, { "epoch": 15.416, "grad_norm": 0.2702074646949768, "learning_rate": 1.1503006012024049e-05, "loss": 0.0095, "step": 3854 }, { "epoch": 15.42, "grad_norm": 0.2394787222146988, "learning_rate": 1.1492985971943889e-05, "loss": 0.0099, "step": 3855 }, { "epoch": 15.424, "grad_norm": 0.13445042073726654, "learning_rate": 1.1482965931863728e-05, "loss": 0.0073, "step": 3856 }, { "epoch": 15.428, "grad_norm": 0.2362317591905594, "learning_rate": 1.1472945891783568e-05, "loss": 0.0074, "step": 3857 }, { "epoch": 15.432, "grad_norm": 0.14894196391105652, "learning_rate": 1.1462925851703407e-05, "loss": 0.0086, "step": 3858 }, { "epoch": 15.436, "grad_norm": 0.17250634729862213, "learning_rate": 1.1452905811623247e-05, "loss": 0.009, "step": 3859 }, { "epoch": 15.44, "grad_norm": 0.197704017162323, "learning_rate": 1.1442885771543087e-05, "loss": 0.0094, "step": 3860 }, { "epoch": 15.444, "grad_norm": 0.23201380670070648, "learning_rate": 1.1432865731462926e-05, "loss": 0.009, "step": 3861 }, { "epoch": 15.448, "grad_norm": 0.26006773114204407, "learning_rate": 1.1422845691382766e-05, "loss": 0.009, "step": 3862 }, { "epoch": 15.452, "grad_norm": 0.21038317680358887, "learning_rate": 1.1412825651302605e-05, "loss": 0.0085, "step": 3863 }, { "epoch": 15.456, "grad_norm": 0.2623082399368286, "learning_rate": 1.1402805611222447e-05, "loss": 0.0097, "step": 3864 }, { "epoch": 15.46, "grad_norm": 0.17351366579532623, "learning_rate": 1.1392785571142284e-05, "loss": 0.0081, "step": 3865 }, { "epoch": 15.464, "grad_norm": 0.1778712272644043, "learning_rate": 1.1382765531062124e-05, "loss": 0.0085, "step": 3866 }, { "epoch": 15.468, "grad_norm": 0.16661031544208527, "learning_rate": 1.1372745490981965e-05, "loss": 0.008, "step": 3867 }, { "epoch": 15.472, "grad_norm": 0.1771150380373001, "learning_rate": 1.1362725450901803e-05, "loss": 0.0084, "step": 3868 }, { "epoch": 15.475999999999999, "grad_norm": 0.2267809361219406, "learning_rate": 1.1352705410821645e-05, "loss": 0.0086, "step": 3869 }, { "epoch": 15.48, "grad_norm": 0.26965567469596863, "learning_rate": 1.1342685370741484e-05, "loss": 0.0091, "step": 3870 }, { "epoch": 15.484, "grad_norm": 0.1582583636045456, "learning_rate": 1.1332665330661322e-05, "loss": 0.0056, "step": 3871 }, { "epoch": 15.488, "grad_norm": 0.24081505835056305, "learning_rate": 1.1322645290581163e-05, "loss": 0.0089, "step": 3872 }, { "epoch": 15.492, "grad_norm": 0.12798239290714264, "learning_rate": 1.1312625250501003e-05, "loss": 0.0049, "step": 3873 }, { "epoch": 15.496, "grad_norm": 0.14275750517845154, "learning_rate": 1.1302605210420842e-05, "loss": 0.0083, "step": 3874 }, { "epoch": 15.5, "grad_norm": 0.22652091085910797, "learning_rate": 1.1292585170340682e-05, "loss": 0.0089, "step": 3875 }, { "epoch": 15.504, "grad_norm": 0.18880586326122284, "learning_rate": 1.1282565130260522e-05, "loss": 0.0083, "step": 3876 }, { "epoch": 15.508, "grad_norm": 0.23349466919898987, "learning_rate": 1.1272545090180361e-05, "loss": 0.0088, "step": 3877 }, { "epoch": 15.512, "grad_norm": 0.2626950740814209, "learning_rate": 1.12625250501002e-05, "loss": 0.0096, "step": 3878 }, { "epoch": 15.516, "grad_norm": 0.1874002069234848, "learning_rate": 1.125250501002004e-05, "loss": 0.0085, "step": 3879 }, { "epoch": 15.52, "grad_norm": 0.2023652046918869, "learning_rate": 1.124248496993988e-05, "loss": 0.0085, "step": 3880 }, { "epoch": 15.524000000000001, "grad_norm": 0.3214608132839203, "learning_rate": 1.1232464929859721e-05, "loss": 0.0097, "step": 3881 }, { "epoch": 15.528, "grad_norm": 0.13317032158374786, "learning_rate": 1.122244488977956e-05, "loss": 0.0071, "step": 3882 }, { "epoch": 15.532, "grad_norm": 0.16016507148742676, "learning_rate": 1.1212424849699399e-05, "loss": 0.0081, "step": 3883 }, { "epoch": 15.536, "grad_norm": 0.2174006998538971, "learning_rate": 1.120240480961924e-05, "loss": 0.008, "step": 3884 }, { "epoch": 15.54, "grad_norm": 0.28932812809944153, "learning_rate": 1.1192384769539078e-05, "loss": 0.0095, "step": 3885 }, { "epoch": 15.544, "grad_norm": 0.20081889629364014, "learning_rate": 1.118236472945892e-05, "loss": 0.0083, "step": 3886 }, { "epoch": 15.548, "grad_norm": 0.2505418658256531, "learning_rate": 1.1172344689378759e-05, "loss": 0.0095, "step": 3887 }, { "epoch": 15.552, "grad_norm": 0.19087974727153778, "learning_rate": 1.1162324649298597e-05, "loss": 0.0074, "step": 3888 }, { "epoch": 15.556000000000001, "grad_norm": 0.2052469551563263, "learning_rate": 1.1152304609218438e-05, "loss": 0.0082, "step": 3889 }, { "epoch": 15.56, "grad_norm": 0.24596913158893585, "learning_rate": 1.1142284569138278e-05, "loss": 0.0084, "step": 3890 }, { "epoch": 15.564, "grad_norm": 0.46544262766838074, "learning_rate": 1.1132264529058117e-05, "loss": 0.0084, "step": 3891 }, { "epoch": 15.568, "grad_norm": 0.20327705144882202, "learning_rate": 1.1122244488977957e-05, "loss": 0.0079, "step": 3892 }, { "epoch": 15.572, "grad_norm": 0.21792593598365784, "learning_rate": 1.1112224448897796e-05, "loss": 0.0094, "step": 3893 }, { "epoch": 15.576, "grad_norm": 0.1825850009918213, "learning_rate": 1.1102204408817636e-05, "loss": 0.0079, "step": 3894 }, { "epoch": 15.58, "grad_norm": 0.18869619071483612, "learning_rate": 1.1092184368737475e-05, "loss": 0.0078, "step": 3895 }, { "epoch": 15.584, "grad_norm": 0.26426199078559875, "learning_rate": 1.1082164328657315e-05, "loss": 0.0112, "step": 3896 }, { "epoch": 15.588, "grad_norm": 0.20740513503551483, "learning_rate": 1.1072144288577155e-05, "loss": 0.0094, "step": 3897 }, { "epoch": 15.592, "grad_norm": 0.16762283444404602, "learning_rate": 1.1062124248496994e-05, "loss": 0.0079, "step": 3898 }, { "epoch": 15.596, "grad_norm": 0.28626206517219543, "learning_rate": 1.1052104208416834e-05, "loss": 0.0091, "step": 3899 }, { "epoch": 15.6, "grad_norm": 0.1966615617275238, "learning_rate": 1.1042084168336673e-05, "loss": 0.0076, "step": 3900 }, { "epoch": 15.604, "grad_norm": 0.2372523993253708, "learning_rate": 1.1032064128256515e-05, "loss": 0.0089, "step": 3901 }, { "epoch": 15.608, "grad_norm": 0.11428368836641312, "learning_rate": 1.1022044088176353e-05, "loss": 0.0048, "step": 3902 }, { "epoch": 15.612, "grad_norm": 0.20153571665287018, "learning_rate": 1.1012024048096192e-05, "loss": 0.0087, "step": 3903 }, { "epoch": 15.616, "grad_norm": 0.2549794614315033, "learning_rate": 1.1002004008016033e-05, "loss": 0.0087, "step": 3904 }, { "epoch": 15.62, "grad_norm": 0.11948023736476898, "learning_rate": 1.0991983967935871e-05, "loss": 0.0052, "step": 3905 }, { "epoch": 15.624, "grad_norm": 0.1888275444507599, "learning_rate": 1.0981963927855713e-05, "loss": 0.0085, "step": 3906 }, { "epoch": 15.628, "grad_norm": 0.38151490688323975, "learning_rate": 1.0971943887775552e-05, "loss": 0.009, "step": 3907 }, { "epoch": 15.632, "grad_norm": 0.12330213189125061, "learning_rate": 1.0961923847695392e-05, "loss": 0.0047, "step": 3908 }, { "epoch": 15.636, "grad_norm": 0.20336763560771942, "learning_rate": 1.0951903807615231e-05, "loss": 0.0083, "step": 3909 }, { "epoch": 15.64, "grad_norm": 0.26230552792549133, "learning_rate": 1.0941883767535071e-05, "loss": 0.0104, "step": 3910 }, { "epoch": 15.644, "grad_norm": 0.2377796471118927, "learning_rate": 1.093186372745491e-05, "loss": 0.0086, "step": 3911 }, { "epoch": 15.648, "grad_norm": 0.17421042919158936, "learning_rate": 1.092184368737475e-05, "loss": 0.0084, "step": 3912 }, { "epoch": 15.652, "grad_norm": 0.21408884227275848, "learning_rate": 1.091182364729459e-05, "loss": 0.0089, "step": 3913 }, { "epoch": 15.656, "grad_norm": 0.15429489314556122, "learning_rate": 1.090180360721443e-05, "loss": 0.0085, "step": 3914 }, { "epoch": 15.66, "grad_norm": 0.15924520790576935, "learning_rate": 1.0891783567134269e-05, "loss": 0.0055, "step": 3915 }, { "epoch": 15.664, "grad_norm": 0.3604001998901367, "learning_rate": 1.0881763527054109e-05, "loss": 0.0098, "step": 3916 }, { "epoch": 15.668, "grad_norm": 0.1900898814201355, "learning_rate": 1.0871743486973948e-05, "loss": 0.0084, "step": 3917 }, { "epoch": 15.672, "grad_norm": 0.2091568261384964, "learning_rate": 1.086172344689379e-05, "loss": 0.0091, "step": 3918 }, { "epoch": 15.676, "grad_norm": 0.32461073994636536, "learning_rate": 1.0851703406813627e-05, "loss": 0.0093, "step": 3919 }, { "epoch": 15.68, "grad_norm": 0.34765294194221497, "learning_rate": 1.0841683366733467e-05, "loss": 0.0097, "step": 3920 }, { "epoch": 15.684, "grad_norm": 0.1439502090215683, "learning_rate": 1.0831663326653308e-05, "loss": 0.0073, "step": 3921 }, { "epoch": 15.688, "grad_norm": 0.11668162047863007, "learning_rate": 1.0821643286573146e-05, "loss": 0.0078, "step": 3922 }, { "epoch": 15.692, "grad_norm": 0.16795441508293152, "learning_rate": 1.0811623246492987e-05, "loss": 0.0083, "step": 3923 }, { "epoch": 15.696, "grad_norm": 0.32226186990737915, "learning_rate": 1.0801603206412827e-05, "loss": 0.0103, "step": 3924 }, { "epoch": 15.7, "grad_norm": 0.13408587872982025, "learning_rate": 1.0791583166332665e-05, "loss": 0.005, "step": 3925 }, { "epoch": 15.704, "grad_norm": 0.12357212603092194, "learning_rate": 1.0781563126252506e-05, "loss": 0.0058, "step": 3926 }, { "epoch": 15.708, "grad_norm": 0.11612733453512192, "learning_rate": 1.0771543086172344e-05, "loss": 0.0045, "step": 3927 }, { "epoch": 15.712, "grad_norm": 0.23471392691135406, "learning_rate": 1.0761523046092185e-05, "loss": 0.0087, "step": 3928 }, { "epoch": 15.716, "grad_norm": 0.1405946910381317, "learning_rate": 1.0751503006012025e-05, "loss": 0.008, "step": 3929 }, { "epoch": 15.72, "grad_norm": 0.168296217918396, "learning_rate": 1.0741482965931863e-05, "loss": 0.0079, "step": 3930 }, { "epoch": 15.724, "grad_norm": 0.3210127353668213, "learning_rate": 1.0731462925851704e-05, "loss": 0.0106, "step": 3931 }, { "epoch": 15.728, "grad_norm": 0.17187538743019104, "learning_rate": 1.0721442885771544e-05, "loss": 0.0085, "step": 3932 }, { "epoch": 15.732, "grad_norm": 0.15461215376853943, "learning_rate": 1.0711422845691383e-05, "loss": 0.009, "step": 3933 }, { "epoch": 15.736, "grad_norm": 0.15376779437065125, "learning_rate": 1.0701402805611223e-05, "loss": 0.0082, "step": 3934 }, { "epoch": 15.74, "grad_norm": 0.3120775818824768, "learning_rate": 1.0691382765531064e-05, "loss": 0.0087, "step": 3935 }, { "epoch": 15.744, "grad_norm": 0.16850018501281738, "learning_rate": 1.0681362725450902e-05, "loss": 0.0085, "step": 3936 }, { "epoch": 15.748, "grad_norm": 0.21848592162132263, "learning_rate": 1.0671342685370742e-05, "loss": 0.0083, "step": 3937 }, { "epoch": 15.752, "grad_norm": 0.16288349032402039, "learning_rate": 1.0661322645290583e-05, "loss": 0.0073, "step": 3938 }, { "epoch": 15.756, "grad_norm": 0.24063876271247864, "learning_rate": 1.065130260521042e-05, "loss": 0.008, "step": 3939 }, { "epoch": 15.76, "grad_norm": 0.20195181667804718, "learning_rate": 1.0641282565130262e-05, "loss": 0.0079, "step": 3940 }, { "epoch": 15.764, "grad_norm": 0.16952313482761383, "learning_rate": 1.06312625250501e-05, "loss": 0.0082, "step": 3941 }, { "epoch": 15.768, "grad_norm": 0.23509082198143005, "learning_rate": 1.062124248496994e-05, "loss": 0.009, "step": 3942 }, { "epoch": 15.772, "grad_norm": 0.1740461140871048, "learning_rate": 1.061122244488978e-05, "loss": 0.0081, "step": 3943 }, { "epoch": 15.776, "grad_norm": 0.23396913707256317, "learning_rate": 1.0601202404809619e-05, "loss": 0.0093, "step": 3944 }, { "epoch": 15.78, "grad_norm": 0.1290149837732315, "learning_rate": 1.059118236472946e-05, "loss": 0.0071, "step": 3945 }, { "epoch": 15.784, "grad_norm": 0.19442108273506165, "learning_rate": 1.05811623246493e-05, "loss": 0.0086, "step": 3946 }, { "epoch": 15.788, "grad_norm": 0.24171869456768036, "learning_rate": 1.0571142284569137e-05, "loss": 0.0085, "step": 3947 }, { "epoch": 15.792, "grad_norm": 0.17903035879135132, "learning_rate": 1.0561122244488979e-05, "loss": 0.0082, "step": 3948 }, { "epoch": 15.796, "grad_norm": 0.23411180078983307, "learning_rate": 1.0551102204408818e-05, "loss": 0.0091, "step": 3949 }, { "epoch": 15.8, "grad_norm": 0.6385198831558228, "learning_rate": 1.0541082164328658e-05, "loss": 0.0077, "step": 3950 }, { "epoch": 15.804, "grad_norm": 0.201161727309227, "learning_rate": 1.0531062124248498e-05, "loss": 0.0102, "step": 3951 }, { "epoch": 15.808, "grad_norm": 0.17722225189208984, "learning_rate": 1.0521042084168337e-05, "loss": 0.0083, "step": 3952 }, { "epoch": 15.812, "grad_norm": 0.16090677678585052, "learning_rate": 1.0511022044088177e-05, "loss": 0.0084, "step": 3953 }, { "epoch": 15.816, "grad_norm": 0.1804235279560089, "learning_rate": 1.0501002004008016e-05, "loss": 0.0088, "step": 3954 }, { "epoch": 15.82, "grad_norm": 0.18935038149356842, "learning_rate": 1.0490981963927856e-05, "loss": 0.0093, "step": 3955 }, { "epoch": 15.824, "grad_norm": 0.1825006753206253, "learning_rate": 1.0480961923847695e-05, "loss": 0.0093, "step": 3956 }, { "epoch": 15.828, "grad_norm": 0.24779827892780304, "learning_rate": 1.0470941883767535e-05, "loss": 0.01, "step": 3957 }, { "epoch": 15.832, "grad_norm": 0.26754239201545715, "learning_rate": 1.0460921843687375e-05, "loss": 0.0092, "step": 3958 }, { "epoch": 15.836, "grad_norm": 0.20227602124214172, "learning_rate": 1.0450901803607214e-05, "loss": 0.0086, "step": 3959 }, { "epoch": 15.84, "grad_norm": 0.28584498167037964, "learning_rate": 1.0440881763527056e-05, "loss": 0.0108, "step": 3960 }, { "epoch": 15.844, "grad_norm": 0.23494118452072144, "learning_rate": 1.0430861723446893e-05, "loss": 0.0086, "step": 3961 }, { "epoch": 15.848, "grad_norm": 0.23064729571342468, "learning_rate": 1.0420841683366733e-05, "loss": 0.0093, "step": 3962 }, { "epoch": 15.852, "grad_norm": 0.3154512643814087, "learning_rate": 1.0410821643286574e-05, "loss": 0.0097, "step": 3963 }, { "epoch": 15.856, "grad_norm": 0.18013764917850494, "learning_rate": 1.0400801603206412e-05, "loss": 0.0083, "step": 3964 }, { "epoch": 15.86, "grad_norm": 0.1942710429430008, "learning_rate": 1.0390781563126253e-05, "loss": 0.0089, "step": 3965 }, { "epoch": 15.864, "grad_norm": 0.24880248308181763, "learning_rate": 1.0380761523046093e-05, "loss": 0.0087, "step": 3966 }, { "epoch": 15.868, "grad_norm": 0.20355558395385742, "learning_rate": 1.0370741482965933e-05, "loss": 0.008, "step": 3967 }, { "epoch": 15.872, "grad_norm": 0.2536620795726776, "learning_rate": 1.0360721442885772e-05, "loss": 0.0085, "step": 3968 }, { "epoch": 15.876, "grad_norm": 0.17376026511192322, "learning_rate": 1.0350701402805612e-05, "loss": 0.008, "step": 3969 }, { "epoch": 15.88, "grad_norm": 0.2233988642692566, "learning_rate": 1.0340681362725451e-05, "loss": 0.0091, "step": 3970 }, { "epoch": 15.884, "grad_norm": 0.1942223310470581, "learning_rate": 1.0330661322645291e-05, "loss": 0.0092, "step": 3971 }, { "epoch": 15.888, "grad_norm": 0.1970679759979248, "learning_rate": 1.032064128256513e-05, "loss": 0.0083, "step": 3972 }, { "epoch": 15.892, "grad_norm": 0.14123409986495972, "learning_rate": 1.031062124248497e-05, "loss": 0.0077, "step": 3973 }, { "epoch": 15.896, "grad_norm": 0.1591396927833557, "learning_rate": 1.030060120240481e-05, "loss": 0.008, "step": 3974 }, { "epoch": 15.9, "grad_norm": 0.1460416465997696, "learning_rate": 1.029058116232465e-05, "loss": 0.0079, "step": 3975 }, { "epoch": 15.904, "grad_norm": 0.1960773915052414, "learning_rate": 1.0280561122244489e-05, "loss": 0.0085, "step": 3976 }, { "epoch": 15.908, "grad_norm": 0.16314558684825897, "learning_rate": 1.027054108216433e-05, "loss": 0.0077, "step": 3977 }, { "epoch": 15.912, "grad_norm": 0.3076308071613312, "learning_rate": 1.0260521042084168e-05, "loss": 0.0098, "step": 3978 }, { "epoch": 15.916, "grad_norm": 0.8374438285827637, "learning_rate": 1.0250501002004008e-05, "loss": 0.0114, "step": 3979 }, { "epoch": 15.92, "grad_norm": 0.31852617859840393, "learning_rate": 1.0240480961923849e-05, "loss": 0.0107, "step": 3980 }, { "epoch": 15.924, "grad_norm": 0.19266337156295776, "learning_rate": 1.0230460921843687e-05, "loss": 0.0084, "step": 3981 }, { "epoch": 15.928, "grad_norm": 0.20687268674373627, "learning_rate": 1.0220440881763528e-05, "loss": 0.0089, "step": 3982 }, { "epoch": 15.932, "grad_norm": 0.17380601167678833, "learning_rate": 1.0210420841683368e-05, "loss": 0.0083, "step": 3983 }, { "epoch": 15.936, "grad_norm": 0.2257053554058075, "learning_rate": 1.0200400801603206e-05, "loss": 0.0084, "step": 3984 }, { "epoch": 15.94, "grad_norm": 0.23071831464767456, "learning_rate": 1.0190380761523047e-05, "loss": 0.0102, "step": 3985 }, { "epoch": 15.943999999999999, "grad_norm": 0.21640700101852417, "learning_rate": 1.0180360721442887e-05, "loss": 0.0085, "step": 3986 }, { "epoch": 15.948, "grad_norm": 0.17279183864593506, "learning_rate": 1.0170340681362726e-05, "loss": 0.0082, "step": 3987 }, { "epoch": 15.952, "grad_norm": 0.15583649277687073, "learning_rate": 1.0160320641282566e-05, "loss": 0.0082, "step": 3988 }, { "epoch": 15.956, "grad_norm": 0.30920568108558655, "learning_rate": 1.0150300601202405e-05, "loss": 0.0101, "step": 3989 }, { "epoch": 15.96, "grad_norm": 0.23646380007266998, "learning_rate": 1.0140280561122245e-05, "loss": 0.0092, "step": 3990 }, { "epoch": 15.964, "grad_norm": 0.16509555280208588, "learning_rate": 1.0130260521042084e-05, "loss": 0.0079, "step": 3991 }, { "epoch": 15.968, "grad_norm": 0.23459596931934357, "learning_rate": 1.0120240480961924e-05, "loss": 0.0088, "step": 3992 }, { "epoch": 15.972, "grad_norm": 0.1768113523721695, "learning_rate": 1.0110220440881764e-05, "loss": 0.0088, "step": 3993 }, { "epoch": 15.975999999999999, "grad_norm": 0.22669987380504608, "learning_rate": 1.0100200400801605e-05, "loss": 0.0087, "step": 3994 }, { "epoch": 15.98, "grad_norm": 0.1484929621219635, "learning_rate": 1.0090180360721443e-05, "loss": 0.0078, "step": 3995 }, { "epoch": 15.984, "grad_norm": 0.16619373857975006, "learning_rate": 1.0080160320641282e-05, "loss": 0.0083, "step": 3996 }, { "epoch": 15.988, "grad_norm": 0.14768655598163605, "learning_rate": 1.0070140280561124e-05, "loss": 0.007, "step": 3997 }, { "epoch": 15.992, "grad_norm": 0.169325053691864, "learning_rate": 1.0060120240480962e-05, "loss": 0.0081, "step": 3998 }, { "epoch": 15.996, "grad_norm": 0.22410063445568085, "learning_rate": 1.0050100200400803e-05, "loss": 0.0086, "step": 3999 }, { "epoch": 16.0, "grad_norm": 0.25347474217414856, "learning_rate": 1.0040080160320642e-05, "loss": 0.0092, "step": 4000 }, { "epoch": 16.004, "grad_norm": 0.1537349820137024, "learning_rate": 1.003006012024048e-05, "loss": 0.0079, "step": 4001 }, { "epoch": 16.008, "grad_norm": 0.16942495107650757, "learning_rate": 1.0020040080160322e-05, "loss": 0.0078, "step": 4002 }, { "epoch": 16.012, "grad_norm": 0.15320508182048798, "learning_rate": 1.0010020040080161e-05, "loss": 0.0078, "step": 4003 }, { "epoch": 16.016, "grad_norm": 0.2386036515235901, "learning_rate": 1e-05, "loss": 0.0079, "step": 4004 }, { "epoch": 16.02, "grad_norm": 0.14939050376415253, "learning_rate": 9.98997995991984e-06, "loss": 0.008, "step": 4005 }, { "epoch": 16.024, "grad_norm": 0.14771635830402374, "learning_rate": 9.97995991983968e-06, "loss": 0.0074, "step": 4006 }, { "epoch": 16.028, "grad_norm": 0.12847012281417847, "learning_rate": 9.96993987975952e-06, "loss": 0.0073, "step": 4007 }, { "epoch": 16.032, "grad_norm": 0.2023051530122757, "learning_rate": 9.95991983967936e-06, "loss": 0.0075, "step": 4008 }, { "epoch": 16.036, "grad_norm": 0.1175389364361763, "learning_rate": 9.949899799599199e-06, "loss": 0.006, "step": 4009 }, { "epoch": 16.04, "grad_norm": 0.2609016001224518, "learning_rate": 9.939879759519038e-06, "loss": 0.0071, "step": 4010 }, { "epoch": 16.044, "grad_norm": 0.1463293582201004, "learning_rate": 9.929859719438878e-06, "loss": 0.0071, "step": 4011 }, { "epoch": 16.048, "grad_norm": 0.15348109602928162, "learning_rate": 9.919839679358718e-06, "loss": 0.0076, "step": 4012 }, { "epoch": 16.052, "grad_norm": 0.15261180698871613, "learning_rate": 9.909819639278557e-06, "loss": 0.0072, "step": 4013 }, { "epoch": 16.056, "grad_norm": 0.21835850179195404, "learning_rate": 9.899799599198398e-06, "loss": 0.007, "step": 4014 }, { "epoch": 16.06, "grad_norm": 0.1864071786403656, "learning_rate": 9.889779559118236e-06, "loss": 0.0075, "step": 4015 }, { "epoch": 16.064, "grad_norm": 0.23007088899612427, "learning_rate": 9.879759519038076e-06, "loss": 0.0098, "step": 4016 }, { "epoch": 16.068, "grad_norm": 0.12705695629119873, "learning_rate": 9.869739478957917e-06, "loss": 0.0073, "step": 4017 }, { "epoch": 16.072, "grad_norm": 0.16708585619926453, "learning_rate": 9.859719438877755e-06, "loss": 0.0071, "step": 4018 }, { "epoch": 16.076, "grad_norm": 0.1379920393228531, "learning_rate": 9.849699398797596e-06, "loss": 0.0075, "step": 4019 }, { "epoch": 16.08, "grad_norm": 0.16059952974319458, "learning_rate": 9.839679358717436e-06, "loss": 0.0077, "step": 4020 }, { "epoch": 16.084, "grad_norm": 0.14642521739006042, "learning_rate": 9.829659318637276e-06, "loss": 0.0081, "step": 4021 }, { "epoch": 16.088, "grad_norm": 0.15344499051570892, "learning_rate": 9.819639278557115e-06, "loss": 0.0079, "step": 4022 }, { "epoch": 16.092, "grad_norm": 0.17185145616531372, "learning_rate": 9.809619238476955e-06, "loss": 0.0071, "step": 4023 }, { "epoch": 16.096, "grad_norm": 0.14292530715465546, "learning_rate": 9.799599198396794e-06, "loss": 0.0074, "step": 4024 }, { "epoch": 16.1, "grad_norm": 0.15338358283042908, "learning_rate": 9.789579158316634e-06, "loss": 0.007, "step": 4025 }, { "epoch": 16.104, "grad_norm": 0.1363472044467926, "learning_rate": 9.779559118236473e-06, "loss": 0.0065, "step": 4026 }, { "epoch": 16.108, "grad_norm": 0.18110625445842743, "learning_rate": 9.769539078156313e-06, "loss": 0.0075, "step": 4027 }, { "epoch": 16.112, "grad_norm": 0.24270915985107422, "learning_rate": 9.759519038076153e-06, "loss": 0.0073, "step": 4028 }, { "epoch": 16.116, "grad_norm": 0.11325805634260178, "learning_rate": 9.749498997995992e-06, "loss": 0.0045, "step": 4029 }, { "epoch": 16.12, "grad_norm": 0.6353485584259033, "learning_rate": 9.739478957915832e-06, "loss": 0.0083, "step": 4030 }, { "epoch": 16.124, "grad_norm": 0.20861081779003143, "learning_rate": 9.729458917835673e-06, "loss": 0.008, "step": 4031 }, { "epoch": 16.128, "grad_norm": 0.1678023636341095, "learning_rate": 9.719438877755511e-06, "loss": 0.0075, "step": 4032 }, { "epoch": 16.132, "grad_norm": 0.22754140198230743, "learning_rate": 9.70941883767535e-06, "loss": 0.0077, "step": 4033 }, { "epoch": 16.136, "grad_norm": 0.16630183160305023, "learning_rate": 9.699398797595192e-06, "loss": 0.0082, "step": 4034 }, { "epoch": 16.14, "grad_norm": 0.1556076854467392, "learning_rate": 9.68937875751503e-06, "loss": 0.0075, "step": 4035 }, { "epoch": 16.144, "grad_norm": 0.12683729827404022, "learning_rate": 9.679358717434871e-06, "loss": 0.0068, "step": 4036 }, { "epoch": 16.148, "grad_norm": 0.17109414935112, "learning_rate": 9.66933867735471e-06, "loss": 0.0085, "step": 4037 }, { "epoch": 16.152, "grad_norm": 0.20201754570007324, "learning_rate": 9.659318637274549e-06, "loss": 0.0071, "step": 4038 }, { "epoch": 16.156, "grad_norm": 0.30220654606819153, "learning_rate": 9.64929859719439e-06, "loss": 0.0078, "step": 4039 }, { "epoch": 16.16, "grad_norm": 0.17886076867580414, "learning_rate": 9.63927855711423e-06, "loss": 0.0081, "step": 4040 }, { "epoch": 16.164, "grad_norm": 0.14879950881004333, "learning_rate": 9.629258517034069e-06, "loss": 0.0073, "step": 4041 }, { "epoch": 16.168, "grad_norm": 0.21824462711811066, "learning_rate": 9.619238476953909e-06, "loss": 0.0055, "step": 4042 }, { "epoch": 16.172, "grad_norm": 0.2450965940952301, "learning_rate": 9.609218436873746e-06, "loss": 0.0088, "step": 4043 }, { "epoch": 16.176, "grad_norm": 0.16798727214336395, "learning_rate": 9.599198396793588e-06, "loss": 0.008, "step": 4044 }, { "epoch": 16.18, "grad_norm": 0.21162693202495575, "learning_rate": 9.589178356713427e-06, "loss": 0.0072, "step": 4045 }, { "epoch": 16.184, "grad_norm": 0.1760333925485611, "learning_rate": 9.579158316633267e-06, "loss": 0.0073, "step": 4046 }, { "epoch": 16.188, "grad_norm": 0.2047117054462433, "learning_rate": 9.569138276553107e-06, "loss": 0.008, "step": 4047 }, { "epoch": 16.192, "grad_norm": 0.18758143484592438, "learning_rate": 9.559118236472948e-06, "loss": 0.0069, "step": 4048 }, { "epoch": 16.196, "grad_norm": 0.2109622359275818, "learning_rate": 9.549098196392786e-06, "loss": 0.0079, "step": 4049 }, { "epoch": 16.2, "grad_norm": 0.16541019082069397, "learning_rate": 9.539078156312625e-06, "loss": 0.0065, "step": 4050 }, { "epoch": 16.204, "grad_norm": 0.12194262444972992, "learning_rate": 9.529058116232467e-06, "loss": 0.0067, "step": 4051 }, { "epoch": 16.208, "grad_norm": 0.5753403306007385, "learning_rate": 9.519038076152304e-06, "loss": 0.0117, "step": 4052 }, { "epoch": 16.212, "grad_norm": 0.2605521082878113, "learning_rate": 9.509018036072146e-06, "loss": 0.0094, "step": 4053 }, { "epoch": 16.216, "grad_norm": 0.2662070393562317, "learning_rate": 9.498997995991984e-06, "loss": 0.0089, "step": 4054 }, { "epoch": 16.22, "grad_norm": 0.17253991961479187, "learning_rate": 9.488977955911823e-06, "loss": 0.0064, "step": 4055 }, { "epoch": 16.224, "grad_norm": 0.1860446333885193, "learning_rate": 9.478957915831665e-06, "loss": 0.0077, "step": 4056 }, { "epoch": 16.228, "grad_norm": 0.1807558834552765, "learning_rate": 9.468937875751502e-06, "loss": 0.0084, "step": 4057 }, { "epoch": 16.232, "grad_norm": 0.2518766224384308, "learning_rate": 9.458917835671344e-06, "loss": 0.0085, "step": 4058 }, { "epoch": 16.236, "grad_norm": 0.18722839653491974, "learning_rate": 9.448897795591183e-06, "loss": 0.0073, "step": 4059 }, { "epoch": 16.24, "grad_norm": 0.16822689771652222, "learning_rate": 9.438877755511021e-06, "loss": 0.0072, "step": 4060 }, { "epoch": 16.244, "grad_norm": 0.26538798213005066, "learning_rate": 9.428857715430862e-06, "loss": 0.0082, "step": 4061 }, { "epoch": 16.248, "grad_norm": 0.24949723482131958, "learning_rate": 9.418837675350702e-06, "loss": 0.009, "step": 4062 }, { "epoch": 16.252, "grad_norm": 0.1420658975839615, "learning_rate": 9.408817635270542e-06, "loss": 0.0066, "step": 4063 }, { "epoch": 16.256, "grad_norm": 0.1801360845565796, "learning_rate": 9.398797595190381e-06, "loss": 0.0076, "step": 4064 }, { "epoch": 16.26, "grad_norm": 0.16072385013103485, "learning_rate": 9.38877755511022e-06, "loss": 0.0074, "step": 4065 }, { "epoch": 16.264, "grad_norm": 0.1723981499671936, "learning_rate": 9.37875751503006e-06, "loss": 0.0079, "step": 4066 }, { "epoch": 16.268, "grad_norm": 0.20122207701206207, "learning_rate": 9.3687374749499e-06, "loss": 0.0085, "step": 4067 }, { "epoch": 16.272, "grad_norm": 0.16807159781455994, "learning_rate": 9.35871743486974e-06, "loss": 0.0081, "step": 4068 }, { "epoch": 16.276, "grad_norm": 0.21395786106586456, "learning_rate": 9.34869739478958e-06, "loss": 0.0084, "step": 4069 }, { "epoch": 16.28, "grad_norm": 0.1668098419904709, "learning_rate": 9.338677354709419e-06, "loss": 0.0071, "step": 4070 }, { "epoch": 16.284, "grad_norm": 0.1805601865053177, "learning_rate": 9.328657314629258e-06, "loss": 0.0071, "step": 4071 }, { "epoch": 16.288, "grad_norm": 0.2028336524963379, "learning_rate": 9.318637274549098e-06, "loss": 0.0074, "step": 4072 }, { "epoch": 16.292, "grad_norm": 0.20740242302417755, "learning_rate": 9.30861723446894e-06, "loss": 0.0076, "step": 4073 }, { "epoch": 16.296, "grad_norm": 0.21578271687030792, "learning_rate": 9.298597194388777e-06, "loss": 0.0082, "step": 4074 }, { "epoch": 16.3, "grad_norm": 0.16717369854450226, "learning_rate": 9.288577154308618e-06, "loss": 0.0076, "step": 4075 }, { "epoch": 16.304, "grad_norm": 0.1616104692220688, "learning_rate": 9.278557114228458e-06, "loss": 0.008, "step": 4076 }, { "epoch": 16.308, "grad_norm": 0.16851823031902313, "learning_rate": 9.268537074148296e-06, "loss": 0.0072, "step": 4077 }, { "epoch": 16.312, "grad_norm": 0.18806764483451843, "learning_rate": 9.258517034068137e-06, "loss": 0.008, "step": 4078 }, { "epoch": 16.316, "grad_norm": 0.21248526871204376, "learning_rate": 9.248496993987977e-06, "loss": 0.0076, "step": 4079 }, { "epoch": 16.32, "grad_norm": 0.2105952352285385, "learning_rate": 9.238476953907816e-06, "loss": 0.009, "step": 4080 }, { "epoch": 16.324, "grad_norm": 0.1982223242521286, "learning_rate": 9.228456913827656e-06, "loss": 0.0071, "step": 4081 }, { "epoch": 16.328, "grad_norm": 0.13595516979694366, "learning_rate": 9.218436873747496e-06, "loss": 0.0075, "step": 4082 }, { "epoch": 16.332, "grad_norm": 0.16603784263134003, "learning_rate": 9.208416833667335e-06, "loss": 0.0079, "step": 4083 }, { "epoch": 16.336, "grad_norm": 0.12831614911556244, "learning_rate": 9.198396793587175e-06, "loss": 0.0063, "step": 4084 }, { "epoch": 16.34, "grad_norm": 0.1970067024230957, "learning_rate": 9.188376753507014e-06, "loss": 0.0067, "step": 4085 }, { "epoch": 16.344, "grad_norm": 0.1089383140206337, "learning_rate": 9.178356713426854e-06, "loss": 0.0045, "step": 4086 }, { "epoch": 16.348, "grad_norm": 0.2165975272655487, "learning_rate": 9.168336673346693e-06, "loss": 0.008, "step": 4087 }, { "epoch": 16.352, "grad_norm": 0.1895694136619568, "learning_rate": 9.158316633266533e-06, "loss": 0.007, "step": 4088 }, { "epoch": 16.356, "grad_norm": 0.2537732720375061, "learning_rate": 9.148296593186373e-06, "loss": 0.0093, "step": 4089 }, { "epoch": 16.36, "grad_norm": 0.16854915022850037, "learning_rate": 9.138276553106214e-06, "loss": 0.0087, "step": 4090 }, { "epoch": 16.364, "grad_norm": 0.1767825335264206, "learning_rate": 9.128256513026052e-06, "loss": 0.0075, "step": 4091 }, { "epoch": 16.368, "grad_norm": 0.19032791256904602, "learning_rate": 9.118236472945891e-06, "loss": 0.0078, "step": 4092 }, { "epoch": 16.372, "grad_norm": 0.21050649881362915, "learning_rate": 9.108216432865733e-06, "loss": 0.0083, "step": 4093 }, { "epoch": 16.376, "grad_norm": 0.1695554405450821, "learning_rate": 9.09819639278557e-06, "loss": 0.0079, "step": 4094 }, { "epoch": 16.38, "grad_norm": 0.15526901185512543, "learning_rate": 9.088176352705412e-06, "loss": 0.0084, "step": 4095 }, { "epoch": 16.384, "grad_norm": 0.18327704071998596, "learning_rate": 9.078156312625251e-06, "loss": 0.0076, "step": 4096 }, { "epoch": 16.388, "grad_norm": 0.1327105313539505, "learning_rate": 9.06813627254509e-06, "loss": 0.007, "step": 4097 }, { "epoch": 16.392, "grad_norm": 0.17368905246257782, "learning_rate": 9.05811623246493e-06, "loss": 0.0078, "step": 4098 }, { "epoch": 16.396, "grad_norm": 0.2445645034313202, "learning_rate": 9.04809619238477e-06, "loss": 0.0084, "step": 4099 }, { "epoch": 16.4, "grad_norm": 0.1971755027770996, "learning_rate": 9.03807615230461e-06, "loss": 0.0084, "step": 4100 }, { "epoch": 16.404, "grad_norm": 0.22356528043746948, "learning_rate": 9.02805611222445e-06, "loss": 0.0085, "step": 4101 }, { "epoch": 16.408, "grad_norm": 0.2354351133108139, "learning_rate": 9.018036072144289e-06, "loss": 0.0088, "step": 4102 }, { "epoch": 16.412, "grad_norm": 0.12401222437620163, "learning_rate": 9.008016032064129e-06, "loss": 0.0047, "step": 4103 }, { "epoch": 16.416, "grad_norm": 0.10349331051111221, "learning_rate": 8.997995991983968e-06, "loss": 0.005, "step": 4104 }, { "epoch": 16.42, "grad_norm": 0.16208389401435852, "learning_rate": 8.987975951903808e-06, "loss": 0.0084, "step": 4105 }, { "epoch": 16.424, "grad_norm": 0.243452250957489, "learning_rate": 8.977955911823647e-06, "loss": 0.0094, "step": 4106 }, { "epoch": 16.428, "grad_norm": 0.142629936337471, "learning_rate": 8.967935871743489e-06, "loss": 0.0075, "step": 4107 }, { "epoch": 16.432, "grad_norm": 0.15112538635730743, "learning_rate": 8.957915831663327e-06, "loss": 0.0075, "step": 4108 }, { "epoch": 16.436, "grad_norm": 0.15232206881046295, "learning_rate": 8.947895791583166e-06, "loss": 0.004, "step": 4109 }, { "epoch": 16.44, "grad_norm": 0.20458529889583588, "learning_rate": 8.937875751503007e-06, "loss": 0.0081, "step": 4110 }, { "epoch": 16.444, "grad_norm": 0.1483834683895111, "learning_rate": 8.927855711422845e-06, "loss": 0.0086, "step": 4111 }, { "epoch": 16.448, "grad_norm": 0.13278226554393768, "learning_rate": 8.917835671342687e-06, "loss": 0.0067, "step": 4112 }, { "epoch": 16.452, "grad_norm": 0.16911619901657104, "learning_rate": 8.907815631262526e-06, "loss": 0.0074, "step": 4113 }, { "epoch": 16.456, "grad_norm": 0.2102874517440796, "learning_rate": 8.897795591182364e-06, "loss": 0.0089, "step": 4114 }, { "epoch": 16.46, "grad_norm": 0.2927187383174896, "learning_rate": 8.887775551102205e-06, "loss": 0.0083, "step": 4115 }, { "epoch": 16.464, "grad_norm": 0.13784025609493256, "learning_rate": 8.877755511022045e-06, "loss": 0.0079, "step": 4116 }, { "epoch": 16.468, "grad_norm": 0.15802910923957825, "learning_rate": 8.867735470941884e-06, "loss": 0.0077, "step": 4117 }, { "epoch": 16.472, "grad_norm": 0.15576040744781494, "learning_rate": 8.857715430861724e-06, "loss": 0.007, "step": 4118 }, { "epoch": 16.476, "grad_norm": 0.1388392150402069, "learning_rate": 8.847695390781564e-06, "loss": 0.0073, "step": 4119 }, { "epoch": 16.48, "grad_norm": 0.14177972078323364, "learning_rate": 8.837675350701403e-06, "loss": 0.0075, "step": 4120 }, { "epoch": 16.484, "grad_norm": 0.19317297637462616, "learning_rate": 8.827655310621243e-06, "loss": 0.0072, "step": 4121 }, { "epoch": 16.488, "grad_norm": 0.15589751303195953, "learning_rate": 8.817635270541082e-06, "loss": 0.0079, "step": 4122 }, { "epoch": 16.492, "grad_norm": 0.23068901896476746, "learning_rate": 8.807615230460922e-06, "loss": 0.0078, "step": 4123 }, { "epoch": 16.496, "grad_norm": 0.2934131920337677, "learning_rate": 8.797595190380762e-06, "loss": 0.008, "step": 4124 }, { "epoch": 16.5, "grad_norm": 0.20778466761112213, "learning_rate": 8.787575150300601e-06, "loss": 0.0087, "step": 4125 }, { "epoch": 16.504, "grad_norm": 0.1556091457605362, "learning_rate": 8.77755511022044e-06, "loss": 0.0071, "step": 4126 }, { "epoch": 16.508, "grad_norm": 0.1435934156179428, "learning_rate": 8.767535070140282e-06, "loss": 0.0078, "step": 4127 }, { "epoch": 16.512, "grad_norm": 0.18272621929645538, "learning_rate": 8.75751503006012e-06, "loss": 0.0082, "step": 4128 }, { "epoch": 16.516, "grad_norm": 0.18199694156646729, "learning_rate": 8.747494989979961e-06, "loss": 0.0073, "step": 4129 }, { "epoch": 16.52, "grad_norm": 0.18646559119224548, "learning_rate": 8.7374749498998e-06, "loss": 0.0071, "step": 4130 }, { "epoch": 16.524, "grad_norm": 0.20238129794597626, "learning_rate": 8.727454909819639e-06, "loss": 0.0087, "step": 4131 }, { "epoch": 16.528, "grad_norm": 0.17325668036937714, "learning_rate": 8.71743486973948e-06, "loss": 0.0049, "step": 4132 }, { "epoch": 16.532, "grad_norm": 0.21818231046199799, "learning_rate": 8.70741482965932e-06, "loss": 0.008, "step": 4133 }, { "epoch": 16.536, "grad_norm": 0.12345243245363235, "learning_rate": 8.69739478957916e-06, "loss": 0.0066, "step": 4134 }, { "epoch": 16.54, "grad_norm": 0.21801672875881195, "learning_rate": 8.687374749498999e-06, "loss": 0.009, "step": 4135 }, { "epoch": 16.544, "grad_norm": 0.22668403387069702, "learning_rate": 8.677354709418838e-06, "loss": 0.0087, "step": 4136 }, { "epoch": 16.548000000000002, "grad_norm": 0.11883547157049179, "learning_rate": 8.667334669338678e-06, "loss": 0.0069, "step": 4137 }, { "epoch": 16.552, "grad_norm": 0.3186958134174347, "learning_rate": 8.657314629258518e-06, "loss": 0.0092, "step": 4138 }, { "epoch": 16.556, "grad_norm": 0.1984918713569641, "learning_rate": 8.647294589178357e-06, "loss": 0.0085, "step": 4139 }, { "epoch": 16.56, "grad_norm": 0.2220780849456787, "learning_rate": 8.637274549098197e-06, "loss": 0.0093, "step": 4140 }, { "epoch": 16.564, "grad_norm": 0.11941434442996979, "learning_rate": 8.627254509018036e-06, "loss": 0.0067, "step": 4141 }, { "epoch": 16.568, "grad_norm": 0.1803411990404129, "learning_rate": 8.617234468937876e-06, "loss": 0.0081, "step": 4142 }, { "epoch": 16.572, "grad_norm": 0.2058972716331482, "learning_rate": 8.607214428857715e-06, "loss": 0.0075, "step": 4143 }, { "epoch": 16.576, "grad_norm": 0.23917993903160095, "learning_rate": 8.597194388777557e-06, "loss": 0.0091, "step": 4144 }, { "epoch": 16.58, "grad_norm": 0.1594621241092682, "learning_rate": 8.587174348697395e-06, "loss": 0.0082, "step": 4145 }, { "epoch": 16.584, "grad_norm": 0.08639764040708542, "learning_rate": 8.577154308617234e-06, "loss": 0.004, "step": 4146 }, { "epoch": 16.588, "grad_norm": 0.2543061673641205, "learning_rate": 8.567134268537076e-06, "loss": 0.009, "step": 4147 }, { "epoch": 16.592, "grad_norm": 0.2126331478357315, "learning_rate": 8.557114228456913e-06, "loss": 0.0082, "step": 4148 }, { "epoch": 16.596, "grad_norm": 0.2556276023387909, "learning_rate": 8.547094188376755e-06, "loss": 0.0088, "step": 4149 }, { "epoch": 16.6, "grad_norm": 0.14997954666614532, "learning_rate": 8.537074148296594e-06, "loss": 0.008, "step": 4150 }, { "epoch": 16.604, "grad_norm": 0.18085747957229614, "learning_rate": 8.527054108216432e-06, "loss": 0.008, "step": 4151 }, { "epoch": 16.608, "grad_norm": 0.22281257808208466, "learning_rate": 8.517034068136273e-06, "loss": 0.0076, "step": 4152 }, { "epoch": 16.612, "grad_norm": 0.11172794550657272, "learning_rate": 8.507014028056113e-06, "loss": 0.0052, "step": 4153 }, { "epoch": 16.616, "grad_norm": 0.2109166383743286, "learning_rate": 8.496993987975953e-06, "loss": 0.0081, "step": 4154 }, { "epoch": 16.62, "grad_norm": 0.19735772907733917, "learning_rate": 8.486973947895792e-06, "loss": 0.0084, "step": 4155 }, { "epoch": 16.624, "grad_norm": 0.1670539826154709, "learning_rate": 8.47695390781563e-06, "loss": 0.0076, "step": 4156 }, { "epoch": 16.628, "grad_norm": 0.26279884576797485, "learning_rate": 8.466933867735471e-06, "loss": 0.0094, "step": 4157 }, { "epoch": 16.632, "grad_norm": 0.24270443618297577, "learning_rate": 8.456913827655311e-06, "loss": 0.0088, "step": 4158 }, { "epoch": 16.636, "grad_norm": 0.2702789604663849, "learning_rate": 8.44689378757515e-06, "loss": 0.0094, "step": 4159 }, { "epoch": 16.64, "grad_norm": 0.13629254698753357, "learning_rate": 8.43687374749499e-06, "loss": 0.007, "step": 4160 }, { "epoch": 16.644, "grad_norm": 0.1675589680671692, "learning_rate": 8.426853707414831e-06, "loss": 0.0093, "step": 4161 }, { "epoch": 16.648, "grad_norm": 0.10212656110525131, "learning_rate": 8.41683366733467e-06, "loss": 0.0048, "step": 4162 }, { "epoch": 16.652, "grad_norm": 0.39032217860221863, "learning_rate": 8.406813627254509e-06, "loss": 0.0074, "step": 4163 }, { "epoch": 16.656, "grad_norm": 0.20752616226673126, "learning_rate": 8.39679358717435e-06, "loss": 0.0095, "step": 4164 }, { "epoch": 16.66, "grad_norm": 0.15666551887989044, "learning_rate": 8.386773547094188e-06, "loss": 0.0083, "step": 4165 }, { "epoch": 16.664, "grad_norm": 0.17233043909072876, "learning_rate": 8.37675350701403e-06, "loss": 0.0076, "step": 4166 }, { "epoch": 16.668, "grad_norm": 0.17786544561386108, "learning_rate": 8.366733466933869e-06, "loss": 0.008, "step": 4167 }, { "epoch": 16.672, "grad_norm": 0.14327113330364227, "learning_rate": 8.356713426853707e-06, "loss": 0.0077, "step": 4168 }, { "epoch": 16.676, "grad_norm": 0.16869743168354034, "learning_rate": 8.346693386773548e-06, "loss": 0.0074, "step": 4169 }, { "epoch": 16.68, "grad_norm": 0.15478970110416412, "learning_rate": 8.336673346693386e-06, "loss": 0.0077, "step": 4170 }, { "epoch": 16.684, "grad_norm": 0.29708924889564514, "learning_rate": 8.326653306613227e-06, "loss": 0.009, "step": 4171 }, { "epoch": 16.688, "grad_norm": 0.1700715571641922, "learning_rate": 8.316633266533067e-06, "loss": 0.0086, "step": 4172 }, { "epoch": 16.692, "grad_norm": 0.16658703982830048, "learning_rate": 8.306613226452905e-06, "loss": 0.0086, "step": 4173 }, { "epoch": 16.696, "grad_norm": 0.18442490696907043, "learning_rate": 8.296593186372746e-06, "loss": 0.0084, "step": 4174 }, { "epoch": 16.7, "grad_norm": 0.2534857392311096, "learning_rate": 8.286573146292586e-06, "loss": 0.0094, "step": 4175 }, { "epoch": 16.704, "grad_norm": 0.193229541182518, "learning_rate": 8.276553106212425e-06, "loss": 0.0084, "step": 4176 }, { "epoch": 16.708, "grad_norm": 0.14113031327724457, "learning_rate": 8.266533066132265e-06, "loss": 0.0072, "step": 4177 }, { "epoch": 16.712, "grad_norm": 0.1453218162059784, "learning_rate": 8.256513026052104e-06, "loss": 0.0083, "step": 4178 }, { "epoch": 16.716, "grad_norm": 0.17128872871398926, "learning_rate": 8.246492985971944e-06, "loss": 0.0084, "step": 4179 }, { "epoch": 16.72, "grad_norm": 0.17818258702754974, "learning_rate": 8.236472945891784e-06, "loss": 0.0067, "step": 4180 }, { "epoch": 16.724, "grad_norm": 0.1808307021856308, "learning_rate": 8.226452905811623e-06, "loss": 0.0084, "step": 4181 }, { "epoch": 16.728, "grad_norm": 0.11610844731330872, "learning_rate": 8.216432865731463e-06, "loss": 0.0048, "step": 4182 }, { "epoch": 16.732, "grad_norm": 0.25724536180496216, "learning_rate": 8.206412825651302e-06, "loss": 0.0097, "step": 4183 }, { "epoch": 16.736, "grad_norm": 0.26306334137916565, "learning_rate": 8.196392785571142e-06, "loss": 0.0093, "step": 4184 }, { "epoch": 16.74, "grad_norm": 0.17018313705921173, "learning_rate": 8.186372745490982e-06, "loss": 0.0073, "step": 4185 }, { "epoch": 16.744, "grad_norm": 0.20504972338676453, "learning_rate": 8.176352705410823e-06, "loss": 0.0085, "step": 4186 }, { "epoch": 16.748, "grad_norm": 0.19006863236427307, "learning_rate": 8.16633266533066e-06, "loss": 0.0079, "step": 4187 }, { "epoch": 16.752, "grad_norm": 0.22111523151397705, "learning_rate": 8.156312625250502e-06, "loss": 0.009, "step": 4188 }, { "epoch": 16.756, "grad_norm": 0.17327627539634705, "learning_rate": 8.146292585170342e-06, "loss": 0.0079, "step": 4189 }, { "epoch": 16.76, "grad_norm": 0.20672562718391418, "learning_rate": 8.13627254509018e-06, "loss": 0.0095, "step": 4190 }, { "epoch": 16.764, "grad_norm": 0.3867163062095642, "learning_rate": 8.12625250501002e-06, "loss": 0.0091, "step": 4191 }, { "epoch": 16.768, "grad_norm": 0.22634045779705048, "learning_rate": 8.11623246492986e-06, "loss": 0.0076, "step": 4192 }, { "epoch": 16.772, "grad_norm": 0.27298519015312195, "learning_rate": 8.1062124248497e-06, "loss": 0.0097, "step": 4193 }, { "epoch": 16.776, "grad_norm": 0.14704295992851257, "learning_rate": 8.09619238476954e-06, "loss": 0.0082, "step": 4194 }, { "epoch": 16.78, "grad_norm": 0.12649138271808624, "learning_rate": 8.08617234468938e-06, "loss": 0.007, "step": 4195 }, { "epoch": 16.784, "grad_norm": 0.2581845223903656, "learning_rate": 8.076152304609219e-06, "loss": 0.0093, "step": 4196 }, { "epoch": 16.788, "grad_norm": 0.19466523826122284, "learning_rate": 8.066132264529058e-06, "loss": 0.008, "step": 4197 }, { "epoch": 16.792, "grad_norm": 0.2284821718931198, "learning_rate": 8.056112224448898e-06, "loss": 0.009, "step": 4198 }, { "epoch": 16.796, "grad_norm": 0.18178433179855347, "learning_rate": 8.046092184368738e-06, "loss": 0.0096, "step": 4199 }, { "epoch": 16.8, "grad_norm": 0.143863707780838, "learning_rate": 8.036072144288577e-06, "loss": 0.005, "step": 4200 }, { "epoch": 16.804, "grad_norm": 0.14635920524597168, "learning_rate": 8.026052104208417e-06, "loss": 0.0085, "step": 4201 }, { "epoch": 16.808, "grad_norm": 0.11327727884054184, "learning_rate": 8.016032064128256e-06, "loss": 0.0046, "step": 4202 }, { "epoch": 16.812, "grad_norm": 0.19041374325752258, "learning_rate": 8.006012024048098e-06, "loss": 0.0089, "step": 4203 }, { "epoch": 16.816, "grad_norm": 0.13083264231681824, "learning_rate": 7.995991983967935e-06, "loss": 0.0072, "step": 4204 }, { "epoch": 16.82, "grad_norm": 0.21465055644512177, "learning_rate": 7.985971943887775e-06, "loss": 0.0083, "step": 4205 }, { "epoch": 16.824, "grad_norm": 0.20523211359977722, "learning_rate": 7.975951903807616e-06, "loss": 0.008, "step": 4206 }, { "epoch": 16.828, "grad_norm": 0.14747971296310425, "learning_rate": 7.965931863727454e-06, "loss": 0.0076, "step": 4207 }, { "epoch": 16.832, "grad_norm": 0.15081685781478882, "learning_rate": 7.955911823647296e-06, "loss": 0.0072, "step": 4208 }, { "epoch": 16.836, "grad_norm": 0.18735671043395996, "learning_rate": 7.945891783567135e-06, "loss": 0.0078, "step": 4209 }, { "epoch": 16.84, "grad_norm": 0.2500723898410797, "learning_rate": 7.935871743486973e-06, "loss": 0.0099, "step": 4210 }, { "epoch": 16.844, "grad_norm": 0.44487854838371277, "learning_rate": 7.925851703406814e-06, "loss": 0.0097, "step": 4211 }, { "epoch": 16.848, "grad_norm": 0.18985402584075928, "learning_rate": 7.915831663326654e-06, "loss": 0.0082, "step": 4212 }, { "epoch": 16.852, "grad_norm": 0.16304108500480652, "learning_rate": 7.905811623246493e-06, "loss": 0.0079, "step": 4213 }, { "epoch": 16.856, "grad_norm": 0.14347808063030243, "learning_rate": 7.895791583166333e-06, "loss": 0.008, "step": 4214 }, { "epoch": 16.86, "grad_norm": 0.18546195328235626, "learning_rate": 7.885771543086173e-06, "loss": 0.0085, "step": 4215 }, { "epoch": 16.864, "grad_norm": 0.16561359167099, "learning_rate": 7.875751503006012e-06, "loss": 0.0084, "step": 4216 }, { "epoch": 16.868, "grad_norm": 0.1679631471633911, "learning_rate": 7.865731462925852e-06, "loss": 0.0083, "step": 4217 }, { "epoch": 16.872, "grad_norm": 0.1606558859348297, "learning_rate": 7.855711422845691e-06, "loss": 0.0077, "step": 4218 }, { "epoch": 16.876, "grad_norm": 0.20817658305168152, "learning_rate": 7.845691382765531e-06, "loss": 0.0089, "step": 4219 }, { "epoch": 16.88, "grad_norm": 0.14732685685157776, "learning_rate": 7.835671342685372e-06, "loss": 0.0077, "step": 4220 }, { "epoch": 16.884, "grad_norm": 0.20664165914058685, "learning_rate": 7.82565130260521e-06, "loss": 0.0081, "step": 4221 }, { "epoch": 16.888, "grad_norm": 0.20258696377277374, "learning_rate": 7.81563126252505e-06, "loss": 0.0093, "step": 4222 }, { "epoch": 16.892, "grad_norm": 0.19178465008735657, "learning_rate": 7.805611222444891e-06, "loss": 0.0082, "step": 4223 }, { "epoch": 16.896, "grad_norm": 0.30760034918785095, "learning_rate": 7.795591182364729e-06, "loss": 0.0096, "step": 4224 }, { "epoch": 16.9, "grad_norm": 0.17052769660949707, "learning_rate": 7.78557114228457e-06, "loss": 0.0086, "step": 4225 }, { "epoch": 16.904, "grad_norm": 0.1625903993844986, "learning_rate": 7.77555110220441e-06, "loss": 0.0082, "step": 4226 }, { "epoch": 16.908, "grad_norm": 0.17967870831489563, "learning_rate": 7.765531062124248e-06, "loss": 0.0087, "step": 4227 }, { "epoch": 16.912, "grad_norm": 0.2507075071334839, "learning_rate": 7.755511022044089e-06, "loss": 0.0091, "step": 4228 }, { "epoch": 16.916, "grad_norm": 0.33597537875175476, "learning_rate": 7.745490981963929e-06, "loss": 0.0086, "step": 4229 }, { "epoch": 16.92, "grad_norm": 0.15713933110237122, "learning_rate": 7.735470941883768e-06, "loss": 0.0075, "step": 4230 }, { "epoch": 16.924, "grad_norm": 0.23833885788917542, "learning_rate": 7.725450901803608e-06, "loss": 0.0087, "step": 4231 }, { "epoch": 16.928, "grad_norm": 0.18948806822299957, "learning_rate": 7.715430861723447e-06, "loss": 0.0083, "step": 4232 }, { "epoch": 16.932, "grad_norm": 0.23685553669929504, "learning_rate": 7.705410821643287e-06, "loss": 0.0086, "step": 4233 }, { "epoch": 16.936, "grad_norm": 0.22197188436985016, "learning_rate": 7.695390781563127e-06, "loss": 0.0092, "step": 4234 }, { "epoch": 16.94, "grad_norm": 0.14752846956253052, "learning_rate": 7.685370741482966e-06, "loss": 0.0075, "step": 4235 }, { "epoch": 16.944, "grad_norm": 0.16525785624980927, "learning_rate": 7.675350701402806e-06, "loss": 0.0082, "step": 4236 }, { "epoch": 16.948, "grad_norm": 0.18567903339862823, "learning_rate": 7.665330661322645e-06, "loss": 0.0079, "step": 4237 }, { "epoch": 16.951999999999998, "grad_norm": 0.16079868376255035, "learning_rate": 7.655310621242485e-06, "loss": 0.0079, "step": 4238 }, { "epoch": 16.956, "grad_norm": 0.17427866160869598, "learning_rate": 7.645290581162324e-06, "loss": 0.0088, "step": 4239 }, { "epoch": 16.96, "grad_norm": 0.18866820633411407, "learning_rate": 7.635270541082166e-06, "loss": 0.0085, "step": 4240 }, { "epoch": 16.964, "grad_norm": 0.15932509303092957, "learning_rate": 7.6252505010020045e-06, "loss": 0.0074, "step": 4241 }, { "epoch": 16.968, "grad_norm": 0.2091926783323288, "learning_rate": 7.615230460921845e-06, "loss": 0.0093, "step": 4242 }, { "epoch": 16.972, "grad_norm": 0.2609868049621582, "learning_rate": 7.605210420841684e-06, "loss": 0.0103, "step": 4243 }, { "epoch": 16.976, "grad_norm": 0.1595424860715866, "learning_rate": 7.595190380761523e-06, "loss": 0.0079, "step": 4244 }, { "epoch": 16.98, "grad_norm": 0.1985984891653061, "learning_rate": 7.585170340681364e-06, "loss": 0.0089, "step": 4245 }, { "epoch": 16.984, "grad_norm": 0.17337600886821747, "learning_rate": 7.5751503006012024e-06, "loss": 0.0074, "step": 4246 }, { "epoch": 16.988, "grad_norm": 0.18920986354351044, "learning_rate": 7.565130260521043e-06, "loss": 0.0083, "step": 4247 }, { "epoch": 16.992, "grad_norm": 0.17387863993644714, "learning_rate": 7.5551102204408825e-06, "loss": 0.0085, "step": 4248 }, { "epoch": 16.996, "grad_norm": 0.20932914316654205, "learning_rate": 7.545090180360721e-06, "loss": 0.0081, "step": 4249 }, { "epoch": 17.0, "grad_norm": 0.18405738472938538, "learning_rate": 7.535070140280562e-06, "loss": 0.0078, "step": 4250 }, { "epoch": 17.004, "grad_norm": 0.12063789367675781, "learning_rate": 7.525050100200401e-06, "loss": 0.0067, "step": 4251 }, { "epoch": 17.008, "grad_norm": 0.13446062803268433, "learning_rate": 7.515030060120242e-06, "loss": 0.0067, "step": 4252 }, { "epoch": 17.012, "grad_norm": 0.13746428489685059, "learning_rate": 7.50501002004008e-06, "loss": 0.0073, "step": 4253 }, { "epoch": 17.016, "grad_norm": 0.17416371405124664, "learning_rate": 7.49498997995992e-06, "loss": 0.0079, "step": 4254 }, { "epoch": 17.02, "grad_norm": 0.19402289390563965, "learning_rate": 7.48496993987976e-06, "loss": 0.0073, "step": 4255 }, { "epoch": 17.024, "grad_norm": 0.14378122985363007, "learning_rate": 7.474949899799599e-06, "loss": 0.0065, "step": 4256 }, { "epoch": 17.028, "grad_norm": 0.14554685354232788, "learning_rate": 7.46492985971944e-06, "loss": 0.0069, "step": 4257 }, { "epoch": 17.032, "grad_norm": 0.14952172338962555, "learning_rate": 7.454909819639279e-06, "loss": 0.0075, "step": 4258 }, { "epoch": 17.036, "grad_norm": 0.08101173490285873, "learning_rate": 7.444889779559118e-06, "loss": 0.0038, "step": 4259 }, { "epoch": 17.04, "grad_norm": 0.13694711029529572, "learning_rate": 7.434869739478958e-06, "loss": 0.0065, "step": 4260 }, { "epoch": 17.044, "grad_norm": 0.1457206904888153, "learning_rate": 7.424849699398798e-06, "loss": 0.0075, "step": 4261 }, { "epoch": 17.048, "grad_norm": 0.1786690503358841, "learning_rate": 7.414829659318638e-06, "loss": 0.0067, "step": 4262 }, { "epoch": 17.052, "grad_norm": 0.16678613424301147, "learning_rate": 7.404809619238477e-06, "loss": 0.007, "step": 4263 }, { "epoch": 17.056, "grad_norm": 0.12600558996200562, "learning_rate": 7.394789579158317e-06, "loss": 0.0074, "step": 4264 }, { "epoch": 17.06, "grad_norm": 0.1651485711336136, "learning_rate": 7.384769539078157e-06, "loss": 0.0077, "step": 4265 }, { "epoch": 17.064, "grad_norm": 0.12365109473466873, "learning_rate": 7.374749498997996e-06, "loss": 0.0065, "step": 4266 }, { "epoch": 17.068, "grad_norm": 0.16177420318126678, "learning_rate": 7.364729458917836e-06, "loss": 0.007, "step": 4267 }, { "epoch": 17.072, "grad_norm": 0.1336231380701065, "learning_rate": 7.354709418837676e-06, "loss": 0.0067, "step": 4268 }, { "epoch": 17.076, "grad_norm": 0.12120088934898376, "learning_rate": 7.344689378757516e-06, "loss": 0.0044, "step": 4269 }, { "epoch": 17.08, "grad_norm": 0.1796593815088272, "learning_rate": 7.334669338677355e-06, "loss": 0.0076, "step": 4270 }, { "epoch": 17.084, "grad_norm": 0.158540278673172, "learning_rate": 7.324649298597195e-06, "loss": 0.0069, "step": 4271 }, { "epoch": 17.088, "grad_norm": 0.19492676854133606, "learning_rate": 7.314629258517035e-06, "loss": 0.0073, "step": 4272 }, { "epoch": 17.092, "grad_norm": 0.19501705467700958, "learning_rate": 7.304609218436874e-06, "loss": 0.008, "step": 4273 }, { "epoch": 17.096, "grad_norm": 0.16361992061138153, "learning_rate": 7.294589178356714e-06, "loss": 0.0068, "step": 4274 }, { "epoch": 17.1, "grad_norm": 0.14692838490009308, "learning_rate": 7.284569138276554e-06, "loss": 0.007, "step": 4275 }, { "epoch": 17.104, "grad_norm": 0.1860724836587906, "learning_rate": 7.274549098196393e-06, "loss": 0.0079, "step": 4276 }, { "epoch": 17.108, "grad_norm": 0.2329706847667694, "learning_rate": 7.264529058116233e-06, "loss": 0.0071, "step": 4277 }, { "epoch": 17.112, "grad_norm": 0.16117851436138153, "learning_rate": 7.254509018036073e-06, "loss": 0.0077, "step": 4278 }, { "epoch": 17.116, "grad_norm": 0.15216809511184692, "learning_rate": 7.244488977955913e-06, "loss": 0.0071, "step": 4279 }, { "epoch": 17.12, "grad_norm": 0.19427397847175598, "learning_rate": 7.234468937875752e-06, "loss": 0.0074, "step": 4280 }, { "epoch": 17.124, "grad_norm": 0.23492863774299622, "learning_rate": 7.2244488977955906e-06, "loss": 0.0078, "step": 4281 }, { "epoch": 17.128, "grad_norm": 0.1491718888282776, "learning_rate": 7.214428857715432e-06, "loss": 0.0064, "step": 4282 }, { "epoch": 17.132, "grad_norm": 0.14911611378192902, "learning_rate": 7.204408817635271e-06, "loss": 0.0069, "step": 4283 }, { "epoch": 17.136, "grad_norm": 0.17563241720199585, "learning_rate": 7.194388777555111e-06, "loss": 0.0076, "step": 4284 }, { "epoch": 17.14, "grad_norm": 0.1660158634185791, "learning_rate": 7.184368737474951e-06, "loss": 0.007, "step": 4285 }, { "epoch": 17.144, "grad_norm": 0.1971948891878128, "learning_rate": 7.174348697394789e-06, "loss": 0.0067, "step": 4286 }, { "epoch": 17.148, "grad_norm": 0.15479083359241486, "learning_rate": 7.16432865731463e-06, "loss": 0.0066, "step": 4287 }, { "epoch": 17.152, "grad_norm": 0.20346534252166748, "learning_rate": 7.1543086172344685e-06, "loss": 0.0068, "step": 4288 }, { "epoch": 17.156, "grad_norm": 0.22112035751342773, "learning_rate": 7.14428857715431e-06, "loss": 0.0071, "step": 4289 }, { "epoch": 17.16, "grad_norm": 0.17985723912715912, "learning_rate": 7.1342685370741486e-06, "loss": 0.0081, "step": 4290 }, { "epoch": 17.164, "grad_norm": 0.20944520831108093, "learning_rate": 7.124248496993987e-06, "loss": 0.007, "step": 4291 }, { "epoch": 17.168, "grad_norm": 0.1437358558177948, "learning_rate": 7.114228456913829e-06, "loss": 0.0084, "step": 4292 }, { "epoch": 17.172, "grad_norm": 0.14544442296028137, "learning_rate": 7.104208416833667e-06, "loss": 0.0047, "step": 4293 }, { "epoch": 17.176, "grad_norm": 0.19183100759983063, "learning_rate": 7.094188376753508e-06, "loss": 0.0074, "step": 4294 }, { "epoch": 17.18, "grad_norm": 0.16251499950885773, "learning_rate": 7.0841683366733465e-06, "loss": 0.0068, "step": 4295 }, { "epoch": 17.184, "grad_norm": 0.16117817163467407, "learning_rate": 7.074148296593188e-06, "loss": 0.0069, "step": 4296 }, { "epoch": 17.188, "grad_norm": 0.15662899613380432, "learning_rate": 7.0641282565130265e-06, "loss": 0.0076, "step": 4297 }, { "epoch": 17.192, "grad_norm": 0.21028119325637817, "learning_rate": 7.054108216432865e-06, "loss": 0.0078, "step": 4298 }, { "epoch": 17.196, "grad_norm": 0.19951249659061432, "learning_rate": 7.044088176352706e-06, "loss": 0.0075, "step": 4299 }, { "epoch": 17.2, "grad_norm": 0.17216528952121735, "learning_rate": 7.034068136272545e-06, "loss": 0.0069, "step": 4300 }, { "epoch": 17.204, "grad_norm": 0.14699934422969818, "learning_rate": 7.024048096192386e-06, "loss": 0.0045, "step": 4301 }, { "epoch": 17.208, "grad_norm": 0.15212714672088623, "learning_rate": 7.0140280561122245e-06, "loss": 0.0079, "step": 4302 }, { "epoch": 17.212, "grad_norm": 0.1797187179327011, "learning_rate": 7.004008016032064e-06, "loss": 0.0071, "step": 4303 }, { "epoch": 17.216, "grad_norm": 0.13188792765140533, "learning_rate": 6.9939879759519045e-06, "loss": 0.0068, "step": 4304 }, { "epoch": 17.22, "grad_norm": 0.1433761864900589, "learning_rate": 6.983967935871743e-06, "loss": 0.0067, "step": 4305 }, { "epoch": 17.224, "grad_norm": 0.22716593742370605, "learning_rate": 6.973947895791584e-06, "loss": 0.0087, "step": 4306 }, { "epoch": 17.228, "grad_norm": 0.09545940160751343, "learning_rate": 6.963927855711423e-06, "loss": 0.0042, "step": 4307 }, { "epoch": 17.232, "grad_norm": 0.15111321210861206, "learning_rate": 6.953907815631262e-06, "loss": 0.0072, "step": 4308 }, { "epoch": 17.236, "grad_norm": 0.14687678217887878, "learning_rate": 6.9438877755511024e-06, "loss": 0.0069, "step": 4309 }, { "epoch": 17.24, "grad_norm": 0.148050919175148, "learning_rate": 6.933867735470942e-06, "loss": 0.0068, "step": 4310 }, { "epoch": 17.244, "grad_norm": 0.12253053486347198, "learning_rate": 6.9238476953907825e-06, "loss": 0.0066, "step": 4311 }, { "epoch": 17.248, "grad_norm": 0.19216535985469818, "learning_rate": 6.913827655310621e-06, "loss": 0.0082, "step": 4312 }, { "epoch": 17.252, "grad_norm": 0.22438839077949524, "learning_rate": 6.903807615230461e-06, "loss": 0.0075, "step": 4313 }, { "epoch": 17.256, "grad_norm": 0.2512076497077942, "learning_rate": 6.893787575150301e-06, "loss": 0.0082, "step": 4314 }, { "epoch": 17.26, "grad_norm": 0.15971721708774567, "learning_rate": 6.88376753507014e-06, "loss": 0.0068, "step": 4315 }, { "epoch": 17.264, "grad_norm": 0.14771561324596405, "learning_rate": 6.87374749498998e-06, "loss": 0.0078, "step": 4316 }, { "epoch": 17.268, "grad_norm": 0.1611126810312271, "learning_rate": 6.86372745490982e-06, "loss": 0.0083, "step": 4317 }, { "epoch": 17.272, "grad_norm": 0.2246340811252594, "learning_rate": 6.853707414829659e-06, "loss": 0.0079, "step": 4318 }, { "epoch": 17.276, "grad_norm": 0.13150276243686676, "learning_rate": 6.843687374749499e-06, "loss": 0.0048, "step": 4319 }, { "epoch": 17.28, "grad_norm": 0.12884242832660675, "learning_rate": 6.833667334669339e-06, "loss": 0.0062, "step": 4320 }, { "epoch": 17.284, "grad_norm": 0.16102537512779236, "learning_rate": 6.823647294589179e-06, "loss": 0.0063, "step": 4321 }, { "epoch": 17.288, "grad_norm": 0.21450555324554443, "learning_rate": 6.813627254509018e-06, "loss": 0.0075, "step": 4322 }, { "epoch": 17.292, "grad_norm": 0.21209336817264557, "learning_rate": 6.803607214428858e-06, "loss": 0.0076, "step": 4323 }, { "epoch": 17.296, "grad_norm": 0.1917915791273117, "learning_rate": 6.793587174348698e-06, "loss": 0.0074, "step": 4324 }, { "epoch": 17.3, "grad_norm": 0.14592121541500092, "learning_rate": 6.783567134268537e-06, "loss": 0.0071, "step": 4325 }, { "epoch": 17.304, "grad_norm": 0.1582149863243103, "learning_rate": 6.773547094188377e-06, "loss": 0.007, "step": 4326 }, { "epoch": 17.308, "grad_norm": 0.21334248781204224, "learning_rate": 6.763527054108217e-06, "loss": 0.0081, "step": 4327 }, { "epoch": 17.312, "grad_norm": 0.15731792151927948, "learning_rate": 6.753507014028057e-06, "loss": 0.0068, "step": 4328 }, { "epoch": 17.316, "grad_norm": 0.15384608507156372, "learning_rate": 6.743486973947896e-06, "loss": 0.0079, "step": 4329 }, { "epoch": 17.32, "grad_norm": 0.23401755094528198, "learning_rate": 6.7334669338677355e-06, "loss": 0.008, "step": 4330 }, { "epoch": 17.324, "grad_norm": 0.16333438456058502, "learning_rate": 6.723446893787576e-06, "loss": 0.0076, "step": 4331 }, { "epoch": 17.328, "grad_norm": 0.25040316581726074, "learning_rate": 6.713426853707415e-06, "loss": 0.0085, "step": 4332 }, { "epoch": 17.332, "grad_norm": 0.22485579550266266, "learning_rate": 6.703406813627255e-06, "loss": 0.0087, "step": 4333 }, { "epoch": 17.336, "grad_norm": 0.16370800137519836, "learning_rate": 6.693386773547095e-06, "loss": 0.0076, "step": 4334 }, { "epoch": 17.34, "grad_norm": 0.18383802473545074, "learning_rate": 6.6833667334669334e-06, "loss": 0.0074, "step": 4335 }, { "epoch": 17.344, "grad_norm": 0.13991597294807434, "learning_rate": 6.673346693386774e-06, "loss": 0.0071, "step": 4336 }, { "epoch": 17.348, "grad_norm": 0.24963268637657166, "learning_rate": 6.6633266533066135e-06, "loss": 0.0084, "step": 4337 }, { "epoch": 17.352, "grad_norm": 0.22435139119625092, "learning_rate": 6.653306613226454e-06, "loss": 0.0079, "step": 4338 }, { "epoch": 17.356, "grad_norm": 0.2332351952791214, "learning_rate": 6.643286573146293e-06, "loss": 0.0082, "step": 4339 }, { "epoch": 17.36, "grad_norm": 0.1963132917881012, "learning_rate": 6.633266533066132e-06, "loss": 0.0069, "step": 4340 }, { "epoch": 17.364, "grad_norm": 0.21712207794189453, "learning_rate": 6.623246492985973e-06, "loss": 0.0086, "step": 4341 }, { "epoch": 17.368, "grad_norm": 0.12130995839834213, "learning_rate": 6.613226452905811e-06, "loss": 0.0038, "step": 4342 }, { "epoch": 17.372, "grad_norm": 0.2439410537481308, "learning_rate": 6.603206412825652e-06, "loss": 0.0081, "step": 4343 }, { "epoch": 17.376, "grad_norm": 0.17727676033973694, "learning_rate": 6.5931863727454914e-06, "loss": 0.0071, "step": 4344 }, { "epoch": 17.38, "grad_norm": 0.15301989018917084, "learning_rate": 6.58316633266533e-06, "loss": 0.0065, "step": 4345 }, { "epoch": 17.384, "grad_norm": 0.17886261641979218, "learning_rate": 6.573146292585171e-06, "loss": 0.0079, "step": 4346 }, { "epoch": 17.388, "grad_norm": 0.21048425137996674, "learning_rate": 6.56312625250501e-06, "loss": 0.0077, "step": 4347 }, { "epoch": 17.392, "grad_norm": 0.23739394545555115, "learning_rate": 6.553106212424851e-06, "loss": 0.0077, "step": 4348 }, { "epoch": 17.396, "grad_norm": 0.22822335362434387, "learning_rate": 6.543086172344689e-06, "loss": 0.0078, "step": 4349 }, { "epoch": 17.4, "grad_norm": 0.24672290682792664, "learning_rate": 6.533066132264529e-06, "loss": 0.0077, "step": 4350 }, { "epoch": 17.404, "grad_norm": 0.22589755058288574, "learning_rate": 6.523046092184369e-06, "loss": 0.008, "step": 4351 }, { "epoch": 17.408, "grad_norm": 0.1355731040239334, "learning_rate": 6.513026052104208e-06, "loss": 0.0076, "step": 4352 }, { "epoch": 17.412, "grad_norm": 0.15726511180400848, "learning_rate": 6.5030060120240486e-06, "loss": 0.007, "step": 4353 }, { "epoch": 17.416, "grad_norm": 0.14194290339946747, "learning_rate": 6.492985971943888e-06, "loss": 0.0069, "step": 4354 }, { "epoch": 17.42, "grad_norm": 0.15629169344902039, "learning_rate": 6.482965931863729e-06, "loss": 0.0077, "step": 4355 }, { "epoch": 17.424, "grad_norm": 0.20658808946609497, "learning_rate": 6.472945891783567e-06, "loss": 0.0071, "step": 4356 }, { "epoch": 17.428, "grad_norm": 0.13976025581359863, "learning_rate": 6.462925851703407e-06, "loss": 0.0074, "step": 4357 }, { "epoch": 17.432, "grad_norm": 0.14922749996185303, "learning_rate": 6.452905811623247e-06, "loss": 0.0075, "step": 4358 }, { "epoch": 17.436, "grad_norm": 0.31609266996383667, "learning_rate": 6.442885771543086e-06, "loss": 0.0083, "step": 4359 }, { "epoch": 17.44, "grad_norm": 0.17123810946941376, "learning_rate": 6.4328657314629265e-06, "loss": 0.0068, "step": 4360 }, { "epoch": 17.444, "grad_norm": 0.2661895155906677, "learning_rate": 6.422845691382766e-06, "loss": 0.0085, "step": 4361 }, { "epoch": 17.448, "grad_norm": 0.20732519030570984, "learning_rate": 6.412825651302605e-06, "loss": 0.0082, "step": 4362 }, { "epoch": 17.452, "grad_norm": 0.20721253752708435, "learning_rate": 6.402805611222445e-06, "loss": 0.0087, "step": 4363 }, { "epoch": 17.456, "grad_norm": 0.15347276628017426, "learning_rate": 6.392785571142285e-06, "loss": 0.0063, "step": 4364 }, { "epoch": 17.46, "grad_norm": 0.19956089556217194, "learning_rate": 6.382765531062125e-06, "loss": 0.0082, "step": 4365 }, { "epoch": 17.464, "grad_norm": 0.15364766120910645, "learning_rate": 6.372745490981964e-06, "loss": 0.0075, "step": 4366 }, { "epoch": 17.468, "grad_norm": 0.48688241839408875, "learning_rate": 6.362725450901804e-06, "loss": 0.0109, "step": 4367 }, { "epoch": 17.472, "grad_norm": 0.19626305997371674, "learning_rate": 6.352705410821644e-06, "loss": 0.0085, "step": 4368 }, { "epoch": 17.476, "grad_norm": 0.21328666806221008, "learning_rate": 6.342685370741483e-06, "loss": 0.0087, "step": 4369 }, { "epoch": 17.48, "grad_norm": 0.2773035764694214, "learning_rate": 6.332665330661323e-06, "loss": 0.0107, "step": 4370 }, { "epoch": 17.484, "grad_norm": 0.15668006241321564, "learning_rate": 6.322645290581163e-06, "loss": 0.0075, "step": 4371 }, { "epoch": 17.488, "grad_norm": 0.20465034246444702, "learning_rate": 6.312625250501002e-06, "loss": 0.0079, "step": 4372 }, { "epoch": 17.492, "grad_norm": 0.1748172491788864, "learning_rate": 6.302605210420842e-06, "loss": 0.0074, "step": 4373 }, { "epoch": 17.496, "grad_norm": 0.13092002272605896, "learning_rate": 6.292585170340682e-06, "loss": 0.0077, "step": 4374 }, { "epoch": 17.5, "grad_norm": 0.13474947214126587, "learning_rate": 6.282565130260522e-06, "loss": 0.0064, "step": 4375 }, { "epoch": 17.504, "grad_norm": 0.17254269123077393, "learning_rate": 6.272545090180361e-06, "loss": 0.0076, "step": 4376 }, { "epoch": 17.508, "grad_norm": 0.1626187562942505, "learning_rate": 6.2625250501002e-06, "loss": 0.008, "step": 4377 }, { "epoch": 17.512, "grad_norm": 0.12236899137496948, "learning_rate": 6.252505010020041e-06, "loss": 0.0073, "step": 4378 }, { "epoch": 17.516, "grad_norm": 0.1585134118795395, "learning_rate": 6.24248496993988e-06, "loss": 0.0067, "step": 4379 }, { "epoch": 17.52, "grad_norm": 0.2114783227443695, "learning_rate": 6.232464929859719e-06, "loss": 0.009, "step": 4380 }, { "epoch": 17.524, "grad_norm": 0.17043016850948334, "learning_rate": 6.22244488977956e-06, "loss": 0.0076, "step": 4381 }, { "epoch": 17.528, "grad_norm": 0.2268737554550171, "learning_rate": 6.212424849699399e-06, "loss": 0.0084, "step": 4382 }, { "epoch": 17.532, "grad_norm": 0.18764908611774445, "learning_rate": 6.202404809619239e-06, "loss": 0.0078, "step": 4383 }, { "epoch": 17.536, "grad_norm": 0.1297231912612915, "learning_rate": 6.192384769539078e-06, "loss": 0.0069, "step": 4384 }, { "epoch": 17.54, "grad_norm": 0.134876549243927, "learning_rate": 6.182364729458918e-06, "loss": 0.0074, "step": 4385 }, { "epoch": 17.544, "grad_norm": 0.1665986031293869, "learning_rate": 6.1723446893787575e-06, "loss": 0.008, "step": 4386 }, { "epoch": 17.548000000000002, "grad_norm": 0.17875370383262634, "learning_rate": 6.162324649298597e-06, "loss": 0.0077, "step": 4387 }, { "epoch": 17.552, "grad_norm": 0.11975561082363129, "learning_rate": 6.1523046092184376e-06, "loss": 0.0048, "step": 4388 }, { "epoch": 17.556, "grad_norm": 0.15035223960876465, "learning_rate": 6.142284569138277e-06, "loss": 0.0087, "step": 4389 }, { "epoch": 17.56, "grad_norm": 0.1503482609987259, "learning_rate": 6.132264529058117e-06, "loss": 0.0077, "step": 4390 }, { "epoch": 17.564, "grad_norm": 0.12905697524547577, "learning_rate": 6.122244488977956e-06, "loss": 0.0065, "step": 4391 }, { "epoch": 17.568, "grad_norm": 0.13570000231266022, "learning_rate": 6.112224448897796e-06, "loss": 0.0078, "step": 4392 }, { "epoch": 17.572, "grad_norm": 0.12492106854915619, "learning_rate": 6.1022044088176355e-06, "loss": 0.0068, "step": 4393 }, { "epoch": 17.576, "grad_norm": 0.23991592228412628, "learning_rate": 6.092184368737475e-06, "loss": 0.0086, "step": 4394 }, { "epoch": 17.58, "grad_norm": 0.15174011886119843, "learning_rate": 6.0821643286573155e-06, "loss": 0.0077, "step": 4395 }, { "epoch": 17.584, "grad_norm": 0.16359587013721466, "learning_rate": 6.072144288577154e-06, "loss": 0.0077, "step": 4396 }, { "epoch": 17.588, "grad_norm": 0.2310338318347931, "learning_rate": 6.062124248496994e-06, "loss": 0.0084, "step": 4397 }, { "epoch": 17.592, "grad_norm": 0.1341230720281601, "learning_rate": 6.052104208416834e-06, "loss": 0.0083, "step": 4398 }, { "epoch": 17.596, "grad_norm": 0.21559903025627136, "learning_rate": 6.042084168336674e-06, "loss": 0.0089, "step": 4399 }, { "epoch": 17.6, "grad_norm": 0.15158264338970184, "learning_rate": 6.0320641282565135e-06, "loss": 0.0073, "step": 4400 }, { "epoch": 17.604, "grad_norm": 0.2421329915523529, "learning_rate": 6.022044088176353e-06, "loss": 0.0081, "step": 4401 }, { "epoch": 17.608, "grad_norm": 0.2801038920879364, "learning_rate": 6.012024048096193e-06, "loss": 0.0089, "step": 4402 }, { "epoch": 17.612, "grad_norm": 0.1329454481601715, "learning_rate": 6.002004008016032e-06, "loss": 0.0078, "step": 4403 }, { "epoch": 17.616, "grad_norm": 0.23923999071121216, "learning_rate": 5.991983967935872e-06, "loss": 0.0086, "step": 4404 }, { "epoch": 17.62, "grad_norm": 0.15022501349449158, "learning_rate": 5.981963927855712e-06, "loss": 0.0067, "step": 4405 }, { "epoch": 17.624, "grad_norm": 0.1838655024766922, "learning_rate": 5.971943887775552e-06, "loss": 0.0073, "step": 4406 }, { "epoch": 17.628, "grad_norm": 0.15019722282886505, "learning_rate": 5.961923847695391e-06, "loss": 0.0072, "step": 4407 }, { "epoch": 17.632, "grad_norm": 0.2248644381761551, "learning_rate": 5.95190380761523e-06, "loss": 0.0085, "step": 4408 }, { "epoch": 17.636, "grad_norm": 0.25571855902671814, "learning_rate": 5.941883767535071e-06, "loss": 0.0098, "step": 4409 }, { "epoch": 17.64, "grad_norm": 0.14798125624656677, "learning_rate": 5.93186372745491e-06, "loss": 0.0076, "step": 4410 }, { "epoch": 17.644, "grad_norm": 0.199660062789917, "learning_rate": 5.92184368737475e-06, "loss": 0.0085, "step": 4411 }, { "epoch": 17.648, "grad_norm": 0.15380799770355225, "learning_rate": 5.911823647294589e-06, "loss": 0.008, "step": 4412 }, { "epoch": 17.652, "grad_norm": 0.20436356961727142, "learning_rate": 5.901803607214429e-06, "loss": 0.0054, "step": 4413 }, { "epoch": 17.656, "grad_norm": 0.22345702350139618, "learning_rate": 5.8917835671342686e-06, "loss": 0.0082, "step": 4414 }, { "epoch": 17.66, "grad_norm": 0.17874562740325928, "learning_rate": 5.881763527054108e-06, "loss": 0.0084, "step": 4415 }, { "epoch": 17.664, "grad_norm": 0.14801347255706787, "learning_rate": 5.871743486973949e-06, "loss": 0.0072, "step": 4416 }, { "epoch": 17.668, "grad_norm": 0.20537561178207397, "learning_rate": 5.861723446893788e-06, "loss": 0.0079, "step": 4417 }, { "epoch": 17.672, "grad_norm": 0.1717330515384674, "learning_rate": 5.851703406813627e-06, "loss": 0.0077, "step": 4418 }, { "epoch": 17.676, "grad_norm": 0.13762032985687256, "learning_rate": 5.841683366733467e-06, "loss": 0.0069, "step": 4419 }, { "epoch": 17.68, "grad_norm": 0.16380003094673157, "learning_rate": 5.831663326653307e-06, "loss": 0.0077, "step": 4420 }, { "epoch": 17.684, "grad_norm": 0.20809531211853027, "learning_rate": 5.8216432865731465e-06, "loss": 0.0084, "step": 4421 }, { "epoch": 17.688, "grad_norm": 0.24906416237354279, "learning_rate": 5.811623246492986e-06, "loss": 0.0077, "step": 4422 }, { "epoch": 17.692, "grad_norm": 0.15348020195960999, "learning_rate": 5.801603206412826e-06, "loss": 0.0081, "step": 4423 }, { "epoch": 17.696, "grad_norm": 0.15915721654891968, "learning_rate": 5.791583166332665e-06, "loss": 0.0071, "step": 4424 }, { "epoch": 17.7, "grad_norm": 0.1927608698606491, "learning_rate": 5.781563126252505e-06, "loss": 0.0071, "step": 4425 }, { "epoch": 17.704, "grad_norm": 0.23460881412029266, "learning_rate": 5.771543086172345e-06, "loss": 0.0077, "step": 4426 }, { "epoch": 17.708, "grad_norm": 0.1741648018360138, "learning_rate": 5.761523046092185e-06, "loss": 0.008, "step": 4427 }, { "epoch": 17.712, "grad_norm": 0.13403233885765076, "learning_rate": 5.7515030060120245e-06, "loss": 0.005, "step": 4428 }, { "epoch": 17.716, "grad_norm": 0.0899837389588356, "learning_rate": 5.741482965931864e-06, "loss": 0.0045, "step": 4429 }, { "epoch": 17.72, "grad_norm": 0.15737628936767578, "learning_rate": 5.731462925851704e-06, "loss": 0.0073, "step": 4430 }, { "epoch": 17.724, "grad_norm": 0.14937400817871094, "learning_rate": 5.721442885771543e-06, "loss": 0.0081, "step": 4431 }, { "epoch": 17.728, "grad_norm": 0.12557938694953918, "learning_rate": 5.711422845691383e-06, "loss": 0.0067, "step": 4432 }, { "epoch": 17.732, "grad_norm": 0.20801903307437897, "learning_rate": 5.701402805611223e-06, "loss": 0.0083, "step": 4433 }, { "epoch": 17.736, "grad_norm": 0.17545443773269653, "learning_rate": 5.691382765531062e-06, "loss": 0.0076, "step": 4434 }, { "epoch": 17.74, "grad_norm": 0.16065756976604462, "learning_rate": 5.681362725450902e-06, "loss": 0.0078, "step": 4435 }, { "epoch": 17.744, "grad_norm": 0.15100617706775665, "learning_rate": 5.671342685370742e-06, "loss": 0.0067, "step": 4436 }, { "epoch": 17.748, "grad_norm": 0.17027676105499268, "learning_rate": 5.661322645290582e-06, "loss": 0.0078, "step": 4437 }, { "epoch": 17.752, "grad_norm": 0.19848494231700897, "learning_rate": 5.651302605210421e-06, "loss": 0.0075, "step": 4438 }, { "epoch": 17.756, "grad_norm": 0.15401864051818848, "learning_rate": 5.641282565130261e-06, "loss": 0.0073, "step": 4439 }, { "epoch": 17.76, "grad_norm": 0.21219518780708313, "learning_rate": 5.6312625250501e-06, "loss": 0.0081, "step": 4440 }, { "epoch": 17.764, "grad_norm": 0.09641958028078079, "learning_rate": 5.62124248496994e-06, "loss": 0.0047, "step": 4441 }, { "epoch": 17.768, "grad_norm": 0.15153850615024567, "learning_rate": 5.61122244488978e-06, "loss": 0.0073, "step": 4442 }, { "epoch": 17.772, "grad_norm": 0.20264235138893127, "learning_rate": 5.60120240480962e-06, "loss": 0.0078, "step": 4443 }, { "epoch": 17.776, "grad_norm": 0.14838512241840363, "learning_rate": 5.59118236472946e-06, "loss": 0.0071, "step": 4444 }, { "epoch": 17.78, "grad_norm": 0.1825137883424759, "learning_rate": 5.581162324649298e-06, "loss": 0.0068, "step": 4445 }, { "epoch": 17.784, "grad_norm": 0.14169864356517792, "learning_rate": 5.571142284569139e-06, "loss": 0.0074, "step": 4446 }, { "epoch": 17.788, "grad_norm": 0.20076483488082886, "learning_rate": 5.561122244488978e-06, "loss": 0.0085, "step": 4447 }, { "epoch": 17.792, "grad_norm": 0.23179060220718384, "learning_rate": 5.551102204408818e-06, "loss": 0.0081, "step": 4448 }, { "epoch": 17.796, "grad_norm": 0.15632283687591553, "learning_rate": 5.5410821643286575e-06, "loss": 0.0075, "step": 4449 }, { "epoch": 17.8, "grad_norm": 0.16295547783374786, "learning_rate": 5.531062124248497e-06, "loss": 0.0078, "step": 4450 }, { "epoch": 17.804, "grad_norm": 0.15852996706962585, "learning_rate": 5.521042084168337e-06, "loss": 0.0076, "step": 4451 }, { "epoch": 17.808, "grad_norm": 0.24899086356163025, "learning_rate": 5.511022044088176e-06, "loss": 0.008, "step": 4452 }, { "epoch": 17.812, "grad_norm": 0.26219606399536133, "learning_rate": 5.501002004008017e-06, "loss": 0.0079, "step": 4453 }, { "epoch": 17.816, "grad_norm": 0.15716837346553802, "learning_rate": 5.490981963927856e-06, "loss": 0.0068, "step": 4454 }, { "epoch": 17.82, "grad_norm": 0.21653732657432556, "learning_rate": 5.480961923847696e-06, "loss": 0.008, "step": 4455 }, { "epoch": 17.824, "grad_norm": 0.1806323230266571, "learning_rate": 5.4709418837675355e-06, "loss": 0.0086, "step": 4456 }, { "epoch": 17.828, "grad_norm": 0.2788422703742981, "learning_rate": 5.460921843687375e-06, "loss": 0.009, "step": 4457 }, { "epoch": 17.832, "grad_norm": 0.16361011564731598, "learning_rate": 5.450901803607215e-06, "loss": 0.0073, "step": 4458 }, { "epoch": 17.836, "grad_norm": 0.24502210319042206, "learning_rate": 5.440881763527054e-06, "loss": 0.0088, "step": 4459 }, { "epoch": 17.84, "grad_norm": 0.18323510885238647, "learning_rate": 5.430861723446895e-06, "loss": 0.0073, "step": 4460 }, { "epoch": 17.844, "grad_norm": 0.1967632919549942, "learning_rate": 5.4208416833667335e-06, "loss": 0.0085, "step": 4461 }, { "epoch": 17.848, "grad_norm": 0.19177305698394775, "learning_rate": 5.410821643286573e-06, "loss": 0.0079, "step": 4462 }, { "epoch": 17.852, "grad_norm": 0.1529979109764099, "learning_rate": 5.4008016032064135e-06, "loss": 0.0077, "step": 4463 }, { "epoch": 17.856, "grad_norm": 0.16155672073364258, "learning_rate": 5.390781563126253e-06, "loss": 0.0083, "step": 4464 }, { "epoch": 17.86, "grad_norm": 0.1222570464015007, "learning_rate": 5.380761523046093e-06, "loss": 0.0075, "step": 4465 }, { "epoch": 17.864, "grad_norm": 0.17447544634342194, "learning_rate": 5.370741482965931e-06, "loss": 0.0074, "step": 4466 }, { "epoch": 17.868, "grad_norm": 0.18004770576953888, "learning_rate": 5.360721442885772e-06, "loss": 0.008, "step": 4467 }, { "epoch": 17.872, "grad_norm": 0.12705212831497192, "learning_rate": 5.350701402805611e-06, "loss": 0.0065, "step": 4468 }, { "epoch": 17.876, "grad_norm": 0.13531920313835144, "learning_rate": 5.340681362725451e-06, "loss": 0.0077, "step": 4469 }, { "epoch": 17.88, "grad_norm": 0.11091826111078262, "learning_rate": 5.3306613226452914e-06, "loss": 0.0038, "step": 4470 }, { "epoch": 17.884, "grad_norm": 0.17355629801750183, "learning_rate": 5.320641282565131e-06, "loss": 0.0073, "step": 4471 }, { "epoch": 17.888, "grad_norm": 0.2770105302333832, "learning_rate": 5.31062124248497e-06, "loss": 0.0092, "step": 4472 }, { "epoch": 17.892, "grad_norm": 0.13555404543876648, "learning_rate": 5.300601202404809e-06, "loss": 0.007, "step": 4473 }, { "epoch": 17.896, "grad_norm": 0.13651050627231598, "learning_rate": 5.29058116232465e-06, "loss": 0.0071, "step": 4474 }, { "epoch": 17.9, "grad_norm": 0.22955292463302612, "learning_rate": 5.280561122244489e-06, "loss": 0.0086, "step": 4475 }, { "epoch": 17.904, "grad_norm": 0.13148556649684906, "learning_rate": 5.270541082164329e-06, "loss": 0.007, "step": 4476 }, { "epoch": 17.908, "grad_norm": 0.1831294298171997, "learning_rate": 5.2605210420841686e-06, "loss": 0.0084, "step": 4477 }, { "epoch": 17.912, "grad_norm": 0.15592306852340698, "learning_rate": 5.250501002004008e-06, "loss": 0.0076, "step": 4478 }, { "epoch": 17.916, "grad_norm": 0.20427504181861877, "learning_rate": 5.240480961923848e-06, "loss": 0.0083, "step": 4479 }, { "epoch": 17.92, "grad_norm": 0.1998140513896942, "learning_rate": 5.230460921843687e-06, "loss": 0.0089, "step": 4480 }, { "epoch": 17.924, "grad_norm": 0.1798437237739563, "learning_rate": 5.220440881763528e-06, "loss": 0.0085, "step": 4481 }, { "epoch": 17.928, "grad_norm": 0.1546415388584137, "learning_rate": 5.2104208416833665e-06, "loss": 0.0076, "step": 4482 }, { "epoch": 17.932, "grad_norm": 0.17189155519008636, "learning_rate": 5.200400801603206e-06, "loss": 0.0074, "step": 4483 }, { "epoch": 17.936, "grad_norm": 0.26644057035446167, "learning_rate": 5.1903807615230465e-06, "loss": 0.0084, "step": 4484 }, { "epoch": 17.94, "grad_norm": 0.12923316657543182, "learning_rate": 5.180360721442886e-06, "loss": 0.0067, "step": 4485 }, { "epoch": 17.944, "grad_norm": 0.20447267591953278, "learning_rate": 5.170340681362726e-06, "loss": 0.0075, "step": 4486 }, { "epoch": 17.948, "grad_norm": 0.17053300142288208, "learning_rate": 5.160320641282565e-06, "loss": 0.0076, "step": 4487 }, { "epoch": 17.951999999999998, "grad_norm": 0.1922687590122223, "learning_rate": 5.150300601202405e-06, "loss": 0.0082, "step": 4488 }, { "epoch": 17.956, "grad_norm": 0.17536620795726776, "learning_rate": 5.1402805611222445e-06, "loss": 0.0081, "step": 4489 }, { "epoch": 17.96, "grad_norm": 0.1868142932653427, "learning_rate": 5.130260521042084e-06, "loss": 0.0079, "step": 4490 }, { "epoch": 17.964, "grad_norm": 0.21813417971134186, "learning_rate": 5.1202404809619245e-06, "loss": 0.0082, "step": 4491 }, { "epoch": 17.968, "grad_norm": 0.1625450849533081, "learning_rate": 5.110220440881764e-06, "loss": 0.0083, "step": 4492 }, { "epoch": 17.972, "grad_norm": 0.1685769259929657, "learning_rate": 5.100200400801603e-06, "loss": 0.0083, "step": 4493 }, { "epoch": 17.976, "grad_norm": 0.16878901422023773, "learning_rate": 5.090180360721443e-06, "loss": 0.008, "step": 4494 }, { "epoch": 17.98, "grad_norm": 0.26042139530181885, "learning_rate": 5.080160320641283e-06, "loss": 0.009, "step": 4495 }, { "epoch": 17.984, "grad_norm": 0.22069434821605682, "learning_rate": 5.0701402805611224e-06, "loss": 0.0072, "step": 4496 }, { "epoch": 17.988, "grad_norm": 0.1699599176645279, "learning_rate": 5.060120240480962e-06, "loss": 0.0089, "step": 4497 }, { "epoch": 17.992, "grad_norm": 0.1470976173877716, "learning_rate": 5.0501002004008025e-06, "loss": 0.0074, "step": 4498 }, { "epoch": 17.996, "grad_norm": 0.22268114984035492, "learning_rate": 5.040080160320641e-06, "loss": 0.0084, "step": 4499 }, { "epoch": 18.0, "grad_norm": 0.1696414351463318, "learning_rate": 5.030060120240481e-06, "loss": 0.0081, "step": 4500 }, { "epoch": 18.004, "grad_norm": 0.18585962057113647, "learning_rate": 5.020040080160321e-06, "loss": 0.0072, "step": 4501 }, { "epoch": 18.008, "grad_norm": 0.1416616141796112, "learning_rate": 5.010020040080161e-06, "loss": 0.0064, "step": 4502 }, { "epoch": 18.012, "grad_norm": 0.1328905075788498, "learning_rate": 5e-06, "loss": 0.0064, "step": 4503 }, { "epoch": 18.016, "grad_norm": 0.16968417167663574, "learning_rate": 4.98997995991984e-06, "loss": 0.0073, "step": 4504 }, { "epoch": 18.02, "grad_norm": 0.15814343094825745, "learning_rate": 4.97995991983968e-06, "loss": 0.0073, "step": 4505 }, { "epoch": 18.024, "grad_norm": 0.17646895349025726, "learning_rate": 4.969939879759519e-06, "loss": 0.0076, "step": 4506 }, { "epoch": 18.028, "grad_norm": 0.14789192378520966, "learning_rate": 4.959919839679359e-06, "loss": 0.0064, "step": 4507 }, { "epoch": 18.032, "grad_norm": 0.15149499475955963, "learning_rate": 4.949899799599199e-06, "loss": 0.0075, "step": 4508 }, { "epoch": 18.036, "grad_norm": 0.12191706895828247, "learning_rate": 4.939879759519038e-06, "loss": 0.0065, "step": 4509 }, { "epoch": 18.04, "grad_norm": 0.1561209261417389, "learning_rate": 4.9298597194388775e-06, "loss": 0.0079, "step": 4510 }, { "epoch": 18.044, "grad_norm": 0.15725496411323547, "learning_rate": 4.919839679358718e-06, "loss": 0.0072, "step": 4511 }, { "epoch": 18.048, "grad_norm": 0.14337027072906494, "learning_rate": 4.9098196392785576e-06, "loss": 0.0065, "step": 4512 }, { "epoch": 18.052, "grad_norm": 0.16354084014892578, "learning_rate": 4.899799599198397e-06, "loss": 0.0068, "step": 4513 }, { "epoch": 18.056, "grad_norm": 0.16241081058979034, "learning_rate": 4.889779559118237e-06, "loss": 0.0066, "step": 4514 }, { "epoch": 18.06, "grad_norm": 0.1941567361354828, "learning_rate": 4.879759519038076e-06, "loss": 0.0062, "step": 4515 }, { "epoch": 18.064, "grad_norm": 0.1292446106672287, "learning_rate": 4.869739478957916e-06, "loss": 0.0066, "step": 4516 }, { "epoch": 18.068, "grad_norm": 0.09367750585079193, "learning_rate": 4.8597194388777555e-06, "loss": 0.0047, "step": 4517 }, { "epoch": 18.072, "grad_norm": 0.14819225668907166, "learning_rate": 4.849699398797596e-06, "loss": 0.0074, "step": 4518 }, { "epoch": 18.076, "grad_norm": 0.17116355895996094, "learning_rate": 4.8396793587174355e-06, "loss": 0.0064, "step": 4519 }, { "epoch": 18.08, "grad_norm": 0.19013841450214386, "learning_rate": 4.829659318637274e-06, "loss": 0.0075, "step": 4520 }, { "epoch": 18.084, "grad_norm": 0.17657625675201416, "learning_rate": 4.819639278557115e-06, "loss": 0.0077, "step": 4521 }, { "epoch": 18.088, "grad_norm": 0.14406649768352509, "learning_rate": 4.809619238476954e-06, "loss": 0.0074, "step": 4522 }, { "epoch": 18.092, "grad_norm": 0.17921116948127747, "learning_rate": 4.799599198396794e-06, "loss": 0.0071, "step": 4523 }, { "epoch": 18.096, "grad_norm": 0.22926273941993713, "learning_rate": 4.7895791583166335e-06, "loss": 0.0083, "step": 4524 }, { "epoch": 18.1, "grad_norm": 0.1705189198255539, "learning_rate": 4.779559118236474e-06, "loss": 0.0074, "step": 4525 }, { "epoch": 18.104, "grad_norm": 0.15004006028175354, "learning_rate": 4.769539078156313e-06, "loss": 0.0063, "step": 4526 }, { "epoch": 18.108, "grad_norm": 0.1770288497209549, "learning_rate": 4.759519038076152e-06, "loss": 0.0077, "step": 4527 }, { "epoch": 18.112, "grad_norm": 0.14755572378635406, "learning_rate": 4.749498997995992e-06, "loss": 0.0065, "step": 4528 }, { "epoch": 18.116, "grad_norm": 0.12540753185749054, "learning_rate": 4.739478957915832e-06, "loss": 0.0067, "step": 4529 }, { "epoch": 18.12, "grad_norm": 0.1722092181444168, "learning_rate": 4.729458917835672e-06, "loss": 0.0076, "step": 4530 }, { "epoch": 18.124, "grad_norm": 0.15818531811237335, "learning_rate": 4.719438877755511e-06, "loss": 0.0065, "step": 4531 }, { "epoch": 18.128, "grad_norm": 0.17607039213180542, "learning_rate": 4.709418837675351e-06, "loss": 0.0058, "step": 4532 }, { "epoch": 18.132, "grad_norm": 0.15239058434963226, "learning_rate": 4.699398797595191e-06, "loss": 0.0072, "step": 4533 }, { "epoch": 18.136, "grad_norm": 0.1865680068731308, "learning_rate": 4.68937875751503e-06, "loss": 0.0074, "step": 4534 }, { "epoch": 18.14, "grad_norm": 0.13207893073558807, "learning_rate": 4.67935871743487e-06, "loss": 0.0069, "step": 4535 }, { "epoch": 18.144, "grad_norm": 0.1481158584356308, "learning_rate": 4.669338677354709e-06, "loss": 0.0068, "step": 4536 }, { "epoch": 18.148, "grad_norm": 0.1650175005197525, "learning_rate": 4.659318637274549e-06, "loss": 0.0084, "step": 4537 }, { "epoch": 18.152, "grad_norm": 0.2189430296421051, "learning_rate": 4.6492985971943886e-06, "loss": 0.0068, "step": 4538 }, { "epoch": 18.156, "grad_norm": 0.17136210203170776, "learning_rate": 4.639278557114229e-06, "loss": 0.0071, "step": 4539 }, { "epoch": 18.16, "grad_norm": 0.1906738430261612, "learning_rate": 4.6292585170340686e-06, "loss": 0.007, "step": 4540 }, { "epoch": 18.164, "grad_norm": 0.1356390118598938, "learning_rate": 4.619238476953908e-06, "loss": 0.0063, "step": 4541 }, { "epoch": 18.168, "grad_norm": 0.1401359885931015, "learning_rate": 4.609218436873748e-06, "loss": 0.0073, "step": 4542 }, { "epoch": 18.172, "grad_norm": 0.22201108932495117, "learning_rate": 4.599198396793587e-06, "loss": 0.0082, "step": 4543 }, { "epoch": 18.176, "grad_norm": 0.17944709956645966, "learning_rate": 4.589178356713427e-06, "loss": 0.0065, "step": 4544 }, { "epoch": 18.18, "grad_norm": 0.19789981842041016, "learning_rate": 4.5791583166332665e-06, "loss": 0.0071, "step": 4545 }, { "epoch": 18.184, "grad_norm": 0.2103254646062851, "learning_rate": 4.569138276553107e-06, "loss": 0.0074, "step": 4546 }, { "epoch": 18.188, "grad_norm": 0.15887366235256195, "learning_rate": 4.559118236472946e-06, "loss": 0.0064, "step": 4547 }, { "epoch": 18.192, "grad_norm": 0.12371140718460083, "learning_rate": 4.549098196392785e-06, "loss": 0.0064, "step": 4548 }, { "epoch": 18.196, "grad_norm": 0.20762166380882263, "learning_rate": 4.539078156312626e-06, "loss": 0.0078, "step": 4549 }, { "epoch": 18.2, "grad_norm": 0.12717363238334656, "learning_rate": 4.529058116232465e-06, "loss": 0.0064, "step": 4550 }, { "epoch": 18.204, "grad_norm": 0.2185417264699936, "learning_rate": 4.519038076152305e-06, "loss": 0.0068, "step": 4551 }, { "epoch": 18.208, "grad_norm": 0.1922868937253952, "learning_rate": 4.5090180360721445e-06, "loss": 0.0049, "step": 4552 }, { "epoch": 18.212, "grad_norm": 0.20411092042922974, "learning_rate": 4.498997995991984e-06, "loss": 0.0076, "step": 4553 }, { "epoch": 18.216, "grad_norm": 0.18740029633045197, "learning_rate": 4.488977955911824e-06, "loss": 0.008, "step": 4554 }, { "epoch": 18.22, "grad_norm": 0.14314641058444977, "learning_rate": 4.478957915831663e-06, "loss": 0.0068, "step": 4555 }, { "epoch": 18.224, "grad_norm": 0.15303204953670502, "learning_rate": 4.468937875751504e-06, "loss": 0.0069, "step": 4556 }, { "epoch": 18.228, "grad_norm": 0.21865060925483704, "learning_rate": 4.458917835671343e-06, "loss": 0.0068, "step": 4557 }, { "epoch": 18.232, "grad_norm": 0.14031033217906952, "learning_rate": 4.448897795591182e-06, "loss": 0.0071, "step": 4558 }, { "epoch": 18.236, "grad_norm": 0.18095235526561737, "learning_rate": 4.4388777555110225e-06, "loss": 0.0071, "step": 4559 }, { "epoch": 18.24, "grad_norm": 0.16418349742889404, "learning_rate": 4.428857715430862e-06, "loss": 0.0077, "step": 4560 }, { "epoch": 18.244, "grad_norm": 0.14331473410129547, "learning_rate": 4.418837675350702e-06, "loss": 0.0067, "step": 4561 }, { "epoch": 18.248, "grad_norm": 0.15830665826797485, "learning_rate": 4.408817635270541e-06, "loss": 0.0074, "step": 4562 }, { "epoch": 18.252, "grad_norm": 0.1355806142091751, "learning_rate": 4.398797595190381e-06, "loss": 0.0069, "step": 4563 }, { "epoch": 18.256, "grad_norm": 0.05929170548915863, "learning_rate": 4.38877755511022e-06, "loss": 0.0023, "step": 4564 }, { "epoch": 18.26, "grad_norm": 0.15484470129013062, "learning_rate": 4.37875751503006e-06, "loss": 0.0061, "step": 4565 }, { "epoch": 18.264, "grad_norm": 0.17139868438243866, "learning_rate": 4.3687374749499e-06, "loss": 0.006, "step": 4566 }, { "epoch": 18.268, "grad_norm": 0.23054589331150055, "learning_rate": 4.35871743486974e-06, "loss": 0.008, "step": 4567 }, { "epoch": 18.272, "grad_norm": 0.1985149085521698, "learning_rate": 4.34869739478958e-06, "loss": 0.0072, "step": 4568 }, { "epoch": 18.276, "grad_norm": 0.17223723232746124, "learning_rate": 4.338677354709419e-06, "loss": 0.0072, "step": 4569 }, { "epoch": 18.28, "grad_norm": 0.14117112755775452, "learning_rate": 4.328657314629259e-06, "loss": 0.0065, "step": 4570 }, { "epoch": 18.284, "grad_norm": 0.22822153568267822, "learning_rate": 4.318637274549098e-06, "loss": 0.0074, "step": 4571 }, { "epoch": 18.288, "grad_norm": 0.21039901673793793, "learning_rate": 4.308617234468938e-06, "loss": 0.0079, "step": 4572 }, { "epoch": 18.292, "grad_norm": 0.14674365520477295, "learning_rate": 4.298597194388778e-06, "loss": 0.0065, "step": 4573 }, { "epoch": 18.296, "grad_norm": 0.1451415717601776, "learning_rate": 4.288577154308617e-06, "loss": 0.0065, "step": 4574 }, { "epoch": 18.3, "grad_norm": 0.15624050796031952, "learning_rate": 4.278557114228457e-06, "loss": 0.0068, "step": 4575 }, { "epoch": 18.304, "grad_norm": 0.1852012276649475, "learning_rate": 4.268537074148297e-06, "loss": 0.0077, "step": 4576 }, { "epoch": 18.308, "grad_norm": 0.22112013399600983, "learning_rate": 4.258517034068137e-06, "loss": 0.0074, "step": 4577 }, { "epoch": 18.312, "grad_norm": 0.1755540817975998, "learning_rate": 4.248496993987976e-06, "loss": 0.0078, "step": 4578 }, { "epoch": 18.316, "grad_norm": 0.16024050116539001, "learning_rate": 4.238476953907815e-06, "loss": 0.007, "step": 4579 }, { "epoch": 18.32, "grad_norm": 0.18745197355747223, "learning_rate": 4.2284569138276555e-06, "loss": 0.0077, "step": 4580 }, { "epoch": 18.324, "grad_norm": 0.18656589090824127, "learning_rate": 4.218436873747495e-06, "loss": 0.0081, "step": 4581 }, { "epoch": 18.328, "grad_norm": 0.16219571232795715, "learning_rate": 4.208416833667335e-06, "loss": 0.0072, "step": 4582 }, { "epoch": 18.332, "grad_norm": 0.23128695785999298, "learning_rate": 4.198396793587175e-06, "loss": 0.0079, "step": 4583 }, { "epoch": 18.336, "grad_norm": 0.21390704810619354, "learning_rate": 4.188376753507015e-06, "loss": 0.0088, "step": 4584 }, { "epoch": 18.34, "grad_norm": 0.23539131879806519, "learning_rate": 4.1783567134268534e-06, "loss": 0.0081, "step": 4585 }, { "epoch": 18.344, "grad_norm": 0.19893336296081543, "learning_rate": 4.168336673346693e-06, "loss": 0.0068, "step": 4586 }, { "epoch": 18.348, "grad_norm": 0.44295942783355713, "learning_rate": 4.1583166332665335e-06, "loss": 0.0096, "step": 4587 }, { "epoch": 18.352, "grad_norm": 0.17842140793800354, "learning_rate": 4.148296593186373e-06, "loss": 0.0077, "step": 4588 }, { "epoch": 18.356, "grad_norm": 0.22559908032417297, "learning_rate": 4.138276553106213e-06, "loss": 0.0067, "step": 4589 }, { "epoch": 18.36, "grad_norm": 0.19061769545078278, "learning_rate": 4.128256513026052e-06, "loss": 0.0076, "step": 4590 }, { "epoch": 18.364, "grad_norm": 0.14413903653621674, "learning_rate": 4.118236472945892e-06, "loss": 0.0074, "step": 4591 }, { "epoch": 18.368, "grad_norm": 0.1512213498353958, "learning_rate": 4.108216432865731e-06, "loss": 0.0082, "step": 4592 }, { "epoch": 18.372, "grad_norm": 0.1925954520702362, "learning_rate": 4.098196392785571e-06, "loss": 0.0071, "step": 4593 }, { "epoch": 18.376, "grad_norm": 0.18109238147735596, "learning_rate": 4.0881763527054114e-06, "loss": 0.0075, "step": 4594 }, { "epoch": 18.38, "grad_norm": 0.1389477699995041, "learning_rate": 4.078156312625251e-06, "loss": 0.0066, "step": 4595 }, { "epoch": 18.384, "grad_norm": 0.24592560529708862, "learning_rate": 4.06813627254509e-06, "loss": 0.0075, "step": 4596 }, { "epoch": 18.388, "grad_norm": 0.18193602561950684, "learning_rate": 4.05811623246493e-06, "loss": 0.0081, "step": 4597 }, { "epoch": 18.392, "grad_norm": 0.26493698358535767, "learning_rate": 4.04809619238477e-06, "loss": 0.0081, "step": 4598 }, { "epoch": 18.396, "grad_norm": 0.16297456622123718, "learning_rate": 4.038076152304609e-06, "loss": 0.0072, "step": 4599 }, { "epoch": 18.4, "grad_norm": 0.27221083641052246, "learning_rate": 4.028056112224449e-06, "loss": 0.0088, "step": 4600 }, { "epoch": 18.404, "grad_norm": 0.19877958297729492, "learning_rate": 4.0180360721442886e-06, "loss": 0.0083, "step": 4601 }, { "epoch": 18.408, "grad_norm": 0.19414174556732178, "learning_rate": 4.008016032064128e-06, "loss": 0.0071, "step": 4602 }, { "epoch": 18.412, "grad_norm": 0.1475425511598587, "learning_rate": 3.997995991983968e-06, "loss": 0.0051, "step": 4603 }, { "epoch": 18.416, "grad_norm": 0.13370120525360107, "learning_rate": 3.987975951903808e-06, "loss": 0.0073, "step": 4604 }, { "epoch": 18.42, "grad_norm": 0.17306354641914368, "learning_rate": 3.977955911823648e-06, "loss": 0.0077, "step": 4605 }, { "epoch": 18.424, "grad_norm": 0.21375983953475952, "learning_rate": 3.9679358717434865e-06, "loss": 0.0068, "step": 4606 }, { "epoch": 18.428, "grad_norm": 0.17107589542865753, "learning_rate": 3.957915831663327e-06, "loss": 0.0076, "step": 4607 }, { "epoch": 18.432, "grad_norm": 0.15007686614990234, "learning_rate": 3.9478957915831665e-06, "loss": 0.008, "step": 4608 }, { "epoch": 18.436, "grad_norm": 0.13160204887390137, "learning_rate": 3.937875751503006e-06, "loss": 0.0064, "step": 4609 }, { "epoch": 18.44, "grad_norm": 0.2253139317035675, "learning_rate": 3.927855711422846e-06, "loss": 0.0077, "step": 4610 }, { "epoch": 18.444, "grad_norm": 0.12682516872882843, "learning_rate": 3.917835671342686e-06, "loss": 0.0035, "step": 4611 }, { "epoch": 18.448, "grad_norm": 0.17346949875354767, "learning_rate": 3.907815631262525e-06, "loss": 0.007, "step": 4612 }, { "epoch": 18.452, "grad_norm": 0.24276018142700195, "learning_rate": 3.8977955911823645e-06, "loss": 0.0075, "step": 4613 }, { "epoch": 18.456, "grad_norm": 0.13902723789215088, "learning_rate": 3.887775551102205e-06, "loss": 0.0067, "step": 4614 }, { "epoch": 18.46, "grad_norm": 0.23678214848041534, "learning_rate": 3.8777555110220445e-06, "loss": 0.0079, "step": 4615 }, { "epoch": 18.464, "grad_norm": 0.11600477248430252, "learning_rate": 3.867735470941884e-06, "loss": 0.0069, "step": 4616 }, { "epoch": 18.468, "grad_norm": 0.17199131846427917, "learning_rate": 3.857715430861724e-06, "loss": 0.0079, "step": 4617 }, { "epoch": 18.472, "grad_norm": 0.13820935785770416, "learning_rate": 3.847695390781563e-06, "loss": 0.0065, "step": 4618 }, { "epoch": 18.476, "grad_norm": 0.2043878436088562, "learning_rate": 3.837675350701403e-06, "loss": 0.008, "step": 4619 }, { "epoch": 18.48, "grad_norm": 0.1664237231016159, "learning_rate": 3.8276553106212424e-06, "loss": 0.0073, "step": 4620 }, { "epoch": 18.484, "grad_norm": 0.14281480014324188, "learning_rate": 3.817635270541083e-06, "loss": 0.0077, "step": 4621 }, { "epoch": 18.488, "grad_norm": 0.15422894060611725, "learning_rate": 3.8076152304609225e-06, "loss": 0.0071, "step": 4622 }, { "epoch": 18.492, "grad_norm": 0.18872609734535217, "learning_rate": 3.7975951903807616e-06, "loss": 0.0075, "step": 4623 }, { "epoch": 18.496, "grad_norm": 0.2351578027009964, "learning_rate": 3.7875751503006012e-06, "loss": 0.0079, "step": 4624 }, { "epoch": 18.5, "grad_norm": 0.15533076226711273, "learning_rate": 3.7775551102204412e-06, "loss": 0.007, "step": 4625 }, { "epoch": 18.504, "grad_norm": 0.19388796389102936, "learning_rate": 3.767535070140281e-06, "loss": 0.0069, "step": 4626 }, { "epoch": 18.508, "grad_norm": 0.17046746611595154, "learning_rate": 3.757515030060121e-06, "loss": 0.0067, "step": 4627 }, { "epoch": 18.512, "grad_norm": 0.15375731885433197, "learning_rate": 3.74749498997996e-06, "loss": 0.0074, "step": 4628 }, { "epoch": 18.516, "grad_norm": 0.2605665922164917, "learning_rate": 3.7374749498997996e-06, "loss": 0.0073, "step": 4629 }, { "epoch": 18.52, "grad_norm": 0.18455125391483307, "learning_rate": 3.7274549098196396e-06, "loss": 0.0081, "step": 4630 }, { "epoch": 18.524, "grad_norm": 0.21785873174667358, "learning_rate": 3.717434869739479e-06, "loss": 0.0082, "step": 4631 }, { "epoch": 18.528, "grad_norm": 0.14746534824371338, "learning_rate": 3.707414829659319e-06, "loss": 0.0074, "step": 4632 }, { "epoch": 18.532, "grad_norm": 0.1808655709028244, "learning_rate": 3.6973947895791584e-06, "loss": 0.0073, "step": 4633 }, { "epoch": 18.536, "grad_norm": 0.20626331865787506, "learning_rate": 3.687374749498998e-06, "loss": 0.0075, "step": 4634 }, { "epoch": 18.54, "grad_norm": 0.19164638221263885, "learning_rate": 3.677354709418838e-06, "loss": 0.0073, "step": 4635 }, { "epoch": 18.544, "grad_norm": 0.12534405291080475, "learning_rate": 3.6673346693386775e-06, "loss": 0.0072, "step": 4636 }, { "epoch": 18.548000000000002, "grad_norm": 0.1504947692155838, "learning_rate": 3.6573146292585176e-06, "loss": 0.007, "step": 4637 }, { "epoch": 18.552, "grad_norm": 0.2330176681280136, "learning_rate": 3.647294589178357e-06, "loss": 0.0077, "step": 4638 }, { "epoch": 18.556, "grad_norm": 0.20846612751483917, "learning_rate": 3.6372745490981963e-06, "loss": 0.0087, "step": 4639 }, { "epoch": 18.56, "grad_norm": 0.15900227427482605, "learning_rate": 3.6272545090180363e-06, "loss": 0.0062, "step": 4640 }, { "epoch": 18.564, "grad_norm": 0.10888917744159698, "learning_rate": 3.617234468937876e-06, "loss": 0.0062, "step": 4641 }, { "epoch": 18.568, "grad_norm": 0.18537414073944092, "learning_rate": 3.607214428857716e-06, "loss": 0.0078, "step": 4642 }, { "epoch": 18.572, "grad_norm": 0.2435263842344284, "learning_rate": 3.5971943887775555e-06, "loss": 0.008, "step": 4643 }, { "epoch": 18.576, "grad_norm": 0.1557953655719757, "learning_rate": 3.5871743486973947e-06, "loss": 0.0066, "step": 4644 }, { "epoch": 18.58, "grad_norm": 0.22416846454143524, "learning_rate": 3.5771543086172343e-06, "loss": 0.0078, "step": 4645 }, { "epoch": 18.584, "grad_norm": 0.1947488784790039, "learning_rate": 3.5671342685370743e-06, "loss": 0.0077, "step": 4646 }, { "epoch": 18.588, "grad_norm": 0.1567157357931137, "learning_rate": 3.5571142284569143e-06, "loss": 0.0064, "step": 4647 }, { "epoch": 18.592, "grad_norm": 0.19222493469715118, "learning_rate": 3.547094188376754e-06, "loss": 0.0074, "step": 4648 }, { "epoch": 18.596, "grad_norm": 0.11136149615049362, "learning_rate": 3.537074148296594e-06, "loss": 0.0041, "step": 4649 }, { "epoch": 18.6, "grad_norm": 0.22324241697788239, "learning_rate": 3.5270541082164326e-06, "loss": 0.0079, "step": 4650 }, { "epoch": 18.604, "grad_norm": 0.19871820509433746, "learning_rate": 3.5170340681362726e-06, "loss": 0.008, "step": 4651 }, { "epoch": 18.608, "grad_norm": 0.14418363571166992, "learning_rate": 3.5070140280561122e-06, "loss": 0.0074, "step": 4652 }, { "epoch": 18.612, "grad_norm": 0.10854968428611755, "learning_rate": 3.4969939879759522e-06, "loss": 0.0045, "step": 4653 }, { "epoch": 18.616, "grad_norm": 0.1685395985841751, "learning_rate": 3.486973947895792e-06, "loss": 0.0074, "step": 4654 }, { "epoch": 18.62, "grad_norm": 0.1491825133562088, "learning_rate": 3.476953907815631e-06, "loss": 0.0081, "step": 4655 }, { "epoch": 18.624, "grad_norm": 0.10171408206224442, "learning_rate": 3.466933867735471e-06, "loss": 0.0042, "step": 4656 }, { "epoch": 18.628, "grad_norm": 0.18516850471496582, "learning_rate": 3.4569138276553106e-06, "loss": 0.0076, "step": 4657 }, { "epoch": 18.632, "grad_norm": 0.192645862698555, "learning_rate": 3.4468937875751506e-06, "loss": 0.0067, "step": 4658 }, { "epoch": 18.636, "grad_norm": 0.17298291623592377, "learning_rate": 3.43687374749499e-06, "loss": 0.0089, "step": 4659 }, { "epoch": 18.64, "grad_norm": 0.20988403260707855, "learning_rate": 3.4268537074148294e-06, "loss": 0.0089, "step": 4660 }, { "epoch": 18.644, "grad_norm": 0.13690654933452606, "learning_rate": 3.4168336673346694e-06, "loss": 0.0072, "step": 4661 }, { "epoch": 18.648, "grad_norm": 0.29314905405044556, "learning_rate": 3.406813627254509e-06, "loss": 0.0084, "step": 4662 }, { "epoch": 18.652, "grad_norm": 0.18612822890281677, "learning_rate": 3.396793587174349e-06, "loss": 0.0072, "step": 4663 }, { "epoch": 18.656, "grad_norm": 0.15141285955905914, "learning_rate": 3.3867735470941886e-06, "loss": 0.0066, "step": 4664 }, { "epoch": 18.66, "grad_norm": 0.13616497814655304, "learning_rate": 3.3767535070140286e-06, "loss": 0.0069, "step": 4665 }, { "epoch": 18.664, "grad_norm": 0.2509894073009491, "learning_rate": 3.3667334669338677e-06, "loss": 0.008, "step": 4666 }, { "epoch": 18.668, "grad_norm": 0.2138143926858902, "learning_rate": 3.3567134268537073e-06, "loss": 0.0073, "step": 4667 }, { "epoch": 18.672, "grad_norm": 0.16789166629314423, "learning_rate": 3.3466933867735473e-06, "loss": 0.0077, "step": 4668 }, { "epoch": 18.676, "grad_norm": 0.16509471833705902, "learning_rate": 3.336673346693387e-06, "loss": 0.0073, "step": 4669 }, { "epoch": 18.68, "grad_norm": 0.2040925770998001, "learning_rate": 3.326653306613227e-06, "loss": 0.0071, "step": 4670 }, { "epoch": 18.684, "grad_norm": 0.17239294946193695, "learning_rate": 3.316633266533066e-06, "loss": 0.0077, "step": 4671 }, { "epoch": 18.688, "grad_norm": 0.17981763184070587, "learning_rate": 3.3066132264529057e-06, "loss": 0.0073, "step": 4672 }, { "epoch": 18.692, "grad_norm": 0.15703114867210388, "learning_rate": 3.2965931863727457e-06, "loss": 0.008, "step": 4673 }, { "epoch": 18.696, "grad_norm": 0.17290978133678436, "learning_rate": 3.2865731462925853e-06, "loss": 0.0085, "step": 4674 }, { "epoch": 18.7, "grad_norm": 0.18817895650863647, "learning_rate": 3.2765531062124253e-06, "loss": 0.0083, "step": 4675 }, { "epoch": 18.704, "grad_norm": 0.1855512112379074, "learning_rate": 3.2665330661322645e-06, "loss": 0.0075, "step": 4676 }, { "epoch": 18.708, "grad_norm": 0.2292889952659607, "learning_rate": 3.256513026052104e-06, "loss": 0.0081, "step": 4677 }, { "epoch": 18.712, "grad_norm": 0.15478086471557617, "learning_rate": 3.246492985971944e-06, "loss": 0.0072, "step": 4678 }, { "epoch": 18.716, "grad_norm": 0.16357959806919098, "learning_rate": 3.2364729458917837e-06, "loss": 0.0077, "step": 4679 }, { "epoch": 18.72, "grad_norm": 0.17677243053913116, "learning_rate": 3.2264529058116237e-06, "loss": 0.0072, "step": 4680 }, { "epoch": 18.724, "grad_norm": 0.23427382111549377, "learning_rate": 3.2164328657314633e-06, "loss": 0.0089, "step": 4681 }, { "epoch": 18.728, "grad_norm": 0.1855786293745041, "learning_rate": 3.2064128256513024e-06, "loss": 0.0069, "step": 4682 }, { "epoch": 18.732, "grad_norm": 0.17159710824489594, "learning_rate": 3.1963927855711424e-06, "loss": 0.0071, "step": 4683 }, { "epoch": 18.736, "grad_norm": 0.18755950033664703, "learning_rate": 3.186372745490982e-06, "loss": 0.0075, "step": 4684 }, { "epoch": 18.74, "grad_norm": 0.16439513862133026, "learning_rate": 3.176352705410822e-06, "loss": 0.0072, "step": 4685 }, { "epoch": 18.744, "grad_norm": 0.17373932898044586, "learning_rate": 3.1663326653306616e-06, "loss": 0.0072, "step": 4686 }, { "epoch": 18.748, "grad_norm": 0.19658909738063812, "learning_rate": 3.156312625250501e-06, "loss": 0.008, "step": 4687 }, { "epoch": 18.752, "grad_norm": 0.1813708245754242, "learning_rate": 3.146292585170341e-06, "loss": 0.0068, "step": 4688 }, { "epoch": 18.756, "grad_norm": 0.15275795757770538, "learning_rate": 3.1362725450901804e-06, "loss": 0.0066, "step": 4689 }, { "epoch": 18.76, "grad_norm": 0.20497362315654755, "learning_rate": 3.1262525050100204e-06, "loss": 0.0081, "step": 4690 }, { "epoch": 18.764, "grad_norm": 0.1795947104692459, "learning_rate": 3.1162324649298596e-06, "loss": 0.0077, "step": 4691 }, { "epoch": 18.768, "grad_norm": 0.14779050648212433, "learning_rate": 3.1062124248496996e-06, "loss": 0.0072, "step": 4692 }, { "epoch": 18.772, "grad_norm": 0.15518996119499207, "learning_rate": 3.096192384769539e-06, "loss": 0.0048, "step": 4693 }, { "epoch": 18.776, "grad_norm": 0.1515381932258606, "learning_rate": 3.0861723446893788e-06, "loss": 0.0068, "step": 4694 }, { "epoch": 18.78, "grad_norm": 0.2062983363866806, "learning_rate": 3.0761523046092188e-06, "loss": 0.0085, "step": 4695 }, { "epoch": 18.784, "grad_norm": 0.1976705640554428, "learning_rate": 3.0661322645290584e-06, "loss": 0.0072, "step": 4696 }, { "epoch": 18.788, "grad_norm": 0.13696104288101196, "learning_rate": 3.056112224448898e-06, "loss": 0.0065, "step": 4697 }, { "epoch": 18.792, "grad_norm": 0.19733761250972748, "learning_rate": 3.0460921843687375e-06, "loss": 0.0072, "step": 4698 }, { "epoch": 18.796, "grad_norm": 0.17642058432102203, "learning_rate": 3.036072144288577e-06, "loss": 0.0078, "step": 4699 }, { "epoch": 18.8, "grad_norm": 0.17116330564022064, "learning_rate": 3.026052104208417e-06, "loss": 0.0069, "step": 4700 }, { "epoch": 18.804, "grad_norm": 0.2022354155778885, "learning_rate": 3.0160320641282567e-06, "loss": 0.0079, "step": 4701 }, { "epoch": 18.808, "grad_norm": 0.147251158952713, "learning_rate": 3.0060120240480963e-06, "loss": 0.0062, "step": 4702 }, { "epoch": 18.812, "grad_norm": 0.1599157601594925, "learning_rate": 2.995991983967936e-06, "loss": 0.008, "step": 4703 }, { "epoch": 18.816, "grad_norm": 0.12859754264354706, "learning_rate": 2.985971943887776e-06, "loss": 0.0073, "step": 4704 }, { "epoch": 18.82, "grad_norm": 0.2207632064819336, "learning_rate": 2.975951903807615e-06, "loss": 0.0078, "step": 4705 }, { "epoch": 18.824, "grad_norm": 0.16606071591377258, "learning_rate": 2.965931863727455e-06, "loss": 0.0072, "step": 4706 }, { "epoch": 18.828, "grad_norm": 0.1599171757698059, "learning_rate": 2.9559118236472947e-06, "loss": 0.008, "step": 4707 }, { "epoch": 18.832, "grad_norm": 0.14735926687717438, "learning_rate": 2.9458917835671343e-06, "loss": 0.0071, "step": 4708 }, { "epoch": 18.836, "grad_norm": 0.25691819190979004, "learning_rate": 2.9358717434869743e-06, "loss": 0.0079, "step": 4709 }, { "epoch": 18.84, "grad_norm": 0.19598759710788727, "learning_rate": 2.9258517034068135e-06, "loss": 0.0076, "step": 4710 }, { "epoch": 18.844, "grad_norm": 0.19333742558956146, "learning_rate": 2.9158316633266535e-06, "loss": 0.0076, "step": 4711 }, { "epoch": 18.848, "grad_norm": 0.1969856470823288, "learning_rate": 2.905811623246493e-06, "loss": 0.0075, "step": 4712 }, { "epoch": 18.852, "grad_norm": 0.1487724334001541, "learning_rate": 2.8957915831663326e-06, "loss": 0.0067, "step": 4713 }, { "epoch": 18.856, "grad_norm": 0.1891922503709793, "learning_rate": 2.8857715430861727e-06, "loss": 0.007, "step": 4714 }, { "epoch": 18.86, "grad_norm": 0.1953631490468979, "learning_rate": 2.8757515030060122e-06, "loss": 0.0077, "step": 4715 }, { "epoch": 18.864, "grad_norm": 0.17311617732048035, "learning_rate": 2.865731462925852e-06, "loss": 0.0072, "step": 4716 }, { "epoch": 18.868, "grad_norm": 0.10396530479192734, "learning_rate": 2.8557114228456914e-06, "loss": 0.0041, "step": 4717 }, { "epoch": 18.872, "grad_norm": 0.1536823958158493, "learning_rate": 2.845691382765531e-06, "loss": 0.008, "step": 4718 }, { "epoch": 18.876, "grad_norm": 0.1858183890581131, "learning_rate": 2.835671342685371e-06, "loss": 0.0076, "step": 4719 }, { "epoch": 18.88, "grad_norm": 0.13414987921714783, "learning_rate": 2.8256513026052106e-06, "loss": 0.0068, "step": 4720 }, { "epoch": 18.884, "grad_norm": 0.11735019087791443, "learning_rate": 2.81563126252505e-06, "loss": 0.0059, "step": 4721 }, { "epoch": 18.888, "grad_norm": 0.17751845717430115, "learning_rate": 2.80561122244489e-06, "loss": 0.0073, "step": 4722 }, { "epoch": 18.892, "grad_norm": 0.22168204188346863, "learning_rate": 2.79559118236473e-06, "loss": 0.0083, "step": 4723 }, { "epoch": 18.896, "grad_norm": 0.18267209827899933, "learning_rate": 2.7855711422845694e-06, "loss": 0.0077, "step": 4724 }, { "epoch": 18.9, "grad_norm": 0.1117076501250267, "learning_rate": 2.775551102204409e-06, "loss": 0.0069, "step": 4725 }, { "epoch": 18.904, "grad_norm": 0.12114907801151276, "learning_rate": 2.7655310621242486e-06, "loss": 0.007, "step": 4726 }, { "epoch": 18.908, "grad_norm": 0.2513674199581146, "learning_rate": 2.755511022044088e-06, "loss": 0.0084, "step": 4727 }, { "epoch": 18.912, "grad_norm": 0.1959051787853241, "learning_rate": 2.745490981963928e-06, "loss": 0.0073, "step": 4728 }, { "epoch": 18.916, "grad_norm": 0.2004287838935852, "learning_rate": 2.7354709418837678e-06, "loss": 0.0077, "step": 4729 }, { "epoch": 18.92, "grad_norm": 0.17070572078227997, "learning_rate": 2.7254509018036073e-06, "loss": 0.0068, "step": 4730 }, { "epoch": 18.924, "grad_norm": 0.14630602300167084, "learning_rate": 2.7154308617234474e-06, "loss": 0.0071, "step": 4731 }, { "epoch": 18.928, "grad_norm": 0.16959349811077118, "learning_rate": 2.7054108216432865e-06, "loss": 0.0083, "step": 4732 }, { "epoch": 18.932, "grad_norm": 0.15642155706882477, "learning_rate": 2.6953907815631265e-06, "loss": 0.0075, "step": 4733 }, { "epoch": 18.936, "grad_norm": 0.17925404012203217, "learning_rate": 2.6853707414829657e-06, "loss": 0.0065, "step": 4734 }, { "epoch": 18.94, "grad_norm": 0.19508227705955505, "learning_rate": 2.6753507014028057e-06, "loss": 0.008, "step": 4735 }, { "epoch": 18.944, "grad_norm": 0.16290459036827087, "learning_rate": 2.6653306613226457e-06, "loss": 0.005, "step": 4736 }, { "epoch": 18.948, "grad_norm": 0.15462210774421692, "learning_rate": 2.655310621242485e-06, "loss": 0.0078, "step": 4737 }, { "epoch": 18.951999999999998, "grad_norm": 0.11808783560991287, "learning_rate": 2.645290581162325e-06, "loss": 0.0066, "step": 4738 }, { "epoch": 18.956, "grad_norm": 0.20686569809913635, "learning_rate": 2.6352705410821645e-06, "loss": 0.0078, "step": 4739 }, { "epoch": 18.96, "grad_norm": 0.17440447211265564, "learning_rate": 2.625250501002004e-06, "loss": 0.0081, "step": 4740 }, { "epoch": 18.964, "grad_norm": 0.337189644575119, "learning_rate": 2.6152304609218437e-06, "loss": 0.0079, "step": 4741 }, { "epoch": 18.968, "grad_norm": 0.18942351639270782, "learning_rate": 2.6052104208416833e-06, "loss": 0.0091, "step": 4742 }, { "epoch": 18.972, "grad_norm": 0.10684970766305923, "learning_rate": 2.5951903807615233e-06, "loss": 0.0046, "step": 4743 }, { "epoch": 18.976, "grad_norm": 0.22341260313987732, "learning_rate": 2.585170340681363e-06, "loss": 0.0077, "step": 4744 }, { "epoch": 18.98, "grad_norm": 0.24727453291416168, "learning_rate": 2.5751503006012024e-06, "loss": 0.0088, "step": 4745 }, { "epoch": 18.984, "grad_norm": 0.19121108949184418, "learning_rate": 2.565130260521042e-06, "loss": 0.0073, "step": 4746 }, { "epoch": 18.988, "grad_norm": 0.2110673189163208, "learning_rate": 2.555110220440882e-06, "loss": 0.0077, "step": 4747 }, { "epoch": 18.992, "grad_norm": 0.13829149305820465, "learning_rate": 2.5450901803607216e-06, "loss": 0.0069, "step": 4748 }, { "epoch": 18.996, "grad_norm": 0.1673704981803894, "learning_rate": 2.5350701402805612e-06, "loss": 0.0073, "step": 4749 }, { "epoch": 19.0, "grad_norm": 0.18628820776939392, "learning_rate": 2.5250501002004012e-06, "loss": 0.0076, "step": 4750 }, { "epoch": 19.004, "grad_norm": 0.17194223403930664, "learning_rate": 2.5150300601202404e-06, "loss": 0.0065, "step": 4751 }, { "epoch": 19.008, "grad_norm": 0.14391052722930908, "learning_rate": 2.5050100200400804e-06, "loss": 0.0066, "step": 4752 }, { "epoch": 19.012, "grad_norm": 0.1288471221923828, "learning_rate": 2.49498997995992e-06, "loss": 0.0064, "step": 4753 }, { "epoch": 19.016, "grad_norm": 0.158019557595253, "learning_rate": 2.4849699398797596e-06, "loss": 0.007, "step": 4754 }, { "epoch": 19.02, "grad_norm": 0.19652923941612244, "learning_rate": 2.4749498997995996e-06, "loss": 0.0078, "step": 4755 }, { "epoch": 19.024, "grad_norm": 0.1863459348678589, "learning_rate": 2.4649298597194388e-06, "loss": 0.0073, "step": 4756 }, { "epoch": 19.028, "grad_norm": 0.16609062254428864, "learning_rate": 2.4549098196392788e-06, "loss": 0.0073, "step": 4757 }, { "epoch": 19.032, "grad_norm": 0.1344534158706665, "learning_rate": 2.4448897795591184e-06, "loss": 0.0062, "step": 4758 }, { "epoch": 19.036, "grad_norm": 0.11711183190345764, "learning_rate": 2.434869739478958e-06, "loss": 0.0058, "step": 4759 }, { "epoch": 19.04, "grad_norm": 0.12532389163970947, "learning_rate": 2.424849699398798e-06, "loss": 0.0068, "step": 4760 }, { "epoch": 19.044, "grad_norm": 0.14596407115459442, "learning_rate": 2.414829659318637e-06, "loss": 0.007, "step": 4761 }, { "epoch": 19.048, "grad_norm": 0.13948693871498108, "learning_rate": 2.404809619238477e-06, "loss": 0.0067, "step": 4762 }, { "epoch": 19.052, "grad_norm": 0.15014758706092834, "learning_rate": 2.3947895791583167e-06, "loss": 0.0067, "step": 4763 }, { "epoch": 19.056, "grad_norm": 0.11399639397859573, "learning_rate": 2.3847695390781563e-06, "loss": 0.0039, "step": 4764 }, { "epoch": 19.06, "grad_norm": 0.27429330348968506, "learning_rate": 2.374749498997996e-06, "loss": 0.0077, "step": 4765 }, { "epoch": 19.064, "grad_norm": 0.1491885483264923, "learning_rate": 2.364729458917836e-06, "loss": 0.0069, "step": 4766 }, { "epoch": 19.068, "grad_norm": 0.1384284943342209, "learning_rate": 2.3547094188376755e-06, "loss": 0.0063, "step": 4767 }, { "epoch": 19.072, "grad_norm": 0.15902647376060486, "learning_rate": 2.344689378757515e-06, "loss": 0.0076, "step": 4768 }, { "epoch": 19.076, "grad_norm": 0.17550063133239746, "learning_rate": 2.3346693386773547e-06, "loss": 0.0075, "step": 4769 }, { "epoch": 19.08, "grad_norm": 0.11770374327898026, "learning_rate": 2.3246492985971943e-06, "loss": 0.0065, "step": 4770 }, { "epoch": 19.084, "grad_norm": 0.13193221390247345, "learning_rate": 2.3146292585170343e-06, "loss": 0.0064, "step": 4771 }, { "epoch": 19.088, "grad_norm": 0.20296931266784668, "learning_rate": 2.304609218436874e-06, "loss": 0.0082, "step": 4772 }, { "epoch": 19.092, "grad_norm": 0.17178785800933838, "learning_rate": 2.2945891783567135e-06, "loss": 0.007, "step": 4773 }, { "epoch": 19.096, "grad_norm": 0.18985581398010254, "learning_rate": 2.2845691382765535e-06, "loss": 0.0064, "step": 4774 }, { "epoch": 19.1, "grad_norm": 0.180537611246109, "learning_rate": 2.2745490981963926e-06, "loss": 0.0069, "step": 4775 }, { "epoch": 19.104, "grad_norm": 0.13604559004306793, "learning_rate": 2.2645290581162327e-06, "loss": 0.0045, "step": 4776 }, { "epoch": 19.108, "grad_norm": 0.13354989886283875, "learning_rate": 2.2545090180360722e-06, "loss": 0.0061, "step": 4777 }, { "epoch": 19.112, "grad_norm": 0.1494477540254593, "learning_rate": 2.244488977955912e-06, "loss": 0.0069, "step": 4778 }, { "epoch": 19.116, "grad_norm": 0.22611655294895172, "learning_rate": 2.234468937875752e-06, "loss": 0.008, "step": 4779 }, { "epoch": 19.12, "grad_norm": 0.11629709601402283, "learning_rate": 2.224448897795591e-06, "loss": 0.0045, "step": 4780 }, { "epoch": 19.124, "grad_norm": 0.16238947212696075, "learning_rate": 2.214428857715431e-06, "loss": 0.007, "step": 4781 }, { "epoch": 19.128, "grad_norm": 0.15134558081626892, "learning_rate": 2.2044088176352706e-06, "loss": 0.0072, "step": 4782 }, { "epoch": 19.132, "grad_norm": 0.1289103478193283, "learning_rate": 2.19438877755511e-06, "loss": 0.0056, "step": 4783 }, { "epoch": 19.136, "grad_norm": 0.13187222182750702, "learning_rate": 2.18436873747495e-06, "loss": 0.0062, "step": 4784 }, { "epoch": 19.14, "grad_norm": 0.16803032159805298, "learning_rate": 2.17434869739479e-06, "loss": 0.0072, "step": 4785 }, { "epoch": 19.144, "grad_norm": 0.16196125745773315, "learning_rate": 2.1643286573146294e-06, "loss": 0.0071, "step": 4786 }, { "epoch": 19.148, "grad_norm": 0.14935888350009918, "learning_rate": 2.154308617234469e-06, "loss": 0.007, "step": 4787 }, { "epoch": 19.152, "grad_norm": 0.17535895109176636, "learning_rate": 2.1442885771543086e-06, "loss": 0.007, "step": 4788 }, { "epoch": 19.156, "grad_norm": 0.12928767502307892, "learning_rate": 2.1342685370741486e-06, "loss": 0.0065, "step": 4789 }, { "epoch": 19.16, "grad_norm": 0.1758950650691986, "learning_rate": 2.124248496993988e-06, "loss": 0.0064, "step": 4790 }, { "epoch": 19.164, "grad_norm": 0.1001003235578537, "learning_rate": 2.1142284569138278e-06, "loss": 0.0058, "step": 4791 }, { "epoch": 19.168, "grad_norm": 0.17448782920837402, "learning_rate": 2.1042084168336673e-06, "loss": 0.0064, "step": 4792 }, { "epoch": 19.172, "grad_norm": 0.16154441237449646, "learning_rate": 2.0941883767535074e-06, "loss": 0.0072, "step": 4793 }, { "epoch": 19.176, "grad_norm": 0.16718144714832306, "learning_rate": 2.0841683366733465e-06, "loss": 0.007, "step": 4794 }, { "epoch": 19.18, "grad_norm": 0.16893278062343597, "learning_rate": 2.0741482965931865e-06, "loss": 0.0077, "step": 4795 }, { "epoch": 19.184, "grad_norm": 0.14809642732143402, "learning_rate": 2.064128256513026e-06, "loss": 0.0062, "step": 4796 }, { "epoch": 19.188, "grad_norm": 0.16328303515911102, "learning_rate": 2.0541082164328657e-06, "loss": 0.0067, "step": 4797 }, { "epoch": 19.192, "grad_norm": 0.17468783259391785, "learning_rate": 2.0440881763527057e-06, "loss": 0.0071, "step": 4798 }, { "epoch": 19.196, "grad_norm": 0.1770590990781784, "learning_rate": 2.034068136272545e-06, "loss": 0.0073, "step": 4799 }, { "epoch": 19.2, "grad_norm": 0.20840556919574738, "learning_rate": 2.024048096192385e-06, "loss": 0.0077, "step": 4800 }, { "epoch": 19.204, "grad_norm": 0.16028790175914764, "learning_rate": 2.0140280561122245e-06, "loss": 0.0075, "step": 4801 }, { "epoch": 19.208, "grad_norm": 0.10381535440683365, "learning_rate": 2.004008016032064e-06, "loss": 0.0042, "step": 4802 }, { "epoch": 19.212, "grad_norm": 0.1724594235420227, "learning_rate": 1.993987975951904e-06, "loss": 0.0079, "step": 4803 }, { "epoch": 19.216, "grad_norm": 0.17494012415409088, "learning_rate": 1.9839679358717433e-06, "loss": 0.0069, "step": 4804 }, { "epoch": 19.22, "grad_norm": 0.16234153509140015, "learning_rate": 1.9739478957915833e-06, "loss": 0.0069, "step": 4805 }, { "epoch": 19.224, "grad_norm": 0.15438240766525269, "learning_rate": 1.963927855711423e-06, "loss": 0.0073, "step": 4806 }, { "epoch": 19.228, "grad_norm": 0.16121751070022583, "learning_rate": 1.9539078156312624e-06, "loss": 0.0074, "step": 4807 }, { "epoch": 19.232, "grad_norm": 0.15929311513900757, "learning_rate": 1.9438877755511025e-06, "loss": 0.0073, "step": 4808 }, { "epoch": 19.236, "grad_norm": 0.1370072364807129, "learning_rate": 1.933867735470942e-06, "loss": 0.007, "step": 4809 }, { "epoch": 19.24, "grad_norm": 0.15893737971782684, "learning_rate": 1.9238476953907816e-06, "loss": 0.0063, "step": 4810 }, { "epoch": 19.244, "grad_norm": 0.19493524730205536, "learning_rate": 1.9138276553106212e-06, "loss": 0.0072, "step": 4811 }, { "epoch": 19.248, "grad_norm": 0.1659090220928192, "learning_rate": 1.9038076152304612e-06, "loss": 0.0069, "step": 4812 }, { "epoch": 19.252, "grad_norm": 0.14420440793037415, "learning_rate": 1.8937875751503006e-06, "loss": 0.0065, "step": 4813 }, { "epoch": 19.256, "grad_norm": 0.16298946738243103, "learning_rate": 1.8837675350701404e-06, "loss": 0.0065, "step": 4814 }, { "epoch": 19.26, "grad_norm": 0.13460707664489746, "learning_rate": 1.87374749498998e-06, "loss": 0.0069, "step": 4815 }, { "epoch": 19.264, "grad_norm": 0.17782555520534515, "learning_rate": 1.8637274549098198e-06, "loss": 0.0078, "step": 4816 }, { "epoch": 19.268, "grad_norm": 0.21626275777816772, "learning_rate": 1.8537074148296596e-06, "loss": 0.0083, "step": 4817 }, { "epoch": 19.272, "grad_norm": 0.07763174921274185, "learning_rate": 1.843687374749499e-06, "loss": 0.0036, "step": 4818 }, { "epoch": 19.276, "grad_norm": 0.19351311028003693, "learning_rate": 1.8336673346693388e-06, "loss": 0.0074, "step": 4819 }, { "epoch": 19.28, "grad_norm": 0.18536561727523804, "learning_rate": 1.8236472945891786e-06, "loss": 0.0074, "step": 4820 }, { "epoch": 19.284, "grad_norm": 0.15970304608345032, "learning_rate": 1.8136272545090182e-06, "loss": 0.0066, "step": 4821 }, { "epoch": 19.288, "grad_norm": 0.14813347160816193, "learning_rate": 1.803607214428858e-06, "loss": 0.0061, "step": 4822 }, { "epoch": 19.292, "grad_norm": 0.1950247585773468, "learning_rate": 1.7935871743486973e-06, "loss": 0.0076, "step": 4823 }, { "epoch": 19.296, "grad_norm": 0.19264726340770721, "learning_rate": 1.7835671342685371e-06, "loss": 0.0069, "step": 4824 }, { "epoch": 19.3, "grad_norm": 0.19691282510757446, "learning_rate": 1.773547094188377e-06, "loss": 0.0067, "step": 4825 }, { "epoch": 19.304, "grad_norm": 0.1652364283800125, "learning_rate": 1.7635270541082163e-06, "loss": 0.0065, "step": 4826 }, { "epoch": 19.308, "grad_norm": 0.18058018386363983, "learning_rate": 1.7535070140280561e-06, "loss": 0.008, "step": 4827 }, { "epoch": 19.312, "grad_norm": 0.1553662121295929, "learning_rate": 1.743486973947896e-06, "loss": 0.0067, "step": 4828 }, { "epoch": 19.316, "grad_norm": 0.12730355560779572, "learning_rate": 1.7334669338677355e-06, "loss": 0.0068, "step": 4829 }, { "epoch": 19.32, "grad_norm": 0.1950482428073883, "learning_rate": 1.7234468937875753e-06, "loss": 0.008, "step": 4830 }, { "epoch": 19.324, "grad_norm": 0.17624934017658234, "learning_rate": 1.7134268537074147e-06, "loss": 0.0076, "step": 4831 }, { "epoch": 19.328, "grad_norm": 0.1288178265094757, "learning_rate": 1.7034068136272545e-06, "loss": 0.0068, "step": 4832 }, { "epoch": 19.332, "grad_norm": 0.1886533796787262, "learning_rate": 1.6933867735470943e-06, "loss": 0.0063, "step": 4833 }, { "epoch": 19.336, "grad_norm": 0.16468536853790283, "learning_rate": 1.6833667334669339e-06, "loss": 0.0069, "step": 4834 }, { "epoch": 19.34, "grad_norm": 0.17551806569099426, "learning_rate": 1.6733466933867737e-06, "loss": 0.0077, "step": 4835 }, { "epoch": 19.344, "grad_norm": 0.1710132509469986, "learning_rate": 1.6633266533066135e-06, "loss": 0.0076, "step": 4836 }, { "epoch": 19.348, "grad_norm": 0.17541347444057465, "learning_rate": 1.6533066132264529e-06, "loss": 0.0068, "step": 4837 }, { "epoch": 19.352, "grad_norm": 0.1492847353219986, "learning_rate": 1.6432865731462927e-06, "loss": 0.0067, "step": 4838 }, { "epoch": 19.356, "grad_norm": 0.17386400699615479, "learning_rate": 1.6332665330661322e-06, "loss": 0.0074, "step": 4839 }, { "epoch": 19.36, "grad_norm": 0.15245819091796875, "learning_rate": 1.623246492985972e-06, "loss": 0.0061, "step": 4840 }, { "epoch": 19.364, "grad_norm": 0.18433190882205963, "learning_rate": 1.6132264529058118e-06, "loss": 0.007, "step": 4841 }, { "epoch": 19.368, "grad_norm": 0.16951580345630646, "learning_rate": 1.6032064128256512e-06, "loss": 0.0069, "step": 4842 }, { "epoch": 19.372, "grad_norm": 0.1776171624660492, "learning_rate": 1.593186372745491e-06, "loss": 0.0068, "step": 4843 }, { "epoch": 19.376, "grad_norm": 0.2017744779586792, "learning_rate": 1.5831663326653308e-06, "loss": 0.0069, "step": 4844 }, { "epoch": 19.38, "grad_norm": 0.22619450092315674, "learning_rate": 1.5731462925851704e-06, "loss": 0.0071, "step": 4845 }, { "epoch": 19.384, "grad_norm": 0.1772204488515854, "learning_rate": 1.5631262525050102e-06, "loss": 0.0069, "step": 4846 }, { "epoch": 19.388, "grad_norm": 0.2213824987411499, "learning_rate": 1.5531062124248498e-06, "loss": 0.0075, "step": 4847 }, { "epoch": 19.392, "grad_norm": 0.21730723977088928, "learning_rate": 1.5430861723446894e-06, "loss": 0.0078, "step": 4848 }, { "epoch": 19.396, "grad_norm": 0.13313718140125275, "learning_rate": 1.5330661322645292e-06, "loss": 0.0063, "step": 4849 }, { "epoch": 19.4, "grad_norm": 0.23904912173748016, "learning_rate": 1.5230460921843688e-06, "loss": 0.0074, "step": 4850 }, { "epoch": 19.404, "grad_norm": 0.1711878627538681, "learning_rate": 1.5130260521042086e-06, "loss": 0.0072, "step": 4851 }, { "epoch": 19.408, "grad_norm": 0.19314146041870117, "learning_rate": 1.5030060120240482e-06, "loss": 0.0066, "step": 4852 }, { "epoch": 19.412, "grad_norm": 0.16302086412906647, "learning_rate": 1.492985971943888e-06, "loss": 0.0078, "step": 4853 }, { "epoch": 19.416, "grad_norm": 0.15653206408023834, "learning_rate": 1.4829659318637276e-06, "loss": 0.0067, "step": 4854 }, { "epoch": 19.42, "grad_norm": 0.12465237826108932, "learning_rate": 1.4729458917835671e-06, "loss": 0.0064, "step": 4855 }, { "epoch": 19.424, "grad_norm": 0.19219818711280823, "learning_rate": 1.4629258517034067e-06, "loss": 0.0075, "step": 4856 }, { "epoch": 19.428, "grad_norm": 0.16947104036808014, "learning_rate": 1.4529058116232465e-06, "loss": 0.0056, "step": 4857 }, { "epoch": 19.432, "grad_norm": 0.16427640616893768, "learning_rate": 1.4428857715430863e-06, "loss": 0.0068, "step": 4858 }, { "epoch": 19.436, "grad_norm": 0.24073843657970428, "learning_rate": 1.432865731462926e-06, "loss": 0.0071, "step": 4859 }, { "epoch": 19.44, "grad_norm": 0.22962218523025513, "learning_rate": 1.4228456913827655e-06, "loss": 0.008, "step": 4860 }, { "epoch": 19.444, "grad_norm": 0.1950879544019699, "learning_rate": 1.4128256513026053e-06, "loss": 0.0067, "step": 4861 }, { "epoch": 19.448, "grad_norm": 0.16316533088684082, "learning_rate": 1.402805611222445e-06, "loss": 0.0074, "step": 4862 }, { "epoch": 19.452, "grad_norm": 0.18425200879573822, "learning_rate": 1.3927855711422847e-06, "loss": 0.0075, "step": 4863 }, { "epoch": 19.456, "grad_norm": 0.1857520490884781, "learning_rate": 1.3827655310621243e-06, "loss": 0.0071, "step": 4864 }, { "epoch": 19.46, "grad_norm": 0.20833329856395721, "learning_rate": 1.372745490981964e-06, "loss": 0.007, "step": 4865 }, { "epoch": 19.464, "grad_norm": 0.21447627246379852, "learning_rate": 1.3627254509018037e-06, "loss": 0.0074, "step": 4866 }, { "epoch": 19.468, "grad_norm": 0.14376424252986908, "learning_rate": 1.3527054108216433e-06, "loss": 0.0063, "step": 4867 }, { "epoch": 19.472, "grad_norm": 0.17494480311870575, "learning_rate": 1.3426853707414828e-06, "loss": 0.0075, "step": 4868 }, { "epoch": 19.476, "grad_norm": 0.19328361749649048, "learning_rate": 1.3326653306613229e-06, "loss": 0.008, "step": 4869 }, { "epoch": 19.48, "grad_norm": 0.15253637731075287, "learning_rate": 1.3226452905811624e-06, "loss": 0.0076, "step": 4870 }, { "epoch": 19.484, "grad_norm": 0.15423879027366638, "learning_rate": 1.312625250501002e-06, "loss": 0.0066, "step": 4871 }, { "epoch": 19.488, "grad_norm": 0.16223280131816864, "learning_rate": 1.3026052104208416e-06, "loss": 0.0071, "step": 4872 }, { "epoch": 19.492, "grad_norm": 0.14479829370975494, "learning_rate": 1.2925851703406814e-06, "loss": 0.0065, "step": 4873 }, { "epoch": 19.496, "grad_norm": 0.1785711795091629, "learning_rate": 1.282565130260521e-06, "loss": 0.0078, "step": 4874 }, { "epoch": 19.5, "grad_norm": 0.14600437879562378, "learning_rate": 1.2725450901803608e-06, "loss": 0.0064, "step": 4875 }, { "epoch": 19.504, "grad_norm": 0.19797396659851074, "learning_rate": 1.2625250501002006e-06, "loss": 0.0069, "step": 4876 }, { "epoch": 19.508, "grad_norm": 0.19664421677589417, "learning_rate": 1.2525050100200402e-06, "loss": 0.0082, "step": 4877 }, { "epoch": 19.512, "grad_norm": 0.20880340039730072, "learning_rate": 1.2424849699398798e-06, "loss": 0.0082, "step": 4878 }, { "epoch": 19.516, "grad_norm": 0.14585073292255402, "learning_rate": 1.2324649298597194e-06, "loss": 0.0066, "step": 4879 }, { "epoch": 19.52, "grad_norm": 0.2517703175544739, "learning_rate": 1.2224448897795592e-06, "loss": 0.0073, "step": 4880 }, { "epoch": 19.524, "grad_norm": 0.18812379240989685, "learning_rate": 1.212424849699399e-06, "loss": 0.008, "step": 4881 }, { "epoch": 19.528, "grad_norm": 0.15317322313785553, "learning_rate": 1.2024048096192386e-06, "loss": 0.0064, "step": 4882 }, { "epoch": 19.532, "grad_norm": 0.22038224339485168, "learning_rate": 1.1923847695390782e-06, "loss": 0.0072, "step": 4883 }, { "epoch": 19.536, "grad_norm": 0.16358880698680878, "learning_rate": 1.182364729458918e-06, "loss": 0.0067, "step": 4884 }, { "epoch": 19.54, "grad_norm": 0.17958605289459229, "learning_rate": 1.1723446893787575e-06, "loss": 0.0065, "step": 4885 }, { "epoch": 19.544, "grad_norm": 0.1586514711380005, "learning_rate": 1.1623246492985971e-06, "loss": 0.0061, "step": 4886 }, { "epoch": 19.548000000000002, "grad_norm": 0.08892928808927536, "learning_rate": 1.152304609218437e-06, "loss": 0.0023, "step": 4887 }, { "epoch": 19.552, "grad_norm": 0.13370400667190552, "learning_rate": 1.1422845691382767e-06, "loss": 0.0057, "step": 4888 }, { "epoch": 19.556, "grad_norm": 0.17751452326774597, "learning_rate": 1.1322645290581163e-06, "loss": 0.0066, "step": 4889 }, { "epoch": 19.56, "grad_norm": 0.20876137912273407, "learning_rate": 1.122244488977956e-06, "loss": 0.0066, "step": 4890 }, { "epoch": 19.564, "grad_norm": 0.19675932824611664, "learning_rate": 1.1122244488977955e-06, "loss": 0.0083, "step": 4891 }, { "epoch": 19.568, "grad_norm": 0.157494455575943, "learning_rate": 1.1022044088176353e-06, "loss": 0.0071, "step": 4892 }, { "epoch": 19.572, "grad_norm": 0.1072336882352829, "learning_rate": 1.092184368737475e-06, "loss": 0.0032, "step": 4893 }, { "epoch": 19.576, "grad_norm": 0.19063591957092285, "learning_rate": 1.0821643286573147e-06, "loss": 0.0073, "step": 4894 }, { "epoch": 19.58, "grad_norm": 0.1833014339208603, "learning_rate": 1.0721442885771543e-06, "loss": 0.0076, "step": 4895 }, { "epoch": 19.584, "grad_norm": 0.19544434547424316, "learning_rate": 1.062124248496994e-06, "loss": 0.0083, "step": 4896 }, { "epoch": 19.588, "grad_norm": 0.19219093024730682, "learning_rate": 1.0521042084168337e-06, "loss": 0.0072, "step": 4897 }, { "epoch": 19.592, "grad_norm": 0.18003998696804047, "learning_rate": 1.0420841683366733e-06, "loss": 0.0075, "step": 4898 }, { "epoch": 19.596, "grad_norm": 0.14643754065036774, "learning_rate": 1.032064128256513e-06, "loss": 0.0074, "step": 4899 }, { "epoch": 19.6, "grad_norm": 0.14837734401226044, "learning_rate": 1.0220440881763529e-06, "loss": 0.0069, "step": 4900 }, { "epoch": 19.604, "grad_norm": 0.21540893614292145, "learning_rate": 1.0120240480961924e-06, "loss": 0.0069, "step": 4901 }, { "epoch": 19.608, "grad_norm": 0.15125292539596558, "learning_rate": 1.002004008016032e-06, "loss": 0.0072, "step": 4902 }, { "epoch": 19.612, "grad_norm": 0.2173326164484024, "learning_rate": 9.919839679358716e-07, "loss": 0.0074, "step": 4903 }, { "epoch": 19.616, "grad_norm": 0.22422145307064056, "learning_rate": 9.819639278557114e-07, "loss": 0.0076, "step": 4904 }, { "epoch": 19.62, "grad_norm": 0.21010397374629974, "learning_rate": 9.719438877755512e-07, "loss": 0.0074, "step": 4905 }, { "epoch": 19.624, "grad_norm": 0.16973888874053955, "learning_rate": 9.619238476953908e-07, "loss": 0.0065, "step": 4906 }, { "epoch": 19.628, "grad_norm": 0.19270877540111542, "learning_rate": 9.519038076152306e-07, "loss": 0.0068, "step": 4907 }, { "epoch": 19.632, "grad_norm": 0.14899659156799316, "learning_rate": 9.418837675350702e-07, "loss": 0.0074, "step": 4908 }, { "epoch": 19.636, "grad_norm": 0.18225519359111786, "learning_rate": 9.318637274549099e-07, "loss": 0.0065, "step": 4909 }, { "epoch": 19.64, "grad_norm": 0.175700843334198, "learning_rate": 9.218436873747495e-07, "loss": 0.0075, "step": 4910 }, { "epoch": 19.644, "grad_norm": 0.1648159772157669, "learning_rate": 9.118236472945893e-07, "loss": 0.0073, "step": 4911 }, { "epoch": 19.648, "grad_norm": 0.22431142628192902, "learning_rate": 9.01803607214429e-07, "loss": 0.0073, "step": 4912 }, { "epoch": 19.652, "grad_norm": 0.19765180349349976, "learning_rate": 8.917835671342686e-07, "loss": 0.0072, "step": 4913 }, { "epoch": 19.656, "grad_norm": 0.16144563257694244, "learning_rate": 8.817635270541082e-07, "loss": 0.007, "step": 4914 }, { "epoch": 19.66, "grad_norm": 0.16720423102378845, "learning_rate": 8.71743486973948e-07, "loss": 0.0068, "step": 4915 }, { "epoch": 19.664, "grad_norm": 0.201465904712677, "learning_rate": 8.617234468937877e-07, "loss": 0.0077, "step": 4916 }, { "epoch": 19.668, "grad_norm": 0.16798268258571625, "learning_rate": 8.517034068136272e-07, "loss": 0.0063, "step": 4917 }, { "epoch": 19.672, "grad_norm": 0.1629098355770111, "learning_rate": 8.416833667334669e-07, "loss": 0.0075, "step": 4918 }, { "epoch": 19.676, "grad_norm": 0.165368914604187, "learning_rate": 8.316633266533067e-07, "loss": 0.0068, "step": 4919 }, { "epoch": 19.68, "grad_norm": 0.21303412318229675, "learning_rate": 8.216432865731463e-07, "loss": 0.0078, "step": 4920 }, { "epoch": 19.684, "grad_norm": 0.16514278948307037, "learning_rate": 8.11623246492986e-07, "loss": 0.0066, "step": 4921 }, { "epoch": 19.688, "grad_norm": 0.14475776255130768, "learning_rate": 8.016032064128256e-07, "loss": 0.007, "step": 4922 }, { "epoch": 19.692, "grad_norm": 0.2859732508659363, "learning_rate": 7.915831663326654e-07, "loss": 0.008, "step": 4923 }, { "epoch": 19.696, "grad_norm": 0.2173309475183487, "learning_rate": 7.815631262525051e-07, "loss": 0.0079, "step": 4924 }, { "epoch": 19.7, "grad_norm": 0.14563588798046112, "learning_rate": 7.715430861723447e-07, "loss": 0.0072, "step": 4925 }, { "epoch": 19.704, "grad_norm": 0.1795637160539627, "learning_rate": 7.615230460921844e-07, "loss": 0.007, "step": 4926 }, { "epoch": 19.708, "grad_norm": 0.1522742211818695, "learning_rate": 7.515030060120241e-07, "loss": 0.0062, "step": 4927 }, { "epoch": 19.712, "grad_norm": 0.23236194252967834, "learning_rate": 7.414829659318638e-07, "loss": 0.0078, "step": 4928 }, { "epoch": 19.716, "grad_norm": 0.15131531655788422, "learning_rate": 7.314629258517034e-07, "loss": 0.0063, "step": 4929 }, { "epoch": 19.72, "grad_norm": 0.15103590488433838, "learning_rate": 7.214428857715432e-07, "loss": 0.0069, "step": 4930 }, { "epoch": 19.724, "grad_norm": 0.15178599953651428, "learning_rate": 7.114228456913828e-07, "loss": 0.0066, "step": 4931 }, { "epoch": 19.728, "grad_norm": 0.22268012166023254, "learning_rate": 7.014028056112224e-07, "loss": 0.0079, "step": 4932 }, { "epoch": 19.732, "grad_norm": 0.1779235601425171, "learning_rate": 6.913827655310621e-07, "loss": 0.0064, "step": 4933 }, { "epoch": 19.736, "grad_norm": 0.17940625548362732, "learning_rate": 6.813627254509018e-07, "loss": 0.007, "step": 4934 }, { "epoch": 19.74, "grad_norm": 0.17642860114574432, "learning_rate": 6.713426853707414e-07, "loss": 0.0062, "step": 4935 }, { "epoch": 19.744, "grad_norm": 0.18925072252750397, "learning_rate": 6.613226452905812e-07, "loss": 0.0079, "step": 4936 }, { "epoch": 19.748, "grad_norm": 0.1245235726237297, "learning_rate": 6.513026052104208e-07, "loss": 0.0063, "step": 4937 }, { "epoch": 19.752, "grad_norm": 0.21628788113594055, "learning_rate": 6.412825651302605e-07, "loss": 0.0081, "step": 4938 }, { "epoch": 19.756, "grad_norm": 0.13739117980003357, "learning_rate": 6.312625250501003e-07, "loss": 0.0041, "step": 4939 }, { "epoch": 19.76, "grad_norm": 0.18183039128780365, "learning_rate": 6.212424849699399e-07, "loss": 0.0064, "step": 4940 }, { "epoch": 19.764, "grad_norm": 0.18417534232139587, "learning_rate": 6.112224448897796e-07, "loss": 0.0076, "step": 4941 }, { "epoch": 19.768, "grad_norm": 0.18779101967811584, "learning_rate": 6.012024048096193e-07, "loss": 0.0076, "step": 4942 }, { "epoch": 19.772, "grad_norm": 0.1617640256881714, "learning_rate": 5.91182364729459e-07, "loss": 0.0065, "step": 4943 }, { "epoch": 19.776, "grad_norm": 0.15822680294513702, "learning_rate": 5.811623246492986e-07, "loss": 0.006, "step": 4944 }, { "epoch": 19.78, "grad_norm": 0.17007651925086975, "learning_rate": 5.711422845691384e-07, "loss": 0.0069, "step": 4945 }, { "epoch": 19.784, "grad_norm": 0.1721976101398468, "learning_rate": 5.61122244488978e-07, "loss": 0.0071, "step": 4946 }, { "epoch": 19.788, "grad_norm": 0.1397552639245987, "learning_rate": 5.511022044088177e-07, "loss": 0.0069, "step": 4947 }, { "epoch": 19.792, "grad_norm": 0.18421714007854462, "learning_rate": 5.410821643286573e-07, "loss": 0.0048, "step": 4948 }, { "epoch": 19.796, "grad_norm": 0.16504515707492828, "learning_rate": 5.31062124248497e-07, "loss": 0.0066, "step": 4949 }, { "epoch": 19.8, "grad_norm": 0.18747548758983612, "learning_rate": 5.210420841683366e-07, "loss": 0.0081, "step": 4950 }, { "epoch": 19.804, "grad_norm": 0.17722636461257935, "learning_rate": 5.110220440881764e-07, "loss": 0.0071, "step": 4951 }, { "epoch": 19.808, "grad_norm": 0.14402155578136444, "learning_rate": 5.01002004008016e-07, "loss": 0.0061, "step": 4952 }, { "epoch": 19.812, "grad_norm": 0.2421988993883133, "learning_rate": 4.909819639278557e-07, "loss": 0.0075, "step": 4953 }, { "epoch": 19.816, "grad_norm": 0.1611681580543518, "learning_rate": 4.809619238476954e-07, "loss": 0.0063, "step": 4954 }, { "epoch": 19.82, "grad_norm": 0.17111510038375854, "learning_rate": 4.709418837675351e-07, "loss": 0.0064, "step": 4955 }, { "epoch": 19.824, "grad_norm": 0.2002701759338379, "learning_rate": 4.6092184368737474e-07, "loss": 0.0066, "step": 4956 }, { "epoch": 19.828, "grad_norm": 0.23351456224918365, "learning_rate": 4.509018036072145e-07, "loss": 0.0073, "step": 4957 }, { "epoch": 19.832, "grad_norm": 0.12465520948171616, "learning_rate": 4.408817635270541e-07, "loss": 0.0057, "step": 4958 }, { "epoch": 19.836, "grad_norm": 0.2019532173871994, "learning_rate": 4.3086172344689383e-07, "loss": 0.0068, "step": 4959 }, { "epoch": 19.84, "grad_norm": 0.1522103101015091, "learning_rate": 4.2084168336673347e-07, "loss": 0.0062, "step": 4960 }, { "epoch": 19.844, "grad_norm": 0.22643589973449707, "learning_rate": 4.1082164328657316e-07, "loss": 0.0079, "step": 4961 }, { "epoch": 19.848, "grad_norm": 0.1775081604719162, "learning_rate": 4.008016032064128e-07, "loss": 0.0065, "step": 4962 }, { "epoch": 19.852, "grad_norm": 0.17318783700466156, "learning_rate": 3.9078156312625255e-07, "loss": 0.007, "step": 4963 }, { "epoch": 19.856, "grad_norm": 0.20053575932979584, "learning_rate": 3.807615230460922e-07, "loss": 0.0074, "step": 4964 }, { "epoch": 19.86, "grad_norm": 0.2197975367307663, "learning_rate": 3.707414829659319e-07, "loss": 0.0077, "step": 4965 }, { "epoch": 19.864, "grad_norm": 0.11314887553453445, "learning_rate": 3.607214428857716e-07, "loss": 0.0039, "step": 4966 }, { "epoch": 19.868, "grad_norm": 0.23044568300247192, "learning_rate": 3.507014028056112e-07, "loss": 0.0081, "step": 4967 }, { "epoch": 19.872, "grad_norm": 0.15248127281665802, "learning_rate": 3.406813627254509e-07, "loss": 0.0068, "step": 4968 }, { "epoch": 19.876, "grad_norm": 0.16394023597240448, "learning_rate": 3.306613226452906e-07, "loss": 0.0066, "step": 4969 }, { "epoch": 19.88, "grad_norm": 0.1749032884836197, "learning_rate": 3.2064128256513025e-07, "loss": 0.0073, "step": 4970 }, { "epoch": 19.884, "grad_norm": 0.17276360094547272, "learning_rate": 3.1062124248496995e-07, "loss": 0.007, "step": 4971 }, { "epoch": 19.888, "grad_norm": 0.13797195255756378, "learning_rate": 3.0060120240480964e-07, "loss": 0.007, "step": 4972 }, { "epoch": 19.892, "grad_norm": 0.25307130813598633, "learning_rate": 2.905811623246493e-07, "loss": 0.0085, "step": 4973 }, { "epoch": 19.896, "grad_norm": 0.17503662407398224, "learning_rate": 2.80561122244489e-07, "loss": 0.0076, "step": 4974 }, { "epoch": 19.9, "grad_norm": 0.14293161034584045, "learning_rate": 2.7054108216432867e-07, "loss": 0.007, "step": 4975 }, { "epoch": 19.904, "grad_norm": 0.1660107970237732, "learning_rate": 2.605210420841683e-07, "loss": 0.0065, "step": 4976 }, { "epoch": 19.908, "grad_norm": 0.245314821600914, "learning_rate": 2.50501002004008e-07, "loss": 0.0092, "step": 4977 }, { "epoch": 19.912, "grad_norm": 0.21174436807632446, "learning_rate": 2.404809619238477e-07, "loss": 0.007, "step": 4978 }, { "epoch": 19.916, "grad_norm": 0.170380100607872, "learning_rate": 2.3046092184368737e-07, "loss": 0.0061, "step": 4979 }, { "epoch": 19.92, "grad_norm": 0.2893669307231903, "learning_rate": 2.2044088176352704e-07, "loss": 0.0086, "step": 4980 }, { "epoch": 19.924, "grad_norm": 0.13519108295440674, "learning_rate": 2.1042084168336673e-07, "loss": 0.006, "step": 4981 }, { "epoch": 19.928, "grad_norm": 0.17455951869487762, "learning_rate": 2.004008016032064e-07, "loss": 0.0076, "step": 4982 }, { "epoch": 19.932, "grad_norm": 0.2173195630311966, "learning_rate": 1.903807615230461e-07, "loss": 0.0086, "step": 4983 }, { "epoch": 19.936, "grad_norm": 0.16455821692943573, "learning_rate": 1.803607214428858e-07, "loss": 0.0066, "step": 4984 }, { "epoch": 19.94, "grad_norm": 0.16139689087867737, "learning_rate": 1.7034068136272546e-07, "loss": 0.0075, "step": 4985 }, { "epoch": 19.944, "grad_norm": 0.18227294087409973, "learning_rate": 1.6032064128256513e-07, "loss": 0.007, "step": 4986 }, { "epoch": 19.948, "grad_norm": 0.13132084906101227, "learning_rate": 1.5030060120240482e-07, "loss": 0.0043, "step": 4987 }, { "epoch": 19.951999999999998, "grad_norm": 0.1532142460346222, "learning_rate": 1.402805611222445e-07, "loss": 0.0063, "step": 4988 }, { "epoch": 19.956, "grad_norm": 0.18583548069000244, "learning_rate": 1.3026052104208416e-07, "loss": 0.0074, "step": 4989 }, { "epoch": 19.96, "grad_norm": 0.16315986216068268, "learning_rate": 1.2024048096192385e-07, "loss": 0.0066, "step": 4990 }, { "epoch": 19.964, "grad_norm": 0.19338533282279968, "learning_rate": 1.1022044088176352e-07, "loss": 0.0062, "step": 4991 }, { "epoch": 19.968, "grad_norm": 0.1874208003282547, "learning_rate": 1.002004008016032e-07, "loss": 0.0072, "step": 4992 }, { "epoch": 19.972, "grad_norm": 0.19708944857120514, "learning_rate": 9.01803607214429e-08, "loss": 0.0075, "step": 4993 }, { "epoch": 19.976, "grad_norm": 0.13537871837615967, "learning_rate": 8.016032064128256e-08, "loss": 0.0072, "step": 4994 }, { "epoch": 19.98, "grad_norm": 0.200593501329422, "learning_rate": 7.014028056112224e-08, "loss": 0.0077, "step": 4995 }, { "epoch": 19.984, "grad_norm": 0.146713986992836, "learning_rate": 6.012024048096193e-08, "loss": 0.0074, "step": 4996 }, { "epoch": 19.988, "grad_norm": 0.20530632138252258, "learning_rate": 5.01002004008016e-08, "loss": 0.0066, "step": 4997 }, { "epoch": 19.992, "grad_norm": 0.1384311020374298, "learning_rate": 4.008016032064128e-08, "loss": 0.0072, "step": 4998 }, { "epoch": 19.996, "grad_norm": 0.1999179571866989, "learning_rate": 3.006012024048096e-08, "loss": 0.0075, "step": 4999 }, { "epoch": 20.0, "grad_norm": 0.22525227069854736, "learning_rate": 2.004008016032064e-08, "loss": 0.0047, "step": 5000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.07226802884608e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }