diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 4.403661251068115, + "learning_rate": 5e-06, + "loss": 3.5443, + "step": 1 + }, + { + "epoch": 0.008, + "grad_norm": 3.535775661468506, + "learning_rate": 1e-05, + "loss": 1.8447, + "step": 2 + }, + { + "epoch": 0.012, + "grad_norm": 2.976018190383911, + "learning_rate": 1.5e-05, + "loss": 1.7894, + "step": 3 + }, + { + "epoch": 0.016, + "grad_norm": 3.0810327529907227, + "learning_rate": 2e-05, + "loss": 2.9688, + "step": 4 + }, + { + "epoch": 0.02, + "grad_norm": 2.9574649333953857, + "learning_rate": 2.5e-05, + "loss": 1.6972, + "step": 5 + }, + { + "epoch": 0.024, + "grad_norm": 2.9108431339263916, + "learning_rate": 3e-05, + "loss": 1.6871, + "step": 6 + }, + { + "epoch": 0.028, + "grad_norm": 2.5679922103881836, + "learning_rate": 3.5e-05, + "loss": 1.6313, + "step": 7 + }, + { + "epoch": 0.032, + "grad_norm": 2.8970062732696533, + "learning_rate": 4e-05, + "loss": 1.5489, + "step": 8 + }, + { + "epoch": 0.036, + "grad_norm": 2.5108256340026855, + "learning_rate": 4.5e-05, + "loss": 1.3924, + "step": 9 + }, + { + "epoch": 0.04, + "grad_norm": 1.4062409400939941, + "learning_rate": 5e-05, + "loss": 1.4039, + "step": 10 + }, + { + "epoch": 0.044, + "grad_norm": 1.4537838697433472, + "learning_rate": 4.9989979959919844e-05, + "loss": 1.3405, + "step": 11 + }, + { + "epoch": 0.048, + "grad_norm": 1.1395865678787231, + "learning_rate": 4.997995991983968e-05, + "loss": 1.2754, + "step": 12 + }, + { + "epoch": 0.052, + "grad_norm": 1.3618974685668945, + "learning_rate": 4.996993987975952e-05, + "loss": 1.2058, + "step": 13 + }, + { + "epoch": 0.056, + "grad_norm": 1.0007574558258057, + "learning_rate": 4.995991983967936e-05, + "loss": 1.1149, + "step": 14 + }, + { + "epoch": 0.06, + "grad_norm": 1.0973924398422241, + "learning_rate": 4.99498997995992e-05, + "loss": 1.0393, + "step": 15 + }, + { + "epoch": 0.064, + "grad_norm": 0.9514708518981934, + "learning_rate": 4.993987975951904e-05, + "loss": 1.1058, + "step": 16 + }, + { + "epoch": 0.068, + "grad_norm": 0.8538547158241272, + "learning_rate": 4.9929859719438885e-05, + "loss": 1.0185, + "step": 17 + }, + { + "epoch": 0.072, + "grad_norm": 0.7519181966781616, + "learning_rate": 4.991983967935872e-05, + "loss": 0.9962, + "step": 18 + }, + { + "epoch": 0.076, + "grad_norm": 0.9052465558052063, + "learning_rate": 4.990981963927856e-05, + "loss": 1.034, + "step": 19 + }, + { + "epoch": 0.08, + "grad_norm": 0.9109005928039551, + "learning_rate": 4.98997995991984e-05, + "loss": 1.0563, + "step": 20 + }, + { + "epoch": 0.084, + "grad_norm": 1.0023589134216309, + "learning_rate": 4.9889779559118236e-05, + "loss": 0.8603, + "step": 21 + }, + { + "epoch": 0.088, + "grad_norm": 0.8098154664039612, + "learning_rate": 4.987975951903808e-05, + "loss": 0.9769, + "step": 22 + }, + { + "epoch": 0.092, + "grad_norm": 0.7375625967979431, + "learning_rate": 4.986973947895792e-05, + "loss": 0.8612, + "step": 23 + }, + { + "epoch": 0.096, + "grad_norm": 0.6169605851173401, + "learning_rate": 4.985971943887775e-05, + "loss": 0.8061, + "step": 24 + }, + { + "epoch": 0.1, + "grad_norm": 0.5663151144981384, + "learning_rate": 4.98496993987976e-05, + "loss": 0.7382, + "step": 25 + }, + { + "epoch": 0.104, + "grad_norm": 0.5792577266693115, + "learning_rate": 4.983967935871744e-05, + "loss": 0.8212, + "step": 26 + }, + { + "epoch": 0.108, + "grad_norm": 0.5520902276039124, + "learning_rate": 4.982965931863728e-05, + "loss": 0.7566, + "step": 27 + }, + { + "epoch": 0.112, + "grad_norm": 0.6001718044281006, + "learning_rate": 4.981963927855712e-05, + "loss": 0.7855, + "step": 28 + }, + { + "epoch": 0.116, + "grad_norm": 0.9115943312644958, + "learning_rate": 4.980961923847696e-05, + "loss": 0.7621, + "step": 29 + }, + { + "epoch": 0.12, + "grad_norm": 0.47501370310783386, + "learning_rate": 4.9799599198396794e-05, + "loss": 0.6687, + "step": 30 + }, + { + "epoch": 0.124, + "grad_norm": 0.48663514852523804, + "learning_rate": 4.9789579158316635e-05, + "loss": 0.7291, + "step": 31 + }, + { + "epoch": 0.128, + "grad_norm": 0.49639806151390076, + "learning_rate": 4.977955911823648e-05, + "loss": 0.7049, + "step": 32 + }, + { + "epoch": 0.132, + "grad_norm": 0.49986907839775085, + "learning_rate": 4.976953907815631e-05, + "loss": 0.7347, + "step": 33 + }, + { + "epoch": 0.136, + "grad_norm": 0.5302708745002747, + "learning_rate": 4.975951903807615e-05, + "loss": 0.771, + "step": 34 + }, + { + "epoch": 0.14, + "grad_norm": 0.5710265040397644, + "learning_rate": 4.9749498997995994e-05, + "loss": 0.6717, + "step": 35 + }, + { + "epoch": 0.144, + "grad_norm": 0.4576522707939148, + "learning_rate": 4.9739478957915835e-05, + "loss": 0.6038, + "step": 36 + }, + { + "epoch": 0.148, + "grad_norm": 0.509702205657959, + "learning_rate": 4.9729458917835676e-05, + "loss": 0.681, + "step": 37 + }, + { + "epoch": 0.152, + "grad_norm": 0.5716866254806519, + "learning_rate": 4.971943887775551e-05, + "loss": 0.7039, + "step": 38 + }, + { + "epoch": 0.156, + "grad_norm": 0.4261268079280853, + "learning_rate": 4.970941883767535e-05, + "loss": 0.6841, + "step": 39 + }, + { + "epoch": 0.16, + "grad_norm": 0.5015498399734497, + "learning_rate": 4.9699398797595193e-05, + "loss": 0.6062, + "step": 40 + }, + { + "epoch": 0.164, + "grad_norm": 0.4798198640346527, + "learning_rate": 4.9689378757515035e-05, + "loss": 0.6174, + "step": 41 + }, + { + "epoch": 0.168, + "grad_norm": 0.47841379046440125, + "learning_rate": 4.967935871743487e-05, + "loss": 0.71, + "step": 42 + }, + { + "epoch": 0.172, + "grad_norm": 0.38819581270217896, + "learning_rate": 4.966933867735471e-05, + "loss": 0.6236, + "step": 43 + }, + { + "epoch": 0.176, + "grad_norm": 0.3802018165588379, + "learning_rate": 4.965931863727455e-05, + "loss": 0.5662, + "step": 44 + }, + { + "epoch": 0.18, + "grad_norm": 0.4659491777420044, + "learning_rate": 4.964929859719439e-05, + "loss": 0.6553, + "step": 45 + }, + { + "epoch": 0.184, + "grad_norm": 0.4309292733669281, + "learning_rate": 4.9639278557114234e-05, + "loss": 0.6317, + "step": 46 + }, + { + "epoch": 0.188, + "grad_norm": 0.36458510160446167, + "learning_rate": 4.962925851703407e-05, + "loss": 0.5853, + "step": 47 + }, + { + "epoch": 0.192, + "grad_norm": 0.3789045214653015, + "learning_rate": 4.961923847695391e-05, + "loss": 0.6565, + "step": 48 + }, + { + "epoch": 0.196, + "grad_norm": 0.43124228715896606, + "learning_rate": 4.960921843687375e-05, + "loss": 0.6484, + "step": 49 + }, + { + "epoch": 0.2, + "grad_norm": Infinity, + "learning_rate": 4.960921843687375e-05, + "loss": 3.1127, + "step": 50 + }, + { + "epoch": 0.204, + "grad_norm": 0.40404313802719116, + "learning_rate": 4.9599198396793586e-05, + "loss": 0.6541, + "step": 51 + }, + { + "epoch": 0.208, + "grad_norm": 0.329449862241745, + "learning_rate": 4.958917835671343e-05, + "loss": 0.5814, + "step": 52 + }, + { + "epoch": 0.212, + "grad_norm": 0.3724288046360016, + "learning_rate": 4.957915831663327e-05, + "loss": 0.6095, + "step": 53 + }, + { + "epoch": 0.216, + "grad_norm": 0.3768303096294403, + "learning_rate": 4.956913827655311e-05, + "loss": 0.6467, + "step": 54 + }, + { + "epoch": 0.22, + "grad_norm": 0.3847925066947937, + "learning_rate": 4.9559118236472944e-05, + "loss": 0.6584, + "step": 55 + }, + { + "epoch": 0.224, + "grad_norm": 0.38740023970603943, + "learning_rate": 4.954909819639279e-05, + "loss": 0.6108, + "step": 56 + }, + { + "epoch": 0.228, + "grad_norm": 0.4133378863334656, + "learning_rate": 4.953907815631263e-05, + "loss": 0.569, + "step": 57 + }, + { + "epoch": 0.232, + "grad_norm": 0.3934808373451233, + "learning_rate": 4.952905811623247e-05, + "loss": 0.5979, + "step": 58 + }, + { + "epoch": 0.236, + "grad_norm": 0.39415234327316284, + "learning_rate": 4.951903807615231e-05, + "loss": 0.5732, + "step": 59 + }, + { + "epoch": 0.24, + "grad_norm": 0.41425755620002747, + "learning_rate": 4.9509018036072144e-05, + "loss": 0.6131, + "step": 60 + }, + { + "epoch": 0.244, + "grad_norm": 0.3662201166152954, + "learning_rate": 4.9498997995991985e-05, + "loss": 0.582, + "step": 61 + }, + { + "epoch": 0.248, + "grad_norm": 0.6343560218811035, + "learning_rate": 4.9488977955911826e-05, + "loss": 0.7075, + "step": 62 + }, + { + "epoch": 0.252, + "grad_norm": 0.362642765045166, + "learning_rate": 4.947895791583166e-05, + "loss": 0.5534, + "step": 63 + }, + { + "epoch": 0.256, + "grad_norm": 0.3604782819747925, + "learning_rate": 4.94689378757515e-05, + "loss": 0.5873, + "step": 64 + }, + { + "epoch": 0.26, + "grad_norm": 0.3327144384384155, + "learning_rate": 4.9458917835671344e-05, + "loss": 0.5309, + "step": 65 + }, + { + "epoch": 0.264, + "grad_norm": 0.35557445883750916, + "learning_rate": 4.9448897795591185e-05, + "loss": 0.5307, + "step": 66 + }, + { + "epoch": 0.268, + "grad_norm": 0.5837879180908203, + "learning_rate": 4.9438877755511026e-05, + "loss": 0.6416, + "step": 67 + }, + { + "epoch": 0.272, + "grad_norm": 0.7987128496170044, + "learning_rate": 4.942885771543087e-05, + "loss": 0.5083, + "step": 68 + }, + { + "epoch": 0.276, + "grad_norm": 0.3976365923881531, + "learning_rate": 4.94188376753507e-05, + "loss": 0.6014, + "step": 69 + }, + { + "epoch": 0.28, + "grad_norm": 0.3430960774421692, + "learning_rate": 4.940881763527054e-05, + "loss": 0.5841, + "step": 70 + }, + { + "epoch": 0.284, + "grad_norm": 0.3691798746585846, + "learning_rate": 4.9398797595190384e-05, + "loss": 0.5346, + "step": 71 + }, + { + "epoch": 0.288, + "grad_norm": 0.3781915307044983, + "learning_rate": 4.938877755511022e-05, + "loss": 0.5438, + "step": 72 + }, + { + "epoch": 0.292, + "grad_norm": 0.43587324023246765, + "learning_rate": 4.937875751503006e-05, + "loss": 0.5575, + "step": 73 + }, + { + "epoch": 0.296, + "grad_norm": 0.5425245761871338, + "learning_rate": 4.93687374749499e-05, + "loss": 0.5755, + "step": 74 + }, + { + "epoch": 0.3, + "grad_norm": 0.3869353234767914, + "learning_rate": 4.935871743486974e-05, + "loss": 0.5267, + "step": 75 + }, + { + "epoch": 0.304, + "grad_norm": 1.0698070526123047, + "learning_rate": 4.9348697394789584e-05, + "loss": 0.5967, + "step": 76 + }, + { + "epoch": 0.308, + "grad_norm": 0.3409326672554016, + "learning_rate": 4.9338677354709425e-05, + "loss": 0.5764, + "step": 77 + }, + { + "epoch": 0.312, + "grad_norm": 0.42487403750419617, + "learning_rate": 4.932865731462926e-05, + "loss": 0.5788, + "step": 78 + }, + { + "epoch": 0.316, + "grad_norm": 0.355347216129303, + "learning_rate": 4.93186372745491e-05, + "loss": 0.5232, + "step": 79 + }, + { + "epoch": 0.32, + "grad_norm": 0.3655643165111542, + "learning_rate": 4.930861723446894e-05, + "loss": 0.5693, + "step": 80 + }, + { + "epoch": 0.324, + "grad_norm": 0.40408679842948914, + "learning_rate": 4.929859719438878e-05, + "loss": 0.5206, + "step": 81 + }, + { + "epoch": 0.328, + "grad_norm": 0.358632355928421, + "learning_rate": 4.928857715430862e-05, + "loss": 0.5344, + "step": 82 + }, + { + "epoch": 0.332, + "grad_norm": 0.3510683476924896, + "learning_rate": 4.927855711422846e-05, + "loss": 0.5673, + "step": 83 + }, + { + "epoch": 0.336, + "grad_norm": 0.3240058422088623, + "learning_rate": 4.9268537074148294e-05, + "loss": 0.5064, + "step": 84 + }, + { + "epoch": 0.34, + "grad_norm": 0.39716836810112, + "learning_rate": 4.925851703406814e-05, + "loss": 0.4974, + "step": 85 + }, + { + "epoch": 0.344, + "grad_norm": 0.3803369998931885, + "learning_rate": 4.9248496993987983e-05, + "loss": 0.5433, + "step": 86 + }, + { + "epoch": 0.348, + "grad_norm": 0.345559298992157, + "learning_rate": 4.923847695390782e-05, + "loss": 0.5194, + "step": 87 + }, + { + "epoch": 0.352, + "grad_norm": 0.3797976076602936, + "learning_rate": 4.922845691382766e-05, + "loss": 0.5668, + "step": 88 + }, + { + "epoch": 0.356, + "grad_norm": 0.4122374653816223, + "learning_rate": 4.92184368737475e-05, + "loss": 0.5709, + "step": 89 + }, + { + "epoch": 0.36, + "grad_norm": 0.3925560414791107, + "learning_rate": 4.9208416833667335e-05, + "loss": 0.5528, + "step": 90 + }, + { + "epoch": 0.364, + "grad_norm": 0.35586780309677124, + "learning_rate": 4.9198396793587176e-05, + "loss": 0.5123, + "step": 91 + }, + { + "epoch": 0.368, + "grad_norm": 0.5167198181152344, + "learning_rate": 4.918837675350702e-05, + "loss": 0.5667, + "step": 92 + }, + { + "epoch": 0.372, + "grad_norm": 0.6427658200263977, + "learning_rate": 4.917835671342685e-05, + "loss": 0.5046, + "step": 93 + }, + { + "epoch": 0.376, + "grad_norm": 0.3925032615661621, + "learning_rate": 4.916833667334669e-05, + "loss": 0.5729, + "step": 94 + }, + { + "epoch": 0.38, + "grad_norm": 0.4006875157356262, + "learning_rate": 4.9158316633266535e-05, + "loss": 0.5932, + "step": 95 + }, + { + "epoch": 0.384, + "grad_norm": 0.3980340361595154, + "learning_rate": 4.9148296593186376e-05, + "loss": 0.5277, + "step": 96 + }, + { + "epoch": 0.388, + "grad_norm": 0.35809895396232605, + "learning_rate": 4.913827655310622e-05, + "loss": 0.5806, + "step": 97 + }, + { + "epoch": 0.392, + "grad_norm": 0.3831680417060852, + "learning_rate": 4.912825651302606e-05, + "loss": 0.5285, + "step": 98 + }, + { + "epoch": 0.396, + "grad_norm": 8.12513256072998, + "learning_rate": 4.911823647294589e-05, + "loss": 1.9624, + "step": 99 + }, + { + "epoch": 0.4, + "grad_norm": 0.38112860918045044, + "learning_rate": 4.9108216432865734e-05, + "loss": 0.5226, + "step": 100 + }, + { + "epoch": 0.404, + "grad_norm": 0.37594351172447205, + "learning_rate": 4.9098196392785576e-05, + "loss": 0.521, + "step": 101 + }, + { + "epoch": 0.408, + "grad_norm": 0.35807371139526367, + "learning_rate": 4.908817635270541e-05, + "loss": 0.574, + "step": 102 + }, + { + "epoch": 0.412, + "grad_norm": 0.42019104957580566, + "learning_rate": 4.907815631262525e-05, + "loss": 0.5854, + "step": 103 + }, + { + "epoch": 0.416, + "grad_norm": 4.040385723114014, + "learning_rate": 4.906813627254509e-05, + "loss": 2.0486, + "step": 104 + }, + { + "epoch": 0.42, + "grad_norm": 0.40117359161376953, + "learning_rate": 4.9058116232464934e-05, + "loss": 0.4933, + "step": 105 + }, + { + "epoch": 0.424, + "grad_norm": 0.3785370886325836, + "learning_rate": 4.9048096192384775e-05, + "loss": 0.5409, + "step": 106 + }, + { + "epoch": 0.428, + "grad_norm": 0.5202479958534241, + "learning_rate": 4.903807615230461e-05, + "loss": 0.6319, + "step": 107 + }, + { + "epoch": 0.432, + "grad_norm": 0.3808040916919708, + "learning_rate": 4.902805611222445e-05, + "loss": 0.5222, + "step": 108 + }, + { + "epoch": 0.436, + "grad_norm": 0.33765068650245667, + "learning_rate": 4.901803607214429e-05, + "loss": 0.5191, + "step": 109 + }, + { + "epoch": 0.44, + "grad_norm": 0.37931010127067566, + "learning_rate": 4.9008016032064134e-05, + "loss": 0.509, + "step": 110 + }, + { + "epoch": 0.444, + "grad_norm": 0.46715351939201355, + "learning_rate": 4.899799599198397e-05, + "loss": 0.5343, + "step": 111 + }, + { + "epoch": 0.448, + "grad_norm": 0.372823029756546, + "learning_rate": 4.898797595190381e-05, + "loss": 0.542, + "step": 112 + }, + { + "epoch": 0.452, + "grad_norm": 0.3744858205318451, + "learning_rate": 4.897795591182365e-05, + "loss": 0.4599, + "step": 113 + }, + { + "epoch": 0.456, + "grad_norm": 0.4025891423225403, + "learning_rate": 4.8967935871743485e-05, + "loss": 0.5455, + "step": 114 + }, + { + "epoch": 0.46, + "grad_norm": 0.33246585726737976, + "learning_rate": 4.895791583166333e-05, + "loss": 0.5026, + "step": 115 + }, + { + "epoch": 0.464, + "grad_norm": 0.4333687126636505, + "learning_rate": 4.894789579158317e-05, + "loss": 0.5372, + "step": 116 + }, + { + "epoch": 0.468, + "grad_norm": 0.36540961265563965, + "learning_rate": 4.893787575150301e-05, + "loss": 0.4877, + "step": 117 + }, + { + "epoch": 0.472, + "grad_norm": 0.39382699131965637, + "learning_rate": 4.892785571142285e-05, + "loss": 0.4705, + "step": 118 + }, + { + "epoch": 0.476, + "grad_norm": 0.37407657504081726, + "learning_rate": 4.8917835671342685e-05, + "loss": 0.5003, + "step": 119 + }, + { + "epoch": 0.48, + "grad_norm": 0.36774665117263794, + "learning_rate": 4.8907815631262526e-05, + "loss": 0.5304, + "step": 120 + }, + { + "epoch": 0.484, + "grad_norm": 0.37522393465042114, + "learning_rate": 4.889779559118237e-05, + "loss": 0.5437, + "step": 121 + }, + { + "epoch": 0.488, + "grad_norm": 0.5063856244087219, + "learning_rate": 4.88877755511022e-05, + "loss": 0.558, + "step": 122 + }, + { + "epoch": 0.492, + "grad_norm": 5.0222487449646, + "learning_rate": 4.887775551102204e-05, + "loss": 1.7772, + "step": 123 + }, + { + "epoch": 0.496, + "grad_norm": 0.4300785958766937, + "learning_rate": 4.886773547094189e-05, + "loss": 0.5218, + "step": 124 + }, + { + "epoch": 0.5, + "grad_norm": 0.5471500754356384, + "learning_rate": 4.8857715430861726e-05, + "loss": 0.5194, + "step": 125 + }, + { + "epoch": 0.504, + "grad_norm": 0.40047940611839294, + "learning_rate": 4.884769539078157e-05, + "loss": 0.47, + "step": 126 + }, + { + "epoch": 0.508, + "grad_norm": 0.39165449142456055, + "learning_rate": 4.883767535070141e-05, + "loss": 0.5365, + "step": 127 + }, + { + "epoch": 0.512, + "grad_norm": 0.4630558490753174, + "learning_rate": 4.882765531062124e-05, + "loss": 0.5676, + "step": 128 + }, + { + "epoch": 0.516, + "grad_norm": 0.3909466862678528, + "learning_rate": 4.8817635270541084e-05, + "loss": 0.565, + "step": 129 + }, + { + "epoch": 0.52, + "grad_norm": 0.49433305859565735, + "learning_rate": 4.8807615230460925e-05, + "loss": 0.4734, + "step": 130 + }, + { + "epoch": 0.524, + "grad_norm": 0.40937337279319763, + "learning_rate": 4.879759519038076e-05, + "loss": 0.4959, + "step": 131 + }, + { + "epoch": 0.528, + "grad_norm": 0.4436751902103424, + "learning_rate": 4.87875751503006e-05, + "loss": 0.5009, + "step": 132 + }, + { + "epoch": 0.532, + "grad_norm": 0.3952745199203491, + "learning_rate": 4.877755511022044e-05, + "loss": 0.5088, + "step": 133 + }, + { + "epoch": 0.536, + "grad_norm": 0.4065265357494354, + "learning_rate": 4.8767535070140284e-05, + "loss": 0.5368, + "step": 134 + }, + { + "epoch": 0.54, + "grad_norm": 0.351837694644928, + "learning_rate": 4.8757515030060125e-05, + "loss": 0.4719, + "step": 135 + }, + { + "epoch": 0.544, + "grad_norm": 0.38240760564804077, + "learning_rate": 4.8747494989979966e-05, + "loss": 0.5045, + "step": 136 + }, + { + "epoch": 0.548, + "grad_norm": 1.0215145349502563, + "learning_rate": 4.87374749498998e-05, + "loss": 0.4706, + "step": 137 + }, + { + "epoch": 0.552, + "grad_norm": 0.44947350025177, + "learning_rate": 4.872745490981964e-05, + "loss": 0.4955, + "step": 138 + }, + { + "epoch": 0.556, + "grad_norm": 0.4560631811618805, + "learning_rate": 4.871743486973948e-05, + "loss": 0.4635, + "step": 139 + }, + { + "epoch": 0.56, + "grad_norm": 0.44674062728881836, + "learning_rate": 4.870741482965932e-05, + "loss": 0.5357, + "step": 140 + }, + { + "epoch": 0.564, + "grad_norm": 0.3947732150554657, + "learning_rate": 4.869739478957916e-05, + "loss": 0.5075, + "step": 141 + }, + { + "epoch": 0.568, + "grad_norm": 0.42195793986320496, + "learning_rate": 4.8687374749499e-05, + "loss": 0.5268, + "step": 142 + }, + { + "epoch": 0.572, + "grad_norm": 0.350239634513855, + "learning_rate": 4.8677354709418835e-05, + "loss": 0.4759, + "step": 143 + }, + { + "epoch": 0.576, + "grad_norm": 0.4829655885696411, + "learning_rate": 4.866733466933868e-05, + "loss": 0.4905, + "step": 144 + }, + { + "epoch": 0.58, + "grad_norm": 0.37556272745132446, + "learning_rate": 4.8657314629258524e-05, + "loss": 0.4722, + "step": 145 + }, + { + "epoch": 0.584, + "grad_norm": 0.3916873335838318, + "learning_rate": 4.864729458917836e-05, + "loss": 0.4706, + "step": 146 + }, + { + "epoch": 0.588, + "grad_norm": 0.4552304148674011, + "learning_rate": 4.86372745490982e-05, + "loss": 0.5148, + "step": 147 + }, + { + "epoch": 0.592, + "grad_norm": 0.43419304490089417, + "learning_rate": 4.862725450901804e-05, + "loss": 0.5318, + "step": 148 + }, + { + "epoch": 0.596, + "grad_norm": 0.4283067584037781, + "learning_rate": 4.8617234468937876e-05, + "loss": 0.5103, + "step": 149 + }, + { + "epoch": 0.6, + "grad_norm": 0.39657995104789734, + "learning_rate": 4.860721442885772e-05, + "loss": 0.4643, + "step": 150 + }, + { + "epoch": 0.604, + "grad_norm": 0.43963828682899475, + "learning_rate": 4.859719438877756e-05, + "loss": 0.5201, + "step": 151 + }, + { + "epoch": 0.608, + "grad_norm": 0.4126660227775574, + "learning_rate": 4.858717434869739e-05, + "loss": 0.5069, + "step": 152 + }, + { + "epoch": 0.612, + "grad_norm": 0.42871612310409546, + "learning_rate": 4.8577154308617234e-05, + "loss": 0.527, + "step": 153 + }, + { + "epoch": 0.616, + "grad_norm": 0.44571855664253235, + "learning_rate": 4.856713426853708e-05, + "loss": 0.5771, + "step": 154 + }, + { + "epoch": 0.62, + "grad_norm": 0.3940712809562683, + "learning_rate": 4.855711422845692e-05, + "loss": 0.5059, + "step": 155 + }, + { + "epoch": 0.624, + "grad_norm": 0.39730727672576904, + "learning_rate": 4.854709418837676e-05, + "loss": 0.4702, + "step": 156 + }, + { + "epoch": 0.628, + "grad_norm": 0.4266124367713928, + "learning_rate": 4.85370741482966e-05, + "loss": 0.4936, + "step": 157 + }, + { + "epoch": 0.632, + "grad_norm": 0.49682965874671936, + "learning_rate": 4.8527054108216434e-05, + "loss": 0.5417, + "step": 158 + }, + { + "epoch": 0.636, + "grad_norm": 0.43528032302856445, + "learning_rate": 4.8517034068136275e-05, + "loss": 0.5538, + "step": 159 + }, + { + "epoch": 0.64, + "grad_norm": 0.4789420962333679, + "learning_rate": 4.8507014028056116e-05, + "loss": 0.4866, + "step": 160 + }, + { + "epoch": 0.644, + "grad_norm": 0.4257548153400421, + "learning_rate": 4.849699398797595e-05, + "loss": 0.4858, + "step": 161 + }, + { + "epoch": 0.648, + "grad_norm": 0.38716921210289, + "learning_rate": 4.848697394789579e-05, + "loss": 0.4976, + "step": 162 + }, + { + "epoch": 0.652, + "grad_norm": 10.990571975708008, + "learning_rate": 4.8476953907815633e-05, + "loss": 2.1143, + "step": 163 + }, + { + "epoch": 0.656, + "grad_norm": 0.42327937483787537, + "learning_rate": 4.8466933867735475e-05, + "loss": 0.5064, + "step": 164 + }, + { + "epoch": 0.66, + "grad_norm": 0.35529690980911255, + "learning_rate": 4.8456913827655316e-05, + "loss": 0.4584, + "step": 165 + }, + { + "epoch": 0.664, + "grad_norm": 0.39093905687332153, + "learning_rate": 4.844689378757515e-05, + "loss": 0.4638, + "step": 166 + }, + { + "epoch": 0.668, + "grad_norm": 0.4272507429122925, + "learning_rate": 4.843687374749499e-05, + "loss": 0.5097, + "step": 167 + }, + { + "epoch": 0.672, + "grad_norm": 0.4245583117008209, + "learning_rate": 4.842685370741483e-05, + "loss": 0.5568, + "step": 168 + }, + { + "epoch": 0.676, + "grad_norm": 3.0908520221710205, + "learning_rate": 4.8416833667334674e-05, + "loss": 1.8422, + "step": 169 + }, + { + "epoch": 0.68, + "grad_norm": 0.4836314916610718, + "learning_rate": 4.840681362725451e-05, + "loss": 0.5187, + "step": 170 + }, + { + "epoch": 0.684, + "grad_norm": 0.4002329409122467, + "learning_rate": 4.839679358717435e-05, + "loss": 0.4804, + "step": 171 + }, + { + "epoch": 0.688, + "grad_norm": 2.849741220474243, + "learning_rate": 4.838677354709419e-05, + "loss": 1.5929, + "step": 172 + }, + { + "epoch": 0.692, + "grad_norm": 0.3997988700866699, + "learning_rate": 4.8376753507014026e-05, + "loss": 0.4551, + "step": 173 + }, + { + "epoch": 0.696, + "grad_norm": 0.4090086817741394, + "learning_rate": 4.8366733466933874e-05, + "loss": 0.5154, + "step": 174 + }, + { + "epoch": 0.7, + "grad_norm": 0.4361508786678314, + "learning_rate": 4.835671342685371e-05, + "loss": 0.4584, + "step": 175 + }, + { + "epoch": 0.704, + "grad_norm": 0.41879937052726746, + "learning_rate": 4.834669338677355e-05, + "loss": 0.5122, + "step": 176 + }, + { + "epoch": 0.708, + "grad_norm": 0.43722084164619446, + "learning_rate": 4.833667334669339e-05, + "loss": 0.503, + "step": 177 + }, + { + "epoch": 0.712, + "grad_norm": 0.3997744917869568, + "learning_rate": 4.8326653306613226e-05, + "loss": 0.5121, + "step": 178 + }, + { + "epoch": 0.716, + "grad_norm": 0.4667453169822693, + "learning_rate": 4.831663326653307e-05, + "loss": 0.4888, + "step": 179 + }, + { + "epoch": 0.72, + "grad_norm": 0.47008612751960754, + "learning_rate": 4.830661322645291e-05, + "loss": 0.5683, + "step": 180 + }, + { + "epoch": 0.724, + "grad_norm": 0.4221080541610718, + "learning_rate": 4.829659318637275e-05, + "loss": 0.4872, + "step": 181 + }, + { + "epoch": 0.728, + "grad_norm": 0.4071381688117981, + "learning_rate": 4.8286573146292584e-05, + "loss": 0.4833, + "step": 182 + }, + { + "epoch": 0.732, + "grad_norm": 0.427738219499588, + "learning_rate": 4.827655310621243e-05, + "loss": 0.4781, + "step": 183 + }, + { + "epoch": 0.736, + "grad_norm": 0.3943522274494171, + "learning_rate": 4.8266533066132266e-05, + "loss": 0.5011, + "step": 184 + }, + { + "epoch": 0.74, + "grad_norm": 0.4425494968891144, + "learning_rate": 4.825651302605211e-05, + "loss": 0.461, + "step": 185 + }, + { + "epoch": 0.744, + "grad_norm": 0.4339994192123413, + "learning_rate": 4.824649298597195e-05, + "loss": 0.4756, + "step": 186 + }, + { + "epoch": 0.748, + "grad_norm": 0.38859328627586365, + "learning_rate": 4.8236472945891784e-05, + "loss": 0.451, + "step": 187 + }, + { + "epoch": 0.752, + "grad_norm": 0.4013572931289673, + "learning_rate": 4.8226452905811625e-05, + "loss": 0.5298, + "step": 188 + }, + { + "epoch": 0.756, + "grad_norm": 9.899824142456055, + "learning_rate": 4.8216432865731466e-05, + "loss": 1.542, + "step": 189 + }, + { + "epoch": 0.76, + "grad_norm": 0.41241151094436646, + "learning_rate": 4.82064128256513e-05, + "loss": 0.5358, + "step": 190 + }, + { + "epoch": 0.764, + "grad_norm": 0.408150315284729, + "learning_rate": 4.819639278557114e-05, + "loss": 0.4647, + "step": 191 + }, + { + "epoch": 0.768, + "grad_norm": 0.40550801157951355, + "learning_rate": 4.818637274549098e-05, + "loss": 0.4881, + "step": 192 + }, + { + "epoch": 0.772, + "grad_norm": 0.35784292221069336, + "learning_rate": 4.8176352705410824e-05, + "loss": 0.4493, + "step": 193 + }, + { + "epoch": 0.776, + "grad_norm": 0.3900321424007416, + "learning_rate": 4.8166332665330666e-05, + "loss": 0.4015, + "step": 194 + }, + { + "epoch": 0.78, + "grad_norm": 0.38830649852752686, + "learning_rate": 4.815631262525051e-05, + "loss": 0.4702, + "step": 195 + }, + { + "epoch": 0.784, + "grad_norm": 0.4276293218135834, + "learning_rate": 4.814629258517034e-05, + "loss": 0.4875, + "step": 196 + }, + { + "epoch": 0.788, + "grad_norm": 0.3857548236846924, + "learning_rate": 4.813627254509018e-05, + "loss": 0.4491, + "step": 197 + }, + { + "epoch": 0.792, + "grad_norm": 0.4131658375263214, + "learning_rate": 4.8126252505010024e-05, + "loss": 0.495, + "step": 198 + }, + { + "epoch": 0.796, + "grad_norm": 0.41396304965019226, + "learning_rate": 4.811623246492986e-05, + "loss": 0.5112, + "step": 199 + }, + { + "epoch": 0.8, + "grad_norm": 0.4702424108982086, + "learning_rate": 4.81062124248497e-05, + "loss": 0.548, + "step": 200 + }, + { + "epoch": 0.804, + "grad_norm": 0.41151168942451477, + "learning_rate": 4.809619238476954e-05, + "loss": 0.4484, + "step": 201 + }, + { + "epoch": 0.808, + "grad_norm": 0.4377236068248749, + "learning_rate": 4.8086172344689376e-05, + "loss": 0.4626, + "step": 202 + }, + { + "epoch": 0.812, + "grad_norm": 0.47114676237106323, + "learning_rate": 4.8076152304609224e-05, + "loss": 0.505, + "step": 203 + }, + { + "epoch": 0.816, + "grad_norm": 0.44927552342414856, + "learning_rate": 4.8066132264529065e-05, + "loss": 0.4697, + "step": 204 + }, + { + "epoch": 0.82, + "grad_norm": 0.4451066255569458, + "learning_rate": 4.80561122244489e-05, + "loss": 0.5453, + "step": 205 + }, + { + "epoch": 0.824, + "grad_norm": 0.7564288973808289, + "learning_rate": 4.804609218436874e-05, + "loss": 0.521, + "step": 206 + }, + { + "epoch": 0.828, + "grad_norm": 0.4182622730731964, + "learning_rate": 4.803607214428858e-05, + "loss": 0.4806, + "step": 207 + }, + { + "epoch": 0.832, + "grad_norm": 0.3740067780017853, + "learning_rate": 4.8026052104208417e-05, + "loss": 0.4551, + "step": 208 + }, + { + "epoch": 0.836, + "grad_norm": 0.4166138768196106, + "learning_rate": 4.801603206412826e-05, + "loss": 0.5156, + "step": 209 + }, + { + "epoch": 0.84, + "grad_norm": 0.4035174548625946, + "learning_rate": 4.80060120240481e-05, + "loss": 0.4798, + "step": 210 + }, + { + "epoch": 0.844, + "grad_norm": 0.46257126331329346, + "learning_rate": 4.7995991983967934e-05, + "loss": 0.527, + "step": 211 + }, + { + "epoch": 0.848, + "grad_norm": 0.4034459590911865, + "learning_rate": 4.7985971943887775e-05, + "loss": 0.4595, + "step": 212 + }, + { + "epoch": 0.852, + "grad_norm": 0.37898552417755127, + "learning_rate": 4.797595190380762e-05, + "loss": 0.4495, + "step": 213 + }, + { + "epoch": 0.856, + "grad_norm": 0.4459609389305115, + "learning_rate": 4.796593186372746e-05, + "loss": 0.5279, + "step": 214 + }, + { + "epoch": 0.86, + "grad_norm": 0.436012864112854, + "learning_rate": 4.79559118236473e-05, + "loss": 0.5159, + "step": 215 + }, + { + "epoch": 0.864, + "grad_norm": 0.42498722672462463, + "learning_rate": 4.794589178356714e-05, + "loss": 0.4828, + "step": 216 + }, + { + "epoch": 0.868, + "grad_norm": 0.4183887541294098, + "learning_rate": 4.7935871743486975e-05, + "loss": 0.4886, + "step": 217 + }, + { + "epoch": 0.872, + "grad_norm": 0.4127841889858246, + "learning_rate": 4.7925851703406816e-05, + "loss": 0.4601, + "step": 218 + }, + { + "epoch": 0.876, + "grad_norm": 0.42419275641441345, + "learning_rate": 4.791583166332666e-05, + "loss": 0.5311, + "step": 219 + }, + { + "epoch": 0.88, + "grad_norm": 3.3534486293792725, + "learning_rate": 4.790581162324649e-05, + "loss": 1.7676, + "step": 220 + }, + { + "epoch": 0.884, + "grad_norm": 0.44655194878578186, + "learning_rate": 4.789579158316633e-05, + "loss": 0.4378, + "step": 221 + }, + { + "epoch": 0.888, + "grad_norm": 0.3902299702167511, + "learning_rate": 4.7885771543086174e-05, + "loss": 0.4638, + "step": 222 + }, + { + "epoch": 0.892, + "grad_norm": 0.700932502746582, + "learning_rate": 4.7875751503006016e-05, + "loss": 0.5176, + "step": 223 + }, + { + "epoch": 0.896, + "grad_norm": 0.4358803927898407, + "learning_rate": 4.786573146292586e-05, + "loss": 0.4604, + "step": 224 + }, + { + "epoch": 0.9, + "grad_norm": 0.4845912754535675, + "learning_rate": 4.78557114228457e-05, + "loss": 0.5406, + "step": 225 + }, + { + "epoch": 0.904, + "grad_norm": 0.38343381881713867, + "learning_rate": 4.784569138276553e-05, + "loss": 0.4693, + "step": 226 + }, + { + "epoch": 0.908, + "grad_norm": 0.48064181208610535, + "learning_rate": 4.7835671342685374e-05, + "loss": 0.4616, + "step": 227 + }, + { + "epoch": 0.912, + "grad_norm": 0.5941019654273987, + "learning_rate": 4.7825651302605215e-05, + "loss": 0.6027, + "step": 228 + }, + { + "epoch": 0.916, + "grad_norm": 0.4344301223754883, + "learning_rate": 4.781563126252505e-05, + "loss": 0.4404, + "step": 229 + }, + { + "epoch": 0.92, + "grad_norm": 0.4292423129081726, + "learning_rate": 4.780561122244489e-05, + "loss": 0.5116, + "step": 230 + }, + { + "epoch": 0.924, + "grad_norm": 6.355248928070068, + "learning_rate": 4.779559118236473e-05, + "loss": 1.3833, + "step": 231 + }, + { + "epoch": 0.928, + "grad_norm": 6.541921138763428, + "learning_rate": 4.778557114228457e-05, + "loss": 1.459, + "step": 232 + }, + { + "epoch": 0.932, + "grad_norm": 0.42501968145370483, + "learning_rate": 4.7775551102204415e-05, + "loss": 0.5074, + "step": 233 + }, + { + "epoch": 0.936, + "grad_norm": 0.4112103581428528, + "learning_rate": 4.776553106212425e-05, + "loss": 0.4512, + "step": 234 + }, + { + "epoch": 0.94, + "grad_norm": 0.40065306425094604, + "learning_rate": 4.775551102204409e-05, + "loss": 0.4646, + "step": 235 + }, + { + "epoch": 0.944, + "grad_norm": 0.43869850039482117, + "learning_rate": 4.774549098196393e-05, + "loss": 0.5059, + "step": 236 + }, + { + "epoch": 0.948, + "grad_norm": 0.4382988214492798, + "learning_rate": 4.773547094188377e-05, + "loss": 0.4654, + "step": 237 + }, + { + "epoch": 0.952, + "grad_norm": 0.437419056892395, + "learning_rate": 4.772545090180361e-05, + "loss": 0.4978, + "step": 238 + }, + { + "epoch": 0.956, + "grad_norm": 0.3500848412513733, + "learning_rate": 4.771543086172345e-05, + "loss": 0.4537, + "step": 239 + }, + { + "epoch": 0.96, + "grad_norm": 0.39310508966445923, + "learning_rate": 4.770541082164329e-05, + "loss": 0.4443, + "step": 240 + }, + { + "epoch": 0.964, + "grad_norm": 0.4204409420490265, + "learning_rate": 4.7695390781563125e-05, + "loss": 0.4745, + "step": 241 + }, + { + "epoch": 0.968, + "grad_norm": 0.42494168877601624, + "learning_rate": 4.768537074148297e-05, + "loss": 0.4443, + "step": 242 + }, + { + "epoch": 0.972, + "grad_norm": 0.406387060880661, + "learning_rate": 4.767535070140281e-05, + "loss": 0.491, + "step": 243 + }, + { + "epoch": 0.976, + "grad_norm": 0.4731022119522095, + "learning_rate": 4.766533066132265e-05, + "loss": 0.5007, + "step": 244 + }, + { + "epoch": 0.98, + "grad_norm": 0.4447345733642578, + "learning_rate": 4.765531062124249e-05, + "loss": 0.5206, + "step": 245 + }, + { + "epoch": 0.984, + "grad_norm": 0.47526809573173523, + "learning_rate": 4.7645290581162324e-05, + "loss": 0.4576, + "step": 246 + }, + { + "epoch": 0.988, + "grad_norm": 0.43649783730506897, + "learning_rate": 4.7635270541082166e-05, + "loss": 0.5029, + "step": 247 + }, + { + "epoch": 0.992, + "grad_norm": 0.4308774173259735, + "learning_rate": 4.762525050100201e-05, + "loss": 0.4237, + "step": 248 + }, + { + "epoch": 0.996, + "grad_norm": 0.3801085948944092, + "learning_rate": 4.761523046092184e-05, + "loss": 0.4387, + "step": 249 + }, + { + "epoch": 1.0, + "grad_norm": 0.4443725645542145, + "learning_rate": 4.760521042084168e-05, + "loss": 0.4304, + "step": 250 + }, + { + "epoch": 1.004, + "grad_norm": 0.3666495680809021, + "learning_rate": 4.7595190380761524e-05, + "loss": 0.4053, + "step": 251 + }, + { + "epoch": 1.008, + "grad_norm": 0.4456152617931366, + "learning_rate": 4.7585170340681365e-05, + "loss": 0.4301, + "step": 252 + }, + { + "epoch": 1.012, + "grad_norm": 0.4105600118637085, + "learning_rate": 4.7575150300601207e-05, + "loss": 0.4437, + "step": 253 + }, + { + "epoch": 1.016, + "grad_norm": 0.42938780784606934, + "learning_rate": 4.756513026052105e-05, + "loss": 0.4685, + "step": 254 + }, + { + "epoch": 1.02, + "grad_norm": 0.4617982506752014, + "learning_rate": 4.755511022044088e-05, + "loss": 0.5046, + "step": 255 + }, + { + "epoch": 1.024, + "grad_norm": 0.3736300468444824, + "learning_rate": 4.7545090180360724e-05, + "loss": 0.4222, + "step": 256 + }, + { + "epoch": 1.028, + "grad_norm": 0.40071138739585876, + "learning_rate": 4.7535070140280565e-05, + "loss": 0.4113, + "step": 257 + }, + { + "epoch": 1.032, + "grad_norm": 0.43367400765419006, + "learning_rate": 4.75250501002004e-05, + "loss": 0.4743, + "step": 258 + }, + { + "epoch": 1.036, + "grad_norm": 0.5123701691627502, + "learning_rate": 4.751503006012024e-05, + "loss": 0.4289, + "step": 259 + }, + { + "epoch": 1.04, + "grad_norm": 0.4922557771205902, + "learning_rate": 4.750501002004008e-05, + "loss": 0.4976, + "step": 260 + }, + { + "epoch": 1.044, + "grad_norm": 0.4395969808101654, + "learning_rate": 4.7494989979959916e-05, + "loss": 0.4344, + "step": 261 + }, + { + "epoch": 1.048, + "grad_norm": 0.4711666405200958, + "learning_rate": 4.7484969939879765e-05, + "loss": 0.4616, + "step": 262 + }, + { + "epoch": 1.052, + "grad_norm": 0.4415505528450012, + "learning_rate": 4.7474949899799606e-05, + "loss": 0.3927, + "step": 263 + }, + { + "epoch": 1.056, + "grad_norm": 0.47160524129867554, + "learning_rate": 4.746492985971944e-05, + "loss": 0.4112, + "step": 264 + }, + { + "epoch": 1.06, + "grad_norm": 0.4496021568775177, + "learning_rate": 4.745490981963928e-05, + "loss": 0.4275, + "step": 265 + }, + { + "epoch": 1.064, + "grad_norm": 0.42134368419647217, + "learning_rate": 4.744488977955912e-05, + "loss": 0.4191, + "step": 266 + }, + { + "epoch": 1.068, + "grad_norm": 0.4793007969856262, + "learning_rate": 4.743486973947896e-05, + "loss": 0.4858, + "step": 267 + }, + { + "epoch": 1.072, + "grad_norm": 0.4894791841506958, + "learning_rate": 4.74248496993988e-05, + "loss": 0.4462, + "step": 268 + }, + { + "epoch": 1.076, + "grad_norm": 0.45502549409866333, + "learning_rate": 4.741482965931864e-05, + "loss": 0.4883, + "step": 269 + }, + { + "epoch": 1.08, + "grad_norm": 0.4357253313064575, + "learning_rate": 4.7404809619238474e-05, + "loss": 0.4329, + "step": 270 + }, + { + "epoch": 1.084, + "grad_norm": 0.4542175233364105, + "learning_rate": 4.7394789579158316e-05, + "loss": 0.4501, + "step": 271 + }, + { + "epoch": 1.088, + "grad_norm": 0.4260950982570648, + "learning_rate": 4.7384769539078164e-05, + "loss": 0.4186, + "step": 272 + }, + { + "epoch": 1.092, + "grad_norm": 0.5126277804374695, + "learning_rate": 4.7374749498998e-05, + "loss": 0.4815, + "step": 273 + }, + { + "epoch": 1.096, + "grad_norm": 0.4585254490375519, + "learning_rate": 4.736472945891784e-05, + "loss": 0.4461, + "step": 274 + }, + { + "epoch": 1.1, + "grad_norm": 0.4200245440006256, + "learning_rate": 4.735470941883768e-05, + "loss": 0.4348, + "step": 275 + }, + { + "epoch": 1.104, + "grad_norm": 0.4564264118671417, + "learning_rate": 4.7344689378757515e-05, + "loss": 0.4483, + "step": 276 + }, + { + "epoch": 1.108, + "grad_norm": 0.38692179322242737, + "learning_rate": 4.733466933867736e-05, + "loss": 0.4139, + "step": 277 + }, + { + "epoch": 1.112, + "grad_norm": 0.5220334529876709, + "learning_rate": 4.73246492985972e-05, + "loss": 0.4688, + "step": 278 + }, + { + "epoch": 1.116, + "grad_norm": 0.47125041484832764, + "learning_rate": 4.731462925851703e-05, + "loss": 0.4965, + "step": 279 + }, + { + "epoch": 1.12, + "grad_norm": 0.41665714979171753, + "learning_rate": 4.7304609218436874e-05, + "loss": 0.4569, + "step": 280 + }, + { + "epoch": 1.124, + "grad_norm": 0.45704329013824463, + "learning_rate": 4.7294589178356715e-05, + "loss": 0.4441, + "step": 281 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.5076858401298523, + "learning_rate": 4.7284569138276556e-05, + "loss": 0.4441, + "step": 282 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 0.4565269649028778, + "learning_rate": 4.72745490981964e-05, + "loss": 0.4001, + "step": 283 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.4239833652973175, + "learning_rate": 4.726452905811624e-05, + "loss": 0.3982, + "step": 284 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.4458796977996826, + "learning_rate": 4.725450901803607e-05, + "loss": 0.4151, + "step": 285 + }, + { + "epoch": 1.144, + "grad_norm": 0.49094000458717346, + "learning_rate": 4.7244488977955915e-05, + "loss": 0.4491, + "step": 286 + }, + { + "epoch": 1.148, + "grad_norm": 11.27839183807373, + "learning_rate": 4.7234468937875756e-05, + "loss": 0.934, + "step": 287 + }, + { + "epoch": 1.152, + "grad_norm": 0.5622511506080627, + "learning_rate": 4.722444889779559e-05, + "loss": 0.4862, + "step": 288 + }, + { + "epoch": 1.156, + "grad_norm": 0.5482255220413208, + "learning_rate": 4.721442885771543e-05, + "loss": 0.4668, + "step": 289 + }, + { + "epoch": 1.16, + "grad_norm": 7.071150779724121, + "learning_rate": 4.720440881763527e-05, + "loss": 0.8774, + "step": 290 + }, + { + "epoch": 1.164, + "grad_norm": 1.3377376794815063, + "learning_rate": 4.7194388777555114e-05, + "loss": 0.4714, + "step": 291 + }, + { + "epoch": 1.168, + "grad_norm": 0.5460695028305054, + "learning_rate": 4.7184368737474956e-05, + "loss": 0.4769, + "step": 292 + }, + { + "epoch": 1.172, + "grad_norm": 0.4908643364906311, + "learning_rate": 4.717434869739479e-05, + "loss": 0.4904, + "step": 293 + }, + { + "epoch": 1.176, + "grad_norm": 0.42432764172554016, + "learning_rate": 4.716432865731463e-05, + "loss": 0.4213, + "step": 294 + }, + { + "epoch": 1.18, + "grad_norm": 0.5326458811759949, + "learning_rate": 4.715430861723447e-05, + "loss": 0.4568, + "step": 295 + }, + { + "epoch": 1.184, + "grad_norm": 0.4412928521633148, + "learning_rate": 4.7144288577154314e-05, + "loss": 0.4776, + "step": 296 + }, + { + "epoch": 1.188, + "grad_norm": 0.4530280828475952, + "learning_rate": 4.713426853707415e-05, + "loss": 0.4139, + "step": 297 + }, + { + "epoch": 1.192, + "grad_norm": 0.4728831946849823, + "learning_rate": 4.712424849699399e-05, + "loss": 0.4586, + "step": 298 + }, + { + "epoch": 1.196, + "grad_norm": 0.4735589325428009, + "learning_rate": 4.711422845691383e-05, + "loss": 0.4707, + "step": 299 + }, + { + "epoch": 1.2, + "grad_norm": 0.5155274271965027, + "learning_rate": 4.7104208416833666e-05, + "loss": 0.425, + "step": 300 + }, + { + "epoch": 1.204, + "grad_norm": 0.5296701788902283, + "learning_rate": 4.7094188376753514e-05, + "loss": 0.4858, + "step": 301 + }, + { + "epoch": 1.208, + "grad_norm": 0.478127121925354, + "learning_rate": 4.708416833667335e-05, + "loss": 0.4522, + "step": 302 + }, + { + "epoch": 1.212, + "grad_norm": 0.4879436194896698, + "learning_rate": 4.707414829659319e-05, + "loss": 0.45, + "step": 303 + }, + { + "epoch": 1.216, + "grad_norm": 0.4468042850494385, + "learning_rate": 4.706412825651303e-05, + "loss": 0.4119, + "step": 304 + }, + { + "epoch": 1.22, + "grad_norm": 0.47193828225135803, + "learning_rate": 4.7054108216432865e-05, + "loss": 0.4911, + "step": 305 + }, + { + "epoch": 1.224, + "grad_norm": 0.46041783690452576, + "learning_rate": 4.7044088176352706e-05, + "loss": 0.4087, + "step": 306 + }, + { + "epoch": 1.228, + "grad_norm": 0.5108659267425537, + "learning_rate": 4.703406813627255e-05, + "loss": 0.4601, + "step": 307 + }, + { + "epoch": 1.232, + "grad_norm": 0.4466314911842346, + "learning_rate": 4.702404809619239e-05, + "loss": 0.4084, + "step": 308 + }, + { + "epoch": 1.236, + "grad_norm": 0.5967437624931335, + "learning_rate": 4.7014028056112224e-05, + "loss": 0.4609, + "step": 309 + }, + { + "epoch": 1.24, + "grad_norm": 0.4608320891857147, + "learning_rate": 4.7004008016032065e-05, + "loss": 0.3945, + "step": 310 + }, + { + "epoch": 1.244, + "grad_norm": 0.47840774059295654, + "learning_rate": 4.6993987975951906e-05, + "loss": 0.4506, + "step": 311 + }, + { + "epoch": 1.248, + "grad_norm": 0.49409082531929016, + "learning_rate": 4.698396793587175e-05, + "loss": 0.4387, + "step": 312 + }, + { + "epoch": 1.252, + "grad_norm": 0.46387237310409546, + "learning_rate": 4.697394789579159e-05, + "loss": 0.436, + "step": 313 + }, + { + "epoch": 1.256, + "grad_norm": 1.0678813457489014, + "learning_rate": 4.696392785571142e-05, + "loss": 0.4139, + "step": 314 + }, + { + "epoch": 1.26, + "grad_norm": 0.5538118481636047, + "learning_rate": 4.6953907815631264e-05, + "loss": 0.4666, + "step": 315 + }, + { + "epoch": 1.264, + "grad_norm": 7.208223819732666, + "learning_rate": 4.6943887775551106e-05, + "loss": 0.6192, + "step": 316 + }, + { + "epoch": 1.268, + "grad_norm": 0.483923077583313, + "learning_rate": 4.693386773547094e-05, + "loss": 0.4103, + "step": 317 + }, + { + "epoch": 1.272, + "grad_norm": 0.4743936061859131, + "learning_rate": 4.692384769539078e-05, + "loss": 0.4514, + "step": 318 + }, + { + "epoch": 1.276, + "grad_norm": 0.5241031646728516, + "learning_rate": 4.691382765531062e-05, + "loss": 0.4661, + "step": 319 + }, + { + "epoch": 1.28, + "grad_norm": 0.44694361090660095, + "learning_rate": 4.690380761523046e-05, + "loss": 0.4173, + "step": 320 + }, + { + "epoch": 1.284, + "grad_norm": 0.4751509726047516, + "learning_rate": 4.6893787575150305e-05, + "loss": 0.3872, + "step": 321 + }, + { + "epoch": 1.288, + "grad_norm": 0.548673152923584, + "learning_rate": 4.688376753507015e-05, + "loss": 0.474, + "step": 322 + }, + { + "epoch": 1.292, + "grad_norm": 0.46685653924942017, + "learning_rate": 4.687374749498998e-05, + "loss": 0.4433, + "step": 323 + }, + { + "epoch": 1.296, + "grad_norm": 3.28513240814209, + "learning_rate": 4.686372745490982e-05, + "loss": 0.4851, + "step": 324 + }, + { + "epoch": 1.3, + "grad_norm": 0.5524126887321472, + "learning_rate": 4.6853707414829664e-05, + "loss": 0.4318, + "step": 325 + }, + { + "epoch": 1.304, + "grad_norm": 0.48285192251205444, + "learning_rate": 4.68436873747495e-05, + "loss": 0.4203, + "step": 326 + }, + { + "epoch": 1.308, + "grad_norm": 2.8871777057647705, + "learning_rate": 4.683366733466934e-05, + "loss": 0.4773, + "step": 327 + }, + { + "epoch": 1.312, + "grad_norm": 0.5716524720191956, + "learning_rate": 4.682364729458918e-05, + "loss": 0.4876, + "step": 328 + }, + { + "epoch": 1.316, + "grad_norm": 0.5107681155204773, + "learning_rate": 4.6813627254509015e-05, + "loss": 0.4194, + "step": 329 + }, + { + "epoch": 1.32, + "grad_norm": 0.5048828125, + "learning_rate": 4.6803607214428857e-05, + "loss": 0.4635, + "step": 330 + }, + { + "epoch": 1.324, + "grad_norm": 0.4749213755130768, + "learning_rate": 4.6793587174348705e-05, + "loss": 0.4625, + "step": 331 + }, + { + "epoch": 1.328, + "grad_norm": 0.5069287419319153, + "learning_rate": 4.678356713426854e-05, + "loss": 0.3894, + "step": 332 + }, + { + "epoch": 1.332, + "grad_norm": 0.5243629217147827, + "learning_rate": 4.677354709418838e-05, + "loss": 0.4179, + "step": 333 + }, + { + "epoch": 1.336, + "grad_norm": 0.4602661728858948, + "learning_rate": 4.676352705410822e-05, + "loss": 0.4378, + "step": 334 + }, + { + "epoch": 1.34, + "grad_norm": 0.591300904750824, + "learning_rate": 4.6753507014028056e-05, + "loss": 0.4615, + "step": 335 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.5004926919937134, + "learning_rate": 4.67434869739479e-05, + "loss": 0.4199, + "step": 336 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.5834985971450806, + "learning_rate": 4.673346693386774e-05, + "loss": 0.499, + "step": 337 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.4432782530784607, + "learning_rate": 4.672344689378757e-05, + "loss": 0.3882, + "step": 338 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.5421860814094543, + "learning_rate": 4.6713426853707415e-05, + "loss": 0.479, + "step": 339 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.5701932907104492, + "learning_rate": 4.6703406813627256e-05, + "loss": 0.4337, + "step": 340 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 0.5272790193557739, + "learning_rate": 4.66933867735471e-05, + "loss": 0.4419, + "step": 341 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5008136034011841, + "learning_rate": 4.668336673346694e-05, + "loss": 0.4162, + "step": 342 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 0.47261759638786316, + "learning_rate": 4.667334669338678e-05, + "loss": 0.4037, + "step": 343 + }, + { + "epoch": 1.376, + "grad_norm": 0.49282076954841614, + "learning_rate": 4.6663326653306614e-05, + "loss": 0.4203, + "step": 344 + }, + { + "epoch": 1.38, + "grad_norm": 0.5760653614997864, + "learning_rate": 4.6653306613226455e-05, + "loss": 0.4935, + "step": 345 + }, + { + "epoch": 1.384, + "grad_norm": 0.4914257824420929, + "learning_rate": 4.66432865731463e-05, + "loss": 0.3675, + "step": 346 + }, + { + "epoch": 1.388, + "grad_norm": 0.6651628017425537, + "learning_rate": 4.663326653306613e-05, + "loss": 0.4484, + "step": 347 + }, + { + "epoch": 1.392, + "grad_norm": 1.1650779247283936, + "learning_rate": 4.662324649298597e-05, + "loss": 0.496, + "step": 348 + }, + { + "epoch": 1.396, + "grad_norm": 0.4959772825241089, + "learning_rate": 4.6613226452905814e-05, + "loss": 0.3795, + "step": 349 + }, + { + "epoch": 1.4, + "grad_norm": 11.361075401306152, + "learning_rate": 4.6603206412825655e-05, + "loss": 0.4368, + "step": 350 + }, + { + "epoch": 1.404, + "grad_norm": 0.5089369416236877, + "learning_rate": 4.6593186372745496e-05, + "loss": 0.3721, + "step": 351 + }, + { + "epoch": 1.408, + "grad_norm": 0.4523348808288574, + "learning_rate": 4.658316633266534e-05, + "loss": 0.4223, + "step": 352 + }, + { + "epoch": 1.412, + "grad_norm": 0.5115591287612915, + "learning_rate": 4.657314629258517e-05, + "loss": 0.4383, + "step": 353 + }, + { + "epoch": 1.416, + "grad_norm": 0.4922471344470978, + "learning_rate": 4.6563126252505013e-05, + "loss": 0.4566, + "step": 354 + }, + { + "epoch": 1.42, + "grad_norm": 0.49494296312332153, + "learning_rate": 4.6553106212424855e-05, + "loss": 0.4328, + "step": 355 + }, + { + "epoch": 1.424, + "grad_norm": 0.6130332946777344, + "learning_rate": 4.654308617234469e-05, + "loss": 0.4885, + "step": 356 + }, + { + "epoch": 1.428, + "grad_norm": 0.5084357857704163, + "learning_rate": 4.653306613226453e-05, + "loss": 0.4918, + "step": 357 + }, + { + "epoch": 1.432, + "grad_norm": 0.5193467140197754, + "learning_rate": 4.652304609218437e-05, + "loss": 0.4379, + "step": 358 + }, + { + "epoch": 1.436, + "grad_norm": 0.5419803261756897, + "learning_rate": 4.6513026052104206e-05, + "loss": 0.444, + "step": 359 + }, + { + "epoch": 1.44, + "grad_norm": 0.5299074649810791, + "learning_rate": 4.6503006012024054e-05, + "loss": 0.4716, + "step": 360 + }, + { + "epoch": 1.444, + "grad_norm": 1.1689682006835938, + "learning_rate": 4.649298597194389e-05, + "loss": 0.2592, + "step": 361 + }, + { + "epoch": 1.448, + "grad_norm": 0.532464861869812, + "learning_rate": 4.648296593186373e-05, + "loss": 0.4572, + "step": 362 + }, + { + "epoch": 1.452, + "grad_norm": 0.4852958023548126, + "learning_rate": 4.647294589178357e-05, + "loss": 0.4132, + "step": 363 + }, + { + "epoch": 1.456, + "grad_norm": 0.4912414848804474, + "learning_rate": 4.646292585170341e-05, + "loss": 0.5046, + "step": 364 + }, + { + "epoch": 1.46, + "grad_norm": 0.5567641854286194, + "learning_rate": 4.645290581162325e-05, + "loss": 0.4648, + "step": 365 + }, + { + "epoch": 1.464, + "grad_norm": 0.5151598453521729, + "learning_rate": 4.644288577154309e-05, + "loss": 0.4677, + "step": 366 + }, + { + "epoch": 1.468, + "grad_norm": 0.5214430093765259, + "learning_rate": 4.643286573146293e-05, + "loss": 0.4396, + "step": 367 + }, + { + "epoch": 1.472, + "grad_norm": 0.5087822079658508, + "learning_rate": 4.6422845691382764e-05, + "loss": 0.4185, + "step": 368 + }, + { + "epoch": 1.476, + "grad_norm": 0.49884939193725586, + "learning_rate": 4.6412825651302606e-05, + "loss": 0.427, + "step": 369 + }, + { + "epoch": 1.48, + "grad_norm": 0.5047423243522644, + "learning_rate": 4.640280561122245e-05, + "loss": 0.4682, + "step": 370 + }, + { + "epoch": 1.484, + "grad_norm": 0.5239180326461792, + "learning_rate": 4.639278557114229e-05, + "loss": 0.4701, + "step": 371 + }, + { + "epoch": 1.488, + "grad_norm": 0.48768556118011475, + "learning_rate": 4.638276553106213e-05, + "loss": 0.4158, + "step": 372 + }, + { + "epoch": 1.492, + "grad_norm": 0.5886668562889099, + "learning_rate": 4.6372745490981964e-05, + "loss": 0.4679, + "step": 373 + }, + { + "epoch": 1.496, + "grad_norm": 0.5226956009864807, + "learning_rate": 4.6362725450901805e-05, + "loss": 0.4523, + "step": 374 + }, + { + "epoch": 1.5, + "grad_norm": 0.5257872939109802, + "learning_rate": 4.6352705410821647e-05, + "loss": 0.4399, + "step": 375 + }, + { + "epoch": 1.504, + "grad_norm": 0.6213669180870056, + "learning_rate": 4.634268537074148e-05, + "loss": 0.4535, + "step": 376 + }, + { + "epoch": 1.508, + "grad_norm": 0.49080657958984375, + "learning_rate": 4.633266533066132e-05, + "loss": 0.4379, + "step": 377 + }, + { + "epoch": 1.512, + "grad_norm": 0.6423617005348206, + "learning_rate": 4.6322645290581164e-05, + "loss": 0.5559, + "step": 378 + }, + { + "epoch": 1.516, + "grad_norm": 0.6420733332633972, + "learning_rate": 4.6312625250501005e-05, + "loss": 0.4543, + "step": 379 + }, + { + "epoch": 1.52, + "grad_norm": 0.5223842859268188, + "learning_rate": 4.6302605210420846e-05, + "loss": 0.466, + "step": 380 + }, + { + "epoch": 1.524, + "grad_norm": 0.4663429856300354, + "learning_rate": 4.629258517034069e-05, + "loss": 0.4153, + "step": 381 + }, + { + "epoch": 1.528, + "grad_norm": 0.5308319926261902, + "learning_rate": 4.628256513026052e-05, + "loss": 0.4136, + "step": 382 + }, + { + "epoch": 1.532, + "grad_norm": 0.48709505796432495, + "learning_rate": 4.627254509018036e-05, + "loss": 0.4478, + "step": 383 + }, + { + "epoch": 1.536, + "grad_norm": 0.4852888584136963, + "learning_rate": 4.6262525050100205e-05, + "loss": 0.3736, + "step": 384 + }, + { + "epoch": 1.54, + "grad_norm": 0.510391116142273, + "learning_rate": 4.625250501002004e-05, + "loss": 0.4505, + "step": 385 + }, + { + "epoch": 1.544, + "grad_norm": 0.48822277784347534, + "learning_rate": 4.624248496993988e-05, + "loss": 0.384, + "step": 386 + }, + { + "epoch": 1.548, + "grad_norm": 0.5103605389595032, + "learning_rate": 4.623246492985972e-05, + "loss": 0.3747, + "step": 387 + }, + { + "epoch": 1.552, + "grad_norm": 0.5321424603462219, + "learning_rate": 4.6222444889779556e-05, + "loss": 0.4732, + "step": 388 + }, + { + "epoch": 1.556, + "grad_norm": 0.5756722092628479, + "learning_rate": 4.62124248496994e-05, + "loss": 0.4735, + "step": 389 + }, + { + "epoch": 1.56, + "grad_norm": 0.5002274513244629, + "learning_rate": 4.6202404809619245e-05, + "loss": 0.4149, + "step": 390 + }, + { + "epoch": 1.564, + "grad_norm": 0.5540143847465515, + "learning_rate": 4.619238476953908e-05, + "loss": 0.4388, + "step": 391 + }, + { + "epoch": 1.568, + "grad_norm": 0.5765349864959717, + "learning_rate": 4.618236472945892e-05, + "loss": 0.5177, + "step": 392 + }, + { + "epoch": 1.572, + "grad_norm": 0.5519753694534302, + "learning_rate": 4.617234468937876e-05, + "loss": 0.4497, + "step": 393 + }, + { + "epoch": 1.576, + "grad_norm": 0.5800144672393799, + "learning_rate": 4.61623246492986e-05, + "loss": 0.448, + "step": 394 + }, + { + "epoch": 1.58, + "grad_norm": 0.5450941920280457, + "learning_rate": 4.615230460921844e-05, + "loss": 0.4368, + "step": 395 + }, + { + "epoch": 1.584, + "grad_norm": 0.50678551197052, + "learning_rate": 4.614228456913828e-05, + "loss": 0.4496, + "step": 396 + }, + { + "epoch": 1.588, + "grad_norm": 0.48308175802230835, + "learning_rate": 4.6132264529058114e-05, + "loss": 0.3646, + "step": 397 + }, + { + "epoch": 1.592, + "grad_norm": 0.5025294423103333, + "learning_rate": 4.6122244488977955e-05, + "loss": 0.4331, + "step": 398 + }, + { + "epoch": 1.596, + "grad_norm": 0.4999145567417145, + "learning_rate": 4.61122244488978e-05, + "loss": 0.3968, + "step": 399 + }, + { + "epoch": 1.6, + "grad_norm": 0.5278040170669556, + "learning_rate": 4.610220440881764e-05, + "loss": 0.3938, + "step": 400 + }, + { + "epoch": 1.604, + "grad_norm": 0.5218545198440552, + "learning_rate": 4.609218436873748e-05, + "loss": 0.4465, + "step": 401 + }, + { + "epoch": 1.608, + "grad_norm": 0.5310875177383423, + "learning_rate": 4.608216432865732e-05, + "loss": 0.4094, + "step": 402 + }, + { + "epoch": 1.612, + "grad_norm": 0.47267839312553406, + "learning_rate": 4.6072144288577155e-05, + "loss": 0.4233, + "step": 403 + }, + { + "epoch": 1.616, + "grad_norm": 0.9498124122619629, + "learning_rate": 4.6062124248496996e-05, + "loss": 0.4357, + "step": 404 + }, + { + "epoch": 1.62, + "grad_norm": 0.46646052598953247, + "learning_rate": 4.605210420841684e-05, + "loss": 0.4146, + "step": 405 + }, + { + "epoch": 1.624, + "grad_norm": 1.2247719764709473, + "learning_rate": 4.604208416833667e-05, + "loss": 0.3475, + "step": 406 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 0.5586639642715454, + "learning_rate": 4.603206412825651e-05, + "loss": 0.444, + "step": 407 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.552677571773529, + "learning_rate": 4.6022044088176355e-05, + "loss": 0.4234, + "step": 408 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 0.5379041433334351, + "learning_rate": 4.6012024048096196e-05, + "loss": 0.4755, + "step": 409 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.5361511707305908, + "learning_rate": 4.600200400801604e-05, + "loss": 0.3844, + "step": 410 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.5484291315078735, + "learning_rate": 4.599198396793588e-05, + "loss": 0.4237, + "step": 411 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.47265616059303284, + "learning_rate": 4.598196392785571e-05, + "loss": 0.4055, + "step": 412 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.586786150932312, + "learning_rate": 4.5971943887775554e-05, + "loss": 0.5109, + "step": 413 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.4930340051651001, + "learning_rate": 4.5961923847695396e-05, + "loss": 0.3967, + "step": 414 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.5312355160713196, + "learning_rate": 4.595190380761523e-05, + "loss": 0.4881, + "step": 415 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.5358534455299377, + "learning_rate": 4.594188376753507e-05, + "loss": 0.4785, + "step": 416 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 0.5466772317886353, + "learning_rate": 4.593186372745491e-05, + "loss": 0.4695, + "step": 417 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.5252442359924316, + "learning_rate": 4.592184368737475e-05, + "loss": 0.4647, + "step": 418 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 0.6336548924446106, + "learning_rate": 4.5911823647294595e-05, + "loss": 0.4535, + "step": 419 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.5073897838592529, + "learning_rate": 4.590180360721443e-05, + "loss": 0.4622, + "step": 420 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 0.5029900074005127, + "learning_rate": 4.589178356713427e-05, + "loss": 0.4221, + "step": 421 + }, + { + "epoch": 1.688, + "grad_norm": 0.590262234210968, + "learning_rate": 4.588176352705411e-05, + "loss": 0.4097, + "step": 422 + }, + { + "epoch": 1.692, + "grad_norm": 0.5236296653747559, + "learning_rate": 4.5871743486973954e-05, + "loss": 0.4445, + "step": 423 + }, + { + "epoch": 1.696, + "grad_norm": 0.5591893792152405, + "learning_rate": 4.586172344689379e-05, + "loss": 0.4016, + "step": 424 + }, + { + "epoch": 1.7, + "grad_norm": 0.48660945892333984, + "learning_rate": 4.585170340681363e-05, + "loss": 0.3968, + "step": 425 + }, + { + "epoch": 1.704, + "grad_norm": 1.2732222080230713, + "learning_rate": 4.584168336673347e-05, + "loss": 0.2928, + "step": 426 + }, + { + "epoch": 1.708, + "grad_norm": 0.5137063264846802, + "learning_rate": 4.5831663326653305e-05, + "loss": 0.4274, + "step": 427 + }, + { + "epoch": 1.712, + "grad_norm": 0.532532811164856, + "learning_rate": 4.5821643286573146e-05, + "loss": 0.443, + "step": 428 + }, + { + "epoch": 1.716, + "grad_norm": 0.7271252870559692, + "learning_rate": 4.581162324649299e-05, + "loss": 0.5061, + "step": 429 + }, + { + "epoch": 1.72, + "grad_norm": 0.49782320857048035, + "learning_rate": 4.580160320641283e-05, + "loss": 0.4603, + "step": 430 + }, + { + "epoch": 1.724, + "grad_norm": 0.49443551898002625, + "learning_rate": 4.579158316633267e-05, + "loss": 0.3841, + "step": 431 + }, + { + "epoch": 1.728, + "grad_norm": 0.5045324563980103, + "learning_rate": 4.5781563126252505e-05, + "loss": 0.3932, + "step": 432 + }, + { + "epoch": 1.732, + "grad_norm": 0.5934986472129822, + "learning_rate": 4.5771543086172346e-05, + "loss": 0.5067, + "step": 433 + }, + { + "epoch": 1.736, + "grad_norm": 0.5429759621620178, + "learning_rate": 4.576152304609219e-05, + "loss": 0.3985, + "step": 434 + }, + { + "epoch": 1.74, + "grad_norm": 0.6646963357925415, + "learning_rate": 4.575150300601203e-05, + "loss": 0.4736, + "step": 435 + }, + { + "epoch": 1.744, + "grad_norm": 0.5723690986633301, + "learning_rate": 4.574148296593186e-05, + "loss": 0.4137, + "step": 436 + }, + { + "epoch": 1.748, + "grad_norm": 0.5194367170333862, + "learning_rate": 4.5731462925851704e-05, + "loss": 0.3921, + "step": 437 + }, + { + "epoch": 1.752, + "grad_norm": 0.510465145111084, + "learning_rate": 4.5721442885771546e-05, + "loss": 0.4055, + "step": 438 + }, + { + "epoch": 1.756, + "grad_norm": 0.6061939597129822, + "learning_rate": 4.571142284569139e-05, + "loss": 0.4404, + "step": 439 + }, + { + "epoch": 1.76, + "grad_norm": 0.5550720691680908, + "learning_rate": 4.570140280561123e-05, + "loss": 0.4366, + "step": 440 + }, + { + "epoch": 1.764, + "grad_norm": 0.5832468867301941, + "learning_rate": 4.569138276553106e-05, + "loss": 0.4176, + "step": 441 + }, + { + "epoch": 1.768, + "grad_norm": 0.5731306672096252, + "learning_rate": 4.5681362725450904e-05, + "loss": 0.3685, + "step": 442 + }, + { + "epoch": 1.772, + "grad_norm": 0.6895073056221008, + "learning_rate": 4.5671342685370745e-05, + "loss": 0.4906, + "step": 443 + }, + { + "epoch": 1.776, + "grad_norm": 0.6335456967353821, + "learning_rate": 4.566132264529058e-05, + "loss": 0.4004, + "step": 444 + }, + { + "epoch": 1.78, + "grad_norm": 0.5286179780960083, + "learning_rate": 4.565130260521042e-05, + "loss": 0.409, + "step": 445 + }, + { + "epoch": 1.784, + "grad_norm": 0.626899003982544, + "learning_rate": 4.564128256513026e-05, + "loss": 0.4522, + "step": 446 + }, + { + "epoch": 1.788, + "grad_norm": 0.5576806664466858, + "learning_rate": 4.56312625250501e-05, + "loss": 0.4494, + "step": 447 + }, + { + "epoch": 1.792, + "grad_norm": 0.5164839625358582, + "learning_rate": 4.562124248496994e-05, + "loss": 0.4463, + "step": 448 + }, + { + "epoch": 1.796, + "grad_norm": 0.5332900881767273, + "learning_rate": 4.5611222444889786e-05, + "loss": 0.4454, + "step": 449 + }, + { + "epoch": 1.8, + "grad_norm": 0.6575366258621216, + "learning_rate": 4.560120240480962e-05, + "loss": 0.4384, + "step": 450 + }, + { + "epoch": 1.804, + "grad_norm": 0.571654200553894, + "learning_rate": 4.559118236472946e-05, + "loss": 0.4229, + "step": 451 + }, + { + "epoch": 1.808, + "grad_norm": 0.5586163997650146, + "learning_rate": 4.55811623246493e-05, + "loss": 0.4484, + "step": 452 + }, + { + "epoch": 1.812, + "grad_norm": 0.5708151459693909, + "learning_rate": 4.557114228456914e-05, + "loss": 0.4247, + "step": 453 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.5310369729995728, + "learning_rate": 4.556112224448898e-05, + "loss": 0.4246, + "step": 454 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.5038295984268188, + "learning_rate": 4.555110220440882e-05, + "loss": 0.3868, + "step": 455 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.6038540005683899, + "learning_rate": 4.5541082164328655e-05, + "loss": 0.5129, + "step": 456 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 0.6133531928062439, + "learning_rate": 4.5531062124248496e-05, + "loss": 0.4609, + "step": 457 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6467635035514832, + "learning_rate": 4.5521042084168344e-05, + "loss": 0.4598, + "step": 458 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 0.5676400661468506, + "learning_rate": 4.551102204408818e-05, + "loss": 0.3724, + "step": 459 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 3.835160732269287, + "learning_rate": 4.550100200400802e-05, + "loss": 0.3293, + "step": 460 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 0.5771289467811584, + "learning_rate": 4.549098196392786e-05, + "loss": 0.4299, + "step": 461 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.5256178379058838, + "learning_rate": 4.5480961923847696e-05, + "loss": 0.3774, + "step": 462 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 0.5819401741027832, + "learning_rate": 4.547094188376754e-05, + "loss": 0.4542, + "step": 463 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.5631145238876343, + "learning_rate": 4.546092184368738e-05, + "loss": 0.4326, + "step": 464 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.5169239044189453, + "learning_rate": 4.545090180360721e-05, + "loss": 0.3941, + "step": 465 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.5691187381744385, + "learning_rate": 4.5440881763527054e-05, + "loss": 0.483, + "step": 466 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 0.5725812911987305, + "learning_rate": 4.5430861723446895e-05, + "loss": 0.448, + "step": 467 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.5266627669334412, + "learning_rate": 4.542084168336674e-05, + "loss": 0.4355, + "step": 468 + }, + { + "epoch": 1.876, + "grad_norm": 0.4844062924385071, + "learning_rate": 4.541082164328658e-05, + "loss": 0.4057, + "step": 469 + }, + { + "epoch": 1.88, + "grad_norm": 0.5010110139846802, + "learning_rate": 4.540080160320642e-05, + "loss": 0.4314, + "step": 470 + }, + { + "epoch": 1.884, + "grad_norm": 0.5683146715164185, + "learning_rate": 4.5390781563126254e-05, + "loss": 0.4376, + "step": 471 + }, + { + "epoch": 1.888, + "grad_norm": 0.535389244556427, + "learning_rate": 4.5380761523046095e-05, + "loss": 0.4047, + "step": 472 + }, + { + "epoch": 1.892, + "grad_norm": 0.49679070711135864, + "learning_rate": 4.5370741482965936e-05, + "loss": 0.4082, + "step": 473 + }, + { + "epoch": 1.896, + "grad_norm": 0.527479887008667, + "learning_rate": 4.536072144288577e-05, + "loss": 0.4829, + "step": 474 + }, + { + "epoch": 1.9, + "grad_norm": 0.5545944571495056, + "learning_rate": 4.535070140280561e-05, + "loss": 0.4503, + "step": 475 + }, + { + "epoch": 1.904, + "grad_norm": 0.5213431119918823, + "learning_rate": 4.5340681362725453e-05, + "loss": 0.3898, + "step": 476 + }, + { + "epoch": 1.908, + "grad_norm": 0.5331659317016602, + "learning_rate": 4.533066132264529e-05, + "loss": 0.4274, + "step": 477 + }, + { + "epoch": 1.912, + "grad_norm": 0.4619419574737549, + "learning_rate": 4.5320641282565136e-05, + "loss": 0.3848, + "step": 478 + }, + { + "epoch": 1.916, + "grad_norm": 0.5063828825950623, + "learning_rate": 4.531062124248498e-05, + "loss": 0.4625, + "step": 479 + }, + { + "epoch": 1.92, + "grad_norm": 0.4973587095737457, + "learning_rate": 4.530060120240481e-05, + "loss": 0.4274, + "step": 480 + }, + { + "epoch": 1.924, + "grad_norm": 0.49490657448768616, + "learning_rate": 4.529058116232465e-05, + "loss": 0.4322, + "step": 481 + }, + { + "epoch": 1.928, + "grad_norm": 4.3649067878723145, + "learning_rate": 4.5280561122244494e-05, + "loss": 0.4085, + "step": 482 + }, + { + "epoch": 1.932, + "grad_norm": 0.5393351316452026, + "learning_rate": 4.527054108216433e-05, + "loss": 0.4309, + "step": 483 + }, + { + "epoch": 1.936, + "grad_norm": 0.6917522549629211, + "learning_rate": 4.526052104208417e-05, + "loss": 0.268, + "step": 484 + }, + { + "epoch": 1.94, + "grad_norm": 0.5249350666999817, + "learning_rate": 4.525050100200401e-05, + "loss": 0.4548, + "step": 485 + }, + { + "epoch": 1.944, + "grad_norm": 0.5986626148223877, + "learning_rate": 4.5240480961923846e-05, + "loss": 0.4347, + "step": 486 + }, + { + "epoch": 1.948, + "grad_norm": 0.6276787519454956, + "learning_rate": 4.523046092184369e-05, + "loss": 0.5127, + "step": 487 + }, + { + "epoch": 1.952, + "grad_norm": 0.5993108749389648, + "learning_rate": 4.522044088176353e-05, + "loss": 0.4559, + "step": 488 + }, + { + "epoch": 1.956, + "grad_norm": 0.4931926131248474, + "learning_rate": 4.521042084168337e-05, + "loss": 0.4144, + "step": 489 + }, + { + "epoch": 1.96, + "grad_norm": 0.482890784740448, + "learning_rate": 4.520040080160321e-05, + "loss": 0.2603, + "step": 490 + }, + { + "epoch": 1.964, + "grad_norm": 0.6805306077003479, + "learning_rate": 4.519038076152305e-05, + "loss": 0.5272, + "step": 491 + }, + { + "epoch": 1.968, + "grad_norm": 0.6288460493087769, + "learning_rate": 4.518036072144289e-05, + "loss": 0.4455, + "step": 492 + }, + { + "epoch": 1.972, + "grad_norm": 0.5547501444816589, + "learning_rate": 4.517034068136273e-05, + "loss": 0.4432, + "step": 493 + }, + { + "epoch": 1.976, + "grad_norm": 0.545608639717102, + "learning_rate": 4.516032064128257e-05, + "loss": 0.4219, + "step": 494 + }, + { + "epoch": 1.98, + "grad_norm": 0.5442771911621094, + "learning_rate": 4.5150300601202404e-05, + "loss": 0.4113, + "step": 495 + }, + { + "epoch": 1.984, + "grad_norm": 0.5810039043426514, + "learning_rate": 4.5140280561122245e-05, + "loss": 0.4304, + "step": 496 + }, + { + "epoch": 1.988, + "grad_norm": 0.5679133534431458, + "learning_rate": 4.5130260521042086e-05, + "loss": 0.3819, + "step": 497 + }, + { + "epoch": 1.992, + "grad_norm": 0.5476593375205994, + "learning_rate": 4.512024048096193e-05, + "loss": 0.4566, + "step": 498 + }, + { + "epoch": 1.996, + "grad_norm": 0.516356348991394, + "learning_rate": 4.511022044088177e-05, + "loss": 0.427, + "step": 499 + }, + { + "epoch": 2.0, + "grad_norm": 0.5612138509750366, + "learning_rate": 4.5100200400801604e-05, + "loss": 0.4089, + "step": 500 + }, + { + "epoch": 2.004, + "grad_norm": 0.5504969358444214, + "learning_rate": 4.5090180360721445e-05, + "loss": 0.3998, + "step": 501 + }, + { + "epoch": 2.008, + "grad_norm": 0.5878968238830566, + "learning_rate": 4.5080160320641286e-05, + "loss": 0.3539, + "step": 502 + }, + { + "epoch": 2.012, + "grad_norm": 0.5411523580551147, + "learning_rate": 4.507014028056112e-05, + "loss": 0.3457, + "step": 503 + }, + { + "epoch": 2.016, + "grad_norm": 0.4899858236312866, + "learning_rate": 4.506012024048096e-05, + "loss": 0.3602, + "step": 504 + }, + { + "epoch": 2.02, + "grad_norm": 0.5289257764816284, + "learning_rate": 4.50501002004008e-05, + "loss": 0.4093, + "step": 505 + }, + { + "epoch": 2.024, + "grad_norm": 0.5221056938171387, + "learning_rate": 4.5040080160320644e-05, + "loss": 0.3818, + "step": 506 + }, + { + "epoch": 2.028, + "grad_norm": 0.5114055871963501, + "learning_rate": 4.503006012024048e-05, + "loss": 0.3801, + "step": 507 + }, + { + "epoch": 2.032, + "grad_norm": 0.5133730173110962, + "learning_rate": 4.502004008016033e-05, + "loss": 0.3916, + "step": 508 + }, + { + "epoch": 2.036, + "grad_norm": 0.6223206520080566, + "learning_rate": 4.501002004008016e-05, + "loss": 0.3904, + "step": 509 + }, + { + "epoch": 2.04, + "grad_norm": 0.597932755947113, + "learning_rate": 4.5e-05, + "loss": 0.3352, + "step": 510 + }, + { + "epoch": 2.044, + "grad_norm": 0.6192795038223267, + "learning_rate": 4.4989979959919844e-05, + "loss": 0.3788, + "step": 511 + }, + { + "epoch": 2.048, + "grad_norm": 0.5938431024551392, + "learning_rate": 4.497995991983968e-05, + "loss": 0.3946, + "step": 512 + }, + { + "epoch": 2.052, + "grad_norm": 0.5885916352272034, + "learning_rate": 4.496993987975952e-05, + "loss": 0.3706, + "step": 513 + }, + { + "epoch": 2.056, + "grad_norm": 0.6078701615333557, + "learning_rate": 4.495991983967936e-05, + "loss": 0.3561, + "step": 514 + }, + { + "epoch": 2.06, + "grad_norm": 0.6931913495063782, + "learning_rate": 4.4949899799599196e-05, + "loss": 0.3697, + "step": 515 + }, + { + "epoch": 2.064, + "grad_norm": 0.6855788230895996, + "learning_rate": 4.493987975951904e-05, + "loss": 0.4235, + "step": 516 + }, + { + "epoch": 2.068, + "grad_norm": 0.5608630180358887, + "learning_rate": 4.4929859719438885e-05, + "loss": 0.3561, + "step": 517 + }, + { + "epoch": 2.072, + "grad_norm": 0.6403824687004089, + "learning_rate": 4.491983967935872e-05, + "loss": 0.3701, + "step": 518 + }, + { + "epoch": 2.076, + "grad_norm": 0.5921077728271484, + "learning_rate": 4.490981963927856e-05, + "loss": 0.36, + "step": 519 + }, + { + "epoch": 2.08, + "grad_norm": 0.6234927773475647, + "learning_rate": 4.48997995991984e-05, + "loss": 0.4041, + "step": 520 + }, + { + "epoch": 2.084, + "grad_norm": 0.6670336723327637, + "learning_rate": 4.488977955911824e-05, + "loss": 0.3733, + "step": 521 + }, + { + "epoch": 2.088, + "grad_norm": 0.6524705290794373, + "learning_rate": 4.487975951903808e-05, + "loss": 0.344, + "step": 522 + }, + { + "epoch": 2.092, + "grad_norm": 0.6538939476013184, + "learning_rate": 4.486973947895792e-05, + "loss": 0.4209, + "step": 523 + }, + { + "epoch": 2.096, + "grad_norm": 0.6096150875091553, + "learning_rate": 4.4859719438877754e-05, + "loss": 0.3739, + "step": 524 + }, + { + "epoch": 2.1, + "grad_norm": 0.5730370879173279, + "learning_rate": 4.4849699398797595e-05, + "loss": 0.3912, + "step": 525 + }, + { + "epoch": 2.104, + "grad_norm": 0.6088821887969971, + "learning_rate": 4.4839679358717436e-05, + "loss": 0.3633, + "step": 526 + }, + { + "epoch": 2.108, + "grad_norm": 0.9573196768760681, + "learning_rate": 4.482965931863728e-05, + "loss": 0.2372, + "step": 527 + }, + { + "epoch": 2.112, + "grad_norm": 0.6470226049423218, + "learning_rate": 4.481963927855712e-05, + "loss": 0.369, + "step": 528 + }, + { + "epoch": 2.116, + "grad_norm": 0.6785104274749756, + "learning_rate": 4.480961923847696e-05, + "loss": 0.4002, + "step": 529 + }, + { + "epoch": 2.12, + "grad_norm": 0.46951213479042053, + "learning_rate": 4.4799599198396795e-05, + "loss": 0.2299, + "step": 530 + }, + { + "epoch": 2.124, + "grad_norm": 0.6221892237663269, + "learning_rate": 4.4789579158316636e-05, + "loss": 0.3324, + "step": 531 + }, + { + "epoch": 2.128, + "grad_norm": 0.6120625138282776, + "learning_rate": 4.477955911823648e-05, + "loss": 0.3473, + "step": 532 + }, + { + "epoch": 2.132, + "grad_norm": 0.6631773710250854, + "learning_rate": 4.476953907815631e-05, + "loss": 0.3919, + "step": 533 + }, + { + "epoch": 2.136, + "grad_norm": 0.6568106412887573, + "learning_rate": 4.475951903807615e-05, + "loss": 0.223, + "step": 534 + }, + { + "epoch": 2.14, + "grad_norm": 0.6405401825904846, + "learning_rate": 4.4749498997995994e-05, + "loss": 0.3438, + "step": 535 + }, + { + "epoch": 2.144, + "grad_norm": 0.7879334092140198, + "learning_rate": 4.473947895791583e-05, + "loss": 0.3801, + "step": 536 + }, + { + "epoch": 2.148, + "grad_norm": 0.6276340484619141, + "learning_rate": 4.472945891783568e-05, + "loss": 0.365, + "step": 537 + }, + { + "epoch": 2.152, + "grad_norm": 0.7515552043914795, + "learning_rate": 4.471943887775552e-05, + "loss": 0.3993, + "step": 538 + }, + { + "epoch": 2.156, + "grad_norm": 0.6806215643882751, + "learning_rate": 4.470941883767535e-05, + "loss": 0.3654, + "step": 539 + }, + { + "epoch": 2.16, + "grad_norm": 0.6749470829963684, + "learning_rate": 4.4699398797595194e-05, + "loss": 0.3565, + "step": 540 + }, + { + "epoch": 2.164, + "grad_norm": 0.8084760308265686, + "learning_rate": 4.4689378757515035e-05, + "loss": 0.4149, + "step": 541 + }, + { + "epoch": 2.168, + "grad_norm": 0.6896063089370728, + "learning_rate": 4.467935871743487e-05, + "loss": 0.3676, + "step": 542 + }, + { + "epoch": 2.172, + "grad_norm": 0.665913999080658, + "learning_rate": 4.466933867735471e-05, + "loss": 0.3514, + "step": 543 + }, + { + "epoch": 2.176, + "grad_norm": 0.6971928477287292, + "learning_rate": 4.465931863727455e-05, + "loss": 0.4026, + "step": 544 + }, + { + "epoch": 2.18, + "grad_norm": 0.752924919128418, + "learning_rate": 4.464929859719439e-05, + "loss": 0.3677, + "step": 545 + }, + { + "epoch": 2.184, + "grad_norm": 0.6077944040298462, + "learning_rate": 4.463927855711423e-05, + "loss": 0.3712, + "step": 546 + }, + { + "epoch": 2.188, + "grad_norm": 0.6650569438934326, + "learning_rate": 4.462925851703407e-05, + "loss": 0.3999, + "step": 547 + }, + { + "epoch": 2.192, + "grad_norm": 0.6830900311470032, + "learning_rate": 4.461923847695391e-05, + "loss": 0.3617, + "step": 548 + }, + { + "epoch": 2.196, + "grad_norm": 0.6619176268577576, + "learning_rate": 4.460921843687375e-05, + "loss": 0.3707, + "step": 549 + }, + { + "epoch": 2.2, + "grad_norm": 0.6596035361289978, + "learning_rate": 4.459919839679359e-05, + "loss": 0.4169, + "step": 550 + }, + { + "epoch": 2.204, + "grad_norm": 0.636611819267273, + "learning_rate": 4.458917835671343e-05, + "loss": 0.3806, + "step": 551 + }, + { + "epoch": 2.208, + "grad_norm": 0.6402906179428101, + "learning_rate": 4.457915831663327e-05, + "loss": 0.3406, + "step": 552 + }, + { + "epoch": 2.212, + "grad_norm": 0.6241163015365601, + "learning_rate": 4.456913827655311e-05, + "loss": 0.3814, + "step": 553 + }, + { + "epoch": 2.216, + "grad_norm": 0.73136305809021, + "learning_rate": 4.4559118236472945e-05, + "loss": 0.3599, + "step": 554 + }, + { + "epoch": 2.22, + "grad_norm": 0.9286080002784729, + "learning_rate": 4.4549098196392786e-05, + "loss": 0.3629, + "step": 555 + }, + { + "epoch": 2.224, + "grad_norm": 0.6749750375747681, + "learning_rate": 4.453907815631263e-05, + "loss": 0.3829, + "step": 556 + }, + { + "epoch": 2.228, + "grad_norm": 0.6232475638389587, + "learning_rate": 4.452905811623247e-05, + "loss": 0.3816, + "step": 557 + }, + { + "epoch": 2.232, + "grad_norm": 0.6914759874343872, + "learning_rate": 4.451903807615231e-05, + "loss": 0.383, + "step": 558 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 0.786461591720581, + "learning_rate": 4.4509018036072144e-05, + "loss": 0.3644, + "step": 559 + }, + { + "epoch": 2.24, + "grad_norm": 0.7495032548904419, + "learning_rate": 4.4498997995991986e-05, + "loss": 0.3818, + "step": 560 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 0.7083197236061096, + "learning_rate": 4.448897795591183e-05, + "loss": 0.3574, + "step": 561 + }, + { + "epoch": 2.248, + "grad_norm": 0.6997913718223572, + "learning_rate": 4.447895791583167e-05, + "loss": 0.3473, + "step": 562 + }, + { + "epoch": 2.252, + "grad_norm": 0.7174849510192871, + "learning_rate": 4.44689378757515e-05, + "loss": 0.331, + "step": 563 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.6463573575019836, + "learning_rate": 4.4458917835671344e-05, + "loss": 0.3907, + "step": 564 + }, + { + "epoch": 2.26, + "grad_norm": 0.6492186784744263, + "learning_rate": 4.4448897795591185e-05, + "loss": 0.373, + "step": 565 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 0.7330689430236816, + "learning_rate": 4.4438877755511027e-05, + "loss": 0.3704, + "step": 566 + }, + { + "epoch": 2.268, + "grad_norm": 0.7165907621383667, + "learning_rate": 4.442885771543087e-05, + "loss": 0.4114, + "step": 567 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.7889212369918823, + "learning_rate": 4.44188376753507e-05, + "loss": 0.3837, + "step": 568 + }, + { + "epoch": 2.276, + "grad_norm": 0.7672508358955383, + "learning_rate": 4.4408817635270544e-05, + "loss": 0.3647, + "step": 569 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 0.7716801762580872, + "learning_rate": 4.4398797595190385e-05, + "loss": 0.4272, + "step": 570 + }, + { + "epoch": 2.284, + "grad_norm": 0.6814711689949036, + "learning_rate": 4.438877755511022e-05, + "loss": 0.4071, + "step": 571 + }, + { + "epoch": 2.288, + "grad_norm": 0.7873872518539429, + "learning_rate": 4.437875751503006e-05, + "loss": 0.3599, + "step": 572 + }, + { + "epoch": 2.292, + "grad_norm": 0.6841912865638733, + "learning_rate": 4.43687374749499e-05, + "loss": 0.3494, + "step": 573 + }, + { + "epoch": 2.296, + "grad_norm": 1.1426913738250732, + "learning_rate": 4.4358717434869737e-05, + "loss": 0.2038, + "step": 574 + }, + { + "epoch": 2.3, + "grad_norm": 0.7427853345870972, + "learning_rate": 4.434869739478958e-05, + "loss": 0.4356, + "step": 575 + }, + { + "epoch": 2.304, + "grad_norm": 0.6541914939880371, + "learning_rate": 4.4338677354709426e-05, + "loss": 0.3992, + "step": 576 + }, + { + "epoch": 2.308, + "grad_norm": 0.6165962219238281, + "learning_rate": 4.432865731462926e-05, + "loss": 0.3442, + "step": 577 + }, + { + "epoch": 2.312, + "grad_norm": 0.6729843616485596, + "learning_rate": 4.43186372745491e-05, + "loss": 0.3985, + "step": 578 + }, + { + "epoch": 2.316, + "grad_norm": 0.735693097114563, + "learning_rate": 4.430861723446894e-05, + "loss": 0.3458, + "step": 579 + }, + { + "epoch": 2.32, + "grad_norm": 0.6572169065475464, + "learning_rate": 4.429859719438878e-05, + "loss": 0.3533, + "step": 580 + }, + { + "epoch": 2.324, + "grad_norm": 0.7412426471710205, + "learning_rate": 4.428857715430862e-05, + "loss": 0.3494, + "step": 581 + }, + { + "epoch": 2.328, + "grad_norm": 0.7071847319602966, + "learning_rate": 4.427855711422846e-05, + "loss": 0.3647, + "step": 582 + }, + { + "epoch": 2.332, + "grad_norm": 0.8219819068908691, + "learning_rate": 4.4268537074148294e-05, + "loss": 0.4073, + "step": 583 + }, + { + "epoch": 2.336, + "grad_norm": 0.8309367895126343, + "learning_rate": 4.4258517034068136e-05, + "loss": 0.3826, + "step": 584 + }, + { + "epoch": 2.34, + "grad_norm": 0.7580353617668152, + "learning_rate": 4.424849699398798e-05, + "loss": 0.3815, + "step": 585 + }, + { + "epoch": 2.344, + "grad_norm": 0.6513029336929321, + "learning_rate": 4.423847695390782e-05, + "loss": 0.3712, + "step": 586 + }, + { + "epoch": 2.348, + "grad_norm": 0.7023140788078308, + "learning_rate": 4.422845691382766e-05, + "loss": 0.3733, + "step": 587 + }, + { + "epoch": 2.352, + "grad_norm": 0.7600807547569275, + "learning_rate": 4.42184368737475e-05, + "loss": 0.3862, + "step": 588 + }, + { + "epoch": 2.356, + "grad_norm": 0.7764167189598083, + "learning_rate": 4.4208416833667335e-05, + "loss": 0.415, + "step": 589 + }, + { + "epoch": 2.36, + "grad_norm": 0.8243778347969055, + "learning_rate": 4.419839679358718e-05, + "loss": 0.3776, + "step": 590 + }, + { + "epoch": 2.364, + "grad_norm": 0.7651914954185486, + "learning_rate": 4.418837675350702e-05, + "loss": 0.3871, + "step": 591 + }, + { + "epoch": 2.368, + "grad_norm": 0.7026492357254028, + "learning_rate": 4.417835671342685e-05, + "loss": 0.3352, + "step": 592 + }, + { + "epoch": 2.372, + "grad_norm": 0.6480612754821777, + "learning_rate": 4.4168336673346694e-05, + "loss": 0.3574, + "step": 593 + }, + { + "epoch": 2.376, + "grad_norm": 0.6376757621765137, + "learning_rate": 4.4158316633266535e-05, + "loss": 0.3865, + "step": 594 + }, + { + "epoch": 2.38, + "grad_norm": 0.7384195327758789, + "learning_rate": 4.414829659318637e-05, + "loss": 0.346, + "step": 595 + }, + { + "epoch": 2.384, + "grad_norm": 0.7906630635261536, + "learning_rate": 4.413827655310622e-05, + "loss": 0.409, + "step": 596 + }, + { + "epoch": 2.388, + "grad_norm": 0.7145755290985107, + "learning_rate": 4.412825651302606e-05, + "loss": 0.3461, + "step": 597 + }, + { + "epoch": 2.392, + "grad_norm": 0.8087967038154602, + "learning_rate": 4.4118236472945893e-05, + "loss": 0.215, + "step": 598 + }, + { + "epoch": 2.396, + "grad_norm": 0.7960618138313293, + "learning_rate": 4.4108216432865735e-05, + "loss": 0.4122, + "step": 599 + }, + { + "epoch": 2.4, + "grad_norm": 0.6827014684677124, + "learning_rate": 4.4098196392785576e-05, + "loss": 0.3964, + "step": 600 + }, + { + "epoch": 2.404, + "grad_norm": 0.7268480062484741, + "learning_rate": 4.408817635270541e-05, + "loss": 0.3557, + "step": 601 + }, + { + "epoch": 2.408, + "grad_norm": 0.7457350492477417, + "learning_rate": 4.407815631262525e-05, + "loss": 0.4288, + "step": 602 + }, + { + "epoch": 2.412, + "grad_norm": 0.7236207127571106, + "learning_rate": 4.406813627254509e-05, + "loss": 0.4079, + "step": 603 + }, + { + "epoch": 2.416, + "grad_norm": 0.6497883200645447, + "learning_rate": 4.405811623246493e-05, + "loss": 0.4348, + "step": 604 + }, + { + "epoch": 2.42, + "grad_norm": 0.7279455065727234, + "learning_rate": 4.404809619238477e-05, + "loss": 0.3534, + "step": 605 + }, + { + "epoch": 2.424, + "grad_norm": 0.7461408972740173, + "learning_rate": 4.403807615230462e-05, + "loss": 0.3619, + "step": 606 + }, + { + "epoch": 2.428, + "grad_norm": 0.7696019411087036, + "learning_rate": 4.402805611222445e-05, + "loss": 0.3546, + "step": 607 + }, + { + "epoch": 2.432, + "grad_norm": 0.7570071816444397, + "learning_rate": 4.401803607214429e-05, + "loss": 0.3496, + "step": 608 + }, + { + "epoch": 2.436, + "grad_norm": 0.6898276805877686, + "learning_rate": 4.4008016032064134e-05, + "loss": 0.3789, + "step": 609 + }, + { + "epoch": 2.44, + "grad_norm": 0.7434217929840088, + "learning_rate": 4.399799599198397e-05, + "loss": 0.3736, + "step": 610 + }, + { + "epoch": 2.444, + "grad_norm": 0.8145261406898499, + "learning_rate": 4.398797595190381e-05, + "loss": 0.3516, + "step": 611 + }, + { + "epoch": 2.448, + "grad_norm": 0.7646612524986267, + "learning_rate": 4.397795591182365e-05, + "loss": 0.3867, + "step": 612 + }, + { + "epoch": 2.452, + "grad_norm": 0.9545761942863464, + "learning_rate": 4.3967935871743486e-05, + "loss": 0.3563, + "step": 613 + }, + { + "epoch": 2.456, + "grad_norm": 0.8131189346313477, + "learning_rate": 4.395791583166333e-05, + "loss": 0.389, + "step": 614 + }, + { + "epoch": 2.46, + "grad_norm": 0.7146084308624268, + "learning_rate": 4.394789579158317e-05, + "loss": 0.3727, + "step": 615 + }, + { + "epoch": 2.464, + "grad_norm": 0.7887970209121704, + "learning_rate": 4.393787575150301e-05, + "loss": 0.3596, + "step": 616 + }, + { + "epoch": 2.468, + "grad_norm": 0.7648036479949951, + "learning_rate": 4.392785571142285e-05, + "loss": 0.3388, + "step": 617 + }, + { + "epoch": 2.472, + "grad_norm": 0.80208420753479, + "learning_rate": 4.391783567134269e-05, + "loss": 0.3498, + "step": 618 + }, + { + "epoch": 2.476, + "grad_norm": 0.7335792183876038, + "learning_rate": 4.3907815631262526e-05, + "loss": 0.3678, + "step": 619 + }, + { + "epoch": 2.48, + "grad_norm": 0.7437960505485535, + "learning_rate": 4.389779559118237e-05, + "loss": 0.3697, + "step": 620 + }, + { + "epoch": 2.484, + "grad_norm": 0.687762975692749, + "learning_rate": 4.388777555110221e-05, + "loss": 0.3292, + "step": 621 + }, + { + "epoch": 2.488, + "grad_norm": 0.7212290167808533, + "learning_rate": 4.3877755511022044e-05, + "loss": 0.3707, + "step": 622 + }, + { + "epoch": 2.492, + "grad_norm": 0.6740947365760803, + "learning_rate": 4.3867735470941885e-05, + "loss": 0.33, + "step": 623 + }, + { + "epoch": 2.496, + "grad_norm": 0.7523700594902039, + "learning_rate": 4.3857715430861726e-05, + "loss": 0.348, + "step": 624 + }, + { + "epoch": 2.5, + "grad_norm": 0.8317469954490662, + "learning_rate": 4.384769539078157e-05, + "loss": 0.4042, + "step": 625 + }, + { + "epoch": 2.504, + "grad_norm": 0.8676090836524963, + "learning_rate": 4.383767535070141e-05, + "loss": 0.3682, + "step": 626 + }, + { + "epoch": 2.508, + "grad_norm": 0.7623687386512756, + "learning_rate": 4.382765531062124e-05, + "loss": 0.4103, + "step": 627 + }, + { + "epoch": 2.512, + "grad_norm": 0.756071925163269, + "learning_rate": 4.3817635270541084e-05, + "loss": 0.3673, + "step": 628 + }, + { + "epoch": 2.516, + "grad_norm": 0.7560201287269592, + "learning_rate": 4.3807615230460926e-05, + "loss": 0.3961, + "step": 629 + }, + { + "epoch": 2.52, + "grad_norm": 0.8405265212059021, + "learning_rate": 4.379759519038076e-05, + "loss": 0.3891, + "step": 630 + }, + { + "epoch": 2.524, + "grad_norm": 0.7683942317962646, + "learning_rate": 4.37875751503006e-05, + "loss": 0.4056, + "step": 631 + }, + { + "epoch": 2.528, + "grad_norm": 0.7256227135658264, + "learning_rate": 4.377755511022044e-05, + "loss": 0.3555, + "step": 632 + }, + { + "epoch": 2.532, + "grad_norm": 0.8724226355552673, + "learning_rate": 4.3767535070140284e-05, + "loss": 0.4297, + "step": 633 + }, + { + "epoch": 2.536, + "grad_norm": 0.8041679263114929, + "learning_rate": 4.375751503006012e-05, + "loss": 0.3846, + "step": 634 + }, + { + "epoch": 2.54, + "grad_norm": 0.7637845873832703, + "learning_rate": 4.374749498997997e-05, + "loss": 0.3578, + "step": 635 + }, + { + "epoch": 2.544, + "grad_norm": 0.7199541330337524, + "learning_rate": 4.37374749498998e-05, + "loss": 0.3484, + "step": 636 + }, + { + "epoch": 2.548, + "grad_norm": 0.7177748680114746, + "learning_rate": 4.372745490981964e-05, + "loss": 0.341, + "step": 637 + }, + { + "epoch": 2.552, + "grad_norm": 0.683948814868927, + "learning_rate": 4.3717434869739484e-05, + "loss": 0.3593, + "step": 638 + }, + { + "epoch": 2.556, + "grad_norm": 0.7331079840660095, + "learning_rate": 4.370741482965932e-05, + "loss": 0.3521, + "step": 639 + }, + { + "epoch": 2.56, + "grad_norm": 0.7923868298530579, + "learning_rate": 4.369739478957916e-05, + "loss": 0.3775, + "step": 640 + }, + { + "epoch": 2.564, + "grad_norm": 0.8066992163658142, + "learning_rate": 4.3687374749499e-05, + "loss": 0.362, + "step": 641 + }, + { + "epoch": 2.568, + "grad_norm": 0.8344401121139526, + "learning_rate": 4.3677354709418835e-05, + "loss": 0.3877, + "step": 642 + }, + { + "epoch": 2.572, + "grad_norm": 0.7773652076721191, + "learning_rate": 4.3667334669338677e-05, + "loss": 0.3878, + "step": 643 + }, + { + "epoch": 2.576, + "grad_norm": 0.9228508472442627, + "learning_rate": 4.365731462925852e-05, + "loss": 0.3737, + "step": 644 + }, + { + "epoch": 2.58, + "grad_norm": 0.7461867928504944, + "learning_rate": 4.364729458917836e-05, + "loss": 0.3422, + "step": 645 + }, + { + "epoch": 2.584, + "grad_norm": 0.7550526261329651, + "learning_rate": 4.36372745490982e-05, + "loss": 0.3748, + "step": 646 + }, + { + "epoch": 2.588, + "grad_norm": 0.7776896953582764, + "learning_rate": 4.362725450901804e-05, + "loss": 0.3629, + "step": 647 + }, + { + "epoch": 2.592, + "grad_norm": 0.8079218864440918, + "learning_rate": 4.3617234468937876e-05, + "loss": 0.352, + "step": 648 + }, + { + "epoch": 2.596, + "grad_norm": 0.7805841565132141, + "learning_rate": 4.360721442885772e-05, + "loss": 0.4112, + "step": 649 + }, + { + "epoch": 2.6, + "grad_norm": 0.6444045305252075, + "learning_rate": 4.359719438877756e-05, + "loss": 0.2286, + "step": 650 + }, + { + "epoch": 2.604, + "grad_norm": 0.7929733395576477, + "learning_rate": 4.358717434869739e-05, + "loss": 0.368, + "step": 651 + }, + { + "epoch": 2.608, + "grad_norm": 0.7788984775543213, + "learning_rate": 4.3577154308617235e-05, + "loss": 0.3658, + "step": 652 + }, + { + "epoch": 2.612, + "grad_norm": 0.7166882753372192, + "learning_rate": 4.3567134268537076e-05, + "loss": 0.3472, + "step": 653 + }, + { + "epoch": 2.616, + "grad_norm": 0.7755756974220276, + "learning_rate": 4.355711422845691e-05, + "loss": 0.3748, + "step": 654 + }, + { + "epoch": 2.62, + "grad_norm": 0.8172996640205383, + "learning_rate": 4.354709418837676e-05, + "loss": 0.3924, + "step": 655 + }, + { + "epoch": 2.624, + "grad_norm": 0.739886462688446, + "learning_rate": 4.35370741482966e-05, + "loss": 0.4263, + "step": 656 + }, + { + "epoch": 2.628, + "grad_norm": 0.8171461224555969, + "learning_rate": 4.3527054108216434e-05, + "loss": 0.3717, + "step": 657 + }, + { + "epoch": 2.632, + "grad_norm": 0.7296169400215149, + "learning_rate": 4.3517034068136275e-05, + "loss": 0.3753, + "step": 658 + }, + { + "epoch": 2.636, + "grad_norm": 0.7279816269874573, + "learning_rate": 4.350701402805612e-05, + "loss": 0.4018, + "step": 659 + }, + { + "epoch": 2.64, + "grad_norm": 0.6948480606079102, + "learning_rate": 4.349699398797595e-05, + "loss": 0.3866, + "step": 660 + }, + { + "epoch": 2.644, + "grad_norm": 0.7570900321006775, + "learning_rate": 4.348697394789579e-05, + "loss": 0.3787, + "step": 661 + }, + { + "epoch": 2.648, + "grad_norm": 0.7140201330184937, + "learning_rate": 4.3476953907815634e-05, + "loss": 0.3161, + "step": 662 + }, + { + "epoch": 2.652, + "grad_norm": 0.7074480056762695, + "learning_rate": 4.346693386773547e-05, + "loss": 0.4135, + "step": 663 + }, + { + "epoch": 2.656, + "grad_norm": 0.7118886113166809, + "learning_rate": 4.345691382765531e-05, + "loss": 0.3456, + "step": 664 + }, + { + "epoch": 2.66, + "grad_norm": 0.7889545559883118, + "learning_rate": 4.344689378757516e-05, + "loss": 0.3727, + "step": 665 + }, + { + "epoch": 2.664, + "grad_norm": 0.7217925786972046, + "learning_rate": 4.343687374749499e-05, + "loss": 0.3396, + "step": 666 + }, + { + "epoch": 2.668, + "grad_norm": 0.5060538053512573, + "learning_rate": 4.3426853707414833e-05, + "loss": 0.2113, + "step": 667 + }, + { + "epoch": 2.672, + "grad_norm": 0.8558411002159119, + "learning_rate": 4.3416833667334675e-05, + "loss": 0.4217, + "step": 668 + }, + { + "epoch": 2.676, + "grad_norm": 0.7091007828712463, + "learning_rate": 4.340681362725451e-05, + "loss": 0.3889, + "step": 669 + }, + { + "epoch": 2.68, + "grad_norm": 0.8151986598968506, + "learning_rate": 4.339679358717435e-05, + "loss": 0.3808, + "step": 670 + }, + { + "epoch": 2.684, + "grad_norm": 0.8028424978256226, + "learning_rate": 4.338677354709419e-05, + "loss": 0.376, + "step": 671 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.7256678938865662, + "learning_rate": 4.3376753507014026e-05, + "loss": 0.3924, + "step": 672 + }, + { + "epoch": 2.692, + "grad_norm": 0.7175251245498657, + "learning_rate": 4.336673346693387e-05, + "loss": 0.3795, + "step": 673 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 0.8073795437812805, + "learning_rate": 4.335671342685371e-05, + "loss": 0.3812, + "step": 674 + }, + { + "epoch": 2.7, + "grad_norm": 0.7120643258094788, + "learning_rate": 4.334669338677355e-05, + "loss": 0.3856, + "step": 675 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 0.6971157193183899, + "learning_rate": 4.333667334669339e-05, + "loss": 0.3223, + "step": 676 + }, + { + "epoch": 2.708, + "grad_norm": 0.7540895938873291, + "learning_rate": 4.332665330661323e-05, + "loss": 0.3582, + "step": 677 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 0.7137064337730408, + "learning_rate": 4.331663326653307e-05, + "loss": 0.3178, + "step": 678 + }, + { + "epoch": 2.716, + "grad_norm": 0.7786818742752075, + "learning_rate": 4.330661322645291e-05, + "loss": 0.38, + "step": 679 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.7963460087776184, + "learning_rate": 4.329659318637275e-05, + "loss": 0.3811, + "step": 680 + }, + { + "epoch": 2.724, + "grad_norm": 0.9932142496109009, + "learning_rate": 4.3286573146292584e-05, + "loss": 0.4158, + "step": 681 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 0.7535082101821899, + "learning_rate": 4.3276553106212426e-05, + "loss": 0.4385, + "step": 682 + }, + { + "epoch": 2.732, + "grad_norm": 0.6805514097213745, + "learning_rate": 4.326653306613227e-05, + "loss": 0.3881, + "step": 683 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.569618284702301, + "learning_rate": 4.325651302605211e-05, + "loss": 0.2244, + "step": 684 + }, + { + "epoch": 2.74, + "grad_norm": 0.6559450030326843, + "learning_rate": 4.324649298597195e-05, + "loss": 0.3176, + "step": 685 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 0.7690601944923401, + "learning_rate": 4.3236472945891784e-05, + "loss": 0.3672, + "step": 686 + }, + { + "epoch": 2.748, + "grad_norm": 0.662957489490509, + "learning_rate": 4.3226452905811625e-05, + "loss": 0.3441, + "step": 687 + }, + { + "epoch": 2.752, + "grad_norm": 0.5383203029632568, + "learning_rate": 4.3216432865731467e-05, + "loss": 0.2406, + "step": 688 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 0.5156013369560242, + "learning_rate": 4.320641282565131e-05, + "loss": 0.1923, + "step": 689 + }, + { + "epoch": 2.76, + "grad_norm": 0.798517644405365, + "learning_rate": 4.319639278557114e-05, + "loss": 0.3142, + "step": 690 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 0.7717877626419067, + "learning_rate": 4.3186372745490984e-05, + "loss": 0.3537, + "step": 691 + }, + { + "epoch": 2.768, + "grad_norm": 0.8068984150886536, + "learning_rate": 4.3176352705410825e-05, + "loss": 0.3579, + "step": 692 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 0.7683663368225098, + "learning_rate": 4.316633266533066e-05, + "loss": 0.3463, + "step": 693 + }, + { + "epoch": 2.776, + "grad_norm": 0.7648158669471741, + "learning_rate": 4.315631262525051e-05, + "loss": 0.3613, + "step": 694 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 0.6192529201507568, + "learning_rate": 4.314629258517034e-05, + "loss": 0.2317, + "step": 695 + }, + { + "epoch": 2.784, + "grad_norm": 0.7966995239257812, + "learning_rate": 4.313627254509018e-05, + "loss": 0.3571, + "step": 696 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 0.7972453236579895, + "learning_rate": 4.3126252505010025e-05, + "loss": 0.3334, + "step": 697 + }, + { + "epoch": 2.792, + "grad_norm": 0.7164406180381775, + "learning_rate": 4.311623246492986e-05, + "loss": 0.3394, + "step": 698 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 0.8824707865715027, + "learning_rate": 4.31062124248497e-05, + "loss": 0.4308, + "step": 699 + }, + { + "epoch": 2.8, + "grad_norm": 0.7947747111320496, + "learning_rate": 4.309619238476954e-05, + "loss": 0.3586, + "step": 700 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 0.8112667202949524, + "learning_rate": 4.3086172344689376e-05, + "loss": 0.424, + "step": 701 + }, + { + "epoch": 2.808, + "grad_norm": 0.7361273765563965, + "learning_rate": 4.307615230460922e-05, + "loss": 0.357, + "step": 702 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 0.7715202569961548, + "learning_rate": 4.306613226452906e-05, + "loss": 0.4038, + "step": 703 + }, + { + "epoch": 2.816, + "grad_norm": 0.7197621464729309, + "learning_rate": 4.30561122244489e-05, + "loss": 0.3378, + "step": 704 + }, + { + "epoch": 2.82, + "grad_norm": 0.7480655312538147, + "learning_rate": 4.304609218436874e-05, + "loss": 0.4022, + "step": 705 + }, + { + "epoch": 2.824, + "grad_norm": 0.6959013938903809, + "learning_rate": 4.303607214428858e-05, + "loss": 0.3704, + "step": 706 + }, + { + "epoch": 2.828, + "grad_norm": 0.7385444641113281, + "learning_rate": 4.302605210420842e-05, + "loss": 0.3868, + "step": 707 + }, + { + "epoch": 2.832, + "grad_norm": 0.7400097846984863, + "learning_rate": 4.301603206412826e-05, + "loss": 0.3851, + "step": 708 + }, + { + "epoch": 2.836, + "grad_norm": 0.7167990207672119, + "learning_rate": 4.30060120240481e-05, + "loss": 0.3516, + "step": 709 + }, + { + "epoch": 2.84, + "grad_norm": 0.7471911907196045, + "learning_rate": 4.2995991983967934e-05, + "loss": 0.3891, + "step": 710 + }, + { + "epoch": 2.844, + "grad_norm": 0.670911431312561, + "learning_rate": 4.2985971943887775e-05, + "loss": 0.1749, + "step": 711 + }, + { + "epoch": 2.848, + "grad_norm": 0.7623646259307861, + "learning_rate": 4.297595190380762e-05, + "loss": 0.3528, + "step": 712 + }, + { + "epoch": 2.852, + "grad_norm": 0.7303867936134338, + "learning_rate": 4.296593186372745e-05, + "loss": 0.3443, + "step": 713 + }, + { + "epoch": 2.856, + "grad_norm": 0.8334202766418457, + "learning_rate": 4.29559118236473e-05, + "loss": 0.3797, + "step": 714 + }, + { + "epoch": 2.86, + "grad_norm": 0.8006396889686584, + "learning_rate": 4.294589178356714e-05, + "loss": 0.3702, + "step": 715 + }, + { + "epoch": 2.864, + "grad_norm": 5.877392768859863, + "learning_rate": 4.2935871743486975e-05, + "loss": 0.3651, + "step": 716 + }, + { + "epoch": 2.868, + "grad_norm": 0.7039988040924072, + "learning_rate": 4.2925851703406816e-05, + "loss": 0.3728, + "step": 717 + }, + { + "epoch": 2.872, + "grad_norm": 0.777847409248352, + "learning_rate": 4.291583166332666e-05, + "loss": 0.3986, + "step": 718 + }, + { + "epoch": 2.876, + "grad_norm": 0.7246628999710083, + "learning_rate": 4.290581162324649e-05, + "loss": 0.3821, + "step": 719 + }, + { + "epoch": 2.88, + "grad_norm": 0.773358941078186, + "learning_rate": 4.289579158316633e-05, + "loss": 0.3758, + "step": 720 + }, + { + "epoch": 2.884, + "grad_norm": 0.7800974249839783, + "learning_rate": 4.2885771543086175e-05, + "loss": 0.364, + "step": 721 + }, + { + "epoch": 2.888, + "grad_norm": 0.8116373419761658, + "learning_rate": 4.287575150300601e-05, + "loss": 0.457, + "step": 722 + }, + { + "epoch": 2.892, + "grad_norm": 0.6917723417282104, + "learning_rate": 4.286573146292585e-05, + "loss": 0.3549, + "step": 723 + }, + { + "epoch": 2.896, + "grad_norm": 0.8000931739807129, + "learning_rate": 4.28557114228457e-05, + "loss": 0.3616, + "step": 724 + }, + { + "epoch": 2.9, + "grad_norm": 0.7841587066650391, + "learning_rate": 4.284569138276553e-05, + "loss": 0.4008, + "step": 725 + }, + { + "epoch": 2.904, + "grad_norm": 0.723857581615448, + "learning_rate": 4.2835671342685374e-05, + "loss": 0.3367, + "step": 726 + }, + { + "epoch": 2.908, + "grad_norm": 0.7573955059051514, + "learning_rate": 4.2825651302605216e-05, + "loss": 0.3473, + "step": 727 + }, + { + "epoch": 2.912, + "grad_norm": 0.6830270290374756, + "learning_rate": 4.281563126252505e-05, + "loss": 0.4192, + "step": 728 + }, + { + "epoch": 2.916, + "grad_norm": 0.7317824363708496, + "learning_rate": 4.280561122244489e-05, + "loss": 0.3519, + "step": 729 + }, + { + "epoch": 2.92, + "grad_norm": 0.8307793736457825, + "learning_rate": 4.279559118236473e-05, + "loss": 0.385, + "step": 730 + }, + { + "epoch": 2.924, + "grad_norm": 0.5436768531799316, + "learning_rate": 4.278557114228457e-05, + "loss": 0.225, + "step": 731 + }, + { + "epoch": 2.928, + "grad_norm": 0.798338770866394, + "learning_rate": 4.277555110220441e-05, + "loss": 0.4136, + "step": 732 + }, + { + "epoch": 2.932, + "grad_norm": 0.7588498592376709, + "learning_rate": 4.2765531062124256e-05, + "loss": 0.3705, + "step": 733 + }, + { + "epoch": 2.936, + "grad_norm": 0.8589876890182495, + "learning_rate": 4.275551102204409e-05, + "loss": 0.3989, + "step": 734 + }, + { + "epoch": 2.94, + "grad_norm": 0.6937852501869202, + "learning_rate": 4.274549098196393e-05, + "loss": 0.3804, + "step": 735 + }, + { + "epoch": 2.944, + "grad_norm": 0.8076114058494568, + "learning_rate": 4.2735470941883774e-05, + "loss": 0.3755, + "step": 736 + }, + { + "epoch": 2.948, + "grad_norm": 0.7671177983283997, + "learning_rate": 4.272545090180361e-05, + "loss": 0.3414, + "step": 737 + }, + { + "epoch": 2.952, + "grad_norm": 0.7929058074951172, + "learning_rate": 4.271543086172345e-05, + "loss": 0.4099, + "step": 738 + }, + { + "epoch": 2.956, + "grad_norm": 0.8027397394180298, + "learning_rate": 4.270541082164329e-05, + "loss": 0.3635, + "step": 739 + }, + { + "epoch": 2.96, + "grad_norm": 0.7325429916381836, + "learning_rate": 4.2695390781563125e-05, + "loss": 0.3457, + "step": 740 + }, + { + "epoch": 2.964, + "grad_norm": 0.7555444240570068, + "learning_rate": 4.2685370741482966e-05, + "loss": 0.4271, + "step": 741 + }, + { + "epoch": 2.968, + "grad_norm": 0.6851345300674438, + "learning_rate": 4.267535070140281e-05, + "loss": 0.374, + "step": 742 + }, + { + "epoch": 2.972, + "grad_norm": 0.7643485069274902, + "learning_rate": 4.266533066132265e-05, + "loss": 0.3338, + "step": 743 + }, + { + "epoch": 2.976, + "grad_norm": 0.8755035996437073, + "learning_rate": 4.265531062124249e-05, + "loss": 0.3444, + "step": 744 + }, + { + "epoch": 2.98, + "grad_norm": 0.6367295980453491, + "learning_rate": 4.264529058116233e-05, + "loss": 0.3667, + "step": 745 + }, + { + "epoch": 2.984, + "grad_norm": 0.8797886967658997, + "learning_rate": 4.2635270541082166e-05, + "loss": 0.4266, + "step": 746 + }, + { + "epoch": 2.988, + "grad_norm": 0.6971140503883362, + "learning_rate": 4.262525050100201e-05, + "loss": 0.337, + "step": 747 + }, + { + "epoch": 2.992, + "grad_norm": 0.751207709312439, + "learning_rate": 4.261523046092185e-05, + "loss": 0.3907, + "step": 748 + }, + { + "epoch": 2.996, + "grad_norm": 0.7211257219314575, + "learning_rate": 4.260521042084168e-05, + "loss": 0.396, + "step": 749 + }, + { + "epoch": 3.0, + "grad_norm": 0.9239338636398315, + "learning_rate": 4.2595190380761524e-05, + "loss": 0.4502, + "step": 750 + }, + { + "epoch": 3.004, + "grad_norm": 0.779204249382019, + "learning_rate": 4.2585170340681366e-05, + "loss": 0.2879, + "step": 751 + }, + { + "epoch": 3.008, + "grad_norm": 0.6356750726699829, + "learning_rate": 4.25751503006012e-05, + "loss": 0.2986, + "step": 752 + }, + { + "epoch": 3.012, + "grad_norm": 0.8113148212432861, + "learning_rate": 4.256513026052105e-05, + "loss": 0.3308, + "step": 753 + }, + { + "epoch": 3.016, + "grad_norm": 0.8562130331993103, + "learning_rate": 4.255511022044088e-05, + "loss": 0.3309, + "step": 754 + }, + { + "epoch": 3.02, + "grad_norm": 0.7852639555931091, + "learning_rate": 4.2545090180360724e-05, + "loss": 0.253, + "step": 755 + }, + { + "epoch": 3.024, + "grad_norm": 0.83427894115448, + "learning_rate": 4.2535070140280565e-05, + "loss": 0.2877, + "step": 756 + }, + { + "epoch": 3.028, + "grad_norm": 0.8282837867736816, + "learning_rate": 4.25250501002004e-05, + "loss": 0.304, + "step": 757 + }, + { + "epoch": 3.032, + "grad_norm": 0.895320475101471, + "learning_rate": 4.251503006012024e-05, + "loss": 0.3018, + "step": 758 + }, + { + "epoch": 3.036, + "grad_norm": 0.9096167683601379, + "learning_rate": 4.250501002004008e-05, + "loss": 0.2782, + "step": 759 + }, + { + "epoch": 3.04, + "grad_norm": 0.9886460304260254, + "learning_rate": 4.2494989979959924e-05, + "loss": 0.2964, + "step": 760 + }, + { + "epoch": 3.044, + "grad_norm": 1.022255778312683, + "learning_rate": 4.248496993987976e-05, + "loss": 0.3164, + "step": 761 + }, + { + "epoch": 3.048, + "grad_norm": 1.0864485502243042, + "learning_rate": 4.24749498997996e-05, + "loss": 0.2808, + "step": 762 + }, + { + "epoch": 3.052, + "grad_norm": 1.1385424137115479, + "learning_rate": 4.246492985971944e-05, + "loss": 0.3147, + "step": 763 + }, + { + "epoch": 3.056, + "grad_norm": 1.4085267782211304, + "learning_rate": 4.245490981963928e-05, + "loss": 0.3169, + "step": 764 + }, + { + "epoch": 3.06, + "grad_norm": 0.9816059470176697, + "learning_rate": 4.244488977955912e-05, + "loss": 0.2724, + "step": 765 + }, + { + "epoch": 3.064, + "grad_norm": 0.9588344693183899, + "learning_rate": 4.243486973947896e-05, + "loss": 0.2695, + "step": 766 + }, + { + "epoch": 3.068, + "grad_norm": 0.9727541208267212, + "learning_rate": 4.24248496993988e-05, + "loss": 0.307, + "step": 767 + }, + { + "epoch": 3.072, + "grad_norm": 0.9045794010162354, + "learning_rate": 4.241482965931864e-05, + "loss": 0.281, + "step": 768 + }, + { + "epoch": 3.076, + "grad_norm": 0.8429794907569885, + "learning_rate": 4.2404809619238475e-05, + "loss": 0.2693, + "step": 769 + }, + { + "epoch": 3.08, + "grad_norm": 0.9454277157783508, + "learning_rate": 4.2394789579158316e-05, + "loss": 0.2927, + "step": 770 + }, + { + "epoch": 3.084, + "grad_norm": 0.8746525645256042, + "learning_rate": 4.238476953907816e-05, + "loss": 0.3275, + "step": 771 + }, + { + "epoch": 3.088, + "grad_norm": 1.0423359870910645, + "learning_rate": 4.2374749498998e-05, + "loss": 0.2617, + "step": 772 + }, + { + "epoch": 3.092, + "grad_norm": 0.9890905022621155, + "learning_rate": 4.236472945891784e-05, + "loss": 0.268, + "step": 773 + }, + { + "epoch": 3.096, + "grad_norm": 0.9057809114456177, + "learning_rate": 4.235470941883768e-05, + "loss": 0.2951, + "step": 774 + }, + { + "epoch": 3.1, + "grad_norm": 0.841378927230835, + "learning_rate": 4.2344689378757516e-05, + "loss": 0.251, + "step": 775 + }, + { + "epoch": 3.104, + "grad_norm": 0.926152229309082, + "learning_rate": 4.233466933867736e-05, + "loss": 0.2939, + "step": 776 + }, + { + "epoch": 3.108, + "grad_norm": 1.010184407234192, + "learning_rate": 4.23246492985972e-05, + "loss": 0.2795, + "step": 777 + }, + { + "epoch": 3.112, + "grad_norm": 1.0386322736740112, + "learning_rate": 4.231462925851703e-05, + "loss": 0.3257, + "step": 778 + }, + { + "epoch": 3.116, + "grad_norm": 0.9416765570640564, + "learning_rate": 4.2304609218436874e-05, + "loss": 0.2857, + "step": 779 + }, + { + "epoch": 3.12, + "grad_norm": 0.6299401521682739, + "learning_rate": 4.2294589178356715e-05, + "loss": 0.1485, + "step": 780 + }, + { + "epoch": 3.124, + "grad_norm": 0.8813037872314453, + "learning_rate": 4.228456913827655e-05, + "loss": 0.2762, + "step": 781 + }, + { + "epoch": 3.128, + "grad_norm": 0.9580820798873901, + "learning_rate": 4.227454909819639e-05, + "loss": 0.3148, + "step": 782 + }, + { + "epoch": 3.132, + "grad_norm": 0.9623346924781799, + "learning_rate": 4.226452905811624e-05, + "loss": 0.3172, + "step": 783 + }, + { + "epoch": 3.136, + "grad_norm": 0.9738514423370361, + "learning_rate": 4.2254509018036074e-05, + "loss": 0.2936, + "step": 784 + }, + { + "epoch": 3.14, + "grad_norm": 0.9638810753822327, + "learning_rate": 4.2244488977955915e-05, + "loss": 0.2686, + "step": 785 + }, + { + "epoch": 3.144, + "grad_norm": 1.004857063293457, + "learning_rate": 4.2234468937875756e-05, + "loss": 0.3125, + "step": 786 + }, + { + "epoch": 3.148, + "grad_norm": 0.9582427740097046, + "learning_rate": 4.222444889779559e-05, + "loss": 0.2873, + "step": 787 + }, + { + "epoch": 3.152, + "grad_norm": 0.9714578986167908, + "learning_rate": 4.221442885771543e-05, + "loss": 0.3185, + "step": 788 + }, + { + "epoch": 3.156, + "grad_norm": 0.9590511322021484, + "learning_rate": 4.2204408817635273e-05, + "loss": 0.2962, + "step": 789 + }, + { + "epoch": 3.16, + "grad_norm": 0.8241173624992371, + "learning_rate": 4.219438877755511e-05, + "loss": 0.273, + "step": 790 + }, + { + "epoch": 3.164, + "grad_norm": 0.9807001352310181, + "learning_rate": 4.218436873747495e-05, + "loss": 0.2778, + "step": 791 + }, + { + "epoch": 3.168, + "grad_norm": 1.012277364730835, + "learning_rate": 4.21743486973948e-05, + "loss": 0.2907, + "step": 792 + }, + { + "epoch": 3.172, + "grad_norm": 0.7485466003417969, + "learning_rate": 4.216432865731463e-05, + "loss": 0.1743, + "step": 793 + }, + { + "epoch": 3.176, + "grad_norm": 1.0279030799865723, + "learning_rate": 4.215430861723447e-05, + "loss": 0.2856, + "step": 794 + }, + { + "epoch": 3.18, + "grad_norm": 1.0178649425506592, + "learning_rate": 4.2144288577154314e-05, + "loss": 0.2993, + "step": 795 + }, + { + "epoch": 3.184, + "grad_norm": 0.8605381846427917, + "learning_rate": 4.213426853707415e-05, + "loss": 0.2982, + "step": 796 + }, + { + "epoch": 3.188, + "grad_norm": 1.0019651651382446, + "learning_rate": 4.212424849699399e-05, + "loss": 0.2708, + "step": 797 + }, + { + "epoch": 3.192, + "grad_norm": 0.8550509810447693, + "learning_rate": 4.211422845691383e-05, + "loss": 0.2795, + "step": 798 + }, + { + "epoch": 3.196, + "grad_norm": 0.9066852927207947, + "learning_rate": 4.2104208416833666e-05, + "loss": 0.294, + "step": 799 + }, + { + "epoch": 3.2, + "grad_norm": 1.0385713577270508, + "learning_rate": 4.209418837675351e-05, + "loss": 0.3046, + "step": 800 + }, + { + "epoch": 3.204, + "grad_norm": 0.958522379398346, + "learning_rate": 4.208416833667335e-05, + "loss": 0.3016, + "step": 801 + }, + { + "epoch": 3.208, + "grad_norm": 1.0640708208084106, + "learning_rate": 4.207414829659319e-05, + "loss": 0.2636, + "step": 802 + }, + { + "epoch": 3.212, + "grad_norm": 0.9335046410560608, + "learning_rate": 4.206412825651303e-05, + "loss": 0.2668, + "step": 803 + }, + { + "epoch": 3.216, + "grad_norm": 1.0041309595108032, + "learning_rate": 4.205410821643287e-05, + "loss": 0.2774, + "step": 804 + }, + { + "epoch": 3.22, + "grad_norm": 0.8523897528648376, + "learning_rate": 4.204408817635271e-05, + "loss": 0.2638, + "step": 805 + }, + { + "epoch": 3.224, + "grad_norm": 0.9941080212593079, + "learning_rate": 4.203406813627255e-05, + "loss": 0.2988, + "step": 806 + }, + { + "epoch": 3.228, + "grad_norm": 1.0812398195266724, + "learning_rate": 4.202404809619239e-05, + "loss": 0.277, + "step": 807 + }, + { + "epoch": 3.232, + "grad_norm": 0.9684290289878845, + "learning_rate": 4.2014028056112224e-05, + "loss": 0.2944, + "step": 808 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 1.021715521812439, + "learning_rate": 4.2004008016032065e-05, + "loss": 0.2548, + "step": 809 + }, + { + "epoch": 3.24, + "grad_norm": 1.052801489830017, + "learning_rate": 4.1993987975951907e-05, + "loss": 0.2885, + "step": 810 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 1.0573465824127197, + "learning_rate": 4.198396793587174e-05, + "loss": 0.3444, + "step": 811 + }, + { + "epoch": 3.248, + "grad_norm": 0.7046388387680054, + "learning_rate": 4.197394789579159e-05, + "loss": 0.134, + "step": 812 + }, + { + "epoch": 3.252, + "grad_norm": 1.030924677848816, + "learning_rate": 4.1963927855711424e-05, + "loss": 0.3304, + "step": 813 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 0.9746088981628418, + "learning_rate": 4.1953907815631265e-05, + "loss": 0.286, + "step": 814 + }, + { + "epoch": 3.26, + "grad_norm": 0.9044510722160339, + "learning_rate": 4.1943887775551106e-05, + "loss": 0.2653, + "step": 815 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 1.0177682638168335, + "learning_rate": 4.193386773547095e-05, + "loss": 0.3103, + "step": 816 + }, + { + "epoch": 3.268, + "grad_norm": 1.0040210485458374, + "learning_rate": 4.192384769539078e-05, + "loss": 0.2804, + "step": 817 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 0.9195308685302734, + "learning_rate": 4.191382765531062e-05, + "loss": 0.3041, + "step": 818 + }, + { + "epoch": 3.276, + "grad_norm": 1.0614639520645142, + "learning_rate": 4.1903807615230465e-05, + "loss": 0.2694, + "step": 819 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 0.9703024625778198, + "learning_rate": 4.18937875751503e-05, + "loss": 0.3042, + "step": 820 + }, + { + "epoch": 3.284, + "grad_norm": 0.7372400760650635, + "learning_rate": 4.188376753507014e-05, + "loss": 0.1776, + "step": 821 + }, + { + "epoch": 3.288, + "grad_norm": 0.9801708459854126, + "learning_rate": 4.187374749498998e-05, + "loss": 0.3445, + "step": 822 + }, + { + "epoch": 3.292, + "grad_norm": 1.0228462219238281, + "learning_rate": 4.186372745490982e-05, + "loss": 0.2965, + "step": 823 + }, + { + "epoch": 3.296, + "grad_norm": 0.9982706904411316, + "learning_rate": 4.1853707414829664e-05, + "loss": 0.2613, + "step": 824 + }, + { + "epoch": 3.3, + "grad_norm": 1.1089848279953003, + "learning_rate": 4.18436873747495e-05, + "loss": 0.3156, + "step": 825 + }, + { + "epoch": 3.304, + "grad_norm": 0.9240087866783142, + "learning_rate": 4.183366733466934e-05, + "loss": 0.2621, + "step": 826 + }, + { + "epoch": 3.308, + "grad_norm": 0.9882985353469849, + "learning_rate": 4.182364729458918e-05, + "loss": 0.2766, + "step": 827 + }, + { + "epoch": 3.312, + "grad_norm": 1.092639446258545, + "learning_rate": 4.1813627254509016e-05, + "loss": 0.3167, + "step": 828 + }, + { + "epoch": 3.316, + "grad_norm": 1.0920966863632202, + "learning_rate": 4.180360721442886e-05, + "loss": 0.3217, + "step": 829 + }, + { + "epoch": 3.32, + "grad_norm": 1.0740662813186646, + "learning_rate": 4.17935871743487e-05, + "loss": 0.2861, + "step": 830 + }, + { + "epoch": 3.324, + "grad_norm": 0.969103991985321, + "learning_rate": 4.178356713426854e-05, + "loss": 0.3083, + "step": 831 + }, + { + "epoch": 3.328, + "grad_norm": 1.1013503074645996, + "learning_rate": 4.177354709418838e-05, + "loss": 0.3094, + "step": 832 + }, + { + "epoch": 3.332, + "grad_norm": 0.9775644540786743, + "learning_rate": 4.176352705410822e-05, + "loss": 0.2919, + "step": 833 + }, + { + "epoch": 3.336, + "grad_norm": 0.9920254349708557, + "learning_rate": 4.175350701402806e-05, + "loss": 0.2904, + "step": 834 + }, + { + "epoch": 3.34, + "grad_norm": 1.0171326398849487, + "learning_rate": 4.17434869739479e-05, + "loss": 0.2888, + "step": 835 + }, + { + "epoch": 3.344, + "grad_norm": 0.8502336144447327, + "learning_rate": 4.173346693386774e-05, + "loss": 0.3193, + "step": 836 + }, + { + "epoch": 3.348, + "grad_norm": 0.9221425652503967, + "learning_rate": 4.1723446893787574e-05, + "loss": 0.3163, + "step": 837 + }, + { + "epoch": 3.352, + "grad_norm": 0.7386908531188965, + "learning_rate": 4.1713426853707415e-05, + "loss": 0.1987, + "step": 838 + }, + { + "epoch": 3.356, + "grad_norm": 0.9897122383117676, + "learning_rate": 4.1703406813627256e-05, + "loss": 0.3081, + "step": 839 + }, + { + "epoch": 3.36, + "grad_norm": 1.0372047424316406, + "learning_rate": 4.169338677354709e-05, + "loss": 0.3217, + "step": 840 + }, + { + "epoch": 3.364, + "grad_norm": 1.108982801437378, + "learning_rate": 4.168336673346693e-05, + "loss": 0.3282, + "step": 841 + }, + { + "epoch": 3.368, + "grad_norm": 0.9839752316474915, + "learning_rate": 4.167334669338678e-05, + "loss": 0.2829, + "step": 842 + }, + { + "epoch": 3.372, + "grad_norm": 0.9582000970840454, + "learning_rate": 4.1663326653306615e-05, + "loss": 0.3538, + "step": 843 + }, + { + "epoch": 3.376, + "grad_norm": 1.0338131189346313, + "learning_rate": 4.1653306613226456e-05, + "loss": 0.3183, + "step": 844 + }, + { + "epoch": 3.38, + "grad_norm": 0.969628632068634, + "learning_rate": 4.16432865731463e-05, + "loss": 0.2753, + "step": 845 + }, + { + "epoch": 3.384, + "grad_norm": 0.9594615697860718, + "learning_rate": 4.163326653306613e-05, + "loss": 0.3348, + "step": 846 + }, + { + "epoch": 3.388, + "grad_norm": 1.0317904949188232, + "learning_rate": 4.162324649298597e-05, + "loss": 0.3075, + "step": 847 + }, + { + "epoch": 3.392, + "grad_norm": 1.00275719165802, + "learning_rate": 4.1613226452905814e-05, + "loss": 0.2883, + "step": 848 + }, + { + "epoch": 3.396, + "grad_norm": 0.902736246585846, + "learning_rate": 4.160320641282565e-05, + "loss": 0.293, + "step": 849 + }, + { + "epoch": 3.4, + "grad_norm": 0.9426193833351135, + "learning_rate": 4.159318637274549e-05, + "loss": 0.3869, + "step": 850 + }, + { + "epoch": 3.404, + "grad_norm": 0.8548994660377502, + "learning_rate": 4.158316633266534e-05, + "loss": 0.2635, + "step": 851 + }, + { + "epoch": 3.408, + "grad_norm": 0.9464280605316162, + "learning_rate": 4.157314629258517e-05, + "loss": 0.3005, + "step": 852 + }, + { + "epoch": 3.412, + "grad_norm": 0.5138210654258728, + "learning_rate": 4.1563126252505014e-05, + "loss": 0.101, + "step": 853 + }, + { + "epoch": 3.416, + "grad_norm": 0.8847005367279053, + "learning_rate": 4.1553106212424855e-05, + "loss": 0.2996, + "step": 854 + }, + { + "epoch": 3.42, + "grad_norm": 1.042490005493164, + "learning_rate": 4.154308617234469e-05, + "loss": 0.2794, + "step": 855 + }, + { + "epoch": 3.424, + "grad_norm": 1.0797277688980103, + "learning_rate": 4.153306613226453e-05, + "loss": 0.2884, + "step": 856 + }, + { + "epoch": 3.428, + "grad_norm": 1.097922921180725, + "learning_rate": 4.152304609218437e-05, + "loss": 0.2846, + "step": 857 + }, + { + "epoch": 3.432, + "grad_norm": 1.0516830682754517, + "learning_rate": 4.151302605210421e-05, + "loss": 0.3018, + "step": 858 + }, + { + "epoch": 3.436, + "grad_norm": 1.0133203268051147, + "learning_rate": 4.150300601202405e-05, + "loss": 0.2871, + "step": 859 + }, + { + "epoch": 3.44, + "grad_norm": 1.0574767589569092, + "learning_rate": 4.149298597194389e-05, + "loss": 0.3106, + "step": 860 + }, + { + "epoch": 3.444, + "grad_norm": 0.9849914908409119, + "learning_rate": 4.148296593186373e-05, + "loss": 0.3729, + "step": 861 + }, + { + "epoch": 3.448, + "grad_norm": 0.8353570103645325, + "learning_rate": 4.147294589178357e-05, + "loss": 0.2873, + "step": 862 + }, + { + "epoch": 3.452, + "grad_norm": 1.0005288124084473, + "learning_rate": 4.146292585170341e-05, + "loss": 0.309, + "step": 863 + }, + { + "epoch": 3.456, + "grad_norm": 1.139251708984375, + "learning_rate": 4.145290581162325e-05, + "loss": 0.3101, + "step": 864 + }, + { + "epoch": 3.46, + "grad_norm": 0.9400432109832764, + "learning_rate": 4.144288577154309e-05, + "loss": 0.3201, + "step": 865 + }, + { + "epoch": 3.464, + "grad_norm": 1.0056957006454468, + "learning_rate": 4.143286573146293e-05, + "loss": 0.2908, + "step": 866 + }, + { + "epoch": 3.468, + "grad_norm": 1.0107009410858154, + "learning_rate": 4.1422845691382765e-05, + "loss": 0.2781, + "step": 867 + }, + { + "epoch": 3.472, + "grad_norm": 0.8538976311683655, + "learning_rate": 4.1412825651302606e-05, + "loss": 0.271, + "step": 868 + }, + { + "epoch": 3.476, + "grad_norm": 0.8999854922294617, + "learning_rate": 4.140280561122245e-05, + "loss": 0.3038, + "step": 869 + }, + { + "epoch": 3.48, + "grad_norm": 0.8919850587844849, + "learning_rate": 4.139278557114228e-05, + "loss": 0.2953, + "step": 870 + }, + { + "epoch": 3.484, + "grad_norm": 0.9376264810562134, + "learning_rate": 4.138276553106213e-05, + "loss": 0.309, + "step": 871 + }, + { + "epoch": 3.488, + "grad_norm": 1.0939548015594482, + "learning_rate": 4.137274549098197e-05, + "loss": 0.3156, + "step": 872 + }, + { + "epoch": 3.492, + "grad_norm": 0.9430792927742004, + "learning_rate": 4.1362725450901806e-05, + "loss": 0.3003, + "step": 873 + }, + { + "epoch": 3.496, + "grad_norm": 0.9949555397033691, + "learning_rate": 4.135270541082165e-05, + "loss": 0.2998, + "step": 874 + }, + { + "epoch": 3.5, + "grad_norm": 0.9819671511650085, + "learning_rate": 4.134268537074149e-05, + "loss": 0.318, + "step": 875 + }, + { + "epoch": 3.504, + "grad_norm": 0.9571810960769653, + "learning_rate": 4.133266533066132e-05, + "loss": 0.3308, + "step": 876 + }, + { + "epoch": 3.508, + "grad_norm": 0.9577171206474304, + "learning_rate": 4.1322645290581164e-05, + "loss": 0.3261, + "step": 877 + }, + { + "epoch": 3.512, + "grad_norm": 0.9510439038276672, + "learning_rate": 4.1312625250501005e-05, + "loss": 0.2901, + "step": 878 + }, + { + "epoch": 3.516, + "grad_norm": 0.9860422015190125, + "learning_rate": 4.130260521042084e-05, + "loss": 0.2799, + "step": 879 + }, + { + "epoch": 3.52, + "grad_norm": 1.0223884582519531, + "learning_rate": 4.129258517034068e-05, + "loss": 0.3264, + "step": 880 + }, + { + "epoch": 3.524, + "grad_norm": 1.0917876958847046, + "learning_rate": 4.128256513026052e-05, + "loss": 0.2869, + "step": 881 + }, + { + "epoch": 3.528, + "grad_norm": 1.0941041707992554, + "learning_rate": 4.1272545090180364e-05, + "loss": 0.3296, + "step": 882 + }, + { + "epoch": 3.532, + "grad_norm": 0.9982560276985168, + "learning_rate": 4.1262525050100205e-05, + "loss": 0.3123, + "step": 883 + }, + { + "epoch": 3.536, + "grad_norm": 1.1089354753494263, + "learning_rate": 4.125250501002004e-05, + "loss": 0.3245, + "step": 884 + }, + { + "epoch": 3.54, + "grad_norm": 0.9130812883377075, + "learning_rate": 4.124248496993988e-05, + "loss": 0.265, + "step": 885 + }, + { + "epoch": 3.544, + "grad_norm": 0.9193147420883179, + "learning_rate": 4.123246492985972e-05, + "loss": 0.2947, + "step": 886 + }, + { + "epoch": 3.548, + "grad_norm": 0.908898651599884, + "learning_rate": 4.122244488977956e-05, + "loss": 0.3173, + "step": 887 + }, + { + "epoch": 3.552, + "grad_norm": 1.038451910018921, + "learning_rate": 4.12124248496994e-05, + "loss": 0.3317, + "step": 888 + }, + { + "epoch": 3.556, + "grad_norm": 1.0614579916000366, + "learning_rate": 4.120240480961924e-05, + "loss": 0.3206, + "step": 889 + }, + { + "epoch": 3.56, + "grad_norm": 1.0630053281784058, + "learning_rate": 4.119238476953908e-05, + "loss": 0.3096, + "step": 890 + }, + { + "epoch": 3.564, + "grad_norm": 0.9061043858528137, + "learning_rate": 4.118236472945892e-05, + "loss": 0.3105, + "step": 891 + }, + { + "epoch": 3.568, + "grad_norm": 0.9253777265548706, + "learning_rate": 4.117234468937876e-05, + "loss": 0.2853, + "step": 892 + }, + { + "epoch": 3.572, + "grad_norm": 1.0427800416946411, + "learning_rate": 4.11623246492986e-05, + "loss": 0.3032, + "step": 893 + }, + { + "epoch": 3.576, + "grad_norm": 0.9730731844902039, + "learning_rate": 4.115230460921844e-05, + "loss": 0.315, + "step": 894 + }, + { + "epoch": 3.58, + "grad_norm": 0.9691907167434692, + "learning_rate": 4.114228456913828e-05, + "loss": 0.3382, + "step": 895 + }, + { + "epoch": 3.584, + "grad_norm": 0.905629575252533, + "learning_rate": 4.1132264529058115e-05, + "loss": 0.2965, + "step": 896 + }, + { + "epoch": 3.588, + "grad_norm": 1.1402881145477295, + "learning_rate": 4.1122244488977956e-05, + "loss": 0.3044, + "step": 897 + }, + { + "epoch": 3.592, + "grad_norm": 1.0247617959976196, + "learning_rate": 4.11122244488978e-05, + "loss": 0.3063, + "step": 898 + }, + { + "epoch": 3.596, + "grad_norm": 1.0826162099838257, + "learning_rate": 4.110220440881764e-05, + "loss": 0.2926, + "step": 899 + }, + { + "epoch": 3.6, + "grad_norm": 1.0896751880645752, + "learning_rate": 4.109218436873748e-05, + "loss": 0.3232, + "step": 900 + }, + { + "epoch": 3.604, + "grad_norm": 1.0806505680084229, + "learning_rate": 4.108216432865732e-05, + "loss": 0.2913, + "step": 901 + }, + { + "epoch": 3.608, + "grad_norm": 1.0015305280685425, + "learning_rate": 4.1072144288577155e-05, + "loss": 0.2998, + "step": 902 + }, + { + "epoch": 3.612, + "grad_norm": 1.0075455904006958, + "learning_rate": 4.1062124248497e-05, + "loss": 0.2716, + "step": 903 + }, + { + "epoch": 3.616, + "grad_norm": 1.06728196144104, + "learning_rate": 4.105210420841684e-05, + "loss": 0.3269, + "step": 904 + }, + { + "epoch": 3.62, + "grad_norm": 0.8644483089447021, + "learning_rate": 4.104208416833667e-05, + "loss": 0.2804, + "step": 905 + }, + { + "epoch": 3.624, + "grad_norm": 0.9951996803283691, + "learning_rate": 4.1032064128256514e-05, + "loss": 0.3223, + "step": 906 + }, + { + "epoch": 3.628, + "grad_norm": 0.9931237697601318, + "learning_rate": 4.1022044088176355e-05, + "loss": 0.2715, + "step": 907 + }, + { + "epoch": 3.632, + "grad_norm": 1.0275764465332031, + "learning_rate": 4.101202404809619e-05, + "loss": 0.3444, + "step": 908 + }, + { + "epoch": 3.636, + "grad_norm": 0.9816417098045349, + "learning_rate": 4.100200400801603e-05, + "loss": 0.2724, + "step": 909 + }, + { + "epoch": 3.64, + "grad_norm": 1.1031150817871094, + "learning_rate": 4.099198396793588e-05, + "loss": 0.3406, + "step": 910 + }, + { + "epoch": 3.644, + "grad_norm": 0.7077162265777588, + "learning_rate": 4.0981963927855713e-05, + "loss": 0.1978, + "step": 911 + }, + { + "epoch": 3.648, + "grad_norm": 1.0858787298202515, + "learning_rate": 4.0971943887775555e-05, + "loss": 0.2975, + "step": 912 + }, + { + "epoch": 3.652, + "grad_norm": 0.9886104464530945, + "learning_rate": 4.0961923847695396e-05, + "loss": 0.3237, + "step": 913 + }, + { + "epoch": 3.656, + "grad_norm": 1.1059352159500122, + "learning_rate": 4.095190380761523e-05, + "loss": 0.3181, + "step": 914 + }, + { + "epoch": 3.66, + "grad_norm": 0.9329758882522583, + "learning_rate": 4.094188376753507e-05, + "loss": 0.2751, + "step": 915 + }, + { + "epoch": 3.664, + "grad_norm": 1.0664880275726318, + "learning_rate": 4.093186372745491e-05, + "loss": 0.3348, + "step": 916 + }, + { + "epoch": 3.668, + "grad_norm": 0.9057939052581787, + "learning_rate": 4.092184368737475e-05, + "loss": 0.3088, + "step": 917 + }, + { + "epoch": 3.672, + "grad_norm": 0.9240928888320923, + "learning_rate": 4.091182364729459e-05, + "loss": 0.1825, + "step": 918 + }, + { + "epoch": 3.676, + "grad_norm": 1.0612246990203857, + "learning_rate": 4.090180360721443e-05, + "loss": 0.2613, + "step": 919 + }, + { + "epoch": 3.68, + "grad_norm": 0.8932524919509888, + "learning_rate": 4.089178356713427e-05, + "loss": 0.3157, + "step": 920 + }, + { + "epoch": 3.684, + "grad_norm": 0.9782359600067139, + "learning_rate": 4.088176352705411e-05, + "loss": 0.2984, + "step": 921 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 1.0117908716201782, + "learning_rate": 4.0871743486973954e-05, + "loss": 0.3072, + "step": 922 + }, + { + "epoch": 3.692, + "grad_norm": 1.1856647729873657, + "learning_rate": 4.086172344689379e-05, + "loss": 0.3281, + "step": 923 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 0.767086386680603, + "learning_rate": 4.085170340681363e-05, + "loss": 0.1969, + "step": 924 + }, + { + "epoch": 3.7, + "grad_norm": 1.2814222574234009, + "learning_rate": 4.084168336673347e-05, + "loss": 0.3409, + "step": 925 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 1.0232386589050293, + "learning_rate": 4.0831663326653306e-05, + "loss": 0.2866, + "step": 926 + }, + { + "epoch": 3.708, + "grad_norm": 1.0636742115020752, + "learning_rate": 4.082164328657315e-05, + "loss": 0.3536, + "step": 927 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 0.9983400106430054, + "learning_rate": 4.081162324649299e-05, + "loss": 0.3254, + "step": 928 + }, + { + "epoch": 3.716, + "grad_norm": 0.9435644745826721, + "learning_rate": 4.080160320641282e-05, + "loss": 0.3213, + "step": 929 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 0.9191159009933472, + "learning_rate": 4.079158316633267e-05, + "loss": 0.2703, + "step": 930 + }, + { + "epoch": 3.724, + "grad_norm": 0.6620362401008606, + "learning_rate": 4.078156312625251e-05, + "loss": 0.1913, + "step": 931 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 0.7755175828933716, + "learning_rate": 4.0771543086172346e-05, + "loss": 0.2269, + "step": 932 + }, + { + "epoch": 3.732, + "grad_norm": 1.0141972303390503, + "learning_rate": 4.076152304609219e-05, + "loss": 0.3042, + "step": 933 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 1.0013058185577393, + "learning_rate": 4.075150300601203e-05, + "loss": 0.3252, + "step": 934 + }, + { + "epoch": 3.74, + "grad_norm": 1.1181739568710327, + "learning_rate": 4.0741482965931864e-05, + "loss": 0.3186, + "step": 935 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 1.0314396619796753, + "learning_rate": 4.0731462925851705e-05, + "loss": 0.2691, + "step": 936 + }, + { + "epoch": 3.748, + "grad_norm": 0.9695926308631897, + "learning_rate": 4.0721442885771546e-05, + "loss": 0.3565, + "step": 937 + }, + { + "epoch": 3.752, + "grad_norm": 0.9737102389335632, + "learning_rate": 4.071142284569138e-05, + "loss": 0.2772, + "step": 938 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 1.114227056503296, + "learning_rate": 4.070140280561122e-05, + "loss": 0.316, + "step": 939 + }, + { + "epoch": 3.76, + "grad_norm": 1.2355971336364746, + "learning_rate": 4.069138276553106e-05, + "loss": 0.331, + "step": 940 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 0.8689810037612915, + "learning_rate": 4.0681362725450904e-05, + "loss": 0.2914, + "step": 941 + }, + { + "epoch": 3.768, + "grad_norm": 1.04837167263031, + "learning_rate": 4.0671342685370746e-05, + "loss": 0.3081, + "step": 942 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 1.047351598739624, + "learning_rate": 4.066132264529059e-05, + "loss": 0.3239, + "step": 943 + }, + { + "epoch": 3.776, + "grad_norm": 0.9564910531044006, + "learning_rate": 4.065130260521042e-05, + "loss": 0.3193, + "step": 944 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 1.0410816669464111, + "learning_rate": 4.064128256513026e-05, + "loss": 0.3311, + "step": 945 + }, + { + "epoch": 3.784, + "grad_norm": 0.9697766304016113, + "learning_rate": 4.0631262525050104e-05, + "loss": 0.3043, + "step": 946 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 1.0359306335449219, + "learning_rate": 4.062124248496994e-05, + "loss": 0.3646, + "step": 947 + }, + { + "epoch": 3.792, + "grad_norm": 0.8712097406387329, + "learning_rate": 4.061122244488978e-05, + "loss": 0.2917, + "step": 948 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 0.9690885543823242, + "learning_rate": 4.060120240480962e-05, + "loss": 0.3265, + "step": 949 + }, + { + "epoch": 3.8, + "grad_norm": 0.9399465322494507, + "learning_rate": 4.059118236472946e-05, + "loss": 0.3144, + "step": 950 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 1.0360187292099, + "learning_rate": 4.0581162324649304e-05, + "loss": 0.3777, + "step": 951 + }, + { + "epoch": 3.808, + "grad_norm": 1.233900785446167, + "learning_rate": 4.057114228456914e-05, + "loss": 0.3032, + "step": 952 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 1.0116194486618042, + "learning_rate": 4.056112224448898e-05, + "loss": 0.2787, + "step": 953 + }, + { + "epoch": 3.816, + "grad_norm": 0.9884048700332642, + "learning_rate": 4.055110220440882e-05, + "loss": 0.3147, + "step": 954 + }, + { + "epoch": 3.82, + "grad_norm": 0.9382745623588562, + "learning_rate": 4.0541082164328655e-05, + "loss": 0.3232, + "step": 955 + }, + { + "epoch": 3.824, + "grad_norm": 0.9225025177001953, + "learning_rate": 4.0531062124248497e-05, + "loss": 0.3334, + "step": 956 + }, + { + "epoch": 3.828, + "grad_norm": 0.8968762159347534, + "learning_rate": 4.052104208416834e-05, + "loss": 0.3146, + "step": 957 + }, + { + "epoch": 3.832, + "grad_norm": 0.9080137610435486, + "learning_rate": 4.051102204408818e-05, + "loss": 0.2877, + "step": 958 + }, + { + "epoch": 3.836, + "grad_norm": 0.9056142568588257, + "learning_rate": 4.050100200400802e-05, + "loss": 0.2728, + "step": 959 + }, + { + "epoch": 3.84, + "grad_norm": 0.976733922958374, + "learning_rate": 4.049098196392786e-05, + "loss": 0.3077, + "step": 960 + }, + { + "epoch": 3.844, + "grad_norm": 1.0319982767105103, + "learning_rate": 4.0480961923847696e-05, + "loss": 0.3124, + "step": 961 + }, + { + "epoch": 3.848, + "grad_norm": 0.8904579281806946, + "learning_rate": 4.047094188376754e-05, + "loss": 0.3197, + "step": 962 + }, + { + "epoch": 3.852, + "grad_norm": 0.9294513463973999, + "learning_rate": 4.046092184368738e-05, + "loss": 0.318, + "step": 963 + }, + { + "epoch": 3.856, + "grad_norm": 0.9072182178497314, + "learning_rate": 4.045090180360721e-05, + "loss": 0.2971, + "step": 964 + }, + { + "epoch": 3.86, + "grad_norm": 1.0785537958145142, + "learning_rate": 4.0440881763527055e-05, + "loss": 0.3362, + "step": 965 + }, + { + "epoch": 3.864, + "grad_norm": 1.1097102165222168, + "learning_rate": 4.0430861723446896e-05, + "loss": 0.3528, + "step": 966 + }, + { + "epoch": 3.868, + "grad_norm": 1.0673046112060547, + "learning_rate": 4.042084168336673e-05, + "loss": 0.3085, + "step": 967 + }, + { + "epoch": 3.872, + "grad_norm": 0.8683971762657166, + "learning_rate": 4.041082164328657e-05, + "loss": 0.2852, + "step": 968 + }, + { + "epoch": 3.876, + "grad_norm": 0.9497571587562561, + "learning_rate": 4.040080160320642e-05, + "loss": 0.2755, + "step": 969 + }, + { + "epoch": 3.88, + "grad_norm": 1.0285412073135376, + "learning_rate": 4.0390781563126254e-05, + "loss": 0.3037, + "step": 970 + }, + { + "epoch": 3.884, + "grad_norm": 1.1578757762908936, + "learning_rate": 4.0380761523046096e-05, + "loss": 0.3113, + "step": 971 + }, + { + "epoch": 3.888, + "grad_norm": 1.017016887664795, + "learning_rate": 4.037074148296594e-05, + "loss": 0.3655, + "step": 972 + }, + { + "epoch": 3.892, + "grad_norm": 1.044767141342163, + "learning_rate": 4.036072144288577e-05, + "loss": 0.3088, + "step": 973 + }, + { + "epoch": 3.896, + "grad_norm": 1.098922848701477, + "learning_rate": 4.035070140280561e-05, + "loss": 0.3294, + "step": 974 + }, + { + "epoch": 3.9, + "grad_norm": 0.9794385433197021, + "learning_rate": 4.0340681362725454e-05, + "loss": 0.302, + "step": 975 + }, + { + "epoch": 3.904, + "grad_norm": 0.9461687207221985, + "learning_rate": 4.033066132264529e-05, + "loss": 0.3199, + "step": 976 + }, + { + "epoch": 3.908, + "grad_norm": 0.971648633480072, + "learning_rate": 4.032064128256513e-05, + "loss": 0.3065, + "step": 977 + }, + { + "epoch": 3.912, + "grad_norm": 0.8333442807197571, + "learning_rate": 4.031062124248497e-05, + "loss": 0.2888, + "step": 978 + }, + { + "epoch": 3.916, + "grad_norm": 0.8855047225952148, + "learning_rate": 4.030060120240481e-05, + "loss": 0.3273, + "step": 979 + }, + { + "epoch": 3.92, + "grad_norm": 0.7457961440086365, + "learning_rate": 4.0290581162324654e-05, + "loss": 0.2061, + "step": 980 + }, + { + "epoch": 3.924, + "grad_norm": 0.9684552550315857, + "learning_rate": 4.0280561122244495e-05, + "loss": 0.3369, + "step": 981 + }, + { + "epoch": 3.928, + "grad_norm": 1.0121392011642456, + "learning_rate": 4.027054108216433e-05, + "loss": 0.3324, + "step": 982 + }, + { + "epoch": 3.932, + "grad_norm": 0.9265434145927429, + "learning_rate": 4.026052104208417e-05, + "loss": 0.3017, + "step": 983 + }, + { + "epoch": 3.936, + "grad_norm": 1.0135233402252197, + "learning_rate": 4.025050100200401e-05, + "loss": 0.2969, + "step": 984 + }, + { + "epoch": 3.94, + "grad_norm": 1.0143775939941406, + "learning_rate": 4.0240480961923846e-05, + "loss": 0.2992, + "step": 985 + }, + { + "epoch": 3.944, + "grad_norm": 0.9987703561782837, + "learning_rate": 4.023046092184369e-05, + "loss": 0.316, + "step": 986 + }, + { + "epoch": 3.948, + "grad_norm": 0.9264063239097595, + "learning_rate": 4.022044088176353e-05, + "loss": 0.3123, + "step": 987 + }, + { + "epoch": 3.952, + "grad_norm": 1.1002973318099976, + "learning_rate": 4.0210420841683363e-05, + "loss": 0.3324, + "step": 988 + }, + { + "epoch": 3.956, + "grad_norm": 1.0875300168991089, + "learning_rate": 4.020040080160321e-05, + "loss": 0.3193, + "step": 989 + }, + { + "epoch": 3.96, + "grad_norm": 1.137242078781128, + "learning_rate": 4.019038076152305e-05, + "loss": 0.2993, + "step": 990 + }, + { + "epoch": 3.964, + "grad_norm": 1.010221242904663, + "learning_rate": 4.018036072144289e-05, + "loss": 0.2902, + "step": 991 + }, + { + "epoch": 3.968, + "grad_norm": 1.0185948610305786, + "learning_rate": 4.017034068136273e-05, + "loss": 0.2858, + "step": 992 + }, + { + "epoch": 3.972, + "grad_norm": 1.1539437770843506, + "learning_rate": 4.016032064128257e-05, + "loss": 0.3179, + "step": 993 + }, + { + "epoch": 3.976, + "grad_norm": 1.0386922359466553, + "learning_rate": 4.0150300601202404e-05, + "loss": 0.3271, + "step": 994 + }, + { + "epoch": 3.98, + "grad_norm": 1.041878581047058, + "learning_rate": 4.0140280561122246e-05, + "loss": 0.3355, + "step": 995 + }, + { + "epoch": 3.984, + "grad_norm": 1.1333723068237305, + "learning_rate": 4.013026052104209e-05, + "loss": 0.3385, + "step": 996 + }, + { + "epoch": 3.988, + "grad_norm": 1.0429155826568604, + "learning_rate": 4.012024048096192e-05, + "loss": 0.2962, + "step": 997 + }, + { + "epoch": 3.992, + "grad_norm": 1.0766241550445557, + "learning_rate": 4.011022044088176e-05, + "loss": 0.3384, + "step": 998 + }, + { + "epoch": 3.996, + "grad_norm": 0.979697048664093, + "learning_rate": 4.010020040080161e-05, + "loss": 0.2805, + "step": 999 + }, + { + "epoch": 4.0, + "grad_norm": 1.0180047750473022, + "learning_rate": 4.0090180360721445e-05, + "loss": 0.3099, + "step": 1000 + }, + { + "epoch": 4.004, + "grad_norm": 1.014128565788269, + "learning_rate": 4.0080160320641287e-05, + "loss": 0.2158, + "step": 1001 + }, + { + "epoch": 4.008, + "grad_norm": 0.9923162460327148, + "learning_rate": 4.007014028056113e-05, + "loss": 0.2208, + "step": 1002 + }, + { + "epoch": 4.012, + "grad_norm": 1.0028576850891113, + "learning_rate": 4.006012024048096e-05, + "loss": 0.2285, + "step": 1003 + }, + { + "epoch": 4.016, + "grad_norm": 0.962719738483429, + "learning_rate": 4.0050100200400804e-05, + "loss": 0.2034, + "step": 1004 + }, + { + "epoch": 4.02, + "grad_norm": 1.013881802558899, + "learning_rate": 4.0040080160320645e-05, + "loss": 0.1846, + "step": 1005 + }, + { + "epoch": 4.024, + "grad_norm": 0.8782246708869934, + "learning_rate": 4.003006012024048e-05, + "loss": 0.2139, + "step": 1006 + }, + { + "epoch": 4.028, + "grad_norm": 1.26191246509552, + "learning_rate": 4.002004008016032e-05, + "loss": 0.2172, + "step": 1007 + }, + { + "epoch": 4.032, + "grad_norm": 1.2841931581497192, + "learning_rate": 4.001002004008016e-05, + "loss": 0.2612, + "step": 1008 + }, + { + "epoch": 4.036, + "grad_norm": 1.2775436639785767, + "learning_rate": 4e-05, + "loss": 0.1986, + "step": 1009 + }, + { + "epoch": 4.04, + "grad_norm": 1.3614190816879272, + "learning_rate": 3.9989979959919845e-05, + "loss": 0.2709, + "step": 1010 + }, + { + "epoch": 4.044, + "grad_norm": 1.4308106899261475, + "learning_rate": 3.997995991983968e-05, + "loss": 0.2196, + "step": 1011 + }, + { + "epoch": 4.048, + "grad_norm": 1.4295969009399414, + "learning_rate": 3.996993987975952e-05, + "loss": 0.2131, + "step": 1012 + }, + { + "epoch": 4.052, + "grad_norm": 1.4375779628753662, + "learning_rate": 3.995991983967936e-05, + "loss": 0.203, + "step": 1013 + }, + { + "epoch": 4.056, + "grad_norm": 1.3883522748947144, + "learning_rate": 3.99498997995992e-05, + "loss": 0.2139, + "step": 1014 + }, + { + "epoch": 4.06, + "grad_norm": 1.211121678352356, + "learning_rate": 3.993987975951904e-05, + "loss": 0.2068, + "step": 1015 + }, + { + "epoch": 4.064, + "grad_norm": 1.3009765148162842, + "learning_rate": 3.992985971943888e-05, + "loss": 0.2104, + "step": 1016 + }, + { + "epoch": 4.068, + "grad_norm": 1.137965202331543, + "learning_rate": 3.991983967935872e-05, + "loss": 0.1963, + "step": 1017 + }, + { + "epoch": 4.072, + "grad_norm": 1.2894974946975708, + "learning_rate": 3.990981963927856e-05, + "loss": 0.2235, + "step": 1018 + }, + { + "epoch": 4.076, + "grad_norm": 0.706529974937439, + "learning_rate": 3.98997995991984e-05, + "loss": 0.098, + "step": 1019 + }, + { + "epoch": 4.08, + "grad_norm": 1.130771279335022, + "learning_rate": 3.988977955911824e-05, + "loss": 0.2235, + "step": 1020 + }, + { + "epoch": 4.084, + "grad_norm": 1.171426773071289, + "learning_rate": 3.987975951903808e-05, + "loss": 0.1852, + "step": 1021 + }, + { + "epoch": 4.088, + "grad_norm": 1.0851662158966064, + "learning_rate": 3.986973947895792e-05, + "loss": 0.1879, + "step": 1022 + }, + { + "epoch": 4.092, + "grad_norm": 1.3458068370819092, + "learning_rate": 3.9859719438877754e-05, + "loss": 0.2321, + "step": 1023 + }, + { + "epoch": 4.096, + "grad_norm": 1.1501377820968628, + "learning_rate": 3.9849699398797595e-05, + "loss": 0.2157, + "step": 1024 + }, + { + "epoch": 4.1, + "grad_norm": 1.2266032695770264, + "learning_rate": 3.983967935871744e-05, + "loss": 0.22, + "step": 1025 + }, + { + "epoch": 4.104, + "grad_norm": 1.1825183629989624, + "learning_rate": 3.982965931863728e-05, + "loss": 0.1887, + "step": 1026 + }, + { + "epoch": 4.108, + "grad_norm": 1.1460607051849365, + "learning_rate": 3.981963927855711e-05, + "loss": 0.1834, + "step": 1027 + }, + { + "epoch": 4.112, + "grad_norm": 1.2845118045806885, + "learning_rate": 3.980961923847696e-05, + "loss": 0.223, + "step": 1028 + }, + { + "epoch": 4.116, + "grad_norm": 1.3526803255081177, + "learning_rate": 3.9799599198396795e-05, + "loss": 0.1907, + "step": 1029 + }, + { + "epoch": 4.12, + "grad_norm": 1.2375893592834473, + "learning_rate": 3.9789579158316636e-05, + "loss": 0.188, + "step": 1030 + }, + { + "epoch": 4.124, + "grad_norm": 1.2433942556381226, + "learning_rate": 3.977955911823648e-05, + "loss": 0.2173, + "step": 1031 + }, + { + "epoch": 4.128, + "grad_norm": 1.2910653352737427, + "learning_rate": 3.976953907815631e-05, + "loss": 0.192, + "step": 1032 + }, + { + "epoch": 4.132, + "grad_norm": 1.0230827331542969, + "learning_rate": 3.9759519038076153e-05, + "loss": 0.1544, + "step": 1033 + }, + { + "epoch": 4.136, + "grad_norm": 0.9551684856414795, + "learning_rate": 3.9749498997995995e-05, + "loss": 0.1518, + "step": 1034 + }, + { + "epoch": 4.14, + "grad_norm": 1.2070391178131104, + "learning_rate": 3.973947895791583e-05, + "loss": 0.1783, + "step": 1035 + }, + { + "epoch": 4.144, + "grad_norm": 1.4213783740997314, + "learning_rate": 3.972945891783567e-05, + "loss": 0.2237, + "step": 1036 + }, + { + "epoch": 4.148, + "grad_norm": 1.2974165678024292, + "learning_rate": 3.971943887775551e-05, + "loss": 0.1768, + "step": 1037 + }, + { + "epoch": 4.152, + "grad_norm": 0.8103154301643372, + "learning_rate": 3.970941883767535e-05, + "loss": 0.0919, + "step": 1038 + }, + { + "epoch": 4.156, + "grad_norm": 1.279154896736145, + "learning_rate": 3.9699398797595194e-05, + "loss": 0.2302, + "step": 1039 + }, + { + "epoch": 4.16, + "grad_norm": 1.3680810928344727, + "learning_rate": 3.9689378757515036e-05, + "loss": 0.2242, + "step": 1040 + }, + { + "epoch": 4.164, + "grad_norm": 1.184191107749939, + "learning_rate": 3.967935871743487e-05, + "loss": 0.2504, + "step": 1041 + }, + { + "epoch": 4.168, + "grad_norm": 1.2547346353530884, + "learning_rate": 3.966933867735471e-05, + "loss": 0.2051, + "step": 1042 + }, + { + "epoch": 4.172, + "grad_norm": 1.1803265810012817, + "learning_rate": 3.965931863727455e-05, + "loss": 0.2276, + "step": 1043 + }, + { + "epoch": 4.176, + "grad_norm": 1.4127157926559448, + "learning_rate": 3.964929859719439e-05, + "loss": 0.2378, + "step": 1044 + }, + { + "epoch": 4.18, + "grad_norm": 1.2966582775115967, + "learning_rate": 3.963927855711423e-05, + "loss": 0.1672, + "step": 1045 + }, + { + "epoch": 4.184, + "grad_norm": 1.1143783330917358, + "learning_rate": 3.962925851703407e-05, + "loss": 0.2011, + "step": 1046 + }, + { + "epoch": 4.188, + "grad_norm": 1.2296042442321777, + "learning_rate": 3.9619238476953904e-05, + "loss": 0.1928, + "step": 1047 + }, + { + "epoch": 4.192, + "grad_norm": 1.2343521118164062, + "learning_rate": 3.960921843687375e-05, + "loss": 0.2026, + "step": 1048 + }, + { + "epoch": 4.196, + "grad_norm": 1.2687957286834717, + "learning_rate": 3.9599198396793594e-05, + "loss": 0.2408, + "step": 1049 + }, + { + "epoch": 4.2, + "grad_norm": 1.172644019126892, + "learning_rate": 3.958917835671343e-05, + "loss": 0.2206, + "step": 1050 + }, + { + "epoch": 4.204, + "grad_norm": 1.220900297164917, + "learning_rate": 3.957915831663327e-05, + "loss": 0.2039, + "step": 1051 + }, + { + "epoch": 4.208, + "grad_norm": 1.3167065382003784, + "learning_rate": 3.956913827655311e-05, + "loss": 0.1992, + "step": 1052 + }, + { + "epoch": 4.212, + "grad_norm": 1.2286665439605713, + "learning_rate": 3.9559118236472945e-05, + "loss": 0.1897, + "step": 1053 + }, + { + "epoch": 4.216, + "grad_norm": 1.2382471561431885, + "learning_rate": 3.9549098196392786e-05, + "loss": 0.2299, + "step": 1054 + }, + { + "epoch": 4.22, + "grad_norm": 1.2562413215637207, + "learning_rate": 3.953907815631263e-05, + "loss": 0.252, + "step": 1055 + }, + { + "epoch": 4.224, + "grad_norm": 1.2567752599716187, + "learning_rate": 3.952905811623246e-05, + "loss": 0.2026, + "step": 1056 + }, + { + "epoch": 4.228, + "grad_norm": 1.1952612400054932, + "learning_rate": 3.9519038076152304e-05, + "loss": 0.2166, + "step": 1057 + }, + { + "epoch": 4.232, + "grad_norm": 1.23219633102417, + "learning_rate": 3.950901803607215e-05, + "loss": 0.2183, + "step": 1058 + }, + { + "epoch": 4.236, + "grad_norm": 1.3385809659957886, + "learning_rate": 3.9498997995991986e-05, + "loss": 0.2031, + "step": 1059 + }, + { + "epoch": 4.24, + "grad_norm": 1.0677396059036255, + "learning_rate": 3.948897795591183e-05, + "loss": 0.2011, + "step": 1060 + }, + { + "epoch": 4.244, + "grad_norm": 1.4090005159378052, + "learning_rate": 3.947895791583167e-05, + "loss": 0.2227, + "step": 1061 + }, + { + "epoch": 4.248, + "grad_norm": 1.1532093286514282, + "learning_rate": 3.94689378757515e-05, + "loss": 0.1974, + "step": 1062 + }, + { + "epoch": 4.252, + "grad_norm": 1.1573532819747925, + "learning_rate": 3.9458917835671344e-05, + "loss": 0.2248, + "step": 1063 + }, + { + "epoch": 4.256, + "grad_norm": 1.2687122821807861, + "learning_rate": 3.9448897795591186e-05, + "loss": 0.2144, + "step": 1064 + }, + { + "epoch": 4.26, + "grad_norm": 1.2371104955673218, + "learning_rate": 3.943887775551102e-05, + "loss": 0.1946, + "step": 1065 + }, + { + "epoch": 4.264, + "grad_norm": 0.7894278764724731, + "learning_rate": 3.942885771543086e-05, + "loss": 0.1066, + "step": 1066 + }, + { + "epoch": 4.268, + "grad_norm": 1.357580304145813, + "learning_rate": 3.94188376753507e-05, + "loss": 0.2478, + "step": 1067 + }, + { + "epoch": 4.272, + "grad_norm": 1.174869418144226, + "learning_rate": 3.9408817635270544e-05, + "loss": 0.1946, + "step": 1068 + }, + { + "epoch": 4.276, + "grad_norm": 1.242233157157898, + "learning_rate": 3.9398797595190385e-05, + "loss": 0.2277, + "step": 1069 + }, + { + "epoch": 4.28, + "grad_norm": 1.31795334815979, + "learning_rate": 3.938877755511023e-05, + "loss": 0.2155, + "step": 1070 + }, + { + "epoch": 4.284, + "grad_norm": 0.8449123501777649, + "learning_rate": 3.937875751503006e-05, + "loss": 0.1393, + "step": 1071 + }, + { + "epoch": 4.288, + "grad_norm": 1.4304672479629517, + "learning_rate": 3.93687374749499e-05, + "loss": 0.2227, + "step": 1072 + }, + { + "epoch": 4.292, + "grad_norm": 1.1797378063201904, + "learning_rate": 3.9358717434869744e-05, + "loss": 0.2414, + "step": 1073 + }, + { + "epoch": 4.296, + "grad_norm": 1.1290992498397827, + "learning_rate": 3.934869739478958e-05, + "loss": 0.2528, + "step": 1074 + }, + { + "epoch": 4.3, + "grad_norm": 1.2640211582183838, + "learning_rate": 3.933867735470942e-05, + "loss": 0.2229, + "step": 1075 + }, + { + "epoch": 4.304, + "grad_norm": 1.117639422416687, + "learning_rate": 3.932865731462926e-05, + "loss": 0.2176, + "step": 1076 + }, + { + "epoch": 4.308, + "grad_norm": 1.35614013671875, + "learning_rate": 3.93186372745491e-05, + "loss": 0.1896, + "step": 1077 + }, + { + "epoch": 4.312, + "grad_norm": 1.2274762392044067, + "learning_rate": 3.930861723446894e-05, + "loss": 0.2213, + "step": 1078 + }, + { + "epoch": 4.316, + "grad_norm": 1.1732356548309326, + "learning_rate": 3.929859719438878e-05, + "loss": 0.2233, + "step": 1079 + }, + { + "epoch": 4.32, + "grad_norm": 1.1939340829849243, + "learning_rate": 3.928857715430862e-05, + "loss": 0.1964, + "step": 1080 + }, + { + "epoch": 4.324, + "grad_norm": 1.3930495977401733, + "learning_rate": 3.927855711422846e-05, + "loss": 0.1965, + "step": 1081 + }, + { + "epoch": 4.328, + "grad_norm": 1.371573567390442, + "learning_rate": 3.9268537074148295e-05, + "loss": 0.2191, + "step": 1082 + }, + { + "epoch": 4.332, + "grad_norm": 1.4507895708084106, + "learning_rate": 3.9258517034068136e-05, + "loss": 0.2927, + "step": 1083 + }, + { + "epoch": 4.336, + "grad_norm": 1.3624083995819092, + "learning_rate": 3.924849699398798e-05, + "loss": 0.2211, + "step": 1084 + }, + { + "epoch": 4.34, + "grad_norm": 1.1890075206756592, + "learning_rate": 3.923847695390782e-05, + "loss": 0.1766, + "step": 1085 + }, + { + "epoch": 4.344, + "grad_norm": 1.4425567388534546, + "learning_rate": 3.922845691382765e-05, + "loss": 0.205, + "step": 1086 + }, + { + "epoch": 4.348, + "grad_norm": 1.2859835624694824, + "learning_rate": 3.92184368737475e-05, + "loss": 0.2194, + "step": 1087 + }, + { + "epoch": 4.352, + "grad_norm": 1.2856872081756592, + "learning_rate": 3.9208416833667336e-05, + "loss": 0.197, + "step": 1088 + }, + { + "epoch": 4.356, + "grad_norm": 1.2462953329086304, + "learning_rate": 3.919839679358718e-05, + "loss": 0.2121, + "step": 1089 + }, + { + "epoch": 4.36, + "grad_norm": 1.364990472793579, + "learning_rate": 3.918837675350702e-05, + "loss": 0.2367, + "step": 1090 + }, + { + "epoch": 4.364, + "grad_norm": 1.3470253944396973, + "learning_rate": 3.917835671342685e-05, + "loss": 0.2263, + "step": 1091 + }, + { + "epoch": 4.368, + "grad_norm": 1.2777583599090576, + "learning_rate": 3.9168336673346694e-05, + "loss": 0.2483, + "step": 1092 + }, + { + "epoch": 4.372, + "grad_norm": 1.1814221143722534, + "learning_rate": 3.9158316633266535e-05, + "loss": 0.2304, + "step": 1093 + }, + { + "epoch": 4.376, + "grad_norm": 1.329244613647461, + "learning_rate": 3.914829659318637e-05, + "loss": 0.258, + "step": 1094 + }, + { + "epoch": 4.38, + "grad_norm": 1.2682360410690308, + "learning_rate": 3.913827655310621e-05, + "loss": 0.2278, + "step": 1095 + }, + { + "epoch": 4.384, + "grad_norm": 1.2741080522537231, + "learning_rate": 3.912825651302605e-05, + "loss": 0.2437, + "step": 1096 + }, + { + "epoch": 4.388, + "grad_norm": 1.1235476732254028, + "learning_rate": 3.9118236472945894e-05, + "loss": 0.2209, + "step": 1097 + }, + { + "epoch": 4.392, + "grad_norm": 1.1556679010391235, + "learning_rate": 3.9108216432865735e-05, + "loss": 0.2668, + "step": 1098 + }, + { + "epoch": 4.396, + "grad_norm": 1.2366710901260376, + "learning_rate": 3.9098196392785576e-05, + "loss": 0.2822, + "step": 1099 + }, + { + "epoch": 4.4, + "grad_norm": 1.325208067893982, + "learning_rate": 3.908817635270541e-05, + "loss": 0.2203, + "step": 1100 + }, + { + "epoch": 4.404, + "grad_norm": 1.166352391242981, + "learning_rate": 3.907815631262525e-05, + "loss": 0.2441, + "step": 1101 + }, + { + "epoch": 4.408, + "grad_norm": 1.2230168581008911, + "learning_rate": 3.9068136272545093e-05, + "loss": 0.2217, + "step": 1102 + }, + { + "epoch": 4.412, + "grad_norm": 1.2550815343856812, + "learning_rate": 3.905811623246493e-05, + "loss": 0.2299, + "step": 1103 + }, + { + "epoch": 4.416, + "grad_norm": 1.3714336156845093, + "learning_rate": 3.904809619238477e-05, + "loss": 0.2247, + "step": 1104 + }, + { + "epoch": 4.42, + "grad_norm": 1.2269859313964844, + "learning_rate": 3.903807615230461e-05, + "loss": 0.21, + "step": 1105 + }, + { + "epoch": 4.424, + "grad_norm": 1.420852780342102, + "learning_rate": 3.9028056112224445e-05, + "loss": 0.2248, + "step": 1106 + }, + { + "epoch": 4.428, + "grad_norm": 1.2329579591751099, + "learning_rate": 3.901803607214429e-05, + "loss": 0.2506, + "step": 1107 + }, + { + "epoch": 4.432, + "grad_norm": 1.2145705223083496, + "learning_rate": 3.9008016032064134e-05, + "loss": 0.2244, + "step": 1108 + }, + { + "epoch": 4.436, + "grad_norm": 1.3034549951553345, + "learning_rate": 3.899799599198397e-05, + "loss": 0.1866, + "step": 1109 + }, + { + "epoch": 4.44, + "grad_norm": 1.1892132759094238, + "learning_rate": 3.898797595190381e-05, + "loss": 0.2283, + "step": 1110 + }, + { + "epoch": 4.444, + "grad_norm": 1.279307246208191, + "learning_rate": 3.897795591182365e-05, + "loss": 0.2039, + "step": 1111 + }, + { + "epoch": 4.448, + "grad_norm": 0.917590320110321, + "learning_rate": 3.8967935871743486e-05, + "loss": 0.1489, + "step": 1112 + }, + { + "epoch": 4.452, + "grad_norm": 1.2451698780059814, + "learning_rate": 3.895791583166333e-05, + "loss": 0.1824, + "step": 1113 + }, + { + "epoch": 4.456, + "grad_norm": 1.3629359006881714, + "learning_rate": 3.894789579158317e-05, + "loss": 0.2174, + "step": 1114 + }, + { + "epoch": 4.46, + "grad_norm": 1.4630622863769531, + "learning_rate": 3.8937875751503e-05, + "loss": 0.2627, + "step": 1115 + }, + { + "epoch": 4.464, + "grad_norm": 1.2522321939468384, + "learning_rate": 3.8927855711422844e-05, + "loss": 0.2285, + "step": 1116 + }, + { + "epoch": 4.468, + "grad_norm": 1.2080490589141846, + "learning_rate": 3.891783567134269e-05, + "loss": 0.2458, + "step": 1117 + }, + { + "epoch": 4.4719999999999995, + "grad_norm": 1.2194185256958008, + "learning_rate": 3.890781563126253e-05, + "loss": 0.2394, + "step": 1118 + }, + { + "epoch": 4.476, + "grad_norm": 1.1908085346221924, + "learning_rate": 3.889779559118237e-05, + "loss": 0.2071, + "step": 1119 + }, + { + "epoch": 4.48, + "grad_norm": 1.3185300827026367, + "learning_rate": 3.888777555110221e-05, + "loss": 0.1841, + "step": 1120 + }, + { + "epoch": 4.484, + "grad_norm": 1.2330673933029175, + "learning_rate": 3.8877755511022044e-05, + "loss": 0.2216, + "step": 1121 + }, + { + "epoch": 4.4879999999999995, + "grad_norm": 1.1211673021316528, + "learning_rate": 3.8867735470941885e-05, + "loss": 0.2207, + "step": 1122 + }, + { + "epoch": 4.492, + "grad_norm": 1.3571367263793945, + "learning_rate": 3.8857715430861727e-05, + "loss": 0.3118, + "step": 1123 + }, + { + "epoch": 4.496, + "grad_norm": 1.3315199613571167, + "learning_rate": 3.884769539078156e-05, + "loss": 0.2559, + "step": 1124 + }, + { + "epoch": 4.5, + "grad_norm": 1.2943772077560425, + "learning_rate": 3.88376753507014e-05, + "loss": 0.2258, + "step": 1125 + }, + { + "epoch": 4.504, + "grad_norm": 1.3622092008590698, + "learning_rate": 3.882765531062125e-05, + "loss": 0.2172, + "step": 1126 + }, + { + "epoch": 4.508, + "grad_norm": 1.2951812744140625, + "learning_rate": 3.8817635270541085e-05, + "loss": 0.2419, + "step": 1127 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 1.2247873544692993, + "learning_rate": 3.8807615230460926e-05, + "loss": 0.1889, + "step": 1128 + }, + { + "epoch": 4.516, + "grad_norm": 1.29974365234375, + "learning_rate": 3.879759519038077e-05, + "loss": 0.2432, + "step": 1129 + }, + { + "epoch": 4.52, + "grad_norm": 1.2597442865371704, + "learning_rate": 3.87875751503006e-05, + "loss": 0.25, + "step": 1130 + }, + { + "epoch": 4.524, + "grad_norm": 1.388176679611206, + "learning_rate": 3.877755511022044e-05, + "loss": 0.2447, + "step": 1131 + }, + { + "epoch": 4.5280000000000005, + "grad_norm": 1.1391148567199707, + "learning_rate": 3.8767535070140285e-05, + "loss": 0.273, + "step": 1132 + }, + { + "epoch": 4.532, + "grad_norm": 1.2334431409835815, + "learning_rate": 3.875751503006012e-05, + "loss": 0.2268, + "step": 1133 + }, + { + "epoch": 4.536, + "grad_norm": 1.2976422309875488, + "learning_rate": 3.874749498997996e-05, + "loss": 0.2243, + "step": 1134 + }, + { + "epoch": 4.54, + "grad_norm": 1.2898563146591187, + "learning_rate": 3.87374749498998e-05, + "loss": 0.226, + "step": 1135 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 1.309234380722046, + "learning_rate": 3.872745490981964e-05, + "loss": 0.193, + "step": 1136 + }, + { + "epoch": 4.548, + "grad_norm": 0.9056435227394104, + "learning_rate": 3.8717434869739484e-05, + "loss": 0.1361, + "step": 1137 + }, + { + "epoch": 4.552, + "grad_norm": 1.2060729265213013, + "learning_rate": 3.870741482965932e-05, + "loss": 0.2172, + "step": 1138 + }, + { + "epoch": 4.556, + "grad_norm": 1.299154281616211, + "learning_rate": 3.869739478957916e-05, + "loss": 0.2291, + "step": 1139 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 1.1153815984725952, + "learning_rate": 3.8687374749499e-05, + "loss": 0.1993, + "step": 1140 + }, + { + "epoch": 4.564, + "grad_norm": 1.402708649635315, + "learning_rate": 3.867735470941884e-05, + "loss": 0.2377, + "step": 1141 + }, + { + "epoch": 4.568, + "grad_norm": 1.3014065027236938, + "learning_rate": 3.866733466933868e-05, + "loss": 0.2178, + "step": 1142 + }, + { + "epoch": 4.572, + "grad_norm": 1.4406603574752808, + "learning_rate": 3.865731462925852e-05, + "loss": 0.2547, + "step": 1143 + }, + { + "epoch": 4.576, + "grad_norm": 1.3197580575942993, + "learning_rate": 3.864729458917836e-05, + "loss": 0.234, + "step": 1144 + }, + { + "epoch": 4.58, + "grad_norm": 1.2924617528915405, + "learning_rate": 3.8637274549098194e-05, + "loss": 0.2327, + "step": 1145 + }, + { + "epoch": 4.584, + "grad_norm": 1.121708631515503, + "learning_rate": 3.862725450901804e-05, + "loss": 0.2183, + "step": 1146 + }, + { + "epoch": 4.588, + "grad_norm": 1.1940151453018188, + "learning_rate": 3.861723446893788e-05, + "loss": 0.2617, + "step": 1147 + }, + { + "epoch": 4.592, + "grad_norm": 1.2300734519958496, + "learning_rate": 3.860721442885772e-05, + "loss": 0.1954, + "step": 1148 + }, + { + "epoch": 4.596, + "grad_norm": 1.2809008359909058, + "learning_rate": 3.859719438877756e-05, + "loss": 0.2435, + "step": 1149 + }, + { + "epoch": 4.6, + "grad_norm": 1.3110355138778687, + "learning_rate": 3.8587174348697394e-05, + "loss": 0.2235, + "step": 1150 + }, + { + "epoch": 4.604, + "grad_norm": 1.3215099573135376, + "learning_rate": 3.8577154308617235e-05, + "loss": 0.2544, + "step": 1151 + }, + { + "epoch": 4.608, + "grad_norm": 1.1234500408172607, + "learning_rate": 3.8567134268537076e-05, + "loss": 0.275, + "step": 1152 + }, + { + "epoch": 4.612, + "grad_norm": 1.1621915102005005, + "learning_rate": 3.855711422845692e-05, + "loss": 0.2289, + "step": 1153 + }, + { + "epoch": 4.616, + "grad_norm": 1.163620114326477, + "learning_rate": 3.854709418837675e-05, + "loss": 0.25, + "step": 1154 + }, + { + "epoch": 4.62, + "grad_norm": 1.219595193862915, + "learning_rate": 3.853707414829659e-05, + "loss": 0.2388, + "step": 1155 + }, + { + "epoch": 4.624, + "grad_norm": 1.2607448101043701, + "learning_rate": 3.8527054108216435e-05, + "loss": 0.1885, + "step": 1156 + }, + { + "epoch": 4.628, + "grad_norm": 0.5879749655723572, + "learning_rate": 3.8517034068136276e-05, + "loss": 0.068, + "step": 1157 + }, + { + "epoch": 4.632, + "grad_norm": 1.3314592838287354, + "learning_rate": 3.850701402805612e-05, + "loss": 0.2302, + "step": 1158 + }, + { + "epoch": 4.636, + "grad_norm": 1.4725823402404785, + "learning_rate": 3.849699398797595e-05, + "loss": 0.2359, + "step": 1159 + }, + { + "epoch": 4.64, + "grad_norm": 1.294355034828186, + "learning_rate": 3.848697394789579e-05, + "loss": 0.2359, + "step": 1160 + }, + { + "epoch": 4.644, + "grad_norm": 1.3472228050231934, + "learning_rate": 3.8476953907815634e-05, + "loss": 0.2347, + "step": 1161 + }, + { + "epoch": 4.648, + "grad_norm": 1.2284510135650635, + "learning_rate": 3.846693386773547e-05, + "loss": 0.2115, + "step": 1162 + }, + { + "epoch": 4.652, + "grad_norm": 1.3090240955352783, + "learning_rate": 3.845691382765531e-05, + "loss": 0.2571, + "step": 1163 + }, + { + "epoch": 4.656, + "grad_norm": 1.103739857673645, + "learning_rate": 3.844689378757515e-05, + "loss": 0.2399, + "step": 1164 + }, + { + "epoch": 4.66, + "grad_norm": 1.189477801322937, + "learning_rate": 3.8436873747494986e-05, + "loss": 0.2247, + "step": 1165 + }, + { + "epoch": 4.664, + "grad_norm": 1.3449199199676514, + "learning_rate": 3.8426853707414834e-05, + "loss": 0.2112, + "step": 1166 + }, + { + "epoch": 4.668, + "grad_norm": 1.3756353855133057, + "learning_rate": 3.8416833667334675e-05, + "loss": 0.2268, + "step": 1167 + }, + { + "epoch": 4.672, + "grad_norm": 1.3317773342132568, + "learning_rate": 3.840681362725451e-05, + "loss": 0.2692, + "step": 1168 + }, + { + "epoch": 4.676, + "grad_norm": 1.2678979635238647, + "learning_rate": 3.839679358717435e-05, + "loss": 0.2411, + "step": 1169 + }, + { + "epoch": 4.68, + "grad_norm": 0.8288043141365051, + "learning_rate": 3.838677354709419e-05, + "loss": 0.1286, + "step": 1170 + }, + { + "epoch": 4.684, + "grad_norm": 1.362588882446289, + "learning_rate": 3.837675350701403e-05, + "loss": 0.2238, + "step": 1171 + }, + { + "epoch": 4.688, + "grad_norm": 1.2444343566894531, + "learning_rate": 3.836673346693387e-05, + "loss": 0.2223, + "step": 1172 + }, + { + "epoch": 4.692, + "grad_norm": 1.3691511154174805, + "learning_rate": 3.835671342685371e-05, + "loss": 0.2527, + "step": 1173 + }, + { + "epoch": 4.696, + "grad_norm": 1.233352780342102, + "learning_rate": 3.8346693386773544e-05, + "loss": 0.2361, + "step": 1174 + }, + { + "epoch": 4.7, + "grad_norm": 1.1726874113082886, + "learning_rate": 3.8336673346693385e-05, + "loss": 0.2026, + "step": 1175 + }, + { + "epoch": 4.704, + "grad_norm": 1.3368510007858276, + "learning_rate": 3.832665330661323e-05, + "loss": 0.2683, + "step": 1176 + }, + { + "epoch": 4.708, + "grad_norm": 1.355893611907959, + "learning_rate": 3.831663326653307e-05, + "loss": 0.2503, + "step": 1177 + }, + { + "epoch": 4.712, + "grad_norm": 1.2774181365966797, + "learning_rate": 3.830661322645291e-05, + "loss": 0.2655, + "step": 1178 + }, + { + "epoch": 4.716, + "grad_norm": 1.2439703941345215, + "learning_rate": 3.829659318637275e-05, + "loss": 0.2311, + "step": 1179 + }, + { + "epoch": 4.72, + "grad_norm": 1.2443056106567383, + "learning_rate": 3.8286573146292585e-05, + "loss": 0.2011, + "step": 1180 + }, + { + "epoch": 4.724, + "grad_norm": 1.3085763454437256, + "learning_rate": 3.8276553106212426e-05, + "loss": 0.2363, + "step": 1181 + }, + { + "epoch": 4.728, + "grad_norm": 1.3546162843704224, + "learning_rate": 3.826653306613227e-05, + "loss": 0.2611, + "step": 1182 + }, + { + "epoch": 4.732, + "grad_norm": 1.2781968116760254, + "learning_rate": 3.82565130260521e-05, + "loss": 0.207, + "step": 1183 + }, + { + "epoch": 4.736, + "grad_norm": 3.8161380290985107, + "learning_rate": 3.824649298597194e-05, + "loss": 0.2138, + "step": 1184 + }, + { + "epoch": 4.74, + "grad_norm": 1.2593066692352295, + "learning_rate": 3.823647294589179e-05, + "loss": 0.2364, + "step": 1185 + }, + { + "epoch": 4.744, + "grad_norm": 1.3970239162445068, + "learning_rate": 3.8226452905811626e-05, + "loss": 0.209, + "step": 1186 + }, + { + "epoch": 4.748, + "grad_norm": 1.306069016456604, + "learning_rate": 3.821643286573147e-05, + "loss": 0.2443, + "step": 1187 + }, + { + "epoch": 4.752, + "grad_norm": 1.1042245626449585, + "learning_rate": 3.820641282565131e-05, + "loss": 0.2157, + "step": 1188 + }, + { + "epoch": 4.756, + "grad_norm": 1.157153606414795, + "learning_rate": 3.819639278557114e-05, + "loss": 0.2211, + "step": 1189 + }, + { + "epoch": 4.76, + "grad_norm": 1.1273726224899292, + "learning_rate": 3.8186372745490984e-05, + "loss": 0.2221, + "step": 1190 + }, + { + "epoch": 4.764, + "grad_norm": 1.3015018701553345, + "learning_rate": 3.8176352705410825e-05, + "loss": 0.2381, + "step": 1191 + }, + { + "epoch": 4.768, + "grad_norm": 1.152994990348816, + "learning_rate": 3.816633266533066e-05, + "loss": 0.2523, + "step": 1192 + }, + { + "epoch": 4.772, + "grad_norm": 1.5288457870483398, + "learning_rate": 3.81563126252505e-05, + "loss": 0.2474, + "step": 1193 + }, + { + "epoch": 4.776, + "grad_norm": 1.488034963607788, + "learning_rate": 3.814629258517034e-05, + "loss": 0.2602, + "step": 1194 + }, + { + "epoch": 4.78, + "grad_norm": 1.257806420326233, + "learning_rate": 3.8136272545090184e-05, + "loss": 0.2165, + "step": 1195 + }, + { + "epoch": 4.784, + "grad_norm": 1.1476024389266968, + "learning_rate": 3.8126252505010025e-05, + "loss": 0.2374, + "step": 1196 + }, + { + "epoch": 4.788, + "grad_norm": 1.3641656637191772, + "learning_rate": 3.8116232464929866e-05, + "loss": 0.2371, + "step": 1197 + }, + { + "epoch": 4.792, + "grad_norm": 1.5159633159637451, + "learning_rate": 3.81062124248497e-05, + "loss": 0.2511, + "step": 1198 + }, + { + "epoch": 4.796, + "grad_norm": 1.186079740524292, + "learning_rate": 3.809619238476954e-05, + "loss": 0.2566, + "step": 1199 + }, + { + "epoch": 4.8, + "grad_norm": 1.5716686248779297, + "learning_rate": 3.808617234468938e-05, + "loss": 0.2318, + "step": 1200 + }, + { + "epoch": 4.804, + "grad_norm": 1.3118908405303955, + "learning_rate": 3.807615230460922e-05, + "loss": 0.2452, + "step": 1201 + }, + { + "epoch": 4.808, + "grad_norm": 1.3244889974594116, + "learning_rate": 3.806613226452906e-05, + "loss": 0.2362, + "step": 1202 + }, + { + "epoch": 4.812, + "grad_norm": 1.1287165880203247, + "learning_rate": 3.80561122244489e-05, + "loss": 0.1915, + "step": 1203 + }, + { + "epoch": 4.816, + "grad_norm": 1.2781370878219604, + "learning_rate": 3.8046092184368735e-05, + "loss": 0.2024, + "step": 1204 + }, + { + "epoch": 4.82, + "grad_norm": 1.2718026638031006, + "learning_rate": 3.803607214428858e-05, + "loss": 0.2099, + "step": 1205 + }, + { + "epoch": 4.824, + "grad_norm": 1.2175809144973755, + "learning_rate": 3.802605210420842e-05, + "loss": 0.2387, + "step": 1206 + }, + { + "epoch": 4.828, + "grad_norm": 1.1810656785964966, + "learning_rate": 3.801603206412826e-05, + "loss": 0.2773, + "step": 1207 + }, + { + "epoch": 4.832, + "grad_norm": 1.0714348554611206, + "learning_rate": 3.80060120240481e-05, + "loss": 0.1103, + "step": 1208 + }, + { + "epoch": 4.836, + "grad_norm": 1.2614023685455322, + "learning_rate": 3.7995991983967935e-05, + "loss": 0.2011, + "step": 1209 + }, + { + "epoch": 4.84, + "grad_norm": 1.2104132175445557, + "learning_rate": 3.7985971943887776e-05, + "loss": 0.2036, + "step": 1210 + }, + { + "epoch": 4.844, + "grad_norm": 1.322792887687683, + "learning_rate": 3.797595190380762e-05, + "loss": 0.246, + "step": 1211 + }, + { + "epoch": 4.848, + "grad_norm": 1.1660313606262207, + "learning_rate": 3.796593186372746e-05, + "loss": 0.2658, + "step": 1212 + }, + { + "epoch": 4.852, + "grad_norm": 1.3373457193374634, + "learning_rate": 3.795591182364729e-05, + "loss": 0.2282, + "step": 1213 + }, + { + "epoch": 4.856, + "grad_norm": 1.284891963005066, + "learning_rate": 3.7945891783567134e-05, + "loss": 0.2356, + "step": 1214 + }, + { + "epoch": 4.86, + "grad_norm": 1.2078474760055542, + "learning_rate": 3.7935871743486975e-05, + "loss": 0.2295, + "step": 1215 + }, + { + "epoch": 4.864, + "grad_norm": 1.158579707145691, + "learning_rate": 3.792585170340682e-05, + "loss": 0.2112, + "step": 1216 + }, + { + "epoch": 4.868, + "grad_norm": 1.3074941635131836, + "learning_rate": 3.791583166332666e-05, + "loss": 0.2361, + "step": 1217 + }, + { + "epoch": 4.872, + "grad_norm": 0.9688441157341003, + "learning_rate": 3.790581162324649e-05, + "loss": 0.1319, + "step": 1218 + }, + { + "epoch": 4.876, + "grad_norm": 1.5012363195419312, + "learning_rate": 3.7895791583166334e-05, + "loss": 0.2426, + "step": 1219 + }, + { + "epoch": 4.88, + "grad_norm": 1.236053228378296, + "learning_rate": 3.7885771543086175e-05, + "loss": 0.2136, + "step": 1220 + }, + { + "epoch": 4.884, + "grad_norm": 1.2395013570785522, + "learning_rate": 3.787575150300601e-05, + "loss": 0.2054, + "step": 1221 + }, + { + "epoch": 4.888, + "grad_norm": 1.3079355955123901, + "learning_rate": 3.786573146292585e-05, + "loss": 0.2243, + "step": 1222 + }, + { + "epoch": 4.892, + "grad_norm": 1.2592490911483765, + "learning_rate": 3.785571142284569e-05, + "loss": 0.2462, + "step": 1223 + }, + { + "epoch": 4.896, + "grad_norm": 1.615578055381775, + "learning_rate": 3.7845691382765533e-05, + "loss": 0.3059, + "step": 1224 + }, + { + "epoch": 4.9, + "grad_norm": 1.2457841634750366, + "learning_rate": 3.7835671342685375e-05, + "loss": 0.1957, + "step": 1225 + }, + { + "epoch": 4.904, + "grad_norm": 1.286840558052063, + "learning_rate": 3.7825651302605216e-05, + "loss": 0.2372, + "step": 1226 + }, + { + "epoch": 4.908, + "grad_norm": 1.2358272075653076, + "learning_rate": 3.781563126252505e-05, + "loss": 0.2342, + "step": 1227 + }, + { + "epoch": 4.912, + "grad_norm": 1.3874037265777588, + "learning_rate": 3.780561122244489e-05, + "loss": 0.2249, + "step": 1228 + }, + { + "epoch": 4.916, + "grad_norm": 1.2254422903060913, + "learning_rate": 3.779559118236473e-05, + "loss": 0.2138, + "step": 1229 + }, + { + "epoch": 4.92, + "grad_norm": 1.2836542129516602, + "learning_rate": 3.778557114228457e-05, + "loss": 0.2212, + "step": 1230 + }, + { + "epoch": 4.924, + "grad_norm": 1.3436830043792725, + "learning_rate": 3.777555110220441e-05, + "loss": 0.1943, + "step": 1231 + }, + { + "epoch": 4.928, + "grad_norm": 1.2510602474212646, + "learning_rate": 3.776553106212425e-05, + "loss": 0.2079, + "step": 1232 + }, + { + "epoch": 4.932, + "grad_norm": 1.2235398292541504, + "learning_rate": 3.7755511022044085e-05, + "loss": 0.245, + "step": 1233 + }, + { + "epoch": 4.936, + "grad_norm": 1.2405413389205933, + "learning_rate": 3.774549098196393e-05, + "loss": 0.2368, + "step": 1234 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 1.2697187662124634, + "learning_rate": 3.7735470941883774e-05, + "loss": 0.2472, + "step": 1235 + }, + { + "epoch": 4.944, + "grad_norm": 1.250165343284607, + "learning_rate": 3.772545090180361e-05, + "loss": 0.245, + "step": 1236 + }, + { + "epoch": 4.948, + "grad_norm": 1.3872276544570923, + "learning_rate": 3.771543086172345e-05, + "loss": 0.2394, + "step": 1237 + }, + { + "epoch": 4.952, + "grad_norm": 1.4038046598434448, + "learning_rate": 3.770541082164329e-05, + "loss": 0.2684, + "step": 1238 + }, + { + "epoch": 4.9559999999999995, + "grad_norm": 1.1734498739242554, + "learning_rate": 3.7695390781563126e-05, + "loss": 0.2401, + "step": 1239 + }, + { + "epoch": 4.96, + "grad_norm": 1.181815266609192, + "learning_rate": 3.768537074148297e-05, + "loss": 0.2575, + "step": 1240 + }, + { + "epoch": 4.964, + "grad_norm": 1.4323796033859253, + "learning_rate": 3.767535070140281e-05, + "loss": 0.2368, + "step": 1241 + }, + { + "epoch": 4.968, + "grad_norm": 1.2690467834472656, + "learning_rate": 3.766533066132264e-05, + "loss": 0.2726, + "step": 1242 + }, + { + "epoch": 4.9719999999999995, + "grad_norm": 1.060066819190979, + "learning_rate": 3.7655310621242484e-05, + "loss": 0.2176, + "step": 1243 + }, + { + "epoch": 4.976, + "grad_norm": 1.293988823890686, + "learning_rate": 3.764529058116233e-05, + "loss": 0.2382, + "step": 1244 + }, + { + "epoch": 4.98, + "grad_norm": 1.1977914571762085, + "learning_rate": 3.7635270541082167e-05, + "loss": 0.2414, + "step": 1245 + }, + { + "epoch": 4.984, + "grad_norm": 1.373465895652771, + "learning_rate": 3.762525050100201e-05, + "loss": 0.2524, + "step": 1246 + }, + { + "epoch": 4.9879999999999995, + "grad_norm": 1.2298009395599365, + "learning_rate": 3.761523046092185e-05, + "loss": 0.2156, + "step": 1247 + }, + { + "epoch": 4.992, + "grad_norm": 1.460239052772522, + "learning_rate": 3.7605210420841684e-05, + "loss": 0.2943, + "step": 1248 + }, + { + "epoch": 4.996, + "grad_norm": 1.2077012062072754, + "learning_rate": 3.7595190380761525e-05, + "loss": 0.2488, + "step": 1249 + }, + { + "epoch": 5.0, + "grad_norm": 1.2159690856933594, + "learning_rate": 3.7585170340681366e-05, + "loss": 0.237, + "step": 1250 + }, + { + "epoch": 5.004, + "grad_norm": 1.1079336404800415, + "learning_rate": 3.75751503006012e-05, + "loss": 0.1713, + "step": 1251 + }, + { + "epoch": 5.008, + "grad_norm": 1.1447618007659912, + "learning_rate": 3.756513026052104e-05, + "loss": 0.1614, + "step": 1252 + }, + { + "epoch": 5.012, + "grad_norm": 1.0622740983963013, + "learning_rate": 3.755511022044088e-05, + "loss": 0.1568, + "step": 1253 + }, + { + "epoch": 5.016, + "grad_norm": 1.1202747821807861, + "learning_rate": 3.7545090180360724e-05, + "loss": 0.1313, + "step": 1254 + }, + { + "epoch": 5.02, + "grad_norm": 1.1181975603103638, + "learning_rate": 3.7535070140280566e-05, + "loss": 0.166, + "step": 1255 + }, + { + "epoch": 5.024, + "grad_norm": 10.162293434143066, + "learning_rate": 3.752505010020041e-05, + "loss": 0.0938, + "step": 1256 + }, + { + "epoch": 5.028, + "grad_norm": 1.2769560813903809, + "learning_rate": 3.751503006012024e-05, + "loss": 0.1378, + "step": 1257 + }, + { + "epoch": 5.032, + "grad_norm": 1.2073560953140259, + "learning_rate": 3.750501002004008e-05, + "loss": 0.1715, + "step": 1258 + }, + { + "epoch": 5.036, + "grad_norm": 1.229649543762207, + "learning_rate": 3.7494989979959924e-05, + "loss": 0.14, + "step": 1259 + }, + { + "epoch": 5.04, + "grad_norm": 1.3796989917755127, + "learning_rate": 3.748496993987976e-05, + "loss": 0.189, + "step": 1260 + }, + { + "epoch": 5.044, + "grad_norm": 1.5101726055145264, + "learning_rate": 3.74749498997996e-05, + "loss": 0.1487, + "step": 1261 + }, + { + "epoch": 5.048, + "grad_norm": 1.451194167137146, + "learning_rate": 3.746492985971944e-05, + "loss": 0.1871, + "step": 1262 + }, + { + "epoch": 5.052, + "grad_norm": 1.608769416809082, + "learning_rate": 3.7454909819639276e-05, + "loss": 0.1676, + "step": 1263 + }, + { + "epoch": 5.056, + "grad_norm": 1.3911021947860718, + "learning_rate": 3.7444889779559124e-05, + "loss": 0.1217, + "step": 1264 + }, + { + "epoch": 5.06, + "grad_norm": 1.4321279525756836, + "learning_rate": 3.743486973947896e-05, + "loss": 0.1725, + "step": 1265 + }, + { + "epoch": 5.064, + "grad_norm": 1.49961519241333, + "learning_rate": 3.74248496993988e-05, + "loss": 0.1564, + "step": 1266 + }, + { + "epoch": 5.068, + "grad_norm": 1.374585509300232, + "learning_rate": 3.741482965931864e-05, + "loss": 0.1183, + "step": 1267 + }, + { + "epoch": 5.072, + "grad_norm": 1.5012727975845337, + "learning_rate": 3.740480961923848e-05, + "loss": 0.1466, + "step": 1268 + }, + { + "epoch": 5.076, + "grad_norm": Infinity, + "learning_rate": 3.740480961923848e-05, + "loss": 0.3217, + "step": 1269 + }, + { + "epoch": 5.08, + "grad_norm": 1.511431336402893, + "learning_rate": 3.739478957915832e-05, + "loss": 0.1452, + "step": 1270 + }, + { + "epoch": 5.084, + "grad_norm": 1.2745331525802612, + "learning_rate": 3.738476953907816e-05, + "loss": 0.1128, + "step": 1271 + }, + { + "epoch": 5.088, + "grad_norm": 1.2193913459777832, + "learning_rate": 3.7374749498998e-05, + "loss": 0.1502, + "step": 1272 + }, + { + "epoch": 5.092, + "grad_norm": 1.190537929534912, + "learning_rate": 3.7364729458917834e-05, + "loss": 0.117, + "step": 1273 + }, + { + "epoch": 5.096, + "grad_norm": 1.1268037557601929, + "learning_rate": 3.7354709418837675e-05, + "loss": 0.1432, + "step": 1274 + }, + { + "epoch": 5.1, + "grad_norm": 1.1972516775131226, + "learning_rate": 3.7344689378757516e-05, + "loss": 0.1263, + "step": 1275 + }, + { + "epoch": 5.104, + "grad_norm": 1.2640272378921509, + "learning_rate": 3.733466933867736e-05, + "loss": 0.1393, + "step": 1276 + }, + { + "epoch": 5.108, + "grad_norm": 1.2229938507080078, + "learning_rate": 3.73246492985972e-05, + "loss": 0.1254, + "step": 1277 + }, + { + "epoch": 5.112, + "grad_norm": 1.4548741579055786, + "learning_rate": 3.731462925851703e-05, + "loss": 0.129, + "step": 1278 + }, + { + "epoch": 5.116, + "grad_norm": 1.5529509782791138, + "learning_rate": 3.7304609218436875e-05, + "loss": 0.1918, + "step": 1279 + }, + { + "epoch": 5.12, + "grad_norm": 1.473236322402954, + "learning_rate": 3.7294589178356716e-05, + "loss": 0.129, + "step": 1280 + }, + { + "epoch": 5.124, + "grad_norm": 1.4448833465576172, + "learning_rate": 3.728456913827656e-05, + "loss": 0.1673, + "step": 1281 + }, + { + "epoch": 5.128, + "grad_norm": 1.455904245376587, + "learning_rate": 3.727454909819639e-05, + "loss": 0.1614, + "step": 1282 + }, + { + "epoch": 5.132, + "grad_norm": 47.209869384765625, + "learning_rate": 3.726452905811623e-05, + "loss": 0.6446, + "step": 1283 + }, + { + "epoch": 5.136, + "grad_norm": 1.2592720985412598, + "learning_rate": 3.7254509018036074e-05, + "loss": 0.1518, + "step": 1284 + }, + { + "epoch": 5.14, + "grad_norm": 1.3965915441513062, + "learning_rate": 3.7244488977955916e-05, + "loss": 0.1286, + "step": 1285 + }, + { + "epoch": 5.144, + "grad_norm": 1.0066782236099243, + "learning_rate": 3.723446893787576e-05, + "loss": 0.0742, + "step": 1286 + }, + { + "epoch": 5.148, + "grad_norm": 1.2181978225708008, + "learning_rate": 3.722444889779559e-05, + "loss": 0.1358, + "step": 1287 + }, + { + "epoch": 5.152, + "grad_norm": 1.3564952611923218, + "learning_rate": 3.721442885771543e-05, + "loss": 0.1212, + "step": 1288 + }, + { + "epoch": 5.156, + "grad_norm": 1.4054635763168335, + "learning_rate": 3.7204408817635274e-05, + "loss": 0.1534, + "step": 1289 + }, + { + "epoch": 5.16, + "grad_norm": 1.1692203283309937, + "learning_rate": 3.719438877755511e-05, + "loss": 0.1135, + "step": 1290 + }, + { + "epoch": 5.164, + "grad_norm": 1.528304934501648, + "learning_rate": 3.718436873747495e-05, + "loss": 0.1787, + "step": 1291 + }, + { + "epoch": 5.168, + "grad_norm": 1.4810398817062378, + "learning_rate": 3.717434869739479e-05, + "loss": 0.1711, + "step": 1292 + }, + { + "epoch": 5.172, + "grad_norm": 1.4008511304855347, + "learning_rate": 3.7164328657314625e-05, + "loss": 0.107, + "step": 1293 + }, + { + "epoch": 5.176, + "grad_norm": 1.5215219259262085, + "learning_rate": 3.7154308617234474e-05, + "loss": 0.1714, + "step": 1294 + }, + { + "epoch": 5.18, + "grad_norm": 1.3255380392074585, + "learning_rate": 3.7144288577154315e-05, + "loss": 0.1437, + "step": 1295 + }, + { + "epoch": 5.184, + "grad_norm": 1.540545105934143, + "learning_rate": 3.713426853707415e-05, + "loss": 0.1374, + "step": 1296 + }, + { + "epoch": 5.188, + "grad_norm": 1.2621541023254395, + "learning_rate": 3.712424849699399e-05, + "loss": 0.1114, + "step": 1297 + }, + { + "epoch": 5.192, + "grad_norm": 1.355939507484436, + "learning_rate": 3.711422845691383e-05, + "loss": 0.135, + "step": 1298 + }, + { + "epoch": 5.196, + "grad_norm": 1.5436731576919556, + "learning_rate": 3.7104208416833666e-05, + "loss": 0.1471, + "step": 1299 + }, + { + "epoch": 5.2, + "grad_norm": 1.4287269115447998, + "learning_rate": 3.709418837675351e-05, + "loss": 0.147, + "step": 1300 + }, + { + "epoch": 5.204, + "grad_norm": 1.37587308883667, + "learning_rate": 3.708416833667335e-05, + "loss": 0.1434, + "step": 1301 + }, + { + "epoch": 5.208, + "grad_norm": 1.3519721031188965, + "learning_rate": 3.7074148296593183e-05, + "loss": 0.1418, + "step": 1302 + }, + { + "epoch": 5.212, + "grad_norm": 1.3758410215377808, + "learning_rate": 3.7064128256513025e-05, + "loss": 0.1535, + "step": 1303 + }, + { + "epoch": 5.216, + "grad_norm": 1.3432583808898926, + "learning_rate": 3.705410821643287e-05, + "loss": 0.1477, + "step": 1304 + }, + { + "epoch": 5.22, + "grad_norm": 1.2263681888580322, + "learning_rate": 3.704408817635271e-05, + "loss": 0.1374, + "step": 1305 + }, + { + "epoch": 5.224, + "grad_norm": 1.2747381925582886, + "learning_rate": 3.703406813627255e-05, + "loss": 0.1498, + "step": 1306 + }, + { + "epoch": 5.228, + "grad_norm": 1.4037673473358154, + "learning_rate": 3.702404809619239e-05, + "loss": 0.1539, + "step": 1307 + }, + { + "epoch": 5.232, + "grad_norm": 1.4913922548294067, + "learning_rate": 3.7014028056112224e-05, + "loss": 0.1381, + "step": 1308 + }, + { + "epoch": 5.236, + "grad_norm": 1.3626859188079834, + "learning_rate": 3.7004008016032066e-05, + "loss": 0.1257, + "step": 1309 + }, + { + "epoch": 5.24, + "grad_norm": 1.508866548538208, + "learning_rate": 3.699398797595191e-05, + "loss": 0.1854, + "step": 1310 + }, + { + "epoch": 5.244, + "grad_norm": 1.3557127714157104, + "learning_rate": 3.698396793587174e-05, + "loss": 0.1417, + "step": 1311 + }, + { + "epoch": 5.248, + "grad_norm": 1.4805433750152588, + "learning_rate": 3.697394789579158e-05, + "loss": 0.1596, + "step": 1312 + }, + { + "epoch": 5.252, + "grad_norm": 1.4268604516983032, + "learning_rate": 3.6963927855711424e-05, + "loss": 0.1612, + "step": 1313 + }, + { + "epoch": 5.256, + "grad_norm": 1.4338222742080688, + "learning_rate": 3.6953907815631265e-05, + "loss": 0.1668, + "step": 1314 + }, + { + "epoch": 5.26, + "grad_norm": 1.3227003812789917, + "learning_rate": 3.6943887775551107e-05, + "loss": 0.1342, + "step": 1315 + }, + { + "epoch": 5.264, + "grad_norm": 1.3900845050811768, + "learning_rate": 3.693386773547095e-05, + "loss": 0.1342, + "step": 1316 + }, + { + "epoch": 5.268, + "grad_norm": 1.3115166425704956, + "learning_rate": 3.692384769539078e-05, + "loss": 0.1544, + "step": 1317 + }, + { + "epoch": 5.272, + "grad_norm": 1.6373382806777954, + "learning_rate": 3.6913827655310624e-05, + "loss": 0.1649, + "step": 1318 + }, + { + "epoch": 5.276, + "grad_norm": 1.2519011497497559, + "learning_rate": 3.6903807615230465e-05, + "loss": 0.1448, + "step": 1319 + }, + { + "epoch": 5.28, + "grad_norm": 1.4340311288833618, + "learning_rate": 3.68937875751503e-05, + "loss": 0.1744, + "step": 1320 + }, + { + "epoch": 5.284, + "grad_norm": 1.7507641315460205, + "learning_rate": 3.688376753507014e-05, + "loss": 0.1714, + "step": 1321 + }, + { + "epoch": 5.288, + "grad_norm": 1.4718471765518188, + "learning_rate": 3.687374749498998e-05, + "loss": 0.1769, + "step": 1322 + }, + { + "epoch": 5.292, + "grad_norm": 1.3668906688690186, + "learning_rate": 3.6863727454909817e-05, + "loss": 0.1593, + "step": 1323 + }, + { + "epoch": 5.296, + "grad_norm": 1.5843260288238525, + "learning_rate": 3.6853707414829665e-05, + "loss": 0.1379, + "step": 1324 + }, + { + "epoch": 5.3, + "grad_norm": 1.3435697555541992, + "learning_rate": 3.6843687374749506e-05, + "loss": 0.1084, + "step": 1325 + }, + { + "epoch": 5.304, + "grad_norm": 1.4321434497833252, + "learning_rate": 3.683366733466934e-05, + "loss": 0.1472, + "step": 1326 + }, + { + "epoch": 5.308, + "grad_norm": 1.3599714040756226, + "learning_rate": 3.682364729458918e-05, + "loss": 0.1392, + "step": 1327 + }, + { + "epoch": 5.312, + "grad_norm": 1.3491992950439453, + "learning_rate": 3.681362725450902e-05, + "loss": 0.1574, + "step": 1328 + }, + { + "epoch": 5.316, + "grad_norm": 1.4255551099777222, + "learning_rate": 3.680360721442886e-05, + "loss": 0.1349, + "step": 1329 + }, + { + "epoch": 5.32, + "grad_norm": 1.34247624874115, + "learning_rate": 3.67935871743487e-05, + "loss": 0.1745, + "step": 1330 + }, + { + "epoch": 5.324, + "grad_norm": 1.4770880937576294, + "learning_rate": 3.678356713426854e-05, + "loss": 0.1558, + "step": 1331 + }, + { + "epoch": 5.328, + "grad_norm": 1.468400001525879, + "learning_rate": 3.6773547094188375e-05, + "loss": 0.1388, + "step": 1332 + }, + { + "epoch": 5.332, + "grad_norm": 1.588028907775879, + "learning_rate": 3.6763527054108216e-05, + "loss": 0.1722, + "step": 1333 + }, + { + "epoch": 5.336, + "grad_norm": 1.3578060865402222, + "learning_rate": 3.675350701402806e-05, + "loss": 0.1314, + "step": 1334 + }, + { + "epoch": 5.34, + "grad_norm": 1.7125457525253296, + "learning_rate": 3.67434869739479e-05, + "loss": 0.1409, + "step": 1335 + }, + { + "epoch": 5.344, + "grad_norm": 1.482620358467102, + "learning_rate": 3.673346693386774e-05, + "loss": 0.1242, + "step": 1336 + }, + { + "epoch": 5.348, + "grad_norm": 1.468563437461853, + "learning_rate": 3.6723446893787574e-05, + "loss": 0.1509, + "step": 1337 + }, + { + "epoch": 5.352, + "grad_norm": 1.4916741847991943, + "learning_rate": 3.6713426853707415e-05, + "loss": 0.1477, + "step": 1338 + }, + { + "epoch": 5.356, + "grad_norm": 1.3862581253051758, + "learning_rate": 3.670340681362726e-05, + "loss": 0.1625, + "step": 1339 + }, + { + "epoch": 5.36, + "grad_norm": 1.3407968282699585, + "learning_rate": 3.66933867735471e-05, + "loss": 0.1499, + "step": 1340 + }, + { + "epoch": 5.364, + "grad_norm": 1.010241985321045, + "learning_rate": 3.668336673346693e-05, + "loss": 0.0938, + "step": 1341 + }, + { + "epoch": 5.368, + "grad_norm": 1.3833948373794556, + "learning_rate": 3.6673346693386774e-05, + "loss": 0.1439, + "step": 1342 + }, + { + "epoch": 5.372, + "grad_norm": 1.3611929416656494, + "learning_rate": 3.6663326653306615e-05, + "loss": 0.1722, + "step": 1343 + }, + { + "epoch": 5.376, + "grad_norm": 1.1714794635772705, + "learning_rate": 3.6653306613226456e-05, + "loss": 0.093, + "step": 1344 + }, + { + "epoch": 5.38, + "grad_norm": 1.0513854026794434, + "learning_rate": 3.66432865731463e-05, + "loss": 0.1028, + "step": 1345 + }, + { + "epoch": 5.384, + "grad_norm": 1.432370662689209, + "learning_rate": 3.663326653306613e-05, + "loss": 0.1621, + "step": 1346 + }, + { + "epoch": 5.388, + "grad_norm": 1.3324103355407715, + "learning_rate": 3.6623246492985973e-05, + "loss": 0.1261, + "step": 1347 + }, + { + "epoch": 5.392, + "grad_norm": 1.4653232097625732, + "learning_rate": 3.6613226452905815e-05, + "loss": 0.1342, + "step": 1348 + }, + { + "epoch": 5.396, + "grad_norm": 1.4007515907287598, + "learning_rate": 3.660320641282565e-05, + "loss": 0.1296, + "step": 1349 + }, + { + "epoch": 5.4, + "grad_norm": 1.5396267175674438, + "learning_rate": 3.659318637274549e-05, + "loss": 0.1519, + "step": 1350 + }, + { + "epoch": 5.404, + "grad_norm": 1.4199961423873901, + "learning_rate": 3.658316633266533e-05, + "loss": 0.1516, + "step": 1351 + }, + { + "epoch": 5.408, + "grad_norm": 1.455402135848999, + "learning_rate": 3.657314629258517e-05, + "loss": 0.19, + "step": 1352 + }, + { + "epoch": 5.412, + "grad_norm": 1.4118759632110596, + "learning_rate": 3.6563126252505014e-05, + "loss": 0.1687, + "step": 1353 + }, + { + "epoch": 5.416, + "grad_norm": 1.4946292638778687, + "learning_rate": 3.6553106212424856e-05, + "loss": 0.1587, + "step": 1354 + }, + { + "epoch": 5.42, + "grad_norm": 1.4371452331542969, + "learning_rate": 3.654308617234469e-05, + "loss": 0.1722, + "step": 1355 + }, + { + "epoch": 5.424, + "grad_norm": 1.4597325325012207, + "learning_rate": 3.653306613226453e-05, + "loss": 0.1954, + "step": 1356 + }, + { + "epoch": 5.428, + "grad_norm": 1.463151216506958, + "learning_rate": 3.652304609218437e-05, + "loss": 0.2056, + "step": 1357 + }, + { + "epoch": 5.432, + "grad_norm": 1.568338394165039, + "learning_rate": 3.651302605210421e-05, + "loss": 0.1665, + "step": 1358 + }, + { + "epoch": 5.436, + "grad_norm": 1.4030911922454834, + "learning_rate": 3.650300601202405e-05, + "loss": 0.1463, + "step": 1359 + }, + { + "epoch": 5.44, + "grad_norm": 1.3122279644012451, + "learning_rate": 3.649298597194389e-05, + "loss": 0.1324, + "step": 1360 + }, + { + "epoch": 5.444, + "grad_norm": 1.398829698562622, + "learning_rate": 3.6482965931863724e-05, + "loss": 0.182, + "step": 1361 + }, + { + "epoch": 5.448, + "grad_norm": 1.5118191242218018, + "learning_rate": 3.6472945891783566e-05, + "loss": 0.1485, + "step": 1362 + }, + { + "epoch": 5.452, + "grad_norm": 1.4859402179718018, + "learning_rate": 3.6462925851703414e-05, + "loss": 0.1429, + "step": 1363 + }, + { + "epoch": 5.456, + "grad_norm": 1.5563009977340698, + "learning_rate": 3.645290581162325e-05, + "loss": 0.1834, + "step": 1364 + }, + { + "epoch": 5.46, + "grad_norm": 1.403515338897705, + "learning_rate": 3.644288577154309e-05, + "loss": 0.1533, + "step": 1365 + }, + { + "epoch": 5.464, + "grad_norm": 1.3895883560180664, + "learning_rate": 3.643286573146293e-05, + "loss": 0.1668, + "step": 1366 + }, + { + "epoch": 5.468, + "grad_norm": 1.454662561416626, + "learning_rate": 3.6422845691382765e-05, + "loss": 0.1376, + "step": 1367 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 1.4135189056396484, + "learning_rate": 3.6412825651302606e-05, + "loss": 0.132, + "step": 1368 + }, + { + "epoch": 5.476, + "grad_norm": 1.3551510572433472, + "learning_rate": 3.640280561122245e-05, + "loss": 0.2015, + "step": 1369 + }, + { + "epoch": 5.48, + "grad_norm": 1.4458680152893066, + "learning_rate": 3.639278557114228e-05, + "loss": 0.1309, + "step": 1370 + }, + { + "epoch": 5.484, + "grad_norm": 1.508864402770996, + "learning_rate": 3.6382765531062124e-05, + "loss": 0.1945, + "step": 1371 + }, + { + "epoch": 5.4879999999999995, + "grad_norm": 1.439699411392212, + "learning_rate": 3.6372745490981965e-05, + "loss": 0.1537, + "step": 1372 + }, + { + "epoch": 5.492, + "grad_norm": 1.516624093055725, + "learning_rate": 3.6362725450901806e-05, + "loss": 0.1503, + "step": 1373 + }, + { + "epoch": 5.496, + "grad_norm": 1.4190101623535156, + "learning_rate": 3.635270541082165e-05, + "loss": 0.1483, + "step": 1374 + }, + { + "epoch": 5.5, + "grad_norm": 1.4877848625183105, + "learning_rate": 3.634268537074149e-05, + "loss": 0.1409, + "step": 1375 + }, + { + "epoch": 5.504, + "grad_norm": 1.448904037475586, + "learning_rate": 3.633266533066132e-05, + "loss": 0.1681, + "step": 1376 + }, + { + "epoch": 5.508, + "grad_norm": 1.4470781087875366, + "learning_rate": 3.6322645290581164e-05, + "loss": 0.1599, + "step": 1377 + }, + { + "epoch": 5.5120000000000005, + "grad_norm": 1.418031930923462, + "learning_rate": 3.6312625250501006e-05, + "loss": 0.1441, + "step": 1378 + }, + { + "epoch": 5.516, + "grad_norm": 1.4554909467697144, + "learning_rate": 3.630260521042084e-05, + "loss": 0.1463, + "step": 1379 + }, + { + "epoch": 5.52, + "grad_norm": 1.379330039024353, + "learning_rate": 3.629258517034068e-05, + "loss": 0.1285, + "step": 1380 + }, + { + "epoch": 5.524, + "grad_norm": 1.4990249872207642, + "learning_rate": 3.628256513026052e-05, + "loss": 0.2017, + "step": 1381 + }, + { + "epoch": 5.5280000000000005, + "grad_norm": 1.2808997631072998, + "learning_rate": 3.627254509018036e-05, + "loss": 0.1664, + "step": 1382 + }, + { + "epoch": 5.532, + "grad_norm": 1.6173251867294312, + "learning_rate": 3.6262525050100205e-05, + "loss": 0.174, + "step": 1383 + }, + { + "epoch": 5.536, + "grad_norm": 1.396598219871521, + "learning_rate": 3.625250501002005e-05, + "loss": 0.1454, + "step": 1384 + }, + { + "epoch": 5.54, + "grad_norm": 1.0782043933868408, + "learning_rate": 3.624248496993988e-05, + "loss": 0.119, + "step": 1385 + }, + { + "epoch": 5.5440000000000005, + "grad_norm": 1.3274977207183838, + "learning_rate": 3.623246492985972e-05, + "loss": 0.1369, + "step": 1386 + }, + { + "epoch": 5.548, + "grad_norm": 1.3669708967208862, + "learning_rate": 3.6222444889779564e-05, + "loss": 0.1481, + "step": 1387 + }, + { + "epoch": 5.552, + "grad_norm": 1.3589403629302979, + "learning_rate": 3.62124248496994e-05, + "loss": 0.2093, + "step": 1388 + }, + { + "epoch": 5.556, + "grad_norm": 1.4127261638641357, + "learning_rate": 3.620240480961924e-05, + "loss": 0.1664, + "step": 1389 + }, + { + "epoch": 5.5600000000000005, + "grad_norm": 1.4620351791381836, + "learning_rate": 3.619238476953908e-05, + "loss": 0.1434, + "step": 1390 + }, + { + "epoch": 5.564, + "grad_norm": 1.4212642908096313, + "learning_rate": 3.6182364729458915e-05, + "loss": 0.161, + "step": 1391 + }, + { + "epoch": 5.568, + "grad_norm": 1.4831411838531494, + "learning_rate": 3.6172344689378757e-05, + "loss": 0.17, + "step": 1392 + }, + { + "epoch": 5.572, + "grad_norm": 1.560911774635315, + "learning_rate": 3.61623246492986e-05, + "loss": 0.1737, + "step": 1393 + }, + { + "epoch": 5.576, + "grad_norm": 1.5256128311157227, + "learning_rate": 3.615230460921844e-05, + "loss": 0.1609, + "step": 1394 + }, + { + "epoch": 5.58, + "grad_norm": 1.407828688621521, + "learning_rate": 3.614228456913828e-05, + "loss": 0.1925, + "step": 1395 + }, + { + "epoch": 5.584, + "grad_norm": 1.3980634212493896, + "learning_rate": 3.613226452905812e-05, + "loss": 0.1535, + "step": 1396 + }, + { + "epoch": 5.588, + "grad_norm": 1.4387320280075073, + "learning_rate": 3.6122244488977956e-05, + "loss": 0.1489, + "step": 1397 + }, + { + "epoch": 5.592, + "grad_norm": 1.5303181409835815, + "learning_rate": 3.61122244488978e-05, + "loss": 0.1925, + "step": 1398 + }, + { + "epoch": 5.596, + "grad_norm": 1.5387216806411743, + "learning_rate": 3.610220440881764e-05, + "loss": 0.1709, + "step": 1399 + }, + { + "epoch": 5.6, + "grad_norm": 0.9596248865127563, + "learning_rate": 3.609218436873747e-05, + "loss": 0.0974, + "step": 1400 + }, + { + "epoch": 5.604, + "grad_norm": 1.4933253526687622, + "learning_rate": 3.6082164328657315e-05, + "loss": 0.2206, + "step": 1401 + }, + { + "epoch": 5.608, + "grad_norm": 1.3961461782455444, + "learning_rate": 3.6072144288577156e-05, + "loss": 0.1859, + "step": 1402 + }, + { + "epoch": 5.612, + "grad_norm": 1.2029403448104858, + "learning_rate": 3.6062124248497e-05, + "loss": 0.1289, + "step": 1403 + }, + { + "epoch": 5.616, + "grad_norm": 1.345209002494812, + "learning_rate": 3.605210420841684e-05, + "loss": 0.1448, + "step": 1404 + }, + { + "epoch": 5.62, + "grad_norm": 1.1782201528549194, + "learning_rate": 3.604208416833667e-05, + "loss": 0.1294, + "step": 1405 + }, + { + "epoch": 5.624, + "grad_norm": 1.461083173751831, + "learning_rate": 3.6032064128256514e-05, + "loss": 0.1534, + "step": 1406 + }, + { + "epoch": 5.628, + "grad_norm": 1.2814429998397827, + "learning_rate": 3.6022044088176356e-05, + "loss": 0.1259, + "step": 1407 + }, + { + "epoch": 5.632, + "grad_norm": 1.565420150756836, + "learning_rate": 3.60120240480962e-05, + "loss": 0.1861, + "step": 1408 + }, + { + "epoch": 5.636, + "grad_norm": 1.0360498428344727, + "learning_rate": 3.600200400801603e-05, + "loss": 0.0836, + "step": 1409 + }, + { + "epoch": 5.64, + "grad_norm": 1.4248356819152832, + "learning_rate": 3.599198396793587e-05, + "loss": 0.1988, + "step": 1410 + }, + { + "epoch": 5.644, + "grad_norm": 1.5098896026611328, + "learning_rate": 3.5981963927855714e-05, + "loss": 0.1708, + "step": 1411 + }, + { + "epoch": 5.648, + "grad_norm": 1.4749072790145874, + "learning_rate": 3.5971943887775555e-05, + "loss": 0.1791, + "step": 1412 + }, + { + "epoch": 5.652, + "grad_norm": 1.316433072090149, + "learning_rate": 3.5961923847695396e-05, + "loss": 0.1425, + "step": 1413 + }, + { + "epoch": 5.656, + "grad_norm": 1.5112534761428833, + "learning_rate": 3.595190380761523e-05, + "loss": 0.1417, + "step": 1414 + }, + { + "epoch": 5.66, + "grad_norm": 1.4549270868301392, + "learning_rate": 3.594188376753507e-05, + "loss": 0.1756, + "step": 1415 + }, + { + "epoch": 5.664, + "grad_norm": 1.5447784662246704, + "learning_rate": 3.5931863727454914e-05, + "loss": 0.1416, + "step": 1416 + }, + { + "epoch": 5.668, + "grad_norm": 1.4175710678100586, + "learning_rate": 3.592184368737475e-05, + "loss": 0.1437, + "step": 1417 + }, + { + "epoch": 5.672, + "grad_norm": 1.4404096603393555, + "learning_rate": 3.591182364729459e-05, + "loss": 0.1456, + "step": 1418 + }, + { + "epoch": 5.676, + "grad_norm": 1.4998631477355957, + "learning_rate": 3.590180360721443e-05, + "loss": 0.1666, + "step": 1419 + }, + { + "epoch": 5.68, + "grad_norm": 1.4109253883361816, + "learning_rate": 3.5891783567134265e-05, + "loss": 0.1846, + "step": 1420 + }, + { + "epoch": 5.684, + "grad_norm": 1.2808420658111572, + "learning_rate": 3.5881763527054106e-05, + "loss": 0.1424, + "step": 1421 + }, + { + "epoch": 5.688, + "grad_norm": 1.439148187637329, + "learning_rate": 3.5871743486973954e-05, + "loss": 0.153, + "step": 1422 + }, + { + "epoch": 5.692, + "grad_norm": 1.5616912841796875, + "learning_rate": 3.586172344689379e-05, + "loss": 0.1863, + "step": 1423 + }, + { + "epoch": 5.696, + "grad_norm": 1.353434681892395, + "learning_rate": 3.585170340681363e-05, + "loss": 0.158, + "step": 1424 + }, + { + "epoch": 5.7, + "grad_norm": 1.3511037826538086, + "learning_rate": 3.584168336673347e-05, + "loss": 0.162, + "step": 1425 + }, + { + "epoch": 5.704, + "grad_norm": 1.4532557725906372, + "learning_rate": 3.5831663326653306e-05, + "loss": 0.1671, + "step": 1426 + }, + { + "epoch": 5.708, + "grad_norm": 1.4470570087432861, + "learning_rate": 3.582164328657315e-05, + "loss": 0.1564, + "step": 1427 + }, + { + "epoch": 5.712, + "grad_norm": 1.4751670360565186, + "learning_rate": 3.581162324649299e-05, + "loss": 0.1445, + "step": 1428 + }, + { + "epoch": 5.716, + "grad_norm": 1.3708182573318481, + "learning_rate": 3.580160320641282e-05, + "loss": 0.1708, + "step": 1429 + }, + { + "epoch": 5.72, + "grad_norm": 1.4434928894042969, + "learning_rate": 3.5791583166332664e-05, + "loss": 0.1691, + "step": 1430 + }, + { + "epoch": 5.724, + "grad_norm": 1.0687934160232544, + "learning_rate": 3.5781563126252506e-05, + "loss": 0.1061, + "step": 1431 + }, + { + "epoch": 5.728, + "grad_norm": 1.4179203510284424, + "learning_rate": 3.577154308617235e-05, + "loss": 0.174, + "step": 1432 + }, + { + "epoch": 5.732, + "grad_norm": 1.318701148033142, + "learning_rate": 3.576152304609219e-05, + "loss": 0.1496, + "step": 1433 + }, + { + "epoch": 5.736, + "grad_norm": 1.4118852615356445, + "learning_rate": 3.575150300601203e-05, + "loss": 0.1978, + "step": 1434 + }, + { + "epoch": 5.74, + "grad_norm": 1.2677303552627563, + "learning_rate": 3.5741482965931864e-05, + "loss": 0.1672, + "step": 1435 + }, + { + "epoch": 5.744, + "grad_norm": 1.8141050338745117, + "learning_rate": 3.5731462925851705e-05, + "loss": 0.1801, + "step": 1436 + }, + { + "epoch": 5.748, + "grad_norm": 1.5993258953094482, + "learning_rate": 3.5721442885771547e-05, + "loss": 0.1583, + "step": 1437 + }, + { + "epoch": 5.752, + "grad_norm": 1.4515212774276733, + "learning_rate": 3.571142284569138e-05, + "loss": 0.2014, + "step": 1438 + }, + { + "epoch": 5.756, + "grad_norm": 1.3378204107284546, + "learning_rate": 3.570140280561122e-05, + "loss": 0.1366, + "step": 1439 + }, + { + "epoch": 5.76, + "grad_norm": 1.3408321142196655, + "learning_rate": 3.5691382765531064e-05, + "loss": 0.1301, + "step": 1440 + }, + { + "epoch": 5.764, + "grad_norm": 1.4962408542633057, + "learning_rate": 3.56813627254509e-05, + "loss": 0.1385, + "step": 1441 + }, + { + "epoch": 5.768, + "grad_norm": 1.589575171470642, + "learning_rate": 3.5671342685370746e-05, + "loss": 0.1777, + "step": 1442 + }, + { + "epoch": 5.772, + "grad_norm": 1.5674686431884766, + "learning_rate": 3.566132264529059e-05, + "loss": 0.166, + "step": 1443 + }, + { + "epoch": 5.776, + "grad_norm": 1.5545917749404907, + "learning_rate": 3.565130260521042e-05, + "loss": 0.1775, + "step": 1444 + }, + { + "epoch": 5.78, + "grad_norm": 1.4211684465408325, + "learning_rate": 3.564128256513026e-05, + "loss": 0.149, + "step": 1445 + }, + { + "epoch": 5.784, + "grad_norm": 1.6268999576568604, + "learning_rate": 3.5631262525050105e-05, + "loss": 0.1587, + "step": 1446 + }, + { + "epoch": 5.788, + "grad_norm": 1.6301075220108032, + "learning_rate": 3.562124248496994e-05, + "loss": 0.1824, + "step": 1447 + }, + { + "epoch": 5.792, + "grad_norm": 1.3886116743087769, + "learning_rate": 3.561122244488978e-05, + "loss": 0.1717, + "step": 1448 + }, + { + "epoch": 5.796, + "grad_norm": 1.5212382078170776, + "learning_rate": 3.560120240480962e-05, + "loss": 0.1772, + "step": 1449 + }, + { + "epoch": 5.8, + "grad_norm": 1.4963605403900146, + "learning_rate": 3.5591182364729456e-05, + "loss": 0.1917, + "step": 1450 + }, + { + "epoch": 5.804, + "grad_norm": 1.4401335716247559, + "learning_rate": 3.55811623246493e-05, + "loss": 0.1619, + "step": 1451 + }, + { + "epoch": 5.808, + "grad_norm": 1.2470486164093018, + "learning_rate": 3.5571142284569145e-05, + "loss": 0.1606, + "step": 1452 + }, + { + "epoch": 5.812, + "grad_norm": 1.4720518589019775, + "learning_rate": 3.556112224448898e-05, + "loss": 0.1735, + "step": 1453 + }, + { + "epoch": 5.816, + "grad_norm": 1.3243999481201172, + "learning_rate": 3.555110220440882e-05, + "loss": 0.172, + "step": 1454 + }, + { + "epoch": 5.82, + "grad_norm": 1.3949337005615234, + "learning_rate": 3.554108216432866e-05, + "loss": 0.1698, + "step": 1455 + }, + { + "epoch": 5.824, + "grad_norm": 1.554047703742981, + "learning_rate": 3.55310621242485e-05, + "loss": 0.1883, + "step": 1456 + }, + { + "epoch": 5.828, + "grad_norm": 1.4399428367614746, + "learning_rate": 3.552104208416834e-05, + "loss": 0.1341, + "step": 1457 + }, + { + "epoch": 5.832, + "grad_norm": 1.5571656227111816, + "learning_rate": 3.551102204408818e-05, + "loss": 0.1765, + "step": 1458 + }, + { + "epoch": 5.836, + "grad_norm": 1.5305334329605103, + "learning_rate": 3.5501002004008014e-05, + "loss": 0.1873, + "step": 1459 + }, + { + "epoch": 5.84, + "grad_norm": 1.5004547834396362, + "learning_rate": 3.5490981963927855e-05, + "loss": 0.1567, + "step": 1460 + }, + { + "epoch": 5.844, + "grad_norm": 1.336523413658142, + "learning_rate": 3.54809619238477e-05, + "loss": 0.1511, + "step": 1461 + }, + { + "epoch": 5.848, + "grad_norm": 1.7299772500991821, + "learning_rate": 3.547094188376754e-05, + "loss": 0.229, + "step": 1462 + }, + { + "epoch": 5.852, + "grad_norm": 1.3854844570159912, + "learning_rate": 3.546092184368738e-05, + "loss": 0.1588, + "step": 1463 + }, + { + "epoch": 5.856, + "grad_norm": 1.587310552597046, + "learning_rate": 3.5450901803607214e-05, + "loss": 0.1463, + "step": 1464 + }, + { + "epoch": 5.86, + "grad_norm": 1.3696925640106201, + "learning_rate": 3.5440881763527055e-05, + "loss": 0.1679, + "step": 1465 + }, + { + "epoch": 5.864, + "grad_norm": 0.932451605796814, + "learning_rate": 3.5430861723446896e-05, + "loss": 0.1001, + "step": 1466 + }, + { + "epoch": 5.868, + "grad_norm": 1.4562475681304932, + "learning_rate": 3.542084168336674e-05, + "loss": 0.1691, + "step": 1467 + }, + { + "epoch": 5.872, + "grad_norm": 1.3682665824890137, + "learning_rate": 3.541082164328657e-05, + "loss": 0.1666, + "step": 1468 + }, + { + "epoch": 5.876, + "grad_norm": 1.3185116052627563, + "learning_rate": 3.5400801603206413e-05, + "loss": 0.1698, + "step": 1469 + }, + { + "epoch": 5.88, + "grad_norm": 1.5660794973373413, + "learning_rate": 3.5390781563126255e-05, + "loss": 0.1482, + "step": 1470 + }, + { + "epoch": 5.884, + "grad_norm": 1.449788212776184, + "learning_rate": 3.5380761523046096e-05, + "loss": 0.1522, + "step": 1471 + }, + { + "epoch": 5.888, + "grad_norm": 1.434978723526001, + "learning_rate": 3.537074148296594e-05, + "loss": 0.1738, + "step": 1472 + }, + { + "epoch": 5.892, + "grad_norm": 1.4906777143478394, + "learning_rate": 3.536072144288577e-05, + "loss": 0.1801, + "step": 1473 + }, + { + "epoch": 5.896, + "grad_norm": 1.4387478828430176, + "learning_rate": 3.535070140280561e-05, + "loss": 0.1582, + "step": 1474 + }, + { + "epoch": 5.9, + "grad_norm": 1.5130301713943481, + "learning_rate": 3.5340681362725454e-05, + "loss": 0.1876, + "step": 1475 + }, + { + "epoch": 5.904, + "grad_norm": 1.44967782497406, + "learning_rate": 3.533066132264529e-05, + "loss": 0.1782, + "step": 1476 + }, + { + "epoch": 5.908, + "grad_norm": 1.5330886840820312, + "learning_rate": 3.532064128256513e-05, + "loss": 0.1584, + "step": 1477 + }, + { + "epoch": 5.912, + "grad_norm": 1.5080928802490234, + "learning_rate": 3.531062124248497e-05, + "loss": 0.1575, + "step": 1478 + }, + { + "epoch": 5.916, + "grad_norm": 1.455888271331787, + "learning_rate": 3.530060120240481e-05, + "loss": 0.1837, + "step": 1479 + }, + { + "epoch": 5.92, + "grad_norm": 1.5638760328292847, + "learning_rate": 3.529058116232465e-05, + "loss": 0.1553, + "step": 1480 + }, + { + "epoch": 5.924, + "grad_norm": 1.487268328666687, + "learning_rate": 3.5280561122244495e-05, + "loss": 0.1817, + "step": 1481 + }, + { + "epoch": 5.928, + "grad_norm": 1.4699187278747559, + "learning_rate": 3.527054108216433e-05, + "loss": 0.1571, + "step": 1482 + }, + { + "epoch": 5.932, + "grad_norm": 1.4532089233398438, + "learning_rate": 3.526052104208417e-05, + "loss": 0.1434, + "step": 1483 + }, + { + "epoch": 5.936, + "grad_norm": 1.591044306755066, + "learning_rate": 3.525050100200401e-05, + "loss": 0.1538, + "step": 1484 + }, + { + "epoch": 5.9399999999999995, + "grad_norm": 1.4258755445480347, + "learning_rate": 3.524048096192385e-05, + "loss": 0.1606, + "step": 1485 + }, + { + "epoch": 5.944, + "grad_norm": 1.4429696798324585, + "learning_rate": 3.523046092184369e-05, + "loss": 0.2038, + "step": 1486 + }, + { + "epoch": 5.948, + "grad_norm": 1.4724392890930176, + "learning_rate": 3.522044088176353e-05, + "loss": 0.1644, + "step": 1487 + }, + { + "epoch": 5.952, + "grad_norm": 1.488022804260254, + "learning_rate": 3.5210420841683364e-05, + "loss": 0.1528, + "step": 1488 + }, + { + "epoch": 5.9559999999999995, + "grad_norm": 1.5905804634094238, + "learning_rate": 3.5200400801603205e-05, + "loss": 0.2294, + "step": 1489 + }, + { + "epoch": 5.96, + "grad_norm": 1.4152902364730835, + "learning_rate": 3.5190380761523046e-05, + "loss": 0.2096, + "step": 1490 + }, + { + "epoch": 5.964, + "grad_norm": 1.3682752847671509, + "learning_rate": 3.518036072144289e-05, + "loss": 0.1684, + "step": 1491 + }, + { + "epoch": 5.968, + "grad_norm": 1.4440537691116333, + "learning_rate": 3.517034068136273e-05, + "loss": 0.1714, + "step": 1492 + }, + { + "epoch": 5.9719999999999995, + "grad_norm": 1.4367694854736328, + "learning_rate": 3.516032064128257e-05, + "loss": 0.1558, + "step": 1493 + }, + { + "epoch": 5.976, + "grad_norm": 1.351462483406067, + "learning_rate": 3.5150300601202405e-05, + "loss": 0.167, + "step": 1494 + }, + { + "epoch": 5.98, + "grad_norm": 1.5862767696380615, + "learning_rate": 3.5140280561122246e-05, + "loss": 0.2089, + "step": 1495 + }, + { + "epoch": 5.984, + "grad_norm": 1.0578093528747559, + "learning_rate": 3.513026052104209e-05, + "loss": 0.1061, + "step": 1496 + }, + { + "epoch": 5.9879999999999995, + "grad_norm": 1.7430568933486938, + "learning_rate": 3.512024048096192e-05, + "loss": 0.1775, + "step": 1497 + }, + { + "epoch": 5.992, + "grad_norm": 1.3868354558944702, + "learning_rate": 3.511022044088176e-05, + "loss": 0.1627, + "step": 1498 + }, + { + "epoch": 5.996, + "grad_norm": 1.4972561597824097, + "learning_rate": 3.5100200400801604e-05, + "loss": 0.1617, + "step": 1499 + }, + { + "epoch": 6.0, + "grad_norm": 1.5004100799560547, + "learning_rate": 3.509018036072144e-05, + "loss": 0.1657, + "step": 1500 + }, + { + "epoch": 6.004, + "grad_norm": 1.2470020055770874, + "learning_rate": 3.508016032064129e-05, + "loss": 0.1026, + "step": 1501 + }, + { + "epoch": 6.008, + "grad_norm": 1.0558257102966309, + "learning_rate": 3.507014028056113e-05, + "loss": 0.0933, + "step": 1502 + }, + { + "epoch": 6.012, + "grad_norm": 1.027756690979004, + "learning_rate": 3.506012024048096e-05, + "loss": 0.0819, + "step": 1503 + }, + { + "epoch": 6.016, + "grad_norm": 1.1106997728347778, + "learning_rate": 3.5050100200400804e-05, + "loss": 0.0955, + "step": 1504 + }, + { + "epoch": 6.02, + "grad_norm": 1.2579761743545532, + "learning_rate": 3.5040080160320645e-05, + "loss": 0.1063, + "step": 1505 + }, + { + "epoch": 6.024, + "grad_norm": 1.3087098598480225, + "learning_rate": 3.503006012024048e-05, + "loss": 0.0956, + "step": 1506 + }, + { + "epoch": 6.028, + "grad_norm": 1.2753757238388062, + "learning_rate": 3.502004008016032e-05, + "loss": 0.0847, + "step": 1507 + }, + { + "epoch": 6.032, + "grad_norm": 0.8829823732376099, + "learning_rate": 3.501002004008016e-05, + "loss": 0.0567, + "step": 1508 + }, + { + "epoch": 6.036, + "grad_norm": 1.3138492107391357, + "learning_rate": 3.5e-05, + "loss": 0.1009, + "step": 1509 + }, + { + "epoch": 6.04, + "grad_norm": 1.252717137336731, + "learning_rate": 3.4989979959919845e-05, + "loss": 0.0815, + "step": 1510 + }, + { + "epoch": 6.044, + "grad_norm": 1.5361855030059814, + "learning_rate": 3.4979959919839686e-05, + "loss": 0.0982, + "step": 1511 + }, + { + "epoch": 6.048, + "grad_norm": 1.8114335536956787, + "learning_rate": 3.496993987975952e-05, + "loss": 0.107, + "step": 1512 + }, + { + "epoch": 6.052, + "grad_norm": 1.430020809173584, + "learning_rate": 3.495991983967936e-05, + "loss": 0.0926, + "step": 1513 + }, + { + "epoch": 6.056, + "grad_norm": 1.3625584840774536, + "learning_rate": 3.49498997995992e-05, + "loss": 0.0849, + "step": 1514 + }, + { + "epoch": 6.06, + "grad_norm": 1.577380895614624, + "learning_rate": 3.493987975951904e-05, + "loss": 0.1411, + "step": 1515 + }, + { + "epoch": 6.064, + "grad_norm": 1.2233928442001343, + "learning_rate": 3.492985971943888e-05, + "loss": 0.0837, + "step": 1516 + }, + { + "epoch": 6.068, + "grad_norm": 1.4746198654174805, + "learning_rate": 3.491983967935872e-05, + "loss": 0.091, + "step": 1517 + }, + { + "epoch": 6.072, + "grad_norm": 1.5226060152053833, + "learning_rate": 3.4909819639278555e-05, + "loss": 0.1285, + "step": 1518 + }, + { + "epoch": 6.076, + "grad_norm": 1.3142298460006714, + "learning_rate": 3.4899799599198396e-05, + "loss": 0.105, + "step": 1519 + }, + { + "epoch": 6.08, + "grad_norm": 1.3603146076202393, + "learning_rate": 3.488977955911824e-05, + "loss": 0.0849, + "step": 1520 + }, + { + "epoch": 6.084, + "grad_norm": 1.2942161560058594, + "learning_rate": 3.487975951903808e-05, + "loss": 0.0896, + "step": 1521 + }, + { + "epoch": 6.088, + "grad_norm": 1.6908429861068726, + "learning_rate": 3.486973947895792e-05, + "loss": 0.1167, + "step": 1522 + }, + { + "epoch": 6.092, + "grad_norm": 1.1075533628463745, + "learning_rate": 3.485971943887776e-05, + "loss": 0.0823, + "step": 1523 + }, + { + "epoch": 6.096, + "grad_norm": 1.4377602338790894, + "learning_rate": 3.4849699398797596e-05, + "loss": 0.0825, + "step": 1524 + }, + { + "epoch": 6.1, + "grad_norm": 1.474347472190857, + "learning_rate": 3.483967935871744e-05, + "loss": 0.1007, + "step": 1525 + }, + { + "epoch": 6.104, + "grad_norm": 1.4804532527923584, + "learning_rate": 3.482965931863728e-05, + "loss": 0.0888, + "step": 1526 + }, + { + "epoch": 6.108, + "grad_norm": 1.413860559463501, + "learning_rate": 3.481963927855711e-05, + "loss": 0.0858, + "step": 1527 + }, + { + "epoch": 6.112, + "grad_norm": 1.3580888509750366, + "learning_rate": 3.4809619238476954e-05, + "loss": 0.0751, + "step": 1528 + }, + { + "epoch": 6.116, + "grad_norm": 1.015170693397522, + "learning_rate": 3.4799599198396795e-05, + "loss": 0.0622, + "step": 1529 + }, + { + "epoch": 6.12, + "grad_norm": 1.4240541458129883, + "learning_rate": 3.478957915831664e-05, + "loss": 0.0968, + "step": 1530 + }, + { + "epoch": 6.124, + "grad_norm": 1.4235905408859253, + "learning_rate": 3.477955911823648e-05, + "loss": 0.0997, + "step": 1531 + }, + { + "epoch": 6.128, + "grad_norm": 1.6935840845108032, + "learning_rate": 3.476953907815631e-05, + "loss": 0.1425, + "step": 1532 + }, + { + "epoch": 6.132, + "grad_norm": 1.4013699293136597, + "learning_rate": 3.4759519038076154e-05, + "loss": 0.0898, + "step": 1533 + }, + { + "epoch": 6.136, + "grad_norm": 1.4652191400527954, + "learning_rate": 3.4749498997995995e-05, + "loss": 0.0968, + "step": 1534 + }, + { + "epoch": 6.14, + "grad_norm": 1.4197686910629272, + "learning_rate": 3.4739478957915836e-05, + "loss": 0.1328, + "step": 1535 + }, + { + "epoch": 6.144, + "grad_norm": 1.2468719482421875, + "learning_rate": 3.472945891783567e-05, + "loss": 0.0874, + "step": 1536 + }, + { + "epoch": 6.148, + "grad_norm": 1.5248289108276367, + "learning_rate": 3.471943887775551e-05, + "loss": 0.0975, + "step": 1537 + }, + { + "epoch": 6.152, + "grad_norm": 1.3224518299102783, + "learning_rate": 3.4709418837675353e-05, + "loss": 0.0761, + "step": 1538 + }, + { + "epoch": 6.156, + "grad_norm": 1.4312621355056763, + "learning_rate": 3.469939879759519e-05, + "loss": 0.0833, + "step": 1539 + }, + { + "epoch": 6.16, + "grad_norm": 1.3523186445236206, + "learning_rate": 3.4689378757515036e-05, + "loss": 0.0912, + "step": 1540 + }, + { + "epoch": 6.164, + "grad_norm": 1.4507619142532349, + "learning_rate": 3.467935871743487e-05, + "loss": 0.0977, + "step": 1541 + }, + { + "epoch": 6.168, + "grad_norm": 1.0727993249893188, + "learning_rate": 3.466933867735471e-05, + "loss": 0.0608, + "step": 1542 + }, + { + "epoch": 6.172, + "grad_norm": 1.7440407276153564, + "learning_rate": 3.465931863727455e-05, + "loss": 0.1061, + "step": 1543 + }, + { + "epoch": 6.176, + "grad_norm": 1.238020420074463, + "learning_rate": 3.464929859719439e-05, + "loss": 0.0703, + "step": 1544 + }, + { + "epoch": 6.18, + "grad_norm": 1.3099385499954224, + "learning_rate": 3.463927855711423e-05, + "loss": 0.1048, + "step": 1545 + }, + { + "epoch": 6.184, + "grad_norm": 1.4899845123291016, + "learning_rate": 3.462925851703407e-05, + "loss": 0.1037, + "step": 1546 + }, + { + "epoch": 6.188, + "grad_norm": 1.4137481451034546, + "learning_rate": 3.4619238476953905e-05, + "loss": 0.1138, + "step": 1547 + }, + { + "epoch": 6.192, + "grad_norm": 1.402271032333374, + "learning_rate": 3.4609218436873746e-05, + "loss": 0.1069, + "step": 1548 + }, + { + "epoch": 6.196, + "grad_norm": 1.4417635202407837, + "learning_rate": 3.459919839679359e-05, + "loss": 0.1024, + "step": 1549 + }, + { + "epoch": 6.2, + "grad_norm": 1.2592614889144897, + "learning_rate": 3.458917835671343e-05, + "loss": 0.1002, + "step": 1550 + }, + { + "epoch": 6.204, + "grad_norm": 1.6195659637451172, + "learning_rate": 3.457915831663327e-05, + "loss": 0.1135, + "step": 1551 + }, + { + "epoch": 6.208, + "grad_norm": 1.3179293870925903, + "learning_rate": 3.456913827655311e-05, + "loss": 0.1026, + "step": 1552 + }, + { + "epoch": 6.212, + "grad_norm": 1.3877010345458984, + "learning_rate": 3.4559118236472946e-05, + "loss": 0.0814, + "step": 1553 + }, + { + "epoch": 6.216, + "grad_norm": 1.2459914684295654, + "learning_rate": 3.454909819639279e-05, + "loss": 0.0978, + "step": 1554 + }, + { + "epoch": 6.22, + "grad_norm": 1.368811011314392, + "learning_rate": 3.453907815631263e-05, + "loss": 0.0927, + "step": 1555 + }, + { + "epoch": 6.224, + "grad_norm": 1.629564642906189, + "learning_rate": 3.452905811623246e-05, + "loss": 0.1302, + "step": 1556 + }, + { + "epoch": 6.228, + "grad_norm": 1.299174189567566, + "learning_rate": 3.4519038076152304e-05, + "loss": 0.0974, + "step": 1557 + }, + { + "epoch": 6.232, + "grad_norm": 1.39913809299469, + "learning_rate": 3.4509018036072145e-05, + "loss": 0.1, + "step": 1558 + }, + { + "epoch": 6.236, + "grad_norm": 1.5463366508483887, + "learning_rate": 3.449899799599198e-05, + "loss": 0.1092, + "step": 1559 + }, + { + "epoch": 6.24, + "grad_norm": 1.416552186012268, + "learning_rate": 3.448897795591183e-05, + "loss": 0.1247, + "step": 1560 + }, + { + "epoch": 6.244, + "grad_norm": 1.3216407299041748, + "learning_rate": 3.447895791583167e-05, + "loss": 0.1219, + "step": 1561 + }, + { + "epoch": 6.248, + "grad_norm": 1.2443658113479614, + "learning_rate": 3.4468937875751504e-05, + "loss": 0.085, + "step": 1562 + }, + { + "epoch": 6.252, + "grad_norm": 1.3868099451065063, + "learning_rate": 3.4458917835671345e-05, + "loss": 0.0941, + "step": 1563 + }, + { + "epoch": 6.256, + "grad_norm": 1.5421968698501587, + "learning_rate": 3.4448897795591186e-05, + "loss": 0.1034, + "step": 1564 + }, + { + "epoch": 6.26, + "grad_norm": 1.3406178951263428, + "learning_rate": 3.443887775551102e-05, + "loss": 0.0902, + "step": 1565 + }, + { + "epoch": 6.264, + "grad_norm": 1.5938392877578735, + "learning_rate": 3.442885771543086e-05, + "loss": 0.1159, + "step": 1566 + }, + { + "epoch": 6.268, + "grad_norm": 1.5965449810028076, + "learning_rate": 3.44188376753507e-05, + "loss": 0.1174, + "step": 1567 + }, + { + "epoch": 6.272, + "grad_norm": 1.4595623016357422, + "learning_rate": 3.440881763527054e-05, + "loss": 0.1078, + "step": 1568 + }, + { + "epoch": 6.276, + "grad_norm": 1.6079479455947876, + "learning_rate": 3.4398797595190386e-05, + "loss": 0.1158, + "step": 1569 + }, + { + "epoch": 6.28, + "grad_norm": 1.078923225402832, + "learning_rate": 3.438877755511023e-05, + "loss": 0.0722, + "step": 1570 + }, + { + "epoch": 6.284, + "grad_norm": 1.4052515029907227, + "learning_rate": 3.437875751503006e-05, + "loss": 0.0988, + "step": 1571 + }, + { + "epoch": 6.288, + "grad_norm": 1.262578010559082, + "learning_rate": 3.43687374749499e-05, + "loss": 0.0981, + "step": 1572 + }, + { + "epoch": 6.292, + "grad_norm": 1.333280324935913, + "learning_rate": 3.4358717434869744e-05, + "loss": 0.11, + "step": 1573 + }, + { + "epoch": 6.296, + "grad_norm": 1.4792897701263428, + "learning_rate": 3.434869739478958e-05, + "loss": 0.0854, + "step": 1574 + }, + { + "epoch": 6.3, + "grad_norm": 1.2670187950134277, + "learning_rate": 3.433867735470942e-05, + "loss": 0.0929, + "step": 1575 + }, + { + "epoch": 6.304, + "grad_norm": 1.4768058061599731, + "learning_rate": 3.432865731462926e-05, + "loss": 0.0961, + "step": 1576 + }, + { + "epoch": 6.308, + "grad_norm": 1.6261510848999023, + "learning_rate": 3.4318637274549096e-05, + "loss": 0.1069, + "step": 1577 + }, + { + "epoch": 6.312, + "grad_norm": 1.4705723524093628, + "learning_rate": 3.430861723446894e-05, + "loss": 0.0841, + "step": 1578 + }, + { + "epoch": 6.316, + "grad_norm": 1.6650081872940063, + "learning_rate": 3.4298597194388785e-05, + "loss": 0.1196, + "step": 1579 + }, + { + "epoch": 6.32, + "grad_norm": 1.4160841703414917, + "learning_rate": 3.428857715430862e-05, + "loss": 0.1067, + "step": 1580 + }, + { + "epoch": 6.324, + "grad_norm": 0.9616687297821045, + "learning_rate": 3.427855711422846e-05, + "loss": 0.055, + "step": 1581 + }, + { + "epoch": 6.328, + "grad_norm": 1.5261203050613403, + "learning_rate": 3.42685370741483e-05, + "loss": 0.115, + "step": 1582 + }, + { + "epoch": 6.332, + "grad_norm": 1.568793773651123, + "learning_rate": 3.425851703406814e-05, + "loss": 0.1151, + "step": 1583 + }, + { + "epoch": 6.336, + "grad_norm": 1.351989507675171, + "learning_rate": 3.424849699398798e-05, + "loss": 0.1155, + "step": 1584 + }, + { + "epoch": 6.34, + "grad_norm": 1.3044042587280273, + "learning_rate": 3.423847695390782e-05, + "loss": 0.0911, + "step": 1585 + }, + { + "epoch": 6.344, + "grad_norm": 1.4144433736801147, + "learning_rate": 3.4228456913827654e-05, + "loss": 0.1016, + "step": 1586 + }, + { + "epoch": 6.348, + "grad_norm": 2.237967014312744, + "learning_rate": 3.4218436873747495e-05, + "loss": 0.0831, + "step": 1587 + }, + { + "epoch": 6.352, + "grad_norm": 1.522283911705017, + "learning_rate": 3.4208416833667336e-05, + "loss": 0.1045, + "step": 1588 + }, + { + "epoch": 6.356, + "grad_norm": 1.3800389766693115, + "learning_rate": 3.419839679358718e-05, + "loss": 0.1121, + "step": 1589 + }, + { + "epoch": 6.36, + "grad_norm": 0.9566818475723267, + "learning_rate": 3.418837675350702e-05, + "loss": 0.0592, + "step": 1590 + }, + { + "epoch": 6.364, + "grad_norm": 1.6034566164016724, + "learning_rate": 3.417835671342685e-05, + "loss": 0.097, + "step": 1591 + }, + { + "epoch": 6.368, + "grad_norm": 1.4005167484283447, + "learning_rate": 3.4168336673346695e-05, + "loss": 0.0934, + "step": 1592 + }, + { + "epoch": 6.372, + "grad_norm": 1.4644639492034912, + "learning_rate": 3.4158316633266536e-05, + "loss": 0.0891, + "step": 1593 + }, + { + "epoch": 6.376, + "grad_norm": 1.265230417251587, + "learning_rate": 3.414829659318638e-05, + "loss": 0.1015, + "step": 1594 + }, + { + "epoch": 6.38, + "grad_norm": 1.375074863433838, + "learning_rate": 3.413827655310621e-05, + "loss": 0.1105, + "step": 1595 + }, + { + "epoch": 6.384, + "grad_norm": 1.3185374736785889, + "learning_rate": 3.412825651302605e-05, + "loss": 0.0965, + "step": 1596 + }, + { + "epoch": 6.388, + "grad_norm": 1.3281744718551636, + "learning_rate": 3.4118236472945894e-05, + "loss": 0.0881, + "step": 1597 + }, + { + "epoch": 6.392, + "grad_norm": 1.5773831605911255, + "learning_rate": 3.410821643286573e-05, + "loss": 0.1217, + "step": 1598 + }, + { + "epoch": 6.396, + "grad_norm": 1.4592585563659668, + "learning_rate": 3.409819639278558e-05, + "loss": 0.0941, + "step": 1599 + }, + { + "epoch": 6.4, + "grad_norm": 1.4319229125976562, + "learning_rate": 3.408817635270541e-05, + "loss": 0.1064, + "step": 1600 + }, + { + "epoch": 6.404, + "grad_norm": 1.5549246072769165, + "learning_rate": 3.407815631262525e-05, + "loss": 0.1062, + "step": 1601 + }, + { + "epoch": 6.408, + "grad_norm": 1.5339570045471191, + "learning_rate": 3.4068136272545094e-05, + "loss": 0.1068, + "step": 1602 + }, + { + "epoch": 6.412, + "grad_norm": 1.4197008609771729, + "learning_rate": 3.405811623246493e-05, + "loss": 0.093, + "step": 1603 + }, + { + "epoch": 6.416, + "grad_norm": 1.3577507734298706, + "learning_rate": 3.404809619238477e-05, + "loss": 0.1006, + "step": 1604 + }, + { + "epoch": 6.42, + "grad_norm": 0.8569574356079102, + "learning_rate": 3.403807615230461e-05, + "loss": 0.0439, + "step": 1605 + }, + { + "epoch": 6.424, + "grad_norm": 1.6528804302215576, + "learning_rate": 3.402805611222445e-05, + "loss": 0.1212, + "step": 1606 + }, + { + "epoch": 6.428, + "grad_norm": 1.4956883192062378, + "learning_rate": 3.401803607214429e-05, + "loss": 0.0922, + "step": 1607 + }, + { + "epoch": 6.432, + "grad_norm": 1.6123335361480713, + "learning_rate": 3.400801603206413e-05, + "loss": 0.1055, + "step": 1608 + }, + { + "epoch": 6.436, + "grad_norm": 1.539689302444458, + "learning_rate": 3.399799599198397e-05, + "loss": 0.0866, + "step": 1609 + }, + { + "epoch": 6.44, + "grad_norm": 1.570948600769043, + "learning_rate": 3.398797595190381e-05, + "loss": 0.1343, + "step": 1610 + }, + { + "epoch": 6.444, + "grad_norm": 1.6961643695831299, + "learning_rate": 3.397795591182365e-05, + "loss": 0.1115, + "step": 1611 + }, + { + "epoch": 6.448, + "grad_norm": 1.4908407926559448, + "learning_rate": 3.3967935871743486e-05, + "loss": 0.0983, + "step": 1612 + }, + { + "epoch": 6.452, + "grad_norm": 1.48050057888031, + "learning_rate": 3.395791583166333e-05, + "loss": 0.0982, + "step": 1613 + }, + { + "epoch": 6.456, + "grad_norm": 1.5200427770614624, + "learning_rate": 3.394789579158317e-05, + "loss": 0.095, + "step": 1614 + }, + { + "epoch": 6.46, + "grad_norm": 1.004806399345398, + "learning_rate": 3.3937875751503003e-05, + "loss": 0.0564, + "step": 1615 + }, + { + "epoch": 6.464, + "grad_norm": 1.4128756523132324, + "learning_rate": 3.3927855711422845e-05, + "loss": 0.1113, + "step": 1616 + }, + { + "epoch": 6.468, + "grad_norm": 1.1182926893234253, + "learning_rate": 3.3917835671342686e-05, + "loss": 0.0857, + "step": 1617 + }, + { + "epoch": 6.4719999999999995, + "grad_norm": 1.3846279382705688, + "learning_rate": 3.390781563126252e-05, + "loss": 0.0842, + "step": 1618 + }, + { + "epoch": 6.476, + "grad_norm": 1.4602874517440796, + "learning_rate": 3.389779559118237e-05, + "loss": 0.0941, + "step": 1619 + }, + { + "epoch": 6.48, + "grad_norm": 1.6256506443023682, + "learning_rate": 3.388777555110221e-05, + "loss": 0.1055, + "step": 1620 + }, + { + "epoch": 6.484, + "grad_norm": 0.9726806282997131, + "learning_rate": 3.3877755511022044e-05, + "loss": 0.0599, + "step": 1621 + }, + { + "epoch": 6.4879999999999995, + "grad_norm": 2.609361410140991, + "learning_rate": 3.3867735470941886e-05, + "loss": 0.1239, + "step": 1622 + }, + { + "epoch": 6.492, + "grad_norm": 1.5318516492843628, + "learning_rate": 3.385771543086173e-05, + "loss": 0.1655, + "step": 1623 + }, + { + "epoch": 6.496, + "grad_norm": 1.4403146505355835, + "learning_rate": 3.384769539078156e-05, + "loss": 0.1011, + "step": 1624 + }, + { + "epoch": 6.5, + "grad_norm": 1.5402027368545532, + "learning_rate": 3.38376753507014e-05, + "loss": 0.1085, + "step": 1625 + }, + { + "epoch": 6.504, + "grad_norm": 1.6094437837600708, + "learning_rate": 3.3827655310621244e-05, + "loss": 0.1067, + "step": 1626 + }, + { + "epoch": 6.508, + "grad_norm": 1.4926447868347168, + "learning_rate": 3.381763527054108e-05, + "loss": 0.0976, + "step": 1627 + }, + { + "epoch": 6.5120000000000005, + "grad_norm": 1.4129664897918701, + "learning_rate": 3.3807615230460927e-05, + "loss": 0.0947, + "step": 1628 + }, + { + "epoch": 6.516, + "grad_norm": 1.3668807744979858, + "learning_rate": 3.379759519038077e-05, + "loss": 0.1382, + "step": 1629 + }, + { + "epoch": 6.52, + "grad_norm": 1.3848040103912354, + "learning_rate": 3.37875751503006e-05, + "loss": 0.1103, + "step": 1630 + }, + { + "epoch": 6.524, + "grad_norm": 1.2794206142425537, + "learning_rate": 3.3777555110220444e-05, + "loss": 0.0964, + "step": 1631 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 1.6137124300003052, + "learning_rate": 3.3767535070140285e-05, + "loss": 0.1081, + "step": 1632 + }, + { + "epoch": 6.532, + "grad_norm": 1.6709508895874023, + "learning_rate": 3.375751503006012e-05, + "loss": 0.1414, + "step": 1633 + }, + { + "epoch": 6.536, + "grad_norm": 1.5419001579284668, + "learning_rate": 3.374749498997996e-05, + "loss": 0.1198, + "step": 1634 + }, + { + "epoch": 6.54, + "grad_norm": 1.83645761013031, + "learning_rate": 3.37374749498998e-05, + "loss": 0.1293, + "step": 1635 + }, + { + "epoch": 6.5440000000000005, + "grad_norm": 1.4904134273529053, + "learning_rate": 3.3727454909819637e-05, + "loss": 0.1015, + "step": 1636 + }, + { + "epoch": 6.548, + "grad_norm": 1.4517120122909546, + "learning_rate": 3.371743486973948e-05, + "loss": 0.0854, + "step": 1637 + }, + { + "epoch": 6.552, + "grad_norm": 1.6505271196365356, + "learning_rate": 3.3707414829659326e-05, + "loss": 0.1111, + "step": 1638 + }, + { + "epoch": 6.556, + "grad_norm": 2.092888116836548, + "learning_rate": 3.369739478957916e-05, + "loss": 0.0974, + "step": 1639 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 1.2148265838623047, + "learning_rate": 3.3687374749499e-05, + "loss": 0.1113, + "step": 1640 + }, + { + "epoch": 6.564, + "grad_norm": 1.4497480392456055, + "learning_rate": 3.367735470941884e-05, + "loss": 0.1082, + "step": 1641 + }, + { + "epoch": 6.568, + "grad_norm": 1.4580659866333008, + "learning_rate": 3.366733466933868e-05, + "loss": 0.1145, + "step": 1642 + }, + { + "epoch": 6.572, + "grad_norm": 1.3077856302261353, + "learning_rate": 3.365731462925852e-05, + "loss": 0.1039, + "step": 1643 + }, + { + "epoch": 6.576, + "grad_norm": 1.3982523679733276, + "learning_rate": 3.364729458917836e-05, + "loss": 0.1041, + "step": 1644 + }, + { + "epoch": 6.58, + "grad_norm": 1.5847138166427612, + "learning_rate": 3.3637274549098195e-05, + "loss": 0.1368, + "step": 1645 + }, + { + "epoch": 6.584, + "grad_norm": 1.542724847793579, + "learning_rate": 3.3627254509018036e-05, + "loss": 0.1215, + "step": 1646 + }, + { + "epoch": 6.588, + "grad_norm": 1.2111552953720093, + "learning_rate": 3.361723446893788e-05, + "loss": 0.0983, + "step": 1647 + }, + { + "epoch": 6.592, + "grad_norm": 1.5661391019821167, + "learning_rate": 3.360721442885772e-05, + "loss": 0.1315, + "step": 1648 + }, + { + "epoch": 6.596, + "grad_norm": 1.6418293714523315, + "learning_rate": 3.359719438877756e-05, + "loss": 0.1096, + "step": 1649 + }, + { + "epoch": 6.6, + "grad_norm": 1.4757168292999268, + "learning_rate": 3.35871743486974e-05, + "loss": 0.1102, + "step": 1650 + }, + { + "epoch": 6.604, + "grad_norm": 1.40822434425354, + "learning_rate": 3.3577154308617235e-05, + "loss": 0.0912, + "step": 1651 + }, + { + "epoch": 6.608, + "grad_norm": 1.426364541053772, + "learning_rate": 3.356713426853708e-05, + "loss": 0.0913, + "step": 1652 + }, + { + "epoch": 6.612, + "grad_norm": 1.2223780155181885, + "learning_rate": 3.355711422845692e-05, + "loss": 0.0945, + "step": 1653 + }, + { + "epoch": 6.616, + "grad_norm": 1.4718832969665527, + "learning_rate": 3.354709418837675e-05, + "loss": 0.1092, + "step": 1654 + }, + { + "epoch": 6.62, + "grad_norm": 1.6908650398254395, + "learning_rate": 3.3537074148296594e-05, + "loss": 0.1143, + "step": 1655 + }, + { + "epoch": 6.624, + "grad_norm": 1.5453354120254517, + "learning_rate": 3.3527054108216435e-05, + "loss": 0.101, + "step": 1656 + }, + { + "epoch": 6.628, + "grad_norm": 1.1836323738098145, + "learning_rate": 3.351703406813627e-05, + "loss": 0.0705, + "step": 1657 + }, + { + "epoch": 6.632, + "grad_norm": 1.5613477230072021, + "learning_rate": 3.350701402805612e-05, + "loss": 0.109, + "step": 1658 + }, + { + "epoch": 6.636, + "grad_norm": 1.5052834749221802, + "learning_rate": 3.349699398797595e-05, + "loss": 0.1057, + "step": 1659 + }, + { + "epoch": 6.64, + "grad_norm": 1.1341853141784668, + "learning_rate": 3.3486973947895793e-05, + "loss": 0.0539, + "step": 1660 + }, + { + "epoch": 6.644, + "grad_norm": 1.6815075874328613, + "learning_rate": 3.3476953907815635e-05, + "loss": 0.1237, + "step": 1661 + }, + { + "epoch": 6.648, + "grad_norm": 1.5323514938354492, + "learning_rate": 3.3466933867735476e-05, + "loss": 0.0888, + "step": 1662 + }, + { + "epoch": 6.652, + "grad_norm": 1.7266521453857422, + "learning_rate": 3.345691382765531e-05, + "loss": 0.1308, + "step": 1663 + }, + { + "epoch": 6.656, + "grad_norm": 1.6895675659179688, + "learning_rate": 3.344689378757515e-05, + "loss": 0.1054, + "step": 1664 + }, + { + "epoch": 6.66, + "grad_norm": 1.456880807876587, + "learning_rate": 3.343687374749499e-05, + "loss": 0.1082, + "step": 1665 + }, + { + "epoch": 6.664, + "grad_norm": 1.5108088254928589, + "learning_rate": 3.342685370741483e-05, + "loss": 0.1111, + "step": 1666 + }, + { + "epoch": 6.668, + "grad_norm": 1.3145025968551636, + "learning_rate": 3.341683366733467e-05, + "loss": 0.096, + "step": 1667 + }, + { + "epoch": 6.672, + "grad_norm": 1.4500359296798706, + "learning_rate": 3.340681362725451e-05, + "loss": 0.0917, + "step": 1668 + }, + { + "epoch": 6.676, + "grad_norm": 1.5362236499786377, + "learning_rate": 3.339679358717435e-05, + "loss": 0.1375, + "step": 1669 + }, + { + "epoch": 6.68, + "grad_norm": 1.490013599395752, + "learning_rate": 3.338677354709419e-05, + "loss": 0.1124, + "step": 1670 + }, + { + "epoch": 6.684, + "grad_norm": 1.6905912160873413, + "learning_rate": 3.337675350701403e-05, + "loss": 0.1238, + "step": 1671 + }, + { + "epoch": 6.688, + "grad_norm": 1.4641087055206299, + "learning_rate": 3.336673346693387e-05, + "loss": 0.1398, + "step": 1672 + }, + { + "epoch": 6.692, + "grad_norm": 1.4538987874984741, + "learning_rate": 3.335671342685371e-05, + "loss": 0.1147, + "step": 1673 + }, + { + "epoch": 6.696, + "grad_norm": 1.6322211027145386, + "learning_rate": 3.3346693386773544e-05, + "loss": 0.1176, + "step": 1674 + }, + { + "epoch": 6.7, + "grad_norm": 1.3581397533416748, + "learning_rate": 3.3336673346693386e-05, + "loss": 0.0936, + "step": 1675 + }, + { + "epoch": 6.704, + "grad_norm": 1.356339693069458, + "learning_rate": 3.332665330661323e-05, + "loss": 0.0987, + "step": 1676 + }, + { + "epoch": 6.708, + "grad_norm": 1.5190908908843994, + "learning_rate": 3.331663326653307e-05, + "loss": 0.1148, + "step": 1677 + }, + { + "epoch": 6.712, + "grad_norm": 1.6022852659225464, + "learning_rate": 3.330661322645291e-05, + "loss": 0.1146, + "step": 1678 + }, + { + "epoch": 6.716, + "grad_norm": 1.7773820161819458, + "learning_rate": 3.329659318637275e-05, + "loss": 0.1239, + "step": 1679 + }, + { + "epoch": 6.72, + "grad_norm": 1.3916137218475342, + "learning_rate": 3.3286573146292585e-05, + "loss": 0.0931, + "step": 1680 + }, + { + "epoch": 6.724, + "grad_norm": 1.4522221088409424, + "learning_rate": 3.3276553106212426e-05, + "loss": 0.1173, + "step": 1681 + }, + { + "epoch": 6.728, + "grad_norm": 1.3560315370559692, + "learning_rate": 3.326653306613227e-05, + "loss": 0.1113, + "step": 1682 + }, + { + "epoch": 6.732, + "grad_norm": 1.4816689491271973, + "learning_rate": 3.32565130260521e-05, + "loss": 0.1027, + "step": 1683 + }, + { + "epoch": 6.736, + "grad_norm": 1.453627586364746, + "learning_rate": 3.3246492985971944e-05, + "loss": 0.1071, + "step": 1684 + }, + { + "epoch": 6.74, + "grad_norm": 1.4733647108078003, + "learning_rate": 3.3236472945891785e-05, + "loss": 0.1208, + "step": 1685 + }, + { + "epoch": 6.744, + "grad_norm": 1.3408794403076172, + "learning_rate": 3.322645290581162e-05, + "loss": 0.1031, + "step": 1686 + }, + { + "epoch": 6.748, + "grad_norm": 1.5712541341781616, + "learning_rate": 3.321643286573147e-05, + "loss": 0.1169, + "step": 1687 + }, + { + "epoch": 6.752, + "grad_norm": 1.2861677408218384, + "learning_rate": 3.320641282565131e-05, + "loss": 0.0946, + "step": 1688 + }, + { + "epoch": 6.756, + "grad_norm": 1.5849751234054565, + "learning_rate": 3.319639278557114e-05, + "loss": 0.0961, + "step": 1689 + }, + { + "epoch": 6.76, + "grad_norm": 1.468711018562317, + "learning_rate": 3.3186372745490984e-05, + "loss": 0.1496, + "step": 1690 + }, + { + "epoch": 6.764, + "grad_norm": 1.6362906694412231, + "learning_rate": 3.3176352705410826e-05, + "loss": 0.1169, + "step": 1691 + }, + { + "epoch": 6.768, + "grad_norm": 1.5838176012039185, + "learning_rate": 3.316633266533066e-05, + "loss": 0.1395, + "step": 1692 + }, + { + "epoch": 6.772, + "grad_norm": 1.5610008239746094, + "learning_rate": 3.31563126252505e-05, + "loss": 0.0939, + "step": 1693 + }, + { + "epoch": 6.776, + "grad_norm": 1.5154725313186646, + "learning_rate": 3.314629258517034e-05, + "loss": 0.1003, + "step": 1694 + }, + { + "epoch": 6.78, + "grad_norm": 1.5459731817245483, + "learning_rate": 3.313627254509018e-05, + "loss": 0.1314, + "step": 1695 + }, + { + "epoch": 6.784, + "grad_norm": 1.492156744003296, + "learning_rate": 3.312625250501002e-05, + "loss": 0.1215, + "step": 1696 + }, + { + "epoch": 6.788, + "grad_norm": 1.5393439531326294, + "learning_rate": 3.311623246492987e-05, + "loss": 0.1102, + "step": 1697 + }, + { + "epoch": 6.792, + "grad_norm": 1.3706822395324707, + "learning_rate": 3.31062124248497e-05, + "loss": 0.1246, + "step": 1698 + }, + { + "epoch": 6.796, + "grad_norm": 1.3498224020004272, + "learning_rate": 3.309619238476954e-05, + "loss": 0.1176, + "step": 1699 + }, + { + "epoch": 6.8, + "grad_norm": 1.5214121341705322, + "learning_rate": 3.3086172344689384e-05, + "loss": 0.1092, + "step": 1700 + }, + { + "epoch": 6.804, + "grad_norm": 1.5812162160873413, + "learning_rate": 3.307615230460922e-05, + "loss": 0.1313, + "step": 1701 + }, + { + "epoch": 6.808, + "grad_norm": 1.4328696727752686, + "learning_rate": 3.306613226452906e-05, + "loss": 0.1188, + "step": 1702 + }, + { + "epoch": 6.812, + "grad_norm": 1.542195200920105, + "learning_rate": 3.30561122244489e-05, + "loss": 0.1328, + "step": 1703 + }, + { + "epoch": 6.816, + "grad_norm": 1.2997112274169922, + "learning_rate": 3.3046092184368735e-05, + "loss": 0.1103, + "step": 1704 + }, + { + "epoch": 6.82, + "grad_norm": 1.746984839439392, + "learning_rate": 3.303607214428858e-05, + "loss": 0.1134, + "step": 1705 + }, + { + "epoch": 6.824, + "grad_norm": 1.5972540378570557, + "learning_rate": 3.302605210420842e-05, + "loss": 0.1525, + "step": 1706 + }, + { + "epoch": 6.828, + "grad_norm": 1.4174765348434448, + "learning_rate": 3.301603206412826e-05, + "loss": 0.0846, + "step": 1707 + }, + { + "epoch": 6.832, + "grad_norm": 1.3693246841430664, + "learning_rate": 3.30060120240481e-05, + "loss": 0.1022, + "step": 1708 + }, + { + "epoch": 6.836, + "grad_norm": 1.5433459281921387, + "learning_rate": 3.299599198396794e-05, + "loss": 0.1151, + "step": 1709 + }, + { + "epoch": 6.84, + "grad_norm": 1.515580415725708, + "learning_rate": 3.2985971943887776e-05, + "loss": 0.1157, + "step": 1710 + }, + { + "epoch": 6.844, + "grad_norm": 1.4604809284210205, + "learning_rate": 3.297595190380762e-05, + "loss": 0.1203, + "step": 1711 + }, + { + "epoch": 6.848, + "grad_norm": 1.8675997257232666, + "learning_rate": 3.296593186372746e-05, + "loss": 0.1357, + "step": 1712 + }, + { + "epoch": 6.852, + "grad_norm": 1.4611036777496338, + "learning_rate": 3.295591182364729e-05, + "loss": 0.1203, + "step": 1713 + }, + { + "epoch": 6.856, + "grad_norm": 1.5014901161193848, + "learning_rate": 3.2945891783567135e-05, + "loss": 0.1151, + "step": 1714 + }, + { + "epoch": 6.86, + "grad_norm": 1.5301997661590576, + "learning_rate": 3.2935871743486976e-05, + "loss": 0.1121, + "step": 1715 + }, + { + "epoch": 6.864, + "grad_norm": 1.5152416229248047, + "learning_rate": 3.292585170340681e-05, + "loss": 0.0901, + "step": 1716 + }, + { + "epoch": 6.868, + "grad_norm": 1.5249059200286865, + "learning_rate": 3.291583166332666e-05, + "loss": 0.1061, + "step": 1717 + }, + { + "epoch": 6.872, + "grad_norm": 1.729848027229309, + "learning_rate": 3.290581162324649e-05, + "loss": 0.1259, + "step": 1718 + }, + { + "epoch": 6.876, + "grad_norm": 1.510529637336731, + "learning_rate": 3.2895791583166334e-05, + "loss": 0.11, + "step": 1719 + }, + { + "epoch": 6.88, + "grad_norm": 1.3543599843978882, + "learning_rate": 3.2885771543086176e-05, + "loss": 0.1085, + "step": 1720 + }, + { + "epoch": 6.884, + "grad_norm": 1.5983351469039917, + "learning_rate": 3.287575150300602e-05, + "loss": 0.144, + "step": 1721 + }, + { + "epoch": 6.888, + "grad_norm": 1.4449596405029297, + "learning_rate": 3.286573146292585e-05, + "loss": 0.1132, + "step": 1722 + }, + { + "epoch": 6.892, + "grad_norm": 1.5098458528518677, + "learning_rate": 3.285571142284569e-05, + "loss": 0.122, + "step": 1723 + }, + { + "epoch": 6.896, + "grad_norm": 1.5662916898727417, + "learning_rate": 3.2845691382765534e-05, + "loss": 0.1112, + "step": 1724 + }, + { + "epoch": 6.9, + "grad_norm": 1.4259101152420044, + "learning_rate": 3.283567134268537e-05, + "loss": 0.0982, + "step": 1725 + }, + { + "epoch": 6.904, + "grad_norm": 1.3410980701446533, + "learning_rate": 3.282565130260521e-05, + "loss": 0.0893, + "step": 1726 + }, + { + "epoch": 6.908, + "grad_norm": 1.5725864171981812, + "learning_rate": 3.281563126252505e-05, + "loss": 0.126, + "step": 1727 + }, + { + "epoch": 6.912, + "grad_norm": 1.5315643548965454, + "learning_rate": 3.280561122244489e-05, + "loss": 0.1013, + "step": 1728 + }, + { + "epoch": 6.916, + "grad_norm": 1.2779252529144287, + "learning_rate": 3.2795591182364734e-05, + "loss": 0.0934, + "step": 1729 + }, + { + "epoch": 6.92, + "grad_norm": 1.4510380029678345, + "learning_rate": 3.278557114228457e-05, + "loss": 0.0971, + "step": 1730 + }, + { + "epoch": 6.924, + "grad_norm": 1.527717113494873, + "learning_rate": 3.277555110220441e-05, + "loss": 0.1138, + "step": 1731 + }, + { + "epoch": 6.928, + "grad_norm": 1.5672757625579834, + "learning_rate": 3.276553106212425e-05, + "loss": 0.1001, + "step": 1732 + }, + { + "epoch": 6.932, + "grad_norm": 1.557324767112732, + "learning_rate": 3.275551102204409e-05, + "loss": 0.1195, + "step": 1733 + }, + { + "epoch": 6.936, + "grad_norm": 1.4140926599502563, + "learning_rate": 3.2745490981963926e-05, + "loss": 0.1113, + "step": 1734 + }, + { + "epoch": 6.9399999999999995, + "grad_norm": 1.4385108947753906, + "learning_rate": 3.273547094188377e-05, + "loss": 0.1139, + "step": 1735 + }, + { + "epoch": 6.944, + "grad_norm": 1.5246384143829346, + "learning_rate": 3.272545090180361e-05, + "loss": 0.1125, + "step": 1736 + }, + { + "epoch": 6.948, + "grad_norm": 1.0488942861557007, + "learning_rate": 3.271543086172345e-05, + "loss": 0.0691, + "step": 1737 + }, + { + "epoch": 6.952, + "grad_norm": 1.495911717414856, + "learning_rate": 3.270541082164329e-05, + "loss": 0.0927, + "step": 1738 + }, + { + "epoch": 6.9559999999999995, + "grad_norm": 1.6264322996139526, + "learning_rate": 3.2695390781563126e-05, + "loss": 0.1127, + "step": 1739 + }, + { + "epoch": 6.96, + "grad_norm": 1.3778818845748901, + "learning_rate": 3.268537074148297e-05, + "loss": 0.1266, + "step": 1740 + }, + { + "epoch": 6.964, + "grad_norm": 1.5300289392471313, + "learning_rate": 3.267535070140281e-05, + "loss": 0.1133, + "step": 1741 + }, + { + "epoch": 6.968, + "grad_norm": 1.5947483777999878, + "learning_rate": 3.266533066132264e-05, + "loss": 0.1196, + "step": 1742 + }, + { + "epoch": 6.9719999999999995, + "grad_norm": 1.4825366735458374, + "learning_rate": 3.2655310621242484e-05, + "loss": 0.1068, + "step": 1743 + }, + { + "epoch": 6.976, + "grad_norm": 1.7026481628417969, + "learning_rate": 3.2645290581162326e-05, + "loss": 0.0977, + "step": 1744 + }, + { + "epoch": 6.98, + "grad_norm": 1.6448338031768799, + "learning_rate": 3.263527054108216e-05, + "loss": 0.1112, + "step": 1745 + }, + { + "epoch": 6.984, + "grad_norm": 1.6659256219863892, + "learning_rate": 3.262525050100201e-05, + "loss": 0.1179, + "step": 1746 + }, + { + "epoch": 6.9879999999999995, + "grad_norm": 1.652055025100708, + "learning_rate": 3.261523046092185e-05, + "loss": 0.1535, + "step": 1747 + }, + { + "epoch": 6.992, + "grad_norm": 1.4880528450012207, + "learning_rate": 3.2605210420841684e-05, + "loss": 0.067, + "step": 1748 + }, + { + "epoch": 6.996, + "grad_norm": 1.5849748849868774, + "learning_rate": 3.2595190380761525e-05, + "loss": 0.1162, + "step": 1749 + }, + { + "epoch": 7.0, + "grad_norm": 1.6205350160598755, + "learning_rate": 3.2585170340681367e-05, + "loss": 0.127, + "step": 1750 + }, + { + "epoch": 7.004, + "grad_norm": 1.1027536392211914, + "learning_rate": 3.25751503006012e-05, + "loss": 0.0632, + "step": 1751 + }, + { + "epoch": 7.008, + "grad_norm": 1.131488561630249, + "learning_rate": 3.256513026052104e-05, + "loss": 0.0832, + "step": 1752 + }, + { + "epoch": 7.012, + "grad_norm": 1.1157985925674438, + "learning_rate": 3.2555110220440884e-05, + "loss": 0.0597, + "step": 1753 + }, + { + "epoch": 7.016, + "grad_norm": 1.2950737476348877, + "learning_rate": 3.254509018036072e-05, + "loss": 0.07, + "step": 1754 + }, + { + "epoch": 7.02, + "grad_norm": 1.0984954833984375, + "learning_rate": 3.253507014028056e-05, + "loss": 0.0584, + "step": 1755 + }, + { + "epoch": 7.024, + "grad_norm": 1.1092177629470825, + "learning_rate": 3.252505010020041e-05, + "loss": 0.0605, + "step": 1756 + }, + { + "epoch": 7.028, + "grad_norm": 0.8248789310455322, + "learning_rate": 3.251503006012024e-05, + "loss": 0.029, + "step": 1757 + }, + { + "epoch": 7.032, + "grad_norm": 1.0940172672271729, + "learning_rate": 3.250501002004008e-05, + "loss": 0.05, + "step": 1758 + }, + { + "epoch": 7.036, + "grad_norm": 1.4330635070800781, + "learning_rate": 3.2494989979959925e-05, + "loss": 0.0788, + "step": 1759 + }, + { + "epoch": 7.04, + "grad_norm": 1.5703409910202026, + "learning_rate": 3.248496993987976e-05, + "loss": 0.0811, + "step": 1760 + }, + { + "epoch": 7.044, + "grad_norm": 1.1306759119033813, + "learning_rate": 3.24749498997996e-05, + "loss": 0.0555, + "step": 1761 + }, + { + "epoch": 7.048, + "grad_norm": 1.2541637420654297, + "learning_rate": 3.246492985971944e-05, + "loss": 0.0572, + "step": 1762 + }, + { + "epoch": 7.052, + "grad_norm": 1.333361268043518, + "learning_rate": 3.2454909819639276e-05, + "loss": 0.0642, + "step": 1763 + }, + { + "epoch": 7.056, + "grad_norm": 1.1626932621002197, + "learning_rate": 3.244488977955912e-05, + "loss": 0.0489, + "step": 1764 + }, + { + "epoch": 7.06, + "grad_norm": 1.2295629978179932, + "learning_rate": 3.243486973947896e-05, + "loss": 0.0544, + "step": 1765 + }, + { + "epoch": 7.064, + "grad_norm": 1.3224353790283203, + "learning_rate": 3.24248496993988e-05, + "loss": 0.0667, + "step": 1766 + }, + { + "epoch": 7.068, + "grad_norm": 1.2137049436569214, + "learning_rate": 3.241482965931864e-05, + "loss": 0.0581, + "step": 1767 + }, + { + "epoch": 7.072, + "grad_norm": 1.1840201616287231, + "learning_rate": 3.240480961923848e-05, + "loss": 0.0649, + "step": 1768 + }, + { + "epoch": 7.076, + "grad_norm": 1.3883408308029175, + "learning_rate": 3.239478957915832e-05, + "loss": 0.0767, + "step": 1769 + }, + { + "epoch": 7.08, + "grad_norm": 0.8142508268356323, + "learning_rate": 3.238476953907816e-05, + "loss": 0.0378, + "step": 1770 + }, + { + "epoch": 7.084, + "grad_norm": 1.2708982229232788, + "learning_rate": 3.2374749498998e-05, + "loss": 0.0578, + "step": 1771 + }, + { + "epoch": 7.088, + "grad_norm": 1.1672301292419434, + "learning_rate": 3.2364729458917834e-05, + "loss": 0.0538, + "step": 1772 + }, + { + "epoch": 7.092, + "grad_norm": 1.0839089155197144, + "learning_rate": 3.2354709418837675e-05, + "loss": 0.0452, + "step": 1773 + }, + { + "epoch": 7.096, + "grad_norm": 1.2967957258224487, + "learning_rate": 3.234468937875752e-05, + "loss": 0.0676, + "step": 1774 + }, + { + "epoch": 7.1, + "grad_norm": 1.3709466457366943, + "learning_rate": 3.233466933867735e-05, + "loss": 0.0607, + "step": 1775 + }, + { + "epoch": 7.104, + "grad_norm": 1.3387433290481567, + "learning_rate": 3.23246492985972e-05, + "loss": 0.0577, + "step": 1776 + }, + { + "epoch": 7.108, + "grad_norm": 1.308927297592163, + "learning_rate": 3.231462925851704e-05, + "loss": 0.0827, + "step": 1777 + }, + { + "epoch": 7.112, + "grad_norm": 1.114564061164856, + "learning_rate": 3.2304609218436875e-05, + "loss": 0.0473, + "step": 1778 + }, + { + "epoch": 7.116, + "grad_norm": 1.2246047258377075, + "learning_rate": 3.2294589178356716e-05, + "loss": 0.0585, + "step": 1779 + }, + { + "epoch": 7.12, + "grad_norm": 1.3446717262268066, + "learning_rate": 3.228456913827656e-05, + "loss": 0.0696, + "step": 1780 + }, + { + "epoch": 7.124, + "grad_norm": 1.2419359683990479, + "learning_rate": 3.227454909819639e-05, + "loss": 0.0715, + "step": 1781 + }, + { + "epoch": 7.128, + "grad_norm": 1.3638228178024292, + "learning_rate": 3.2264529058116233e-05, + "loss": 0.0566, + "step": 1782 + }, + { + "epoch": 7.132, + "grad_norm": 1.1495225429534912, + "learning_rate": 3.2254509018036075e-05, + "loss": 0.057, + "step": 1783 + }, + { + "epoch": 7.136, + "grad_norm": 1.3355066776275635, + "learning_rate": 3.224448897795591e-05, + "loss": 0.067, + "step": 1784 + }, + { + "epoch": 7.14, + "grad_norm": 1.2129656076431274, + "learning_rate": 3.223446893787575e-05, + "loss": 0.052, + "step": 1785 + }, + { + "epoch": 7.144, + "grad_norm": 1.1906895637512207, + "learning_rate": 3.222444889779559e-05, + "loss": 0.0627, + "step": 1786 + }, + { + "epoch": 7.148, + "grad_norm": 1.0992094278335571, + "learning_rate": 3.221442885771543e-05, + "loss": 0.0473, + "step": 1787 + }, + { + "epoch": 7.152, + "grad_norm": 1.235259771347046, + "learning_rate": 3.2204408817635274e-05, + "loss": 0.0644, + "step": 1788 + }, + { + "epoch": 7.156, + "grad_norm": 1.3004990816116333, + "learning_rate": 3.2194388777555116e-05, + "loss": 0.0675, + "step": 1789 + }, + { + "epoch": 7.16, + "grad_norm": 1.2756885290145874, + "learning_rate": 3.218436873747495e-05, + "loss": 0.067, + "step": 1790 + }, + { + "epoch": 7.164, + "grad_norm": 1.1897791624069214, + "learning_rate": 3.217434869739479e-05, + "loss": 0.0483, + "step": 1791 + }, + { + "epoch": 7.168, + "grad_norm": 1.2008836269378662, + "learning_rate": 3.216432865731463e-05, + "loss": 0.0651, + "step": 1792 + }, + { + "epoch": 7.172, + "grad_norm": 1.4409757852554321, + "learning_rate": 3.215430861723447e-05, + "loss": 0.0662, + "step": 1793 + }, + { + "epoch": 7.176, + "grad_norm": 1.4043469429016113, + "learning_rate": 3.214428857715431e-05, + "loss": 0.0937, + "step": 1794 + }, + { + "epoch": 7.18, + "grad_norm": 1.6467530727386475, + "learning_rate": 3.213426853707415e-05, + "loss": 0.0913, + "step": 1795 + }, + { + "epoch": 7.184, + "grad_norm": 1.4349619150161743, + "learning_rate": 3.212424849699399e-05, + "loss": 0.0623, + "step": 1796 + }, + { + "epoch": 7.188, + "grad_norm": 1.2570191621780396, + "learning_rate": 3.211422845691383e-05, + "loss": 0.0597, + "step": 1797 + }, + { + "epoch": 7.192, + "grad_norm": 1.2177025079727173, + "learning_rate": 3.210420841683367e-05, + "loss": 0.0568, + "step": 1798 + }, + { + "epoch": 7.196, + "grad_norm": 1.1672996282577515, + "learning_rate": 3.209418837675351e-05, + "loss": 0.0621, + "step": 1799 + }, + { + "epoch": 7.2, + "grad_norm": 1.3348926305770874, + "learning_rate": 3.208416833667335e-05, + "loss": 0.0656, + "step": 1800 + }, + { + "epoch": 7.204, + "grad_norm": 1.202222466468811, + "learning_rate": 3.2074148296593184e-05, + "loss": 0.0549, + "step": 1801 + }, + { + "epoch": 7.208, + "grad_norm": 1.333136796951294, + "learning_rate": 3.2064128256513025e-05, + "loss": 0.0645, + "step": 1802 + }, + { + "epoch": 7.212, + "grad_norm": 1.3489561080932617, + "learning_rate": 3.2054108216432866e-05, + "loss": 0.0583, + "step": 1803 + }, + { + "epoch": 7.216, + "grad_norm": 1.4054896831512451, + "learning_rate": 3.204408817635271e-05, + "loss": 0.0581, + "step": 1804 + }, + { + "epoch": 7.22, + "grad_norm": 1.1050463914871216, + "learning_rate": 3.203406813627255e-05, + "loss": 0.0552, + "step": 1805 + }, + { + "epoch": 7.224, + "grad_norm": 0.9208634495735168, + "learning_rate": 3.202404809619239e-05, + "loss": 0.0349, + "step": 1806 + }, + { + "epoch": 7.228, + "grad_norm": 1.2044702768325806, + "learning_rate": 3.2014028056112225e-05, + "loss": 0.0567, + "step": 1807 + }, + { + "epoch": 7.232, + "grad_norm": 1.2432702779769897, + "learning_rate": 3.2004008016032066e-05, + "loss": 0.0576, + "step": 1808 + }, + { + "epoch": 7.236, + "grad_norm": 1.2744909524917603, + "learning_rate": 3.199398797595191e-05, + "loss": 0.0682, + "step": 1809 + }, + { + "epoch": 7.24, + "grad_norm": 1.3941128253936768, + "learning_rate": 3.198396793587174e-05, + "loss": 0.0596, + "step": 1810 + }, + { + "epoch": 7.244, + "grad_norm": 1.7390588521957397, + "learning_rate": 3.197394789579158e-05, + "loss": 0.0646, + "step": 1811 + }, + { + "epoch": 7.248, + "grad_norm": 1.6309089660644531, + "learning_rate": 3.1963927855711424e-05, + "loss": 0.0645, + "step": 1812 + }, + { + "epoch": 7.252, + "grad_norm": 1.3789124488830566, + "learning_rate": 3.195390781563126e-05, + "loss": 0.0684, + "step": 1813 + }, + { + "epoch": 7.256, + "grad_norm": 1.2757648229599, + "learning_rate": 3.19438877755511e-05, + "loss": 0.0706, + "step": 1814 + }, + { + "epoch": 7.26, + "grad_norm": 1.0513287782669067, + "learning_rate": 3.193386773547095e-05, + "loss": 0.0525, + "step": 1815 + }, + { + "epoch": 7.264, + "grad_norm": 1.4275181293487549, + "learning_rate": 3.192384769539078e-05, + "loss": 0.0563, + "step": 1816 + }, + { + "epoch": 7.268, + "grad_norm": 1.3130202293395996, + "learning_rate": 3.1913827655310624e-05, + "loss": 0.0626, + "step": 1817 + }, + { + "epoch": 7.272, + "grad_norm": 1.3455928564071655, + "learning_rate": 3.1903807615230465e-05, + "loss": 0.0688, + "step": 1818 + }, + { + "epoch": 7.276, + "grad_norm": 1.3660778999328613, + "learning_rate": 3.18937875751503e-05, + "loss": 0.0666, + "step": 1819 + }, + { + "epoch": 7.28, + "grad_norm": 1.1925787925720215, + "learning_rate": 3.188376753507014e-05, + "loss": 0.0685, + "step": 1820 + }, + { + "epoch": 7.284, + "grad_norm": 1.4987269639968872, + "learning_rate": 3.187374749498998e-05, + "loss": 0.0747, + "step": 1821 + }, + { + "epoch": 7.288, + "grad_norm": 1.1474624872207642, + "learning_rate": 3.186372745490982e-05, + "loss": 0.0558, + "step": 1822 + }, + { + "epoch": 7.292, + "grad_norm": 1.4372210502624512, + "learning_rate": 3.185370741482966e-05, + "loss": 0.068, + "step": 1823 + }, + { + "epoch": 7.296, + "grad_norm": 1.5136364698410034, + "learning_rate": 3.18436873747495e-05, + "loss": 0.0852, + "step": 1824 + }, + { + "epoch": 7.3, + "grad_norm": 0.8756834268569946, + "learning_rate": 3.183366733466934e-05, + "loss": 0.0306, + "step": 1825 + }, + { + "epoch": 7.304, + "grad_norm": 1.6666139364242554, + "learning_rate": 3.182364729458918e-05, + "loss": 0.0818, + "step": 1826 + }, + { + "epoch": 7.308, + "grad_norm": 1.2882213592529297, + "learning_rate": 3.181362725450902e-05, + "loss": 0.0663, + "step": 1827 + }, + { + "epoch": 7.312, + "grad_norm": 1.2303308248519897, + "learning_rate": 3.180360721442886e-05, + "loss": 0.0696, + "step": 1828 + }, + { + "epoch": 7.316, + "grad_norm": 1.3530824184417725, + "learning_rate": 3.17935871743487e-05, + "loss": 0.0665, + "step": 1829 + }, + { + "epoch": 7.32, + "grad_norm": 1.375704288482666, + "learning_rate": 3.178356713426854e-05, + "loss": 0.0606, + "step": 1830 + }, + { + "epoch": 7.324, + "grad_norm": 1.5587910413742065, + "learning_rate": 3.1773547094188375e-05, + "loss": 0.0797, + "step": 1831 + }, + { + "epoch": 7.328, + "grad_norm": 1.2583500146865845, + "learning_rate": 3.1763527054108216e-05, + "loss": 0.0551, + "step": 1832 + }, + { + "epoch": 7.332, + "grad_norm": 1.2027583122253418, + "learning_rate": 3.175350701402806e-05, + "loss": 0.0578, + "step": 1833 + }, + { + "epoch": 7.336, + "grad_norm": 0.9741259813308716, + "learning_rate": 3.174348697394789e-05, + "loss": 0.0362, + "step": 1834 + }, + { + "epoch": 7.34, + "grad_norm": 1.3720448017120361, + "learning_rate": 3.173346693386774e-05, + "loss": 0.0693, + "step": 1835 + }, + { + "epoch": 7.344, + "grad_norm": 1.2734616994857788, + "learning_rate": 3.172344689378758e-05, + "loss": 0.0569, + "step": 1836 + }, + { + "epoch": 7.348, + "grad_norm": 1.4324922561645508, + "learning_rate": 3.1713426853707416e-05, + "loss": 0.0765, + "step": 1837 + }, + { + "epoch": 7.352, + "grad_norm": 1.2667415142059326, + "learning_rate": 3.170340681362726e-05, + "loss": 0.0605, + "step": 1838 + }, + { + "epoch": 7.356, + "grad_norm": 1.3741072416305542, + "learning_rate": 3.16933867735471e-05, + "loss": 0.0634, + "step": 1839 + }, + { + "epoch": 7.36, + "grad_norm": 1.2768627405166626, + "learning_rate": 3.168336673346693e-05, + "loss": 0.0575, + "step": 1840 + }, + { + "epoch": 7.364, + "grad_norm": 1.393515944480896, + "learning_rate": 3.1673346693386774e-05, + "loss": 0.0642, + "step": 1841 + }, + { + "epoch": 7.368, + "grad_norm": 1.6649253368377686, + "learning_rate": 3.1663326653306616e-05, + "loss": 0.0837, + "step": 1842 + }, + { + "epoch": 7.372, + "grad_norm": 1.5157742500305176, + "learning_rate": 3.165330661322645e-05, + "loss": 0.07, + "step": 1843 + }, + { + "epoch": 7.376, + "grad_norm": 1.4178087711334229, + "learning_rate": 3.16432865731463e-05, + "loss": 0.0717, + "step": 1844 + }, + { + "epoch": 7.38, + "grad_norm": 1.1105268001556396, + "learning_rate": 3.163326653306614e-05, + "loss": 0.0495, + "step": 1845 + }, + { + "epoch": 7.384, + "grad_norm": 1.3235563039779663, + "learning_rate": 3.1623246492985974e-05, + "loss": 0.0687, + "step": 1846 + }, + { + "epoch": 7.388, + "grad_norm": 1.3781366348266602, + "learning_rate": 3.1613226452905815e-05, + "loss": 0.0741, + "step": 1847 + }, + { + "epoch": 7.392, + "grad_norm": 1.330330491065979, + "learning_rate": 3.1603206412825656e-05, + "loss": 0.061, + "step": 1848 + }, + { + "epoch": 7.396, + "grad_norm": 1.3802903890609741, + "learning_rate": 3.159318637274549e-05, + "loss": 0.068, + "step": 1849 + }, + { + "epoch": 7.4, + "grad_norm": 0.8295315504074097, + "learning_rate": 3.158316633266533e-05, + "loss": 0.0321, + "step": 1850 + }, + { + "epoch": 7.404, + "grad_norm": 1.4891018867492676, + "learning_rate": 3.1573146292585173e-05, + "loss": 0.0585, + "step": 1851 + }, + { + "epoch": 7.408, + "grad_norm": 1.496936321258545, + "learning_rate": 3.156312625250501e-05, + "loss": 0.0689, + "step": 1852 + }, + { + "epoch": 7.412, + "grad_norm": 1.4590027332305908, + "learning_rate": 3.155310621242485e-05, + "loss": 0.0722, + "step": 1853 + }, + { + "epoch": 7.416, + "grad_norm": 1.5951083898544312, + "learning_rate": 3.154308617234469e-05, + "loss": 0.0666, + "step": 1854 + }, + { + "epoch": 7.42, + "grad_norm": 1.2343343496322632, + "learning_rate": 3.153306613226453e-05, + "loss": 0.07, + "step": 1855 + }, + { + "epoch": 7.424, + "grad_norm": 1.3191311359405518, + "learning_rate": 3.152304609218437e-05, + "loss": 0.0684, + "step": 1856 + }, + { + "epoch": 7.428, + "grad_norm": 1.3911070823669434, + "learning_rate": 3.151302605210421e-05, + "loss": 0.0812, + "step": 1857 + }, + { + "epoch": 7.432, + "grad_norm": 1.2717417478561401, + "learning_rate": 3.150300601202405e-05, + "loss": 0.0693, + "step": 1858 + }, + { + "epoch": 7.436, + "grad_norm": 1.1313704252243042, + "learning_rate": 3.149298597194389e-05, + "loss": 0.0705, + "step": 1859 + }, + { + "epoch": 7.44, + "grad_norm": 1.0588295459747314, + "learning_rate": 3.148296593186373e-05, + "loss": 0.0578, + "step": 1860 + }, + { + "epoch": 7.444, + "grad_norm": 1.2012568712234497, + "learning_rate": 3.1472945891783566e-05, + "loss": 0.0607, + "step": 1861 + }, + { + "epoch": 7.448, + "grad_norm": 2.303619861602783, + "learning_rate": 3.146292585170341e-05, + "loss": 0.0796, + "step": 1862 + }, + { + "epoch": 7.452, + "grad_norm": 1.1877150535583496, + "learning_rate": 3.145290581162325e-05, + "loss": 0.0563, + "step": 1863 + }, + { + "epoch": 7.456, + "grad_norm": 1.2776094675064087, + "learning_rate": 3.144288577154309e-05, + "loss": 0.0591, + "step": 1864 + }, + { + "epoch": 7.46, + "grad_norm": 2.1104819774627686, + "learning_rate": 3.143286573146293e-05, + "loss": 0.0901, + "step": 1865 + }, + { + "epoch": 7.464, + "grad_norm": 1.0128964185714722, + "learning_rate": 3.1422845691382766e-05, + "loss": 0.0475, + "step": 1866 + }, + { + "epoch": 7.468, + "grad_norm": 1.3107222318649292, + "learning_rate": 3.141282565130261e-05, + "loss": 0.0709, + "step": 1867 + }, + { + "epoch": 7.4719999999999995, + "grad_norm": 1.4184036254882812, + "learning_rate": 3.140280561122245e-05, + "loss": 0.0801, + "step": 1868 + }, + { + "epoch": 7.476, + "grad_norm": 1.6373494863510132, + "learning_rate": 3.139278557114228e-05, + "loss": 0.0716, + "step": 1869 + }, + { + "epoch": 7.48, + "grad_norm": 1.555827260017395, + "learning_rate": 3.1382765531062124e-05, + "loss": 0.0653, + "step": 1870 + }, + { + "epoch": 7.484, + "grad_norm": 1.7851310968399048, + "learning_rate": 3.1372745490981965e-05, + "loss": 0.0892, + "step": 1871 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 1.5268523693084717, + "learning_rate": 3.13627254509018e-05, + "loss": 0.0688, + "step": 1872 + }, + { + "epoch": 7.492, + "grad_norm": 1.477304220199585, + "learning_rate": 3.135270541082164e-05, + "loss": 0.0733, + "step": 1873 + }, + { + "epoch": 7.496, + "grad_norm": 1.2932136058807373, + "learning_rate": 3.134268537074149e-05, + "loss": 0.0636, + "step": 1874 + }, + { + "epoch": 7.5, + "grad_norm": 1.4812061786651611, + "learning_rate": 3.1332665330661324e-05, + "loss": 0.0747, + "step": 1875 + }, + { + "epoch": 7.504, + "grad_norm": 1.303823471069336, + "learning_rate": 3.1322645290581165e-05, + "loss": 0.0616, + "step": 1876 + }, + { + "epoch": 7.508, + "grad_norm": 1.4868488311767578, + "learning_rate": 3.1312625250501006e-05, + "loss": 0.0898, + "step": 1877 + }, + { + "epoch": 7.5120000000000005, + "grad_norm": 1.6528791189193726, + "learning_rate": 3.130260521042084e-05, + "loss": 0.0661, + "step": 1878 + }, + { + "epoch": 7.516, + "grad_norm": 1.552954077720642, + "learning_rate": 3.129258517034068e-05, + "loss": 0.0716, + "step": 1879 + }, + { + "epoch": 7.52, + "grad_norm": 1.2300714254379272, + "learning_rate": 3.128256513026052e-05, + "loss": 0.0625, + "step": 1880 + }, + { + "epoch": 7.524, + "grad_norm": 1.54698646068573, + "learning_rate": 3.127254509018036e-05, + "loss": 0.0765, + "step": 1881 + }, + { + "epoch": 7.5280000000000005, + "grad_norm": 1.5242317914962769, + "learning_rate": 3.12625250501002e-05, + "loss": 0.0922, + "step": 1882 + }, + { + "epoch": 7.532, + "grad_norm": 1.4363588094711304, + "learning_rate": 3.125250501002004e-05, + "loss": 0.0784, + "step": 1883 + }, + { + "epoch": 7.536, + "grad_norm": 1.3856273889541626, + "learning_rate": 3.124248496993988e-05, + "loss": 0.0669, + "step": 1884 + }, + { + "epoch": 7.54, + "grad_norm": 1.296464443206787, + "learning_rate": 3.123246492985972e-05, + "loss": 0.0713, + "step": 1885 + }, + { + "epoch": 7.5440000000000005, + "grad_norm": 1.3681565523147583, + "learning_rate": 3.1222444889779564e-05, + "loss": 0.0678, + "step": 1886 + }, + { + "epoch": 7.548, + "grad_norm": 1.34380304813385, + "learning_rate": 3.12124248496994e-05, + "loss": 0.0552, + "step": 1887 + }, + { + "epoch": 7.552, + "grad_norm": 1.2859309911727905, + "learning_rate": 3.120240480961924e-05, + "loss": 0.0602, + "step": 1888 + }, + { + "epoch": 7.556, + "grad_norm": 1.4414995908737183, + "learning_rate": 3.119238476953908e-05, + "loss": 0.0646, + "step": 1889 + }, + { + "epoch": 7.5600000000000005, + "grad_norm": 1.549656629562378, + "learning_rate": 3.1182364729458916e-05, + "loss": 0.0848, + "step": 1890 + }, + { + "epoch": 7.564, + "grad_norm": 1.274708867073059, + "learning_rate": 3.117234468937876e-05, + "loss": 0.0634, + "step": 1891 + }, + { + "epoch": 7.568, + "grad_norm": 2.1878645420074463, + "learning_rate": 3.11623246492986e-05, + "loss": 0.0593, + "step": 1892 + }, + { + "epoch": 7.572, + "grad_norm": 1.367893099784851, + "learning_rate": 3.115230460921843e-05, + "loss": 0.0637, + "step": 1893 + }, + { + "epoch": 7.576, + "grad_norm": 1.5797450542449951, + "learning_rate": 3.114228456913828e-05, + "loss": 0.0853, + "step": 1894 + }, + { + "epoch": 7.58, + "grad_norm": 1.5319100618362427, + "learning_rate": 3.113226452905812e-05, + "loss": 0.0795, + "step": 1895 + }, + { + "epoch": 7.584, + "grad_norm": 1.3238308429718018, + "learning_rate": 3.112224448897796e-05, + "loss": 0.0622, + "step": 1896 + }, + { + "epoch": 7.588, + "grad_norm": 1.4970932006835938, + "learning_rate": 3.11122244488978e-05, + "loss": 0.0776, + "step": 1897 + }, + { + "epoch": 7.592, + "grad_norm": 0.7695464491844177, + "learning_rate": 3.110220440881764e-05, + "loss": 0.0338, + "step": 1898 + }, + { + "epoch": 7.596, + "grad_norm": 1.3819239139556885, + "learning_rate": 3.1092184368737474e-05, + "loss": 0.0804, + "step": 1899 + }, + { + "epoch": 7.6, + "grad_norm": 1.5732004642486572, + "learning_rate": 3.1082164328657315e-05, + "loss": 0.0869, + "step": 1900 + }, + { + "epoch": 7.604, + "grad_norm": 1.3113768100738525, + "learning_rate": 3.1072144288577156e-05, + "loss": 0.0659, + "step": 1901 + }, + { + "epoch": 7.608, + "grad_norm": 1.4455841779708862, + "learning_rate": 3.106212424849699e-05, + "loss": 0.0847, + "step": 1902 + }, + { + "epoch": 7.612, + "grad_norm": 0.8127650618553162, + "learning_rate": 3.105210420841684e-05, + "loss": 0.044, + "step": 1903 + }, + { + "epoch": 7.616, + "grad_norm": 1.613440990447998, + "learning_rate": 3.104208416833668e-05, + "loss": 0.0779, + "step": 1904 + }, + { + "epoch": 7.62, + "grad_norm": 1.3210656642913818, + "learning_rate": 3.1032064128256515e-05, + "loss": 0.058, + "step": 1905 + }, + { + "epoch": 7.624, + "grad_norm": 1.4061453342437744, + "learning_rate": 3.1022044088176356e-05, + "loss": 0.0759, + "step": 1906 + }, + { + "epoch": 7.628, + "grad_norm": 1.5052586793899536, + "learning_rate": 3.10120240480962e-05, + "loss": 0.0785, + "step": 1907 + }, + { + "epoch": 7.632, + "grad_norm": 1.3261641263961792, + "learning_rate": 3.100200400801603e-05, + "loss": 0.0922, + "step": 1908 + }, + { + "epoch": 7.636, + "grad_norm": 1.5588467121124268, + "learning_rate": 3.099198396793587e-05, + "loss": 0.0703, + "step": 1909 + }, + { + "epoch": 7.64, + "grad_norm": 1.2506775856018066, + "learning_rate": 3.0981963927855714e-05, + "loss": 0.0537, + "step": 1910 + }, + { + "epoch": 7.644, + "grad_norm": 1.6546599864959717, + "learning_rate": 3.097194388777555e-05, + "loss": 0.0913, + "step": 1911 + }, + { + "epoch": 7.648, + "grad_norm": 1.600527286529541, + "learning_rate": 3.096192384769539e-05, + "loss": 0.0788, + "step": 1912 + }, + { + "epoch": 7.652, + "grad_norm": 1.5613360404968262, + "learning_rate": 3.095190380761523e-05, + "loss": 0.068, + "step": 1913 + }, + { + "epoch": 7.656, + "grad_norm": 1.5077072381973267, + "learning_rate": 3.094188376753507e-05, + "loss": 0.0729, + "step": 1914 + }, + { + "epoch": 7.66, + "grad_norm": 1.6048588752746582, + "learning_rate": 3.0931863727454914e-05, + "loss": 0.0876, + "step": 1915 + }, + { + "epoch": 7.664, + "grad_norm": 1.4152404069900513, + "learning_rate": 3.0921843687374755e-05, + "loss": 0.0876, + "step": 1916 + }, + { + "epoch": 7.668, + "grad_norm": 1.5331817865371704, + "learning_rate": 3.091182364729459e-05, + "loss": 0.0765, + "step": 1917 + }, + { + "epoch": 7.672, + "grad_norm": 1.5034297704696655, + "learning_rate": 3.090180360721443e-05, + "loss": 0.0618, + "step": 1918 + }, + { + "epoch": 7.676, + "grad_norm": 1.4812408685684204, + "learning_rate": 3.089178356713427e-05, + "loss": 0.0791, + "step": 1919 + }, + { + "epoch": 7.68, + "grad_norm": 1.4071189165115356, + "learning_rate": 3.088176352705411e-05, + "loss": 0.0773, + "step": 1920 + }, + { + "epoch": 7.684, + "grad_norm": 1.3819324970245361, + "learning_rate": 3.087174348697395e-05, + "loss": 0.0734, + "step": 1921 + }, + { + "epoch": 7.688, + "grad_norm": 1.3420048952102661, + "learning_rate": 3.086172344689379e-05, + "loss": 0.0659, + "step": 1922 + }, + { + "epoch": 7.692, + "grad_norm": 1.4877381324768066, + "learning_rate": 3.085170340681363e-05, + "loss": 0.0729, + "step": 1923 + }, + { + "epoch": 7.696, + "grad_norm": 1.2913259267807007, + "learning_rate": 3.084168336673347e-05, + "loss": 0.0593, + "step": 1924 + }, + { + "epoch": 7.7, + "grad_norm": 1.3754169940948486, + "learning_rate": 3.0831663326653306e-05, + "loss": 0.0668, + "step": 1925 + }, + { + "epoch": 7.704, + "grad_norm": 1.3683477640151978, + "learning_rate": 3.082164328657315e-05, + "loss": 0.0736, + "step": 1926 + }, + { + "epoch": 7.708, + "grad_norm": 1.151803970336914, + "learning_rate": 3.081162324649299e-05, + "loss": 0.0581, + "step": 1927 + }, + { + "epoch": 7.712, + "grad_norm": 1.4740242958068848, + "learning_rate": 3.0801603206412824e-05, + "loss": 0.0903, + "step": 1928 + }, + { + "epoch": 7.716, + "grad_norm": 1.588667869567871, + "learning_rate": 3.0791583166332665e-05, + "loss": 0.083, + "step": 1929 + }, + { + "epoch": 7.72, + "grad_norm": 1.4795126914978027, + "learning_rate": 3.0781563126252506e-05, + "loss": 0.0743, + "step": 1930 + }, + { + "epoch": 7.724, + "grad_norm": 1.3423376083374023, + "learning_rate": 3.077154308617235e-05, + "loss": 0.0701, + "step": 1931 + }, + { + "epoch": 7.728, + "grad_norm": 1.1936395168304443, + "learning_rate": 3.076152304609218e-05, + "loss": 0.0645, + "step": 1932 + }, + { + "epoch": 7.732, + "grad_norm": 1.1713727712631226, + "learning_rate": 3.075150300601203e-05, + "loss": 0.0536, + "step": 1933 + }, + { + "epoch": 7.736, + "grad_norm": 1.3141846656799316, + "learning_rate": 3.0741482965931864e-05, + "loss": 0.0723, + "step": 1934 + }, + { + "epoch": 7.74, + "grad_norm": 1.4886362552642822, + "learning_rate": 3.0731462925851706e-05, + "loss": 0.0902, + "step": 1935 + }, + { + "epoch": 7.744, + "grad_norm": 1.4828537702560425, + "learning_rate": 3.072144288577155e-05, + "loss": 0.0807, + "step": 1936 + }, + { + "epoch": 7.748, + "grad_norm": 1.341299295425415, + "learning_rate": 3.071142284569138e-05, + "loss": 0.0692, + "step": 1937 + }, + { + "epoch": 7.752, + "grad_norm": 1.3996098041534424, + "learning_rate": 3.070140280561122e-05, + "loss": 0.0658, + "step": 1938 + }, + { + "epoch": 7.756, + "grad_norm": 1.3277019262313843, + "learning_rate": 3.0691382765531064e-05, + "loss": 0.0825, + "step": 1939 + }, + { + "epoch": 7.76, + "grad_norm": 1.4799069166183472, + "learning_rate": 3.06813627254509e-05, + "loss": 0.0631, + "step": 1940 + }, + { + "epoch": 7.764, + "grad_norm": 1.4274985790252686, + "learning_rate": 3.067134268537074e-05, + "loss": 0.0756, + "step": 1941 + }, + { + "epoch": 7.768, + "grad_norm": 1.5825673341751099, + "learning_rate": 3.066132264529058e-05, + "loss": 0.0992, + "step": 1942 + }, + { + "epoch": 7.772, + "grad_norm": 1.4629828929901123, + "learning_rate": 3.065130260521042e-05, + "loss": 0.0893, + "step": 1943 + }, + { + "epoch": 7.776, + "grad_norm": 1.2774440050125122, + "learning_rate": 3.0641282565130264e-05, + "loss": 0.059, + "step": 1944 + }, + { + "epoch": 7.78, + "grad_norm": 1.3957229852676392, + "learning_rate": 3.0631262525050105e-05, + "loss": 0.0699, + "step": 1945 + }, + { + "epoch": 7.784, + "grad_norm": 1.5899467468261719, + "learning_rate": 3.062124248496994e-05, + "loss": 0.0865, + "step": 1946 + }, + { + "epoch": 7.788, + "grad_norm": 1.4467995166778564, + "learning_rate": 3.061122244488978e-05, + "loss": 0.0765, + "step": 1947 + }, + { + "epoch": 7.792, + "grad_norm": 1.4088798761367798, + "learning_rate": 3.060120240480962e-05, + "loss": 0.07, + "step": 1948 + }, + { + "epoch": 7.796, + "grad_norm": 1.3148294687271118, + "learning_rate": 3.0591182364729457e-05, + "loss": 0.0845, + "step": 1949 + }, + { + "epoch": 7.8, + "grad_norm": 1.445083737373352, + "learning_rate": 3.05811623246493e-05, + "loss": 0.0739, + "step": 1950 + }, + { + "epoch": 7.804, + "grad_norm": 1.4596608877182007, + "learning_rate": 3.057114228456914e-05, + "loss": 0.0684, + "step": 1951 + }, + { + "epoch": 7.808, + "grad_norm": 1.3999098539352417, + "learning_rate": 3.0561122244488974e-05, + "loss": 0.0638, + "step": 1952 + }, + { + "epoch": 7.812, + "grad_norm": 1.6097480058670044, + "learning_rate": 3.055110220440882e-05, + "loss": 0.0812, + "step": 1953 + }, + { + "epoch": 7.816, + "grad_norm": 1.6041979789733887, + "learning_rate": 3.054108216432866e-05, + "loss": 0.0989, + "step": 1954 + }, + { + "epoch": 7.82, + "grad_norm": 1.4978128671646118, + "learning_rate": 3.05310621242485e-05, + "loss": 0.0799, + "step": 1955 + }, + { + "epoch": 7.824, + "grad_norm": 1.465773582458496, + "learning_rate": 3.052104208416834e-05, + "loss": 0.0827, + "step": 1956 + }, + { + "epoch": 7.828, + "grad_norm": 1.273937463760376, + "learning_rate": 3.0511022044088177e-05, + "loss": 0.0664, + "step": 1957 + }, + { + "epoch": 7.832, + "grad_norm": 1.212803602218628, + "learning_rate": 3.0501002004008018e-05, + "loss": 0.0612, + "step": 1958 + }, + { + "epoch": 7.836, + "grad_norm": 1.3836429119110107, + "learning_rate": 3.0490981963927856e-05, + "loss": 0.0711, + "step": 1959 + }, + { + "epoch": 7.84, + "grad_norm": 1.5588130950927734, + "learning_rate": 3.0480961923847694e-05, + "loss": 0.0792, + "step": 1960 + }, + { + "epoch": 7.844, + "grad_norm": 1.5219345092773438, + "learning_rate": 3.0470941883767535e-05, + "loss": 0.0826, + "step": 1961 + }, + { + "epoch": 7.848, + "grad_norm": 1.6055107116699219, + "learning_rate": 3.046092184368738e-05, + "loss": 0.0766, + "step": 1962 + }, + { + "epoch": 7.852, + "grad_norm": 1.2326933145523071, + "learning_rate": 3.0450901803607218e-05, + "loss": 0.0622, + "step": 1963 + }, + { + "epoch": 7.856, + "grad_norm": 1.6281651258468628, + "learning_rate": 3.0440881763527055e-05, + "loss": 0.0789, + "step": 1964 + }, + { + "epoch": 7.86, + "grad_norm": 1.5395461320877075, + "learning_rate": 3.0430861723446897e-05, + "loss": 0.0793, + "step": 1965 + }, + { + "epoch": 7.864, + "grad_norm": 1.4532355070114136, + "learning_rate": 3.0420841683366735e-05, + "loss": 0.0667, + "step": 1966 + }, + { + "epoch": 7.868, + "grad_norm": 1.4249470233917236, + "learning_rate": 3.0410821643286576e-05, + "loss": 0.0675, + "step": 1967 + }, + { + "epoch": 7.872, + "grad_norm": 1.4712321758270264, + "learning_rate": 3.0400801603206414e-05, + "loss": 0.0563, + "step": 1968 + }, + { + "epoch": 7.876, + "grad_norm": 1.6727486848831177, + "learning_rate": 3.0390781563126252e-05, + "loss": 0.0915, + "step": 1969 + }, + { + "epoch": 7.88, + "grad_norm": 1.4988911151885986, + "learning_rate": 3.0380761523046093e-05, + "loss": 0.0754, + "step": 1970 + }, + { + "epoch": 7.884, + "grad_norm": 1.4844058752059937, + "learning_rate": 3.037074148296593e-05, + "loss": 0.0777, + "step": 1971 + }, + { + "epoch": 7.888, + "grad_norm": 1.4595845937728882, + "learning_rate": 3.0360721442885776e-05, + "loss": 0.0714, + "step": 1972 + }, + { + "epoch": 7.892, + "grad_norm": 1.6251410245895386, + "learning_rate": 3.0350701402805613e-05, + "loss": 0.0891, + "step": 1973 + }, + { + "epoch": 7.896, + "grad_norm": 1.4736934900283813, + "learning_rate": 3.0340681362725455e-05, + "loss": 0.0776, + "step": 1974 + }, + { + "epoch": 7.9, + "grad_norm": 1.6098906993865967, + "learning_rate": 3.0330661322645293e-05, + "loss": 0.0738, + "step": 1975 + }, + { + "epoch": 7.904, + "grad_norm": 1.4233413934707642, + "learning_rate": 3.032064128256513e-05, + "loss": 0.0681, + "step": 1976 + }, + { + "epoch": 7.908, + "grad_norm": 1.4190319776535034, + "learning_rate": 3.0310621242484972e-05, + "loss": 0.0941, + "step": 1977 + }, + { + "epoch": 7.912, + "grad_norm": 1.4623279571533203, + "learning_rate": 3.030060120240481e-05, + "loss": 0.0736, + "step": 1978 + }, + { + "epoch": 7.916, + "grad_norm": 1.6991535425186157, + "learning_rate": 3.029058116232465e-05, + "loss": 0.0896, + "step": 1979 + }, + { + "epoch": 7.92, + "grad_norm": 1.6135594844818115, + "learning_rate": 3.028056112224449e-05, + "loss": 0.0841, + "step": 1980 + }, + { + "epoch": 7.924, + "grad_norm": 1.5118712186813354, + "learning_rate": 3.0270541082164327e-05, + "loss": 0.0966, + "step": 1981 + }, + { + "epoch": 7.928, + "grad_norm": 1.3589533567428589, + "learning_rate": 3.026052104208417e-05, + "loss": 0.0725, + "step": 1982 + }, + { + "epoch": 7.932, + "grad_norm": 1.5694752931594849, + "learning_rate": 3.0250501002004013e-05, + "loss": 0.0785, + "step": 1983 + }, + { + "epoch": 7.936, + "grad_norm": 1.42135751247406, + "learning_rate": 3.024048096192385e-05, + "loss": 0.0802, + "step": 1984 + }, + { + "epoch": 7.9399999999999995, + "grad_norm": 1.4186774492263794, + "learning_rate": 3.023046092184369e-05, + "loss": 0.071, + "step": 1985 + }, + { + "epoch": 7.944, + "grad_norm": 0.7611780166625977, + "learning_rate": 3.022044088176353e-05, + "loss": 0.0184, + "step": 1986 + }, + { + "epoch": 7.948, + "grad_norm": 1.5581532716751099, + "learning_rate": 3.0210420841683368e-05, + "loss": 0.0853, + "step": 1987 + }, + { + "epoch": 7.952, + "grad_norm": 1.394500494003296, + "learning_rate": 3.0200400801603206e-05, + "loss": 0.0915, + "step": 1988 + }, + { + "epoch": 7.9559999999999995, + "grad_norm": 1.33413565158844, + "learning_rate": 3.0190380761523047e-05, + "loss": 0.0689, + "step": 1989 + }, + { + "epoch": 7.96, + "grad_norm": 1.417820692062378, + "learning_rate": 3.0180360721442885e-05, + "loss": 0.0861, + "step": 1990 + }, + { + "epoch": 7.964, + "grad_norm": 1.309669852256775, + "learning_rate": 3.0170340681362723e-05, + "loss": 0.0703, + "step": 1991 + }, + { + "epoch": 7.968, + "grad_norm": 1.4722129106521606, + "learning_rate": 3.0160320641282567e-05, + "loss": 0.0636, + "step": 1992 + }, + { + "epoch": 7.9719999999999995, + "grad_norm": 1.4907193183898926, + "learning_rate": 3.015030060120241e-05, + "loss": 0.0699, + "step": 1993 + }, + { + "epoch": 7.976, + "grad_norm": 1.5335925817489624, + "learning_rate": 3.0140280561122247e-05, + "loss": 0.0712, + "step": 1994 + }, + { + "epoch": 7.98, + "grad_norm": 1.2255603075027466, + "learning_rate": 3.0130260521042088e-05, + "loss": 0.0602, + "step": 1995 + }, + { + "epoch": 7.984, + "grad_norm": 1.4105703830718994, + "learning_rate": 3.0120240480961926e-05, + "loss": 0.0731, + "step": 1996 + }, + { + "epoch": 7.9879999999999995, + "grad_norm": 1.5889180898666382, + "learning_rate": 3.0110220440881764e-05, + "loss": 0.0752, + "step": 1997 + }, + { + "epoch": 7.992, + "grad_norm": 1.6140540838241577, + "learning_rate": 3.0100200400801605e-05, + "loss": 0.0753, + "step": 1998 + }, + { + "epoch": 7.996, + "grad_norm": 1.826938509941101, + "learning_rate": 3.0090180360721443e-05, + "loss": 0.0852, + "step": 1999 + }, + { + "epoch": 8.0, + "grad_norm": 1.0301682949066162, + "learning_rate": 3.008016032064128e-05, + "loss": 0.045, + "step": 2000 + }, + { + "epoch": 8.004, + "grad_norm": 0.9636511206626892, + "learning_rate": 3.0070140280561122e-05, + "loss": 0.0422, + "step": 2001 + }, + { + "epoch": 8.008, + "grad_norm": 0.9918342232704163, + "learning_rate": 3.0060120240480967e-05, + "loss": 0.0413, + "step": 2002 + }, + { + "epoch": 8.012, + "grad_norm": 0.8967450857162476, + "learning_rate": 3.0050100200400805e-05, + "loss": 0.0343, + "step": 2003 + }, + { + "epoch": 8.016, + "grad_norm": 1.0246479511260986, + "learning_rate": 3.0040080160320642e-05, + "loss": 0.0399, + "step": 2004 + }, + { + "epoch": 8.02, + "grad_norm": 1.0252851247787476, + "learning_rate": 3.0030060120240484e-05, + "loss": 0.0458, + "step": 2005 + }, + { + "epoch": 8.024, + "grad_norm": 0.974763810634613, + "learning_rate": 3.002004008016032e-05, + "loss": 0.0437, + "step": 2006 + }, + { + "epoch": 8.028, + "grad_norm": 0.9569265842437744, + "learning_rate": 3.0010020040080163e-05, + "loss": 0.0364, + "step": 2007 + }, + { + "epoch": 8.032, + "grad_norm": 1.104304313659668, + "learning_rate": 3e-05, + "loss": 0.0397, + "step": 2008 + }, + { + "epoch": 8.036, + "grad_norm": 1.042075753211975, + "learning_rate": 2.998997995991984e-05, + "loss": 0.0319, + "step": 2009 + }, + { + "epoch": 8.04, + "grad_norm": 1.41454017162323, + "learning_rate": 2.997995991983968e-05, + "loss": 0.0519, + "step": 2010 + }, + { + "epoch": 8.044, + "grad_norm": 1.2228070497512817, + "learning_rate": 2.9969939879759525e-05, + "loss": 0.0471, + "step": 2011 + }, + { + "epoch": 8.048, + "grad_norm": 1.3753093481063843, + "learning_rate": 2.9959919839679363e-05, + "loss": 0.0426, + "step": 2012 + }, + { + "epoch": 8.052, + "grad_norm": 1.2249881029129028, + "learning_rate": 2.99498997995992e-05, + "loss": 0.0375, + "step": 2013 + }, + { + "epoch": 8.056, + "grad_norm": 1.1894644498825073, + "learning_rate": 2.993987975951904e-05, + "loss": 0.0388, + "step": 2014 + }, + { + "epoch": 8.06, + "grad_norm": 1.2309092283248901, + "learning_rate": 2.992985971943888e-05, + "loss": 0.0403, + "step": 2015 + }, + { + "epoch": 8.064, + "grad_norm": 1.237890362739563, + "learning_rate": 2.9919839679358717e-05, + "loss": 0.0414, + "step": 2016 + }, + { + "epoch": 8.068, + "grad_norm": 1.3843622207641602, + "learning_rate": 2.990981963927856e-05, + "loss": 0.0462, + "step": 2017 + }, + { + "epoch": 8.072, + "grad_norm": 1.204715609550476, + "learning_rate": 2.9899799599198397e-05, + "loss": 0.0455, + "step": 2018 + }, + { + "epoch": 8.076, + "grad_norm": 1.1877174377441406, + "learning_rate": 2.9889779559118235e-05, + "loss": 0.0429, + "step": 2019 + }, + { + "epoch": 8.08, + "grad_norm": 0.9619832634925842, + "learning_rate": 2.9879759519038076e-05, + "loss": 0.0344, + "step": 2020 + }, + { + "epoch": 8.084, + "grad_norm": 1.2770909070968628, + "learning_rate": 2.986973947895792e-05, + "loss": 0.0434, + "step": 2021 + }, + { + "epoch": 8.088, + "grad_norm": 2.0295021533966064, + "learning_rate": 2.985971943887776e-05, + "loss": 0.0478, + "step": 2022 + }, + { + "epoch": 8.092, + "grad_norm": 1.0458402633666992, + "learning_rate": 2.98496993987976e-05, + "loss": 0.0407, + "step": 2023 + }, + { + "epoch": 8.096, + "grad_norm": 1.3169829845428467, + "learning_rate": 2.9839679358717438e-05, + "loss": 0.0383, + "step": 2024 + }, + { + "epoch": 8.1, + "grad_norm": 1.2271572351455688, + "learning_rate": 2.9829659318637275e-05, + "loss": 0.0426, + "step": 2025 + }, + { + "epoch": 8.104, + "grad_norm": 0.6691854596138, + "learning_rate": 2.9819639278557117e-05, + "loss": 0.0178, + "step": 2026 + }, + { + "epoch": 8.108, + "grad_norm": 1.2203044891357422, + "learning_rate": 2.9809619238476955e-05, + "loss": 0.049, + "step": 2027 + }, + { + "epoch": 8.112, + "grad_norm": 1.1004842519760132, + "learning_rate": 2.9799599198396793e-05, + "loss": 0.0413, + "step": 2028 + }, + { + "epoch": 8.116, + "grad_norm": 1.1500310897827148, + "learning_rate": 2.9789579158316634e-05, + "loss": 0.052, + "step": 2029 + }, + { + "epoch": 8.12, + "grad_norm": 1.0582787990570068, + "learning_rate": 2.9779559118236472e-05, + "loss": 0.0387, + "step": 2030 + }, + { + "epoch": 8.124, + "grad_norm": 1.1282780170440674, + "learning_rate": 2.9769539078156316e-05, + "loss": 0.0463, + "step": 2031 + }, + { + "epoch": 8.128, + "grad_norm": 1.2757964134216309, + "learning_rate": 2.9759519038076154e-05, + "loss": 0.0458, + "step": 2032 + }, + { + "epoch": 8.132, + "grad_norm": 1.3332687616348267, + "learning_rate": 2.9749498997995996e-05, + "loss": 0.0527, + "step": 2033 + }, + { + "epoch": 8.136, + "grad_norm": 1.269371747970581, + "learning_rate": 2.9739478957915833e-05, + "loss": 0.0555, + "step": 2034 + }, + { + "epoch": 8.14, + "grad_norm": 1.179306149482727, + "learning_rate": 2.9729458917835675e-05, + "loss": 0.0379, + "step": 2035 + }, + { + "epoch": 8.144, + "grad_norm": 1.3321940898895264, + "learning_rate": 2.9719438877755513e-05, + "loss": 0.0554, + "step": 2036 + }, + { + "epoch": 8.148, + "grad_norm": 0.9157149791717529, + "learning_rate": 2.970941883767535e-05, + "loss": 0.033, + "step": 2037 + }, + { + "epoch": 8.152, + "grad_norm": 1.1154097318649292, + "learning_rate": 2.9699398797595192e-05, + "loss": 0.0384, + "step": 2038 + }, + { + "epoch": 8.156, + "grad_norm": 0.9774281978607178, + "learning_rate": 2.968937875751503e-05, + "loss": 0.0333, + "step": 2039 + }, + { + "epoch": 8.16, + "grad_norm": 1.5343128442764282, + "learning_rate": 2.9679358717434868e-05, + "loss": 0.0353, + "step": 2040 + }, + { + "epoch": 8.164, + "grad_norm": 1.0981296300888062, + "learning_rate": 2.9669338677354712e-05, + "loss": 0.0372, + "step": 2041 + }, + { + "epoch": 8.168, + "grad_norm": 1.17277193069458, + "learning_rate": 2.9659318637274554e-05, + "loss": 0.0456, + "step": 2042 + }, + { + "epoch": 8.172, + "grad_norm": 1.0405935049057007, + "learning_rate": 2.964929859719439e-05, + "loss": 0.0431, + "step": 2043 + }, + { + "epoch": 8.176, + "grad_norm": 1.213351845741272, + "learning_rate": 2.963927855711423e-05, + "loss": 0.0469, + "step": 2044 + }, + { + "epoch": 8.18, + "grad_norm": 1.091701865196228, + "learning_rate": 2.962925851703407e-05, + "loss": 0.0409, + "step": 2045 + }, + { + "epoch": 8.184, + "grad_norm": 1.140825629234314, + "learning_rate": 2.961923847695391e-05, + "loss": 0.0399, + "step": 2046 + }, + { + "epoch": 8.188, + "grad_norm": 1.3966275453567505, + "learning_rate": 2.9609218436873746e-05, + "loss": 0.047, + "step": 2047 + }, + { + "epoch": 8.192, + "grad_norm": 1.0607054233551025, + "learning_rate": 2.9599198396793588e-05, + "loss": 0.0403, + "step": 2048 + }, + { + "epoch": 8.196, + "grad_norm": 1.3309967517852783, + "learning_rate": 2.9589178356713426e-05, + "loss": 0.0424, + "step": 2049 + }, + { + "epoch": 8.2, + "grad_norm": 1.007638931274414, + "learning_rate": 2.9579158316633267e-05, + "loss": 0.0319, + "step": 2050 + }, + { + "epoch": 8.204, + "grad_norm": 1.2654892206192017, + "learning_rate": 2.956913827655311e-05, + "loss": 0.0453, + "step": 2051 + }, + { + "epoch": 8.208, + "grad_norm": 1.064241647720337, + "learning_rate": 2.955911823647295e-05, + "loss": 0.0436, + "step": 2052 + }, + { + "epoch": 8.212, + "grad_norm": 1.1412560939788818, + "learning_rate": 2.9549098196392787e-05, + "loss": 0.0434, + "step": 2053 + }, + { + "epoch": 8.216, + "grad_norm": 1.2644426822662354, + "learning_rate": 2.953907815631263e-05, + "loss": 0.0443, + "step": 2054 + }, + { + "epoch": 8.22, + "grad_norm": 1.153268814086914, + "learning_rate": 2.9529058116232467e-05, + "loss": 0.0446, + "step": 2055 + }, + { + "epoch": 8.224, + "grad_norm": 1.1821683645248413, + "learning_rate": 2.9519038076152304e-05, + "loss": 0.0403, + "step": 2056 + }, + { + "epoch": 8.228, + "grad_norm": 1.1083087921142578, + "learning_rate": 2.9509018036072146e-05, + "loss": 0.0376, + "step": 2057 + }, + { + "epoch": 8.232, + "grad_norm": 1.251597285270691, + "learning_rate": 2.9498997995991984e-05, + "loss": 0.0458, + "step": 2058 + }, + { + "epoch": 8.236, + "grad_norm": 1.3240916728973389, + "learning_rate": 2.948897795591182e-05, + "loss": 0.0505, + "step": 2059 + }, + { + "epoch": 8.24, + "grad_norm": 1.3797112703323364, + "learning_rate": 2.9478957915831663e-05, + "loss": 0.0468, + "step": 2060 + }, + { + "epoch": 8.244, + "grad_norm": 1.4539932012557983, + "learning_rate": 2.9468937875751507e-05, + "loss": 0.0459, + "step": 2061 + }, + { + "epoch": 8.248, + "grad_norm": 1.0563139915466309, + "learning_rate": 2.9458917835671345e-05, + "loss": 0.0375, + "step": 2062 + }, + { + "epoch": 8.252, + "grad_norm": 1.1842182874679565, + "learning_rate": 2.9448897795591183e-05, + "loss": 0.0441, + "step": 2063 + }, + { + "epoch": 8.256, + "grad_norm": 1.2775763273239136, + "learning_rate": 2.9438877755511024e-05, + "loss": 0.0487, + "step": 2064 + }, + { + "epoch": 8.26, + "grad_norm": 1.2534736394882202, + "learning_rate": 2.9428857715430862e-05, + "loss": 0.0389, + "step": 2065 + }, + { + "epoch": 8.264, + "grad_norm": 1.3463037014007568, + "learning_rate": 2.9418837675350704e-05, + "loss": 0.053, + "step": 2066 + }, + { + "epoch": 8.268, + "grad_norm": 0.9820384383201599, + "learning_rate": 2.940881763527054e-05, + "loss": 0.0467, + "step": 2067 + }, + { + "epoch": 8.272, + "grad_norm": 1.1850944757461548, + "learning_rate": 2.939879759519038e-05, + "loss": 0.0398, + "step": 2068 + }, + { + "epoch": 8.276, + "grad_norm": 0.9997429847717285, + "learning_rate": 2.938877755511022e-05, + "loss": 0.0343, + "step": 2069 + }, + { + "epoch": 8.28, + "grad_norm": 1.2655606269836426, + "learning_rate": 2.9378757515030065e-05, + "loss": 0.0437, + "step": 2070 + }, + { + "epoch": 8.284, + "grad_norm": 1.0781662464141846, + "learning_rate": 2.9368737474949903e-05, + "loss": 0.0349, + "step": 2071 + }, + { + "epoch": 8.288, + "grad_norm": 1.158593773841858, + "learning_rate": 2.935871743486974e-05, + "loss": 0.046, + "step": 2072 + }, + { + "epoch": 8.292, + "grad_norm": 1.2248380184173584, + "learning_rate": 2.9348697394789582e-05, + "loss": 0.0406, + "step": 2073 + }, + { + "epoch": 8.296, + "grad_norm": 1.362268090248108, + "learning_rate": 2.933867735470942e-05, + "loss": 0.0516, + "step": 2074 + }, + { + "epoch": 8.3, + "grad_norm": 1.1527179479599, + "learning_rate": 2.9328657314629258e-05, + "loss": 0.0396, + "step": 2075 + }, + { + "epoch": 8.304, + "grad_norm": 1.1694166660308838, + "learning_rate": 2.93186372745491e-05, + "loss": 0.0374, + "step": 2076 + }, + { + "epoch": 8.308, + "grad_norm": 1.28037428855896, + "learning_rate": 2.9308617234468937e-05, + "loss": 0.0416, + "step": 2077 + }, + { + "epoch": 8.312, + "grad_norm": 1.2145733833312988, + "learning_rate": 2.929859719438878e-05, + "loss": 0.0433, + "step": 2078 + }, + { + "epoch": 8.316, + "grad_norm": 1.0807759761810303, + "learning_rate": 2.9288577154308617e-05, + "loss": 0.0364, + "step": 2079 + }, + { + "epoch": 8.32, + "grad_norm": 1.364870548248291, + "learning_rate": 2.927855711422846e-05, + "loss": 0.0582, + "step": 2080 + }, + { + "epoch": 8.324, + "grad_norm": 1.2568074464797974, + "learning_rate": 2.92685370741483e-05, + "loss": 0.042, + "step": 2081 + }, + { + "epoch": 8.328, + "grad_norm": 1.4024349451065063, + "learning_rate": 2.925851703406814e-05, + "loss": 0.0465, + "step": 2082 + }, + { + "epoch": 8.332, + "grad_norm": 1.2387995719909668, + "learning_rate": 2.924849699398798e-05, + "loss": 0.0375, + "step": 2083 + }, + { + "epoch": 8.336, + "grad_norm": 1.2565348148345947, + "learning_rate": 2.9238476953907816e-05, + "loss": 0.04, + "step": 2084 + }, + { + "epoch": 8.34, + "grad_norm": 1.384647011756897, + "learning_rate": 2.9228456913827658e-05, + "loss": 0.0525, + "step": 2085 + }, + { + "epoch": 8.344, + "grad_norm": 0.9669448137283325, + "learning_rate": 2.9218436873747495e-05, + "loss": 0.0398, + "step": 2086 + }, + { + "epoch": 8.348, + "grad_norm": 1.022433876991272, + "learning_rate": 2.9208416833667333e-05, + "loss": 0.037, + "step": 2087 + }, + { + "epoch": 8.352, + "grad_norm": 1.2381759881973267, + "learning_rate": 2.9198396793587175e-05, + "loss": 0.0512, + "step": 2088 + }, + { + "epoch": 8.356, + "grad_norm": 0.967339813709259, + "learning_rate": 2.9188376753507013e-05, + "loss": 0.0392, + "step": 2089 + }, + { + "epoch": 8.36, + "grad_norm": 1.4872891902923584, + "learning_rate": 2.9178356713426857e-05, + "loss": 0.0491, + "step": 2090 + }, + { + "epoch": 8.364, + "grad_norm": 1.1258385181427002, + "learning_rate": 2.9168336673346695e-05, + "loss": 0.0395, + "step": 2091 + }, + { + "epoch": 8.368, + "grad_norm": 1.4340835809707642, + "learning_rate": 2.9158316633266536e-05, + "loss": 0.0561, + "step": 2092 + }, + { + "epoch": 8.372, + "grad_norm": 1.2354909181594849, + "learning_rate": 2.9148296593186374e-05, + "loss": 0.0415, + "step": 2093 + }, + { + "epoch": 8.376, + "grad_norm": 1.414834976196289, + "learning_rate": 2.9138276553106216e-05, + "loss": 0.0563, + "step": 2094 + }, + { + "epoch": 8.38, + "grad_norm": 1.3660783767700195, + "learning_rate": 2.9128256513026053e-05, + "loss": 0.049, + "step": 2095 + }, + { + "epoch": 8.384, + "grad_norm": 1.2326569557189941, + "learning_rate": 2.911823647294589e-05, + "loss": 0.0458, + "step": 2096 + }, + { + "epoch": 8.388, + "grad_norm": 1.343539834022522, + "learning_rate": 2.9108216432865733e-05, + "loss": 0.0498, + "step": 2097 + }, + { + "epoch": 8.392, + "grad_norm": 1.4126864671707153, + "learning_rate": 2.909819639278557e-05, + "loss": 0.0485, + "step": 2098 + }, + { + "epoch": 8.396, + "grad_norm": 1.0817058086395264, + "learning_rate": 2.908817635270541e-05, + "loss": 0.0342, + "step": 2099 + }, + { + "epoch": 8.4, + "grad_norm": 1.1074159145355225, + "learning_rate": 2.9078156312625253e-05, + "loss": 0.0371, + "step": 2100 + }, + { + "epoch": 8.404, + "grad_norm": 1.0859711170196533, + "learning_rate": 2.9068136272545094e-05, + "loss": 0.0408, + "step": 2101 + }, + { + "epoch": 8.408, + "grad_norm": 1.2138441801071167, + "learning_rate": 2.9058116232464932e-05, + "loss": 0.0497, + "step": 2102 + }, + { + "epoch": 8.412, + "grad_norm": 0.9061650037765503, + "learning_rate": 2.904809619238477e-05, + "loss": 0.0325, + "step": 2103 + }, + { + "epoch": 8.416, + "grad_norm": 1.2485976219177246, + "learning_rate": 2.903807615230461e-05, + "loss": 0.0525, + "step": 2104 + }, + { + "epoch": 8.42, + "grad_norm": 1.1576125621795654, + "learning_rate": 2.902805611222445e-05, + "loss": 0.036, + "step": 2105 + }, + { + "epoch": 8.424, + "grad_norm": 1.201346755027771, + "learning_rate": 2.901803607214429e-05, + "loss": 0.0455, + "step": 2106 + }, + { + "epoch": 8.428, + "grad_norm": 1.1513805389404297, + "learning_rate": 2.900801603206413e-05, + "loss": 0.0441, + "step": 2107 + }, + { + "epoch": 8.432, + "grad_norm": 1.2415015697479248, + "learning_rate": 2.8997995991983966e-05, + "loss": 0.0485, + "step": 2108 + }, + { + "epoch": 8.436, + "grad_norm": 1.4972500801086426, + "learning_rate": 2.8987975951903808e-05, + "loss": 0.0563, + "step": 2109 + }, + { + "epoch": 8.44, + "grad_norm": 1.346211314201355, + "learning_rate": 2.8977955911823652e-05, + "loss": 0.0537, + "step": 2110 + }, + { + "epoch": 8.444, + "grad_norm": 1.4355465173721313, + "learning_rate": 2.896793587174349e-05, + "loss": 0.0446, + "step": 2111 + }, + { + "epoch": 8.448, + "grad_norm": 1.4319212436676025, + "learning_rate": 2.8957915831663328e-05, + "loss": 0.0439, + "step": 2112 + }, + { + "epoch": 8.452, + "grad_norm": 1.0715012550354004, + "learning_rate": 2.894789579158317e-05, + "loss": 0.0377, + "step": 2113 + }, + { + "epoch": 8.456, + "grad_norm": 1.2712820768356323, + "learning_rate": 2.8937875751503007e-05, + "loss": 0.0433, + "step": 2114 + }, + { + "epoch": 8.46, + "grad_norm": 1.385491132736206, + "learning_rate": 2.8927855711422845e-05, + "loss": 0.0472, + "step": 2115 + }, + { + "epoch": 8.464, + "grad_norm": 1.0765126943588257, + "learning_rate": 2.8917835671342686e-05, + "loss": 0.0392, + "step": 2116 + }, + { + "epoch": 8.468, + "grad_norm": 0.964982807636261, + "learning_rate": 2.8907815631262524e-05, + "loss": 0.0364, + "step": 2117 + }, + { + "epoch": 8.472, + "grad_norm": 1.0207576751708984, + "learning_rate": 2.8897795591182362e-05, + "loss": 0.0353, + "step": 2118 + }, + { + "epoch": 8.475999999999999, + "grad_norm": 1.363625407218933, + "learning_rate": 2.8887775551102207e-05, + "loss": 0.0479, + "step": 2119 + }, + { + "epoch": 8.48, + "grad_norm": 1.3118845224380493, + "learning_rate": 2.8877755511022048e-05, + "loss": 0.05, + "step": 2120 + }, + { + "epoch": 8.484, + "grad_norm": 1.3515814542770386, + "learning_rate": 2.8867735470941886e-05, + "loss": 0.0449, + "step": 2121 + }, + { + "epoch": 8.488, + "grad_norm": 1.2568910121917725, + "learning_rate": 2.8857715430861727e-05, + "loss": 0.0421, + "step": 2122 + }, + { + "epoch": 8.492, + "grad_norm": 1.1728308200836182, + "learning_rate": 2.8847695390781565e-05, + "loss": 0.0402, + "step": 2123 + }, + { + "epoch": 8.496, + "grad_norm": 1.3032832145690918, + "learning_rate": 2.8837675350701403e-05, + "loss": 0.0431, + "step": 2124 + }, + { + "epoch": 8.5, + "grad_norm": 1.277818202972412, + "learning_rate": 2.8827655310621244e-05, + "loss": 0.0703, + "step": 2125 + }, + { + "epoch": 8.504, + "grad_norm": 1.165521264076233, + "learning_rate": 2.8817635270541082e-05, + "loss": 0.0411, + "step": 2126 + }, + { + "epoch": 8.508, + "grad_norm": 1.190546989440918, + "learning_rate": 2.880761523046092e-05, + "loss": 0.0426, + "step": 2127 + }, + { + "epoch": 8.512, + "grad_norm": 1.323736310005188, + "learning_rate": 2.879759519038076e-05, + "loss": 0.0498, + "step": 2128 + }, + { + "epoch": 8.516, + "grad_norm": 1.3947193622589111, + "learning_rate": 2.8787575150300606e-05, + "loss": 0.0562, + "step": 2129 + }, + { + "epoch": 8.52, + "grad_norm": 1.3394712209701538, + "learning_rate": 2.8777555110220444e-05, + "loss": 0.0568, + "step": 2130 + }, + { + "epoch": 8.524000000000001, + "grad_norm": 1.2725778818130493, + "learning_rate": 2.8767535070140282e-05, + "loss": 0.0453, + "step": 2131 + }, + { + "epoch": 8.528, + "grad_norm": 0.6503176093101501, + "learning_rate": 2.8757515030060123e-05, + "loss": 0.0155, + "step": 2132 + }, + { + "epoch": 8.532, + "grad_norm": 1.112267017364502, + "learning_rate": 2.874749498997996e-05, + "loss": 0.0351, + "step": 2133 + }, + { + "epoch": 8.536, + "grad_norm": 1.2222670316696167, + "learning_rate": 2.8737474949899802e-05, + "loss": 0.04, + "step": 2134 + }, + { + "epoch": 8.54, + "grad_norm": 1.2434343099594116, + "learning_rate": 2.872745490981964e-05, + "loss": 0.0432, + "step": 2135 + }, + { + "epoch": 8.544, + "grad_norm": 1.369353175163269, + "learning_rate": 2.8717434869739478e-05, + "loss": 0.0541, + "step": 2136 + }, + { + "epoch": 8.548, + "grad_norm": 1.3330413103103638, + "learning_rate": 2.870741482965932e-05, + "loss": 0.043, + "step": 2137 + }, + { + "epoch": 8.552, + "grad_norm": 1.0383684635162354, + "learning_rate": 2.8697394789579157e-05, + "loss": 0.0414, + "step": 2138 + }, + { + "epoch": 8.556000000000001, + "grad_norm": 1.3430683612823486, + "learning_rate": 2.8687374749499002e-05, + "loss": 0.0558, + "step": 2139 + }, + { + "epoch": 8.56, + "grad_norm": 1.6391087770462036, + "learning_rate": 2.867735470941884e-05, + "loss": 0.0523, + "step": 2140 + }, + { + "epoch": 8.564, + "grad_norm": 1.2342334985733032, + "learning_rate": 2.866733466933868e-05, + "loss": 0.0495, + "step": 2141 + }, + { + "epoch": 8.568, + "grad_norm": 1.2186678647994995, + "learning_rate": 2.865731462925852e-05, + "loss": 0.0468, + "step": 2142 + }, + { + "epoch": 8.572, + "grad_norm": 1.2047197818756104, + "learning_rate": 2.8647294589178357e-05, + "loss": 0.0485, + "step": 2143 + }, + { + "epoch": 8.576, + "grad_norm": 1.139602541923523, + "learning_rate": 2.86372745490982e-05, + "loss": 0.0398, + "step": 2144 + }, + { + "epoch": 8.58, + "grad_norm": 1.393538475036621, + "learning_rate": 2.8627254509018036e-05, + "loss": 0.0499, + "step": 2145 + }, + { + "epoch": 8.584, + "grad_norm": 0.7064828276634216, + "learning_rate": 2.8617234468937874e-05, + "loss": 0.0208, + "step": 2146 + }, + { + "epoch": 8.588, + "grad_norm": 1.0699363946914673, + "learning_rate": 2.8607214428857715e-05, + "loss": 0.0452, + "step": 2147 + }, + { + "epoch": 8.592, + "grad_norm": 0.9702959060668945, + "learning_rate": 2.8597194388777553e-05, + "loss": 0.0258, + "step": 2148 + }, + { + "epoch": 8.596, + "grad_norm": 1.0714532136917114, + "learning_rate": 2.8587174348697398e-05, + "loss": 0.0399, + "step": 2149 + }, + { + "epoch": 8.6, + "grad_norm": 0.7246587872505188, + "learning_rate": 2.857715430861724e-05, + "loss": 0.0249, + "step": 2150 + }, + { + "epoch": 8.604, + "grad_norm": 1.354813814163208, + "learning_rate": 2.8567134268537077e-05, + "loss": 0.0458, + "step": 2151 + }, + { + "epoch": 8.608, + "grad_norm": 1.3162986040115356, + "learning_rate": 2.8557114228456915e-05, + "loss": 0.0508, + "step": 2152 + }, + { + "epoch": 8.612, + "grad_norm": 1.2697739601135254, + "learning_rate": 2.8547094188376756e-05, + "loss": 0.0472, + "step": 2153 + }, + { + "epoch": 8.616, + "grad_norm": 1.3663972616195679, + "learning_rate": 2.8537074148296594e-05, + "loss": 0.0535, + "step": 2154 + }, + { + "epoch": 8.62, + "grad_norm": 1.2056630849838257, + "learning_rate": 2.8527054108216432e-05, + "loss": 0.0419, + "step": 2155 + }, + { + "epoch": 8.624, + "grad_norm": 1.1879265308380127, + "learning_rate": 2.8517034068136273e-05, + "loss": 0.042, + "step": 2156 + }, + { + "epoch": 8.628, + "grad_norm": 1.250275731086731, + "learning_rate": 2.850701402805611e-05, + "loss": 0.0428, + "step": 2157 + }, + { + "epoch": 8.632, + "grad_norm": 1.2584730386734009, + "learning_rate": 2.849699398797595e-05, + "loss": 0.0453, + "step": 2158 + }, + { + "epoch": 8.636, + "grad_norm": 1.3111329078674316, + "learning_rate": 2.8486973947895794e-05, + "loss": 0.0501, + "step": 2159 + }, + { + "epoch": 8.64, + "grad_norm": 1.344029426574707, + "learning_rate": 2.8476953907815635e-05, + "loss": 0.0479, + "step": 2160 + }, + { + "epoch": 8.644, + "grad_norm": 1.4813225269317627, + "learning_rate": 2.8466933867735473e-05, + "loss": 0.0489, + "step": 2161 + }, + { + "epoch": 8.648, + "grad_norm": 1.3338834047317505, + "learning_rate": 2.8456913827655314e-05, + "loss": 0.0481, + "step": 2162 + }, + { + "epoch": 8.652, + "grad_norm": 1.295943260192871, + "learning_rate": 2.8446893787575152e-05, + "loss": 0.0403, + "step": 2163 + }, + { + "epoch": 8.656, + "grad_norm": 1.1170644760131836, + "learning_rate": 2.843687374749499e-05, + "loss": 0.0453, + "step": 2164 + }, + { + "epoch": 8.66, + "grad_norm": 1.6115245819091797, + "learning_rate": 2.842685370741483e-05, + "loss": 0.0496, + "step": 2165 + }, + { + "epoch": 8.664, + "grad_norm": 1.2752050161361694, + "learning_rate": 2.841683366733467e-05, + "loss": 0.0441, + "step": 2166 + }, + { + "epoch": 8.668, + "grad_norm": 1.237499475479126, + "learning_rate": 2.8406813627254507e-05, + "loss": 0.0474, + "step": 2167 + }, + { + "epoch": 8.672, + "grad_norm": 1.2884820699691772, + "learning_rate": 2.839679358717435e-05, + "loss": 0.0489, + "step": 2168 + }, + { + "epoch": 8.676, + "grad_norm": 1.2853950262069702, + "learning_rate": 2.8386773547094193e-05, + "loss": 0.0514, + "step": 2169 + }, + { + "epoch": 8.68, + "grad_norm": 1.3021364212036133, + "learning_rate": 2.837675350701403e-05, + "loss": 0.0451, + "step": 2170 + }, + { + "epoch": 8.684, + "grad_norm": 1.1973280906677246, + "learning_rate": 2.836673346693387e-05, + "loss": 0.046, + "step": 2171 + }, + { + "epoch": 8.688, + "grad_norm": 0.8755884170532227, + "learning_rate": 2.835671342685371e-05, + "loss": 0.0345, + "step": 2172 + }, + { + "epoch": 8.692, + "grad_norm": 1.2425113916397095, + "learning_rate": 2.8346693386773548e-05, + "loss": 0.0407, + "step": 2173 + }, + { + "epoch": 8.696, + "grad_norm": 1.3100374937057495, + "learning_rate": 2.8336673346693386e-05, + "loss": 0.0459, + "step": 2174 + }, + { + "epoch": 8.7, + "grad_norm": 1.5196589231491089, + "learning_rate": 2.8326653306613227e-05, + "loss": 0.0492, + "step": 2175 + }, + { + "epoch": 8.704, + "grad_norm": 1.3412437438964844, + "learning_rate": 2.8316633266533065e-05, + "loss": 0.0522, + "step": 2176 + }, + { + "epoch": 8.708, + "grad_norm": 1.1661128997802734, + "learning_rate": 2.8306613226452906e-05, + "loss": 0.0533, + "step": 2177 + }, + { + "epoch": 8.712, + "grad_norm": 1.1783884763717651, + "learning_rate": 2.829659318637275e-05, + "loss": 0.0511, + "step": 2178 + }, + { + "epoch": 8.716, + "grad_norm": 1.162697434425354, + "learning_rate": 2.828657314629259e-05, + "loss": 0.0452, + "step": 2179 + }, + { + "epoch": 8.72, + "grad_norm": 1.5059949159622192, + "learning_rate": 2.8276553106212427e-05, + "loss": 0.0503, + "step": 2180 + }, + { + "epoch": 8.724, + "grad_norm": 1.33864426612854, + "learning_rate": 2.8266533066132268e-05, + "loss": 0.0519, + "step": 2181 + }, + { + "epoch": 8.728, + "grad_norm": 1.316199779510498, + "learning_rate": 2.8256513026052106e-05, + "loss": 0.0515, + "step": 2182 + }, + { + "epoch": 8.732, + "grad_norm": 1.2155145406723022, + "learning_rate": 2.8246492985971944e-05, + "loss": 0.0425, + "step": 2183 + }, + { + "epoch": 8.736, + "grad_norm": 1.4967896938323975, + "learning_rate": 2.8236472945891785e-05, + "loss": 0.0589, + "step": 2184 + }, + { + "epoch": 8.74, + "grad_norm": 1.188382863998413, + "learning_rate": 2.8226452905811623e-05, + "loss": 0.0446, + "step": 2185 + }, + { + "epoch": 8.744, + "grad_norm": 1.3887944221496582, + "learning_rate": 2.821643286573146e-05, + "loss": 0.0513, + "step": 2186 + }, + { + "epoch": 8.748, + "grad_norm": 1.4028888940811157, + "learning_rate": 2.8206412825651302e-05, + "loss": 0.0536, + "step": 2187 + }, + { + "epoch": 8.752, + "grad_norm": 1.4226263761520386, + "learning_rate": 2.8196392785571147e-05, + "loss": 0.0561, + "step": 2188 + }, + { + "epoch": 8.756, + "grad_norm": 1.1816654205322266, + "learning_rate": 2.8186372745490985e-05, + "loss": 0.039, + "step": 2189 + }, + { + "epoch": 8.76, + "grad_norm": 1.3053758144378662, + "learning_rate": 2.8176352705410823e-05, + "loss": 0.043, + "step": 2190 + }, + { + "epoch": 8.764, + "grad_norm": 1.021385669708252, + "learning_rate": 2.8166332665330664e-05, + "loss": 0.0425, + "step": 2191 + }, + { + "epoch": 8.768, + "grad_norm": 1.6020197868347168, + "learning_rate": 2.8156312625250502e-05, + "loss": 0.0488, + "step": 2192 + }, + { + "epoch": 8.772, + "grad_norm": 1.498152256011963, + "learning_rate": 2.8146292585170343e-05, + "loss": 0.0512, + "step": 2193 + }, + { + "epoch": 8.776, + "grad_norm": 1.3913947343826294, + "learning_rate": 2.813627254509018e-05, + "loss": 0.0509, + "step": 2194 + }, + { + "epoch": 8.78, + "grad_norm": 1.3088997602462769, + "learning_rate": 2.812625250501002e-05, + "loss": 0.0483, + "step": 2195 + }, + { + "epoch": 8.784, + "grad_norm": 0.9024881720542908, + "learning_rate": 2.811623246492986e-05, + "loss": 0.0297, + "step": 2196 + }, + { + "epoch": 8.788, + "grad_norm": 1.3125743865966797, + "learning_rate": 2.8106212424849698e-05, + "loss": 0.0552, + "step": 2197 + }, + { + "epoch": 8.792, + "grad_norm": 1.094606876373291, + "learning_rate": 2.8096192384769543e-05, + "loss": 0.0398, + "step": 2198 + }, + { + "epoch": 8.796, + "grad_norm": 1.1351479291915894, + "learning_rate": 2.808617234468938e-05, + "loss": 0.0518, + "step": 2199 + }, + { + "epoch": 8.8, + "grad_norm": 1.218656063079834, + "learning_rate": 2.8076152304609222e-05, + "loss": 0.0547, + "step": 2200 + }, + { + "epoch": 8.804, + "grad_norm": 1.1994880437850952, + "learning_rate": 2.806613226452906e-05, + "loss": 0.0405, + "step": 2201 + }, + { + "epoch": 8.808, + "grad_norm": 1.25562584400177, + "learning_rate": 2.8056112224448898e-05, + "loss": 0.0537, + "step": 2202 + }, + { + "epoch": 8.812, + "grad_norm": 1.5509350299835205, + "learning_rate": 2.804609218436874e-05, + "loss": 0.0624, + "step": 2203 + }, + { + "epoch": 8.816, + "grad_norm": 1.4089373350143433, + "learning_rate": 2.8036072144288577e-05, + "loss": 0.0627, + "step": 2204 + }, + { + "epoch": 8.82, + "grad_norm": 1.2322133779525757, + "learning_rate": 2.802605210420842e-05, + "loss": 0.0432, + "step": 2205 + }, + { + "epoch": 8.824, + "grad_norm": 1.5502153635025024, + "learning_rate": 2.8016032064128256e-05, + "loss": 0.0682, + "step": 2206 + }, + { + "epoch": 8.828, + "grad_norm": 1.2696598768234253, + "learning_rate": 2.8006012024048094e-05, + "loss": 0.046, + "step": 2207 + }, + { + "epoch": 8.832, + "grad_norm": 1.2160747051239014, + "learning_rate": 2.799599198396794e-05, + "loss": 0.0401, + "step": 2208 + }, + { + "epoch": 8.836, + "grad_norm": 1.38071608543396, + "learning_rate": 2.798597194388778e-05, + "loss": 0.0495, + "step": 2209 + }, + { + "epoch": 8.84, + "grad_norm": 1.6408637762069702, + "learning_rate": 2.7975951903807618e-05, + "loss": 0.0646, + "step": 2210 + }, + { + "epoch": 8.844, + "grad_norm": 1.340449333190918, + "learning_rate": 2.7965931863727456e-05, + "loss": 0.0475, + "step": 2211 + }, + { + "epoch": 8.848, + "grad_norm": 1.3540539741516113, + "learning_rate": 2.7955911823647297e-05, + "loss": 0.0526, + "step": 2212 + }, + { + "epoch": 8.852, + "grad_norm": 1.4355782270431519, + "learning_rate": 2.7945891783567135e-05, + "loss": 0.0478, + "step": 2213 + }, + { + "epoch": 8.856, + "grad_norm": 1.5981539487838745, + "learning_rate": 2.7935871743486973e-05, + "loss": 0.0546, + "step": 2214 + }, + { + "epoch": 8.86, + "grad_norm": 1.3910454511642456, + "learning_rate": 2.7925851703406814e-05, + "loss": 0.0513, + "step": 2215 + }, + { + "epoch": 8.864, + "grad_norm": 1.2546508312225342, + "learning_rate": 2.7915831663326652e-05, + "loss": 0.0506, + "step": 2216 + }, + { + "epoch": 8.868, + "grad_norm": 1.4212666749954224, + "learning_rate": 2.790581162324649e-05, + "loss": 0.05, + "step": 2217 + }, + { + "epoch": 8.872, + "grad_norm": 1.2646417617797852, + "learning_rate": 2.7895791583166335e-05, + "loss": 0.046, + "step": 2218 + }, + { + "epoch": 8.876, + "grad_norm": 1.1945741176605225, + "learning_rate": 2.7885771543086176e-05, + "loss": 0.0445, + "step": 2219 + }, + { + "epoch": 8.88, + "grad_norm": 1.351228952407837, + "learning_rate": 2.7875751503006014e-05, + "loss": 0.0574, + "step": 2220 + }, + { + "epoch": 8.884, + "grad_norm": 1.2459324598312378, + "learning_rate": 2.7865731462925855e-05, + "loss": 0.0479, + "step": 2221 + }, + { + "epoch": 8.888, + "grad_norm": 1.5551952123641968, + "learning_rate": 2.7855711422845693e-05, + "loss": 0.0703, + "step": 2222 + }, + { + "epoch": 8.892, + "grad_norm": 1.3713449239730835, + "learning_rate": 2.784569138276553e-05, + "loss": 0.049, + "step": 2223 + }, + { + "epoch": 8.896, + "grad_norm": 1.3457205295562744, + "learning_rate": 2.7835671342685372e-05, + "loss": 0.0417, + "step": 2224 + }, + { + "epoch": 8.9, + "grad_norm": 1.2651445865631104, + "learning_rate": 2.782565130260521e-05, + "loss": 0.0554, + "step": 2225 + }, + { + "epoch": 8.904, + "grad_norm": 1.3092762231826782, + "learning_rate": 2.7815631262525048e-05, + "loss": 0.05, + "step": 2226 + }, + { + "epoch": 8.908, + "grad_norm": 1.3287698030471802, + "learning_rate": 2.780561122244489e-05, + "loss": 0.0495, + "step": 2227 + }, + { + "epoch": 8.912, + "grad_norm": 1.1512635946273804, + "learning_rate": 2.7795591182364734e-05, + "loss": 0.0453, + "step": 2228 + }, + { + "epoch": 8.916, + "grad_norm": 1.3240880966186523, + "learning_rate": 2.7785571142284572e-05, + "loss": 0.0417, + "step": 2229 + }, + { + "epoch": 8.92, + "grad_norm": 1.21474289894104, + "learning_rate": 2.777555110220441e-05, + "loss": 0.047, + "step": 2230 + }, + { + "epoch": 8.924, + "grad_norm": 1.4137842655181885, + "learning_rate": 2.776553106212425e-05, + "loss": 0.0747, + "step": 2231 + }, + { + "epoch": 8.928, + "grad_norm": 1.2826186418533325, + "learning_rate": 2.775551102204409e-05, + "loss": 0.0541, + "step": 2232 + }, + { + "epoch": 8.932, + "grad_norm": 1.2921860218048096, + "learning_rate": 2.774549098196393e-05, + "loss": 0.0671, + "step": 2233 + }, + { + "epoch": 8.936, + "grad_norm": 1.2751491069793701, + "learning_rate": 2.7735470941883768e-05, + "loss": 0.0357, + "step": 2234 + }, + { + "epoch": 8.94, + "grad_norm": 1.2974969148635864, + "learning_rate": 2.7725450901803606e-05, + "loss": 0.05, + "step": 2235 + }, + { + "epoch": 8.943999999999999, + "grad_norm": 1.132696509361267, + "learning_rate": 2.7715430861723447e-05, + "loss": 0.0425, + "step": 2236 + }, + { + "epoch": 8.948, + "grad_norm": 0.6913595199584961, + "learning_rate": 2.7705410821643292e-05, + "loss": 0.0187, + "step": 2237 + }, + { + "epoch": 8.952, + "grad_norm": 1.1450154781341553, + "learning_rate": 2.769539078156313e-05, + "loss": 0.0421, + "step": 2238 + }, + { + "epoch": 8.956, + "grad_norm": 1.154909372329712, + "learning_rate": 2.7685370741482968e-05, + "loss": 0.0384, + "step": 2239 + }, + { + "epoch": 8.96, + "grad_norm": 1.3707146644592285, + "learning_rate": 2.767535070140281e-05, + "loss": 0.0517, + "step": 2240 + }, + { + "epoch": 8.964, + "grad_norm": 1.5236717462539673, + "learning_rate": 2.7665330661322647e-05, + "loss": 0.0684, + "step": 2241 + }, + { + "epoch": 8.968, + "grad_norm": 1.2843303680419922, + "learning_rate": 2.7655310621242485e-05, + "loss": 0.0427, + "step": 2242 + }, + { + "epoch": 8.972, + "grad_norm": 1.1353422403335571, + "learning_rate": 2.7645290581162326e-05, + "loss": 0.043, + "step": 2243 + }, + { + "epoch": 8.975999999999999, + "grad_norm": 1.2605583667755127, + "learning_rate": 2.7635270541082164e-05, + "loss": 0.0486, + "step": 2244 + }, + { + "epoch": 8.98, + "grad_norm": 1.3495458364486694, + "learning_rate": 2.7625250501002002e-05, + "loss": 0.0474, + "step": 2245 + }, + { + "epoch": 8.984, + "grad_norm": 1.3079636096954346, + "learning_rate": 2.7615230460921843e-05, + "loss": 0.0639, + "step": 2246 + }, + { + "epoch": 8.988, + "grad_norm": 1.171911358833313, + "learning_rate": 2.7605210420841688e-05, + "loss": 0.0417, + "step": 2247 + }, + { + "epoch": 8.992, + "grad_norm": 1.34932541847229, + "learning_rate": 2.7595190380761526e-05, + "loss": 0.0482, + "step": 2248 + }, + { + "epoch": 8.996, + "grad_norm": 1.653420329093933, + "learning_rate": 2.7585170340681367e-05, + "loss": 0.0602, + "step": 2249 + }, + { + "epoch": 9.0, + "grad_norm": 1.4358410835266113, + "learning_rate": 2.7575150300601205e-05, + "loss": 0.0582, + "step": 2250 + }, + { + "epoch": 9.004, + "grad_norm": 0.6799370050430298, + "learning_rate": 2.7565130260521043e-05, + "loss": 0.0219, + "step": 2251 + }, + { + "epoch": 9.008, + "grad_norm": 0.6610779762268066, + "learning_rate": 2.7555110220440884e-05, + "loss": 0.022, + "step": 2252 + }, + { + "epoch": 9.012, + "grad_norm": 0.9135136604309082, + "learning_rate": 2.7545090180360722e-05, + "loss": 0.0262, + "step": 2253 + }, + { + "epoch": 9.016, + "grad_norm": 0.8344765305519104, + "learning_rate": 2.753507014028056e-05, + "loss": 0.0254, + "step": 2254 + }, + { + "epoch": 9.02, + "grad_norm": 0.8494986891746521, + "learning_rate": 2.75250501002004e-05, + "loss": 0.0273, + "step": 2255 + }, + { + "epoch": 9.024, + "grad_norm": 0.9093011021614075, + "learning_rate": 2.751503006012024e-05, + "loss": 0.0281, + "step": 2256 + }, + { + "epoch": 9.028, + "grad_norm": 0.9227543473243713, + "learning_rate": 2.7505010020040084e-05, + "loss": 0.0294, + "step": 2257 + }, + { + "epoch": 9.032, + "grad_norm": 0.6513843536376953, + "learning_rate": 2.749498997995992e-05, + "loss": 0.0201, + "step": 2258 + }, + { + "epoch": 9.036, + "grad_norm": 0.8121753334999084, + "learning_rate": 2.7484969939879763e-05, + "loss": 0.0262, + "step": 2259 + }, + { + "epoch": 9.04, + "grad_norm": 1.054721713066101, + "learning_rate": 2.74749498997996e-05, + "loss": 0.0277, + "step": 2260 + }, + { + "epoch": 9.044, + "grad_norm": 0.8000840544700623, + "learning_rate": 2.7464929859719442e-05, + "loss": 0.0227, + "step": 2261 + }, + { + "epoch": 9.048, + "grad_norm": 0.8721778988838196, + "learning_rate": 2.745490981963928e-05, + "loss": 0.0236, + "step": 2262 + }, + { + "epoch": 9.052, + "grad_norm": 0.9745427370071411, + "learning_rate": 2.7444889779559118e-05, + "loss": 0.0236, + "step": 2263 + }, + { + "epoch": 9.056, + "grad_norm": 0.972322404384613, + "learning_rate": 2.743486973947896e-05, + "loss": 0.0278, + "step": 2264 + }, + { + "epoch": 9.06, + "grad_norm": 1.111842155456543, + "learning_rate": 2.7424849699398797e-05, + "loss": 0.0299, + "step": 2265 + }, + { + "epoch": 9.064, + "grad_norm": 0.7864890694618225, + "learning_rate": 2.7414829659318635e-05, + "loss": 0.0243, + "step": 2266 + }, + { + "epoch": 9.068, + "grad_norm": 1.2372549772262573, + "learning_rate": 2.740480961923848e-05, + "loss": 0.043, + "step": 2267 + }, + { + "epoch": 9.072, + "grad_norm": 0.7905459403991699, + "learning_rate": 2.739478957915832e-05, + "loss": 0.021, + "step": 2268 + }, + { + "epoch": 9.076, + "grad_norm": 1.100546956062317, + "learning_rate": 2.738476953907816e-05, + "loss": 0.0277, + "step": 2269 + }, + { + "epoch": 9.08, + "grad_norm": 0.8412137031555176, + "learning_rate": 2.7374749498997997e-05, + "loss": 0.025, + "step": 2270 + }, + { + "epoch": 9.084, + "grad_norm": 0.9313427805900574, + "learning_rate": 2.7364729458917838e-05, + "loss": 0.0231, + "step": 2271 + }, + { + "epoch": 9.088, + "grad_norm": 1.1528956890106201, + "learning_rate": 2.7354709418837676e-05, + "loss": 0.0338, + "step": 2272 + }, + { + "epoch": 9.092, + "grad_norm": 0.7774552702903748, + "learning_rate": 2.7344689378757514e-05, + "loss": 0.0247, + "step": 2273 + }, + { + "epoch": 9.096, + "grad_norm": 1.030387282371521, + "learning_rate": 2.7334669338677355e-05, + "loss": 0.0266, + "step": 2274 + }, + { + "epoch": 9.1, + "grad_norm": 0.8086778521537781, + "learning_rate": 2.7324649298597193e-05, + "loss": 0.0246, + "step": 2275 + }, + { + "epoch": 9.104, + "grad_norm": 0.8489255905151367, + "learning_rate": 2.7314629258517034e-05, + "loss": 0.0249, + "step": 2276 + }, + { + "epoch": 9.108, + "grad_norm": 0.7739505171775818, + "learning_rate": 2.730460921843688e-05, + "loss": 0.0257, + "step": 2277 + }, + { + "epoch": 9.112, + "grad_norm": 0.8986899852752686, + "learning_rate": 2.7294589178356717e-05, + "loss": 0.0258, + "step": 2278 + }, + { + "epoch": 9.116, + "grad_norm": 1.1886520385742188, + "learning_rate": 2.7284569138276555e-05, + "loss": 0.0349, + "step": 2279 + }, + { + "epoch": 9.12, + "grad_norm": 0.9789061546325684, + "learning_rate": 2.7274549098196396e-05, + "loss": 0.0277, + "step": 2280 + }, + { + "epoch": 9.124, + "grad_norm": 1.192771315574646, + "learning_rate": 2.7264529058116234e-05, + "loss": 0.0336, + "step": 2281 + }, + { + "epoch": 9.128, + "grad_norm": 1.048073649406433, + "learning_rate": 2.7254509018036072e-05, + "loss": 0.031, + "step": 2282 + }, + { + "epoch": 9.132, + "grad_norm": 1.163774013519287, + "learning_rate": 2.7244488977955913e-05, + "loss": 0.0309, + "step": 2283 + }, + { + "epoch": 9.136, + "grad_norm": 0.8994669914245605, + "learning_rate": 2.723446893787575e-05, + "loss": 0.0204, + "step": 2284 + }, + { + "epoch": 9.14, + "grad_norm": 1.123653769493103, + "learning_rate": 2.722444889779559e-05, + "loss": 0.0325, + "step": 2285 + }, + { + "epoch": 9.144, + "grad_norm": 0.9699426889419556, + "learning_rate": 2.7214428857715433e-05, + "loss": 0.0253, + "step": 2286 + }, + { + "epoch": 9.148, + "grad_norm": 0.8264506459236145, + "learning_rate": 2.7204408817635275e-05, + "loss": 0.0234, + "step": 2287 + }, + { + "epoch": 9.152, + "grad_norm": 1.0332759618759155, + "learning_rate": 2.7194388777555113e-05, + "loss": 0.0315, + "step": 2288 + }, + { + "epoch": 9.156, + "grad_norm": 1.0331981182098389, + "learning_rate": 2.7184368737474954e-05, + "loss": 0.0336, + "step": 2289 + }, + { + "epoch": 9.16, + "grad_norm": 2.753606081008911, + "learning_rate": 2.7174348697394792e-05, + "loss": 0.0322, + "step": 2290 + }, + { + "epoch": 9.164, + "grad_norm": 0.8838220834732056, + "learning_rate": 2.716432865731463e-05, + "loss": 0.0244, + "step": 2291 + }, + { + "epoch": 9.168, + "grad_norm": 0.9516180753707886, + "learning_rate": 2.715430861723447e-05, + "loss": 0.0325, + "step": 2292 + }, + { + "epoch": 9.172, + "grad_norm": 0.9610980153083801, + "learning_rate": 2.714428857715431e-05, + "loss": 0.0257, + "step": 2293 + }, + { + "epoch": 9.176, + "grad_norm": 1.0998841524124146, + "learning_rate": 2.7134268537074147e-05, + "loss": 0.0291, + "step": 2294 + }, + { + "epoch": 9.18, + "grad_norm": 1.0278939008712769, + "learning_rate": 2.7124248496993988e-05, + "loss": 0.0308, + "step": 2295 + }, + { + "epoch": 9.184, + "grad_norm": 1.050432562828064, + "learning_rate": 2.7114228456913833e-05, + "loss": 0.0362, + "step": 2296 + }, + { + "epoch": 9.188, + "grad_norm": 1.3083264827728271, + "learning_rate": 2.710420841683367e-05, + "loss": 0.0347, + "step": 2297 + }, + { + "epoch": 9.192, + "grad_norm": 1.0661804676055908, + "learning_rate": 2.709418837675351e-05, + "loss": 0.0317, + "step": 2298 + }, + { + "epoch": 9.196, + "grad_norm": 0.8684881925582886, + "learning_rate": 2.708416833667335e-05, + "loss": 0.028, + "step": 2299 + }, + { + "epoch": 9.2, + "grad_norm": 0.4513463079929352, + "learning_rate": 2.7074148296593188e-05, + "loss": 0.0072, + "step": 2300 + }, + { + "epoch": 9.204, + "grad_norm": 0.9160906076431274, + "learning_rate": 2.7064128256513026e-05, + "loss": 0.0268, + "step": 2301 + }, + { + "epoch": 9.208, + "grad_norm": 1.1022686958312988, + "learning_rate": 2.7054108216432867e-05, + "loss": 0.0261, + "step": 2302 + }, + { + "epoch": 9.212, + "grad_norm": 1.2009657621383667, + "learning_rate": 2.7044088176352705e-05, + "loss": 0.0388, + "step": 2303 + }, + { + "epoch": 9.216, + "grad_norm": 1.1833915710449219, + "learning_rate": 2.7034068136272546e-05, + "loss": 0.031, + "step": 2304 + }, + { + "epoch": 9.22, + "grad_norm": 1.0764391422271729, + "learning_rate": 2.7024048096192384e-05, + "loss": 0.0327, + "step": 2305 + }, + { + "epoch": 9.224, + "grad_norm": 0.9721907377243042, + "learning_rate": 2.701402805611223e-05, + "loss": 0.0326, + "step": 2306 + }, + { + "epoch": 9.228, + "grad_norm": 1.1473323106765747, + "learning_rate": 2.7004008016032067e-05, + "loss": 0.0345, + "step": 2307 + }, + { + "epoch": 9.232, + "grad_norm": 1.1332943439483643, + "learning_rate": 2.6993987975951908e-05, + "loss": 0.0313, + "step": 2308 + }, + { + "epoch": 9.236, + "grad_norm": 1.263730525970459, + "learning_rate": 2.6983967935871746e-05, + "loss": 0.0302, + "step": 2309 + }, + { + "epoch": 9.24, + "grad_norm": 1.0454695224761963, + "learning_rate": 2.6973947895791584e-05, + "loss": 0.028, + "step": 2310 + }, + { + "epoch": 9.244, + "grad_norm": 1.1046359539031982, + "learning_rate": 2.6963927855711425e-05, + "loss": 0.0343, + "step": 2311 + }, + { + "epoch": 9.248, + "grad_norm": 1.1158086061477661, + "learning_rate": 2.6953907815631263e-05, + "loss": 0.0298, + "step": 2312 + }, + { + "epoch": 9.252, + "grad_norm": 1.1569856405258179, + "learning_rate": 2.69438877755511e-05, + "loss": 0.0281, + "step": 2313 + }, + { + "epoch": 9.256, + "grad_norm": 0.6049909591674805, + "learning_rate": 2.6933867735470942e-05, + "loss": 0.0132, + "step": 2314 + }, + { + "epoch": 9.26, + "grad_norm": 1.2866177558898926, + "learning_rate": 2.692384769539078e-05, + "loss": 0.0361, + "step": 2315 + }, + { + "epoch": 9.264, + "grad_norm": 0.9961463809013367, + "learning_rate": 2.6913827655310625e-05, + "loss": 0.0267, + "step": 2316 + }, + { + "epoch": 9.268, + "grad_norm": 1.083455204963684, + "learning_rate": 2.6903807615230462e-05, + "loss": 0.0271, + "step": 2317 + }, + { + "epoch": 9.272, + "grad_norm": 0.8019450306892395, + "learning_rate": 2.6893787575150304e-05, + "loss": 0.0313, + "step": 2318 + }, + { + "epoch": 9.276, + "grad_norm": 1.0289325714111328, + "learning_rate": 2.688376753507014e-05, + "loss": 0.0338, + "step": 2319 + }, + { + "epoch": 9.28, + "grad_norm": 1.015210509300232, + "learning_rate": 2.6873747494989983e-05, + "loss": 0.028, + "step": 2320 + }, + { + "epoch": 9.284, + "grad_norm": 0.8635653257369995, + "learning_rate": 2.686372745490982e-05, + "loss": 0.0256, + "step": 2321 + }, + { + "epoch": 9.288, + "grad_norm": 0.9069684743881226, + "learning_rate": 2.685370741482966e-05, + "loss": 0.0324, + "step": 2322 + }, + { + "epoch": 9.292, + "grad_norm": 1.1683540344238281, + "learning_rate": 2.68436873747495e-05, + "loss": 0.0279, + "step": 2323 + }, + { + "epoch": 9.296, + "grad_norm": 1.084283471107483, + "learning_rate": 2.6833667334669338e-05, + "loss": 0.033, + "step": 2324 + }, + { + "epoch": 9.3, + "grad_norm": 1.0750457048416138, + "learning_rate": 2.6823647294589176e-05, + "loss": 0.026, + "step": 2325 + }, + { + "epoch": 9.304, + "grad_norm": 1.1331920623779297, + "learning_rate": 2.681362725450902e-05, + "loss": 0.0282, + "step": 2326 + }, + { + "epoch": 9.308, + "grad_norm": 0.7947516441345215, + "learning_rate": 2.6803607214428862e-05, + "loss": 0.0229, + "step": 2327 + }, + { + "epoch": 9.312, + "grad_norm": 1.2119550704956055, + "learning_rate": 2.67935871743487e-05, + "loss": 0.0352, + "step": 2328 + }, + { + "epoch": 9.316, + "grad_norm": 0.9553737640380859, + "learning_rate": 2.6783567134268537e-05, + "loss": 0.0291, + "step": 2329 + }, + { + "epoch": 9.32, + "grad_norm": 0.9401413202285767, + "learning_rate": 2.677354709418838e-05, + "loss": 0.0278, + "step": 2330 + }, + { + "epoch": 9.324, + "grad_norm": 0.9966760873794556, + "learning_rate": 2.6763527054108217e-05, + "loss": 0.0247, + "step": 2331 + }, + { + "epoch": 9.328, + "grad_norm": 1.0120066404342651, + "learning_rate": 2.6753507014028058e-05, + "loss": 0.0313, + "step": 2332 + }, + { + "epoch": 9.332, + "grad_norm": 1.1858829259872437, + "learning_rate": 2.6743486973947896e-05, + "loss": 0.0358, + "step": 2333 + }, + { + "epoch": 9.336, + "grad_norm": 1.082135796546936, + "learning_rate": 2.6733466933867734e-05, + "loss": 0.0311, + "step": 2334 + }, + { + "epoch": 9.34, + "grad_norm": 0.9590717554092407, + "learning_rate": 2.6723446893787575e-05, + "loss": 0.0316, + "step": 2335 + }, + { + "epoch": 9.344, + "grad_norm": 1.2413767576217651, + "learning_rate": 2.671342685370742e-05, + "loss": 0.0373, + "step": 2336 + }, + { + "epoch": 9.348, + "grad_norm": 1.1042641401290894, + "learning_rate": 2.6703406813627258e-05, + "loss": 0.0325, + "step": 2337 + }, + { + "epoch": 9.352, + "grad_norm": 0.944624125957489, + "learning_rate": 2.6693386773547095e-05, + "loss": 0.0273, + "step": 2338 + }, + { + "epoch": 9.356, + "grad_norm": 1.1185686588287354, + "learning_rate": 2.6683366733466937e-05, + "loss": 0.0377, + "step": 2339 + }, + { + "epoch": 9.36, + "grad_norm": 1.205579400062561, + "learning_rate": 2.6673346693386775e-05, + "loss": 0.0345, + "step": 2340 + }, + { + "epoch": 9.364, + "grad_norm": 1.2945799827575684, + "learning_rate": 2.6663326653306613e-05, + "loss": 0.0324, + "step": 2341 + }, + { + "epoch": 9.368, + "grad_norm": 0.9837092161178589, + "learning_rate": 2.6653306613226454e-05, + "loss": 0.0276, + "step": 2342 + }, + { + "epoch": 9.372, + "grad_norm": 1.1606407165527344, + "learning_rate": 2.6643286573146292e-05, + "loss": 0.0331, + "step": 2343 + }, + { + "epoch": 9.376, + "grad_norm": 1.0431427955627441, + "learning_rate": 2.663326653306613e-05, + "loss": 0.0295, + "step": 2344 + }, + { + "epoch": 9.38, + "grad_norm": 1.0889356136322021, + "learning_rate": 2.6623246492985974e-05, + "loss": 0.0291, + "step": 2345 + }, + { + "epoch": 9.384, + "grad_norm": 0.9746689200401306, + "learning_rate": 2.6613226452905816e-05, + "loss": 0.0281, + "step": 2346 + }, + { + "epoch": 9.388, + "grad_norm": 1.1333224773406982, + "learning_rate": 2.6603206412825653e-05, + "loss": 0.0306, + "step": 2347 + }, + { + "epoch": 9.392, + "grad_norm": 1.082342505455017, + "learning_rate": 2.6593186372745495e-05, + "loss": 0.032, + "step": 2348 + }, + { + "epoch": 9.396, + "grad_norm": 1.0573281049728394, + "learning_rate": 2.6583166332665333e-05, + "loss": 0.0261, + "step": 2349 + }, + { + "epoch": 9.4, + "grad_norm": 1.1344096660614014, + "learning_rate": 2.657314629258517e-05, + "loss": 0.03, + "step": 2350 + }, + { + "epoch": 9.404, + "grad_norm": 1.1539257764816284, + "learning_rate": 2.6563126252505012e-05, + "loss": 0.0325, + "step": 2351 + }, + { + "epoch": 9.408, + "grad_norm": 1.318395733833313, + "learning_rate": 2.655310621242485e-05, + "loss": 0.0383, + "step": 2352 + }, + { + "epoch": 9.412, + "grad_norm": 0.6710442900657654, + "learning_rate": 2.6543086172344688e-05, + "loss": 0.0202, + "step": 2353 + }, + { + "epoch": 9.416, + "grad_norm": 1.0864232778549194, + "learning_rate": 2.653306613226453e-05, + "loss": 0.0286, + "step": 2354 + }, + { + "epoch": 9.42, + "grad_norm": 1.046763300895691, + "learning_rate": 2.6523046092184374e-05, + "loss": 0.0292, + "step": 2355 + }, + { + "epoch": 9.424, + "grad_norm": 1.1092429161071777, + "learning_rate": 2.651302605210421e-05, + "loss": 0.0308, + "step": 2356 + }, + { + "epoch": 9.428, + "grad_norm": 1.0427887439727783, + "learning_rate": 2.650300601202405e-05, + "loss": 0.0282, + "step": 2357 + }, + { + "epoch": 9.432, + "grad_norm": 1.171705722808838, + "learning_rate": 2.649298597194389e-05, + "loss": 0.0326, + "step": 2358 + }, + { + "epoch": 9.436, + "grad_norm": 0.982445240020752, + "learning_rate": 2.648296593186373e-05, + "loss": 0.0275, + "step": 2359 + }, + { + "epoch": 9.44, + "grad_norm": 0.9876815676689148, + "learning_rate": 2.647294589178357e-05, + "loss": 0.0366, + "step": 2360 + }, + { + "epoch": 9.444, + "grad_norm": 0.9229361414909363, + "learning_rate": 2.6462925851703408e-05, + "loss": 0.0284, + "step": 2361 + }, + { + "epoch": 9.448, + "grad_norm": 0.9455786943435669, + "learning_rate": 2.6452905811623246e-05, + "loss": 0.0279, + "step": 2362 + }, + { + "epoch": 9.452, + "grad_norm": 0.5432707071304321, + "learning_rate": 2.6442885771543087e-05, + "loss": 0.0153, + "step": 2363 + }, + { + "epoch": 9.456, + "grad_norm": 1.1233261823654175, + "learning_rate": 2.6432865731462925e-05, + "loss": 0.0322, + "step": 2364 + }, + { + "epoch": 9.46, + "grad_norm": 1.0408973693847656, + "learning_rate": 2.642284569138277e-05, + "loss": 0.0334, + "step": 2365 + }, + { + "epoch": 9.464, + "grad_norm": 1.3444018363952637, + "learning_rate": 2.6412825651302607e-05, + "loss": 0.0354, + "step": 2366 + }, + { + "epoch": 9.468, + "grad_norm": 1.0790801048278809, + "learning_rate": 2.640280561122245e-05, + "loss": 0.0291, + "step": 2367 + }, + { + "epoch": 9.472, + "grad_norm": 1.0651098489761353, + "learning_rate": 2.6392785571142287e-05, + "loss": 0.0412, + "step": 2368 + }, + { + "epoch": 9.475999999999999, + "grad_norm": 1.0434693098068237, + "learning_rate": 2.6382765531062124e-05, + "loss": 0.0287, + "step": 2369 + }, + { + "epoch": 9.48, + "grad_norm": 1.1507903337478638, + "learning_rate": 2.6372745490981966e-05, + "loss": 0.032, + "step": 2370 + }, + { + "epoch": 9.484, + "grad_norm": 1.0209254026412964, + "learning_rate": 2.6362725450901804e-05, + "loss": 0.0312, + "step": 2371 + }, + { + "epoch": 9.488, + "grad_norm": 1.2612141370773315, + "learning_rate": 2.635270541082164e-05, + "loss": 0.038, + "step": 2372 + }, + { + "epoch": 9.492, + "grad_norm": 0.9201658368110657, + "learning_rate": 2.6342685370741483e-05, + "loss": 0.0302, + "step": 2373 + }, + { + "epoch": 9.496, + "grad_norm": 1.2548552751541138, + "learning_rate": 2.633266533066132e-05, + "loss": 0.0356, + "step": 2374 + }, + { + "epoch": 9.5, + "grad_norm": 0.9841686487197876, + "learning_rate": 2.6322645290581165e-05, + "loss": 0.0262, + "step": 2375 + }, + { + "epoch": 9.504, + "grad_norm": 1.228925108909607, + "learning_rate": 2.6312625250501007e-05, + "loss": 0.0354, + "step": 2376 + }, + { + "epoch": 9.508, + "grad_norm": 1.1918587684631348, + "learning_rate": 2.6302605210420845e-05, + "loss": 0.0334, + "step": 2377 + }, + { + "epoch": 9.512, + "grad_norm": 1.1744786500930786, + "learning_rate": 2.6292585170340682e-05, + "loss": 0.0391, + "step": 2378 + }, + { + "epoch": 9.516, + "grad_norm": 1.0026384592056274, + "learning_rate": 2.6282565130260524e-05, + "loss": 0.0361, + "step": 2379 + }, + { + "epoch": 9.52, + "grad_norm": 1.3792084455490112, + "learning_rate": 2.627254509018036e-05, + "loss": 0.0323, + "step": 2380 + }, + { + "epoch": 9.524000000000001, + "grad_norm": 0.7252622842788696, + "learning_rate": 2.62625250501002e-05, + "loss": 0.0176, + "step": 2381 + }, + { + "epoch": 9.528, + "grad_norm": 1.0033866167068481, + "learning_rate": 2.625250501002004e-05, + "loss": 0.0307, + "step": 2382 + }, + { + "epoch": 9.532, + "grad_norm": 1.1288398504257202, + "learning_rate": 2.624248496993988e-05, + "loss": 0.0363, + "step": 2383 + }, + { + "epoch": 9.536, + "grad_norm": 1.11295485496521, + "learning_rate": 2.6232464929859717e-05, + "loss": 0.0359, + "step": 2384 + }, + { + "epoch": 9.54, + "grad_norm": 0.9084358215332031, + "learning_rate": 2.622244488977956e-05, + "loss": 0.026, + "step": 2385 + }, + { + "epoch": 9.544, + "grad_norm": 1.091408371925354, + "learning_rate": 2.6212424849699403e-05, + "loss": 0.0328, + "step": 2386 + }, + { + "epoch": 9.548, + "grad_norm": 1.1012197732925415, + "learning_rate": 2.620240480961924e-05, + "loss": 0.0337, + "step": 2387 + }, + { + "epoch": 9.552, + "grad_norm": 1.2079898118972778, + "learning_rate": 2.619238476953908e-05, + "loss": 0.0318, + "step": 2388 + }, + { + "epoch": 9.556000000000001, + "grad_norm": 0.6078687310218811, + "learning_rate": 2.618236472945892e-05, + "loss": 0.0192, + "step": 2389 + }, + { + "epoch": 9.56, + "grad_norm": 0.9216514825820923, + "learning_rate": 2.6172344689378757e-05, + "loss": 0.0291, + "step": 2390 + }, + { + "epoch": 9.564, + "grad_norm": 0.9543266892433167, + "learning_rate": 2.61623246492986e-05, + "loss": 0.0283, + "step": 2391 + }, + { + "epoch": 9.568, + "grad_norm": 1.2358766794204712, + "learning_rate": 2.6152304609218437e-05, + "loss": 0.0338, + "step": 2392 + }, + { + "epoch": 9.572, + "grad_norm": 1.0046981573104858, + "learning_rate": 2.6142284569138275e-05, + "loss": 0.0234, + "step": 2393 + }, + { + "epoch": 9.576, + "grad_norm": 1.3283228874206543, + "learning_rate": 2.6132264529058116e-05, + "loss": 0.0488, + "step": 2394 + }, + { + "epoch": 9.58, + "grad_norm": 0.8663226962089539, + "learning_rate": 2.612224448897796e-05, + "loss": 0.0219, + "step": 2395 + }, + { + "epoch": 9.584, + "grad_norm": 1.2725272178649902, + "learning_rate": 2.61122244488978e-05, + "loss": 0.037, + "step": 2396 + }, + { + "epoch": 9.588, + "grad_norm": 1.0414756536483765, + "learning_rate": 2.6102204408817636e-05, + "loss": 0.0337, + "step": 2397 + }, + { + "epoch": 9.592, + "grad_norm": 1.0537278652191162, + "learning_rate": 2.6092184368737478e-05, + "loss": 0.0296, + "step": 2398 + }, + { + "epoch": 9.596, + "grad_norm": 1.2478158473968506, + "learning_rate": 2.6082164328657315e-05, + "loss": 0.0344, + "step": 2399 + }, + { + "epoch": 9.6, + "grad_norm": 1.1711362600326538, + "learning_rate": 2.6072144288577153e-05, + "loss": 0.036, + "step": 2400 + }, + { + "epoch": 9.604, + "grad_norm": 1.2118823528289795, + "learning_rate": 2.6062124248496995e-05, + "loss": 0.0284, + "step": 2401 + }, + { + "epoch": 9.608, + "grad_norm": 1.1261894702911377, + "learning_rate": 2.6052104208416833e-05, + "loss": 0.0339, + "step": 2402 + }, + { + "epoch": 9.612, + "grad_norm": 1.8169751167297363, + "learning_rate": 2.6042084168336674e-05, + "loss": 0.0326, + "step": 2403 + }, + { + "epoch": 9.616, + "grad_norm": 1.1546857357025146, + "learning_rate": 2.603206412825652e-05, + "loss": 0.0358, + "step": 2404 + }, + { + "epoch": 9.62, + "grad_norm": 1.1949758529663086, + "learning_rate": 2.6022044088176356e-05, + "loss": 0.0337, + "step": 2405 + }, + { + "epoch": 9.624, + "grad_norm": 0.9737666249275208, + "learning_rate": 2.6012024048096194e-05, + "loss": 0.0294, + "step": 2406 + }, + { + "epoch": 9.628, + "grad_norm": 1.1526328325271606, + "learning_rate": 2.6002004008016036e-05, + "loss": 0.0343, + "step": 2407 + }, + { + "epoch": 9.632, + "grad_norm": 1.266064167022705, + "learning_rate": 2.5991983967935873e-05, + "loss": 0.0309, + "step": 2408 + }, + { + "epoch": 9.636, + "grad_norm": 1.0405514240264893, + "learning_rate": 2.598196392785571e-05, + "loss": 0.0325, + "step": 2409 + }, + { + "epoch": 9.64, + "grad_norm": 1.1470423936843872, + "learning_rate": 2.5971943887775553e-05, + "loss": 0.0315, + "step": 2410 + }, + { + "epoch": 9.644, + "grad_norm": 1.1034044027328491, + "learning_rate": 2.596192384769539e-05, + "loss": 0.0325, + "step": 2411 + }, + { + "epoch": 9.648, + "grad_norm": 1.101983666419983, + "learning_rate": 2.595190380761523e-05, + "loss": 0.0281, + "step": 2412 + }, + { + "epoch": 9.652, + "grad_norm": 1.0074468851089478, + "learning_rate": 2.594188376753507e-05, + "loss": 0.0311, + "step": 2413 + }, + { + "epoch": 9.656, + "grad_norm": 1.2595995664596558, + "learning_rate": 2.5931863727454914e-05, + "loss": 0.0396, + "step": 2414 + }, + { + "epoch": 9.66, + "grad_norm": 1.195958137512207, + "learning_rate": 2.5921843687374752e-05, + "loss": 0.0329, + "step": 2415 + }, + { + "epoch": 9.664, + "grad_norm": 1.3391655683517456, + "learning_rate": 2.5911823647294594e-05, + "loss": 0.0402, + "step": 2416 + }, + { + "epoch": 9.668, + "grad_norm": 1.2114450931549072, + "learning_rate": 2.590180360721443e-05, + "loss": 0.0401, + "step": 2417 + }, + { + "epoch": 9.672, + "grad_norm": 1.2215373516082764, + "learning_rate": 2.589178356713427e-05, + "loss": 0.0325, + "step": 2418 + }, + { + "epoch": 9.676, + "grad_norm": 1.1048833131790161, + "learning_rate": 2.588176352705411e-05, + "loss": 0.0337, + "step": 2419 + }, + { + "epoch": 9.68, + "grad_norm": 0.823464035987854, + "learning_rate": 2.587174348697395e-05, + "loss": 0.0204, + "step": 2420 + }, + { + "epoch": 9.684, + "grad_norm": 1.2144451141357422, + "learning_rate": 2.5861723446893786e-05, + "loss": 0.0337, + "step": 2421 + }, + { + "epoch": 9.688, + "grad_norm": 0.928974986076355, + "learning_rate": 2.5851703406813628e-05, + "loss": 0.0276, + "step": 2422 + }, + { + "epoch": 9.692, + "grad_norm": 1.3975306749343872, + "learning_rate": 2.5841683366733466e-05, + "loss": 0.041, + "step": 2423 + }, + { + "epoch": 9.696, + "grad_norm": 1.0173219442367554, + "learning_rate": 2.583166332665331e-05, + "loss": 0.0308, + "step": 2424 + }, + { + "epoch": 9.7, + "grad_norm": 0.8563686013221741, + "learning_rate": 2.5821643286573148e-05, + "loss": 0.0308, + "step": 2425 + }, + { + "epoch": 9.704, + "grad_norm": 0.930245041847229, + "learning_rate": 2.581162324649299e-05, + "loss": 0.0298, + "step": 2426 + }, + { + "epoch": 9.708, + "grad_norm": 1.0278750658035278, + "learning_rate": 2.5801603206412827e-05, + "loss": 0.0298, + "step": 2427 + }, + { + "epoch": 9.712, + "grad_norm": 0.9554607272148132, + "learning_rate": 2.5791583166332665e-05, + "loss": 0.0321, + "step": 2428 + }, + { + "epoch": 9.716, + "grad_norm": 0.9845343828201294, + "learning_rate": 2.5781563126252507e-05, + "loss": 0.0365, + "step": 2429 + }, + { + "epoch": 9.72, + "grad_norm": 0.959926187992096, + "learning_rate": 2.5771543086172344e-05, + "loss": 0.0323, + "step": 2430 + }, + { + "epoch": 9.724, + "grad_norm": 1.3230634927749634, + "learning_rate": 2.5761523046092186e-05, + "loss": 0.0397, + "step": 2431 + }, + { + "epoch": 9.728, + "grad_norm": 1.1665371656417847, + "learning_rate": 2.5751503006012024e-05, + "loss": 0.0299, + "step": 2432 + }, + { + "epoch": 9.732, + "grad_norm": 1.1947264671325684, + "learning_rate": 2.574148296593186e-05, + "loss": 0.0344, + "step": 2433 + }, + { + "epoch": 9.736, + "grad_norm": 1.088255763053894, + "learning_rate": 2.5731462925851706e-05, + "loss": 0.0308, + "step": 2434 + }, + { + "epoch": 9.74, + "grad_norm": 1.1948275566101074, + "learning_rate": 2.5721442885771547e-05, + "loss": 0.0433, + "step": 2435 + }, + { + "epoch": 9.744, + "grad_norm": 0.96909099817276, + "learning_rate": 2.5711422845691385e-05, + "loss": 0.0311, + "step": 2436 + }, + { + "epoch": 9.748, + "grad_norm": 1.1078877449035645, + "learning_rate": 2.5701402805611223e-05, + "loss": 0.033, + "step": 2437 + }, + { + "epoch": 9.752, + "grad_norm": 1.0920991897583008, + "learning_rate": 2.5691382765531065e-05, + "loss": 0.0345, + "step": 2438 + }, + { + "epoch": 9.756, + "grad_norm": 1.28229558467865, + "learning_rate": 2.5681362725450902e-05, + "loss": 0.0386, + "step": 2439 + }, + { + "epoch": 9.76, + "grad_norm": 1.235954761505127, + "learning_rate": 2.567134268537074e-05, + "loss": 0.037, + "step": 2440 + }, + { + "epoch": 9.764, + "grad_norm": 1.089188575744629, + "learning_rate": 2.566132264529058e-05, + "loss": 0.0308, + "step": 2441 + }, + { + "epoch": 9.768, + "grad_norm": 1.021936058998108, + "learning_rate": 2.565130260521042e-05, + "loss": 0.0275, + "step": 2442 + }, + { + "epoch": 9.772, + "grad_norm": 1.178109049797058, + "learning_rate": 2.564128256513026e-05, + "loss": 0.0328, + "step": 2443 + }, + { + "epoch": 9.776, + "grad_norm": 1.2006083726882935, + "learning_rate": 2.5631262525050102e-05, + "loss": 0.0372, + "step": 2444 + }, + { + "epoch": 9.78, + "grad_norm": 1.1977381706237793, + "learning_rate": 2.5621242484969943e-05, + "loss": 0.0331, + "step": 2445 + }, + { + "epoch": 9.784, + "grad_norm": 1.240316390991211, + "learning_rate": 2.561122244488978e-05, + "loss": 0.0335, + "step": 2446 + }, + { + "epoch": 9.788, + "grad_norm": 1.1987797021865845, + "learning_rate": 2.5601202404809622e-05, + "loss": 0.0341, + "step": 2447 + }, + { + "epoch": 9.792, + "grad_norm": 1.0871502161026, + "learning_rate": 2.559118236472946e-05, + "loss": 0.0343, + "step": 2448 + }, + { + "epoch": 9.796, + "grad_norm": 0.9493705034255981, + "learning_rate": 2.5581162324649298e-05, + "loss": 0.0262, + "step": 2449 + }, + { + "epoch": 9.8, + "grad_norm": 1.1285532712936401, + "learning_rate": 2.557114228456914e-05, + "loss": 0.0334, + "step": 2450 + }, + { + "epoch": 9.804, + "grad_norm": 0.9567793011665344, + "learning_rate": 2.5561122244488977e-05, + "loss": 0.0291, + "step": 2451 + }, + { + "epoch": 9.808, + "grad_norm": 1.1705495119094849, + "learning_rate": 2.5551102204408815e-05, + "loss": 0.0334, + "step": 2452 + }, + { + "epoch": 9.812, + "grad_norm": 1.1043469905853271, + "learning_rate": 2.554108216432866e-05, + "loss": 0.0329, + "step": 2453 + }, + { + "epoch": 9.816, + "grad_norm": 1.5261973142623901, + "learning_rate": 2.55310621242485e-05, + "loss": 0.04, + "step": 2454 + }, + { + "epoch": 9.82, + "grad_norm": 1.164994716644287, + "learning_rate": 2.552104208416834e-05, + "loss": 0.0349, + "step": 2455 + }, + { + "epoch": 9.824, + "grad_norm": 0.960659384727478, + "learning_rate": 2.5511022044088177e-05, + "loss": 0.0251, + "step": 2456 + }, + { + "epoch": 9.828, + "grad_norm": 1.2397003173828125, + "learning_rate": 2.550100200400802e-05, + "loss": 0.0379, + "step": 2457 + }, + { + "epoch": 9.832, + "grad_norm": 1.0921655893325806, + "learning_rate": 2.5490981963927856e-05, + "loss": 0.0377, + "step": 2458 + }, + { + "epoch": 9.836, + "grad_norm": 1.165229320526123, + "learning_rate": 2.5480961923847698e-05, + "loss": 0.032, + "step": 2459 + }, + { + "epoch": 9.84, + "grad_norm": 1.2369738817214966, + "learning_rate": 2.5470941883767535e-05, + "loss": 0.0372, + "step": 2460 + }, + { + "epoch": 9.844, + "grad_norm": 1.1303399801254272, + "learning_rate": 2.5460921843687373e-05, + "loss": 0.031, + "step": 2461 + }, + { + "epoch": 9.848, + "grad_norm": 1.3156812191009521, + "learning_rate": 2.5450901803607215e-05, + "loss": 0.0334, + "step": 2462 + }, + { + "epoch": 9.852, + "grad_norm": 1.2975099086761475, + "learning_rate": 2.544088176352706e-05, + "loss": 0.0342, + "step": 2463 + }, + { + "epoch": 9.856, + "grad_norm": 1.2751260995864868, + "learning_rate": 2.5430861723446897e-05, + "loss": 0.0391, + "step": 2464 + }, + { + "epoch": 9.86, + "grad_norm": 1.1819888353347778, + "learning_rate": 2.5420841683366735e-05, + "loss": 0.0326, + "step": 2465 + }, + { + "epoch": 9.864, + "grad_norm": 1.6855754852294922, + "learning_rate": 2.5410821643286576e-05, + "loss": 0.0476, + "step": 2466 + }, + { + "epoch": 9.868, + "grad_norm": 1.1449778079986572, + "learning_rate": 2.5400801603206414e-05, + "loss": 0.0332, + "step": 2467 + }, + { + "epoch": 9.872, + "grad_norm": 1.15505850315094, + "learning_rate": 2.5390781563126252e-05, + "loss": 0.0334, + "step": 2468 + }, + { + "epoch": 9.876, + "grad_norm": 1.2710686922073364, + "learning_rate": 2.5380761523046093e-05, + "loss": 0.0317, + "step": 2469 + }, + { + "epoch": 9.88, + "grad_norm": 1.2653512954711914, + "learning_rate": 2.537074148296593e-05, + "loss": 0.0387, + "step": 2470 + }, + { + "epoch": 9.884, + "grad_norm": 1.1764731407165527, + "learning_rate": 2.536072144288577e-05, + "loss": 0.0375, + "step": 2471 + }, + { + "epoch": 9.888, + "grad_norm": 0.7444102764129639, + "learning_rate": 2.535070140280561e-05, + "loss": 0.0182, + "step": 2472 + }, + { + "epoch": 9.892, + "grad_norm": 1.2663006782531738, + "learning_rate": 2.5340681362725455e-05, + "loss": 0.034, + "step": 2473 + }, + { + "epoch": 9.896, + "grad_norm": 1.14084792137146, + "learning_rate": 2.5330661322645293e-05, + "loss": 0.0365, + "step": 2474 + }, + { + "epoch": 9.9, + "grad_norm": 1.0226420164108276, + "learning_rate": 2.5320641282565134e-05, + "loss": 0.0301, + "step": 2475 + }, + { + "epoch": 9.904, + "grad_norm": 1.0205984115600586, + "learning_rate": 2.5310621242484972e-05, + "loss": 0.0305, + "step": 2476 + }, + { + "epoch": 9.908, + "grad_norm": 1.1715893745422363, + "learning_rate": 2.530060120240481e-05, + "loss": 0.0322, + "step": 2477 + }, + { + "epoch": 9.912, + "grad_norm": 1.4050195217132568, + "learning_rate": 2.529058116232465e-05, + "loss": 0.0434, + "step": 2478 + }, + { + "epoch": 9.916, + "grad_norm": 1.1078695058822632, + "learning_rate": 2.528056112224449e-05, + "loss": 0.0311, + "step": 2479 + }, + { + "epoch": 9.92, + "grad_norm": 1.0815905332565308, + "learning_rate": 2.5270541082164327e-05, + "loss": 0.0372, + "step": 2480 + }, + { + "epoch": 9.924, + "grad_norm": 1.1205084323883057, + "learning_rate": 2.526052104208417e-05, + "loss": 0.0309, + "step": 2481 + }, + { + "epoch": 9.928, + "grad_norm": 1.123581051826477, + "learning_rate": 2.5250501002004006e-05, + "loss": 0.0332, + "step": 2482 + }, + { + "epoch": 9.932, + "grad_norm": 1.1913464069366455, + "learning_rate": 2.524048096192385e-05, + "loss": 0.0344, + "step": 2483 + }, + { + "epoch": 9.936, + "grad_norm": 1.033087134361267, + "learning_rate": 2.523046092184369e-05, + "loss": 0.0326, + "step": 2484 + }, + { + "epoch": 9.94, + "grad_norm": 1.0734940767288208, + "learning_rate": 2.522044088176353e-05, + "loss": 0.0364, + "step": 2485 + }, + { + "epoch": 9.943999999999999, + "grad_norm": 1.0637986660003662, + "learning_rate": 2.5210420841683368e-05, + "loss": 0.038, + "step": 2486 + }, + { + "epoch": 9.948, + "grad_norm": 1.1711645126342773, + "learning_rate": 2.520040080160321e-05, + "loss": 0.0343, + "step": 2487 + }, + { + "epoch": 9.952, + "grad_norm": 1.1081736087799072, + "learning_rate": 2.5190380761523047e-05, + "loss": 0.032, + "step": 2488 + }, + { + "epoch": 9.956, + "grad_norm": 1.0930702686309814, + "learning_rate": 2.5180360721442885e-05, + "loss": 0.0389, + "step": 2489 + }, + { + "epoch": 9.96, + "grad_norm": 1.1130561828613281, + "learning_rate": 2.5170340681362726e-05, + "loss": 0.0332, + "step": 2490 + }, + { + "epoch": 9.964, + "grad_norm": 1.0953967571258545, + "learning_rate": 2.5160320641282564e-05, + "loss": 0.0427, + "step": 2491 + }, + { + "epoch": 9.968, + "grad_norm": 1.0438133478164673, + "learning_rate": 2.5150300601202402e-05, + "loss": 0.0337, + "step": 2492 + }, + { + "epoch": 9.972, + "grad_norm": 1.1139825582504272, + "learning_rate": 2.5140280561122247e-05, + "loss": 0.0326, + "step": 2493 + }, + { + "epoch": 9.975999999999999, + "grad_norm": 1.1483579874038696, + "learning_rate": 2.5130260521042088e-05, + "loss": 0.0321, + "step": 2494 + }, + { + "epoch": 9.98, + "grad_norm": 0.9585171937942505, + "learning_rate": 2.5120240480961926e-05, + "loss": 0.0264, + "step": 2495 + }, + { + "epoch": 9.984, + "grad_norm": 1.2960076332092285, + "learning_rate": 2.5110220440881764e-05, + "loss": 0.0475, + "step": 2496 + }, + { + "epoch": 9.988, + "grad_norm": 1.278764009475708, + "learning_rate": 2.5100200400801605e-05, + "loss": 0.0376, + "step": 2497 + }, + { + "epoch": 9.992, + "grad_norm": 1.109755039215088, + "learning_rate": 2.5090180360721443e-05, + "loss": 0.0304, + "step": 2498 + }, + { + "epoch": 9.996, + "grad_norm": 1.2089651823043823, + "learning_rate": 2.508016032064128e-05, + "loss": 0.0345, + "step": 2499 + }, + { + "epoch": 10.0, + "grad_norm": 1.3059208393096924, + "learning_rate": 2.5070140280561122e-05, + "loss": 0.044, + "step": 2500 + }, + { + "epoch": 10.004, + "grad_norm": 0.838786780834198, + "learning_rate": 2.506012024048096e-05, + "loss": 0.0223, + "step": 2501 + }, + { + "epoch": 10.008, + "grad_norm": 0.9346208572387695, + "learning_rate": 2.50501002004008e-05, + "loss": 0.0308, + "step": 2502 + }, + { + "epoch": 10.012, + "grad_norm": 0.627692699432373, + "learning_rate": 2.5040080160320646e-05, + "loss": 0.0171, + "step": 2503 + }, + { + "epoch": 10.016, + "grad_norm": 0.789028525352478, + "learning_rate": 2.5030060120240484e-05, + "loss": 0.0211, + "step": 2504 + }, + { + "epoch": 10.02, + "grad_norm": 0.8333077430725098, + "learning_rate": 2.5020040080160322e-05, + "loss": 0.0253, + "step": 2505 + }, + { + "epoch": 10.024, + "grad_norm": 0.7909857630729675, + "learning_rate": 2.5010020040080163e-05, + "loss": 0.0231, + "step": 2506 + }, + { + "epoch": 10.028, + "grad_norm": 0.9159106016159058, + "learning_rate": 2.5e-05, + "loss": 0.0219, + "step": 2507 + }, + { + "epoch": 10.032, + "grad_norm": 0.5722668766975403, + "learning_rate": 2.498997995991984e-05, + "loss": 0.0185, + "step": 2508 + }, + { + "epoch": 10.036, + "grad_norm": 0.8425304889678955, + "learning_rate": 2.497995991983968e-05, + "loss": 0.0223, + "step": 2509 + }, + { + "epoch": 10.04, + "grad_norm": 0.9075009226799011, + "learning_rate": 2.496993987975952e-05, + "loss": 0.0228, + "step": 2510 + }, + { + "epoch": 10.044, + "grad_norm": 0.7858025431632996, + "learning_rate": 2.495991983967936e-05, + "loss": 0.0196, + "step": 2511 + }, + { + "epoch": 10.048, + "grad_norm": 0.6345151662826538, + "learning_rate": 2.49498997995992e-05, + "loss": 0.0161, + "step": 2512 + }, + { + "epoch": 10.052, + "grad_norm": 0.6853556036949158, + "learning_rate": 2.493987975951904e-05, + "loss": 0.0174, + "step": 2513 + }, + { + "epoch": 10.056, + "grad_norm": 0.7381585240364075, + "learning_rate": 2.4929859719438877e-05, + "loss": 0.0182, + "step": 2514 + }, + { + "epoch": 10.06, + "grad_norm": 1.0127655267715454, + "learning_rate": 2.491983967935872e-05, + "loss": 0.0226, + "step": 2515 + }, + { + "epoch": 10.064, + "grad_norm": 1.03529691696167, + "learning_rate": 2.490981963927856e-05, + "loss": 0.0265, + "step": 2516 + }, + { + "epoch": 10.068, + "grad_norm": 1.1951006650924683, + "learning_rate": 2.4899799599198397e-05, + "loss": 0.0231, + "step": 2517 + }, + { + "epoch": 10.072, + "grad_norm": 0.7003726959228516, + "learning_rate": 2.488977955911824e-05, + "loss": 0.0186, + "step": 2518 + }, + { + "epoch": 10.076, + "grad_norm": 0.7388056516647339, + "learning_rate": 2.4879759519038076e-05, + "loss": 0.0185, + "step": 2519 + }, + { + "epoch": 10.08, + "grad_norm": 0.766671895980835, + "learning_rate": 2.4869739478957918e-05, + "loss": 0.0196, + "step": 2520 + }, + { + "epoch": 10.084, + "grad_norm": 0.6915393471717834, + "learning_rate": 2.4859719438877755e-05, + "loss": 0.0161, + "step": 2521 + }, + { + "epoch": 10.088, + "grad_norm": 0.831326961517334, + "learning_rate": 2.4849699398797597e-05, + "loss": 0.0222, + "step": 2522 + }, + { + "epoch": 10.092, + "grad_norm": 0.8071035146713257, + "learning_rate": 2.4839679358717435e-05, + "loss": 0.0198, + "step": 2523 + }, + { + "epoch": 10.096, + "grad_norm": 0.8890426754951477, + "learning_rate": 2.4829659318637276e-05, + "loss": 0.0222, + "step": 2524 + }, + { + "epoch": 10.1, + "grad_norm": 0.8120467066764832, + "learning_rate": 2.4819639278557117e-05, + "loss": 0.0193, + "step": 2525 + }, + { + "epoch": 10.104, + "grad_norm": 0.7911978960037231, + "learning_rate": 2.4809619238476955e-05, + "loss": 0.0228, + "step": 2526 + }, + { + "epoch": 10.108, + "grad_norm": 0.82595294713974, + "learning_rate": 2.4799599198396793e-05, + "loss": 0.0213, + "step": 2527 + }, + { + "epoch": 10.112, + "grad_norm": 0.40247252583503723, + "learning_rate": 2.4789579158316634e-05, + "loss": 0.0098, + "step": 2528 + }, + { + "epoch": 10.116, + "grad_norm": 1.0697221755981445, + "learning_rate": 2.4779559118236472e-05, + "loss": 0.0252, + "step": 2529 + }, + { + "epoch": 10.12, + "grad_norm": 0.8177196979522705, + "learning_rate": 2.4769539078156313e-05, + "loss": 0.0214, + "step": 2530 + }, + { + "epoch": 10.124, + "grad_norm": 0.6448612213134766, + "learning_rate": 2.4759519038076155e-05, + "loss": 0.0163, + "step": 2531 + }, + { + "epoch": 10.128, + "grad_norm": 0.7582941055297852, + "learning_rate": 2.4749498997995993e-05, + "loss": 0.0191, + "step": 2532 + }, + { + "epoch": 10.132, + "grad_norm": 0.6828077435493469, + "learning_rate": 2.473947895791583e-05, + "loss": 0.0174, + "step": 2533 + }, + { + "epoch": 10.136, + "grad_norm": 0.7914385199546814, + "learning_rate": 2.4729458917835672e-05, + "loss": 0.0198, + "step": 2534 + }, + { + "epoch": 10.14, + "grad_norm": 0.6168143153190613, + "learning_rate": 2.4719438877755513e-05, + "loss": 0.0171, + "step": 2535 + }, + { + "epoch": 10.144, + "grad_norm": 0.7246816158294678, + "learning_rate": 2.470941883767535e-05, + "loss": 0.0207, + "step": 2536 + }, + { + "epoch": 10.148, + "grad_norm": 1.0454907417297363, + "learning_rate": 2.4699398797595192e-05, + "loss": 0.025, + "step": 2537 + }, + { + "epoch": 10.152, + "grad_norm": 0.9220485091209412, + "learning_rate": 2.468937875751503e-05, + "loss": 0.0234, + "step": 2538 + }, + { + "epoch": 10.156, + "grad_norm": 0.8142296671867371, + "learning_rate": 2.467935871743487e-05, + "loss": 0.0197, + "step": 2539 + }, + { + "epoch": 10.16, + "grad_norm": 0.7292171716690063, + "learning_rate": 2.4669338677354713e-05, + "loss": 0.0197, + "step": 2540 + }, + { + "epoch": 10.164, + "grad_norm": 0.8474435210227966, + "learning_rate": 2.465931863727455e-05, + "loss": 0.0199, + "step": 2541 + }, + { + "epoch": 10.168, + "grad_norm": 0.677284836769104, + "learning_rate": 2.464929859719439e-05, + "loss": 0.0161, + "step": 2542 + }, + { + "epoch": 10.172, + "grad_norm": 0.8956500887870789, + "learning_rate": 2.463927855711423e-05, + "loss": 0.0233, + "step": 2543 + }, + { + "epoch": 10.176, + "grad_norm": 1.009035348892212, + "learning_rate": 2.462925851703407e-05, + "loss": 0.0243, + "step": 2544 + }, + { + "epoch": 10.18, + "grad_norm": 0.867087721824646, + "learning_rate": 2.461923847695391e-05, + "loss": 0.0201, + "step": 2545 + }, + { + "epoch": 10.184, + "grad_norm": 1.1653364896774292, + "learning_rate": 2.460921843687375e-05, + "loss": 0.0296, + "step": 2546 + }, + { + "epoch": 10.188, + "grad_norm": 0.7345436215400696, + "learning_rate": 2.4599198396793588e-05, + "loss": 0.0194, + "step": 2547 + }, + { + "epoch": 10.192, + "grad_norm": 0.9385821223258972, + "learning_rate": 2.4589178356713426e-05, + "loss": 0.0223, + "step": 2548 + }, + { + "epoch": 10.196, + "grad_norm": 0.6956146359443665, + "learning_rate": 2.4579158316633267e-05, + "loss": 0.0169, + "step": 2549 + }, + { + "epoch": 10.2, + "grad_norm": 0.7195176482200623, + "learning_rate": 2.456913827655311e-05, + "loss": 0.0189, + "step": 2550 + }, + { + "epoch": 10.204, + "grad_norm": 1.0395700931549072, + "learning_rate": 2.4559118236472946e-05, + "loss": 0.0214, + "step": 2551 + }, + { + "epoch": 10.208, + "grad_norm": 1.0134451389312744, + "learning_rate": 2.4549098196392788e-05, + "loss": 0.0218, + "step": 2552 + }, + { + "epoch": 10.212, + "grad_norm": 1.1786450147628784, + "learning_rate": 2.4539078156312626e-05, + "loss": 0.0177, + "step": 2553 + }, + { + "epoch": 10.216, + "grad_norm": 0.8216089606285095, + "learning_rate": 2.4529058116232467e-05, + "loss": 0.0197, + "step": 2554 + }, + { + "epoch": 10.22, + "grad_norm": 0.5676589608192444, + "learning_rate": 2.4519038076152305e-05, + "loss": 0.0171, + "step": 2555 + }, + { + "epoch": 10.224, + "grad_norm": 0.8638578653335571, + "learning_rate": 2.4509018036072146e-05, + "loss": 0.0225, + "step": 2556 + }, + { + "epoch": 10.228, + "grad_norm": 0.9760416746139526, + "learning_rate": 2.4498997995991984e-05, + "loss": 0.0222, + "step": 2557 + }, + { + "epoch": 10.232, + "grad_norm": 0.8012329339981079, + "learning_rate": 2.4488977955911825e-05, + "loss": 0.0164, + "step": 2558 + }, + { + "epoch": 10.236, + "grad_norm": 0.6655237078666687, + "learning_rate": 2.4478957915831667e-05, + "loss": 0.0189, + "step": 2559 + }, + { + "epoch": 10.24, + "grad_norm": 1.0602320432662964, + "learning_rate": 2.4468937875751504e-05, + "loss": 0.0271, + "step": 2560 + }, + { + "epoch": 10.244, + "grad_norm": 0.8225724697113037, + "learning_rate": 2.4458917835671342e-05, + "loss": 0.0207, + "step": 2561 + }, + { + "epoch": 10.248, + "grad_norm": 0.783803403377533, + "learning_rate": 2.4448897795591184e-05, + "loss": 0.0186, + "step": 2562 + }, + { + "epoch": 10.252, + "grad_norm": 0.704971969127655, + "learning_rate": 2.443887775551102e-05, + "loss": 0.0179, + "step": 2563 + }, + { + "epoch": 10.256, + "grad_norm": 0.8094855546951294, + "learning_rate": 2.4428857715430863e-05, + "loss": 0.0209, + "step": 2564 + }, + { + "epoch": 10.26, + "grad_norm": 0.8836872577667236, + "learning_rate": 2.4418837675350704e-05, + "loss": 0.0192, + "step": 2565 + }, + { + "epoch": 10.264, + "grad_norm": 0.901753842830658, + "learning_rate": 2.4408817635270542e-05, + "loss": 0.0241, + "step": 2566 + }, + { + "epoch": 10.268, + "grad_norm": 0.7555407881736755, + "learning_rate": 2.439879759519038e-05, + "loss": 0.0191, + "step": 2567 + }, + { + "epoch": 10.272, + "grad_norm": 0.8224442005157471, + "learning_rate": 2.438877755511022e-05, + "loss": 0.0209, + "step": 2568 + }, + { + "epoch": 10.276, + "grad_norm": 0.9603815078735352, + "learning_rate": 2.4378757515030062e-05, + "loss": 0.0205, + "step": 2569 + }, + { + "epoch": 10.28, + "grad_norm": 0.7587599158287048, + "learning_rate": 2.43687374749499e-05, + "loss": 0.0165, + "step": 2570 + }, + { + "epoch": 10.284, + "grad_norm": 0.8181880712509155, + "learning_rate": 2.435871743486974e-05, + "loss": 0.0217, + "step": 2571 + }, + { + "epoch": 10.288, + "grad_norm": 0.8437207341194153, + "learning_rate": 2.434869739478958e-05, + "loss": 0.0209, + "step": 2572 + }, + { + "epoch": 10.292, + "grad_norm": 0.9303336143493652, + "learning_rate": 2.4338677354709417e-05, + "loss": 0.0271, + "step": 2573 + }, + { + "epoch": 10.296, + "grad_norm": 1.0167421102523804, + "learning_rate": 2.4328657314629262e-05, + "loss": 0.0206, + "step": 2574 + }, + { + "epoch": 10.3, + "grad_norm": 0.9930081367492676, + "learning_rate": 2.43186372745491e-05, + "loss": 0.0221, + "step": 2575 + }, + { + "epoch": 10.304, + "grad_norm": 0.6778939962387085, + "learning_rate": 2.4308617234468938e-05, + "loss": 0.0205, + "step": 2576 + }, + { + "epoch": 10.308, + "grad_norm": 0.8133848309516907, + "learning_rate": 2.429859719438878e-05, + "loss": 0.0196, + "step": 2577 + }, + { + "epoch": 10.312, + "grad_norm": 0.7504951357841492, + "learning_rate": 2.4288577154308617e-05, + "loss": 0.0173, + "step": 2578 + }, + { + "epoch": 10.316, + "grad_norm": 1.1020333766937256, + "learning_rate": 2.427855711422846e-05, + "loss": 0.0258, + "step": 2579 + }, + { + "epoch": 10.32, + "grad_norm": 0.807967483997345, + "learning_rate": 2.42685370741483e-05, + "loss": 0.0199, + "step": 2580 + }, + { + "epoch": 10.324, + "grad_norm": 0.9082990884780884, + "learning_rate": 2.4258517034068138e-05, + "loss": 0.0199, + "step": 2581 + }, + { + "epoch": 10.328, + "grad_norm": 0.8345447182655334, + "learning_rate": 2.4248496993987975e-05, + "loss": 0.021, + "step": 2582 + }, + { + "epoch": 10.332, + "grad_norm": 0.9574979543685913, + "learning_rate": 2.4238476953907817e-05, + "loss": 0.0243, + "step": 2583 + }, + { + "epoch": 10.336, + "grad_norm": 0.8273745775222778, + "learning_rate": 2.4228456913827658e-05, + "loss": 0.02, + "step": 2584 + }, + { + "epoch": 10.34, + "grad_norm": 0.695869505405426, + "learning_rate": 2.4218436873747496e-05, + "loss": 0.0154, + "step": 2585 + }, + { + "epoch": 10.344, + "grad_norm": 0.8283644914627075, + "learning_rate": 2.4208416833667337e-05, + "loss": 0.0204, + "step": 2586 + }, + { + "epoch": 10.348, + "grad_norm": 1.0260083675384521, + "learning_rate": 2.4198396793587175e-05, + "loss": 0.0259, + "step": 2587 + }, + { + "epoch": 10.352, + "grad_norm": 0.9744479656219482, + "learning_rate": 2.4188376753507013e-05, + "loss": 0.0243, + "step": 2588 + }, + { + "epoch": 10.356, + "grad_norm": 0.937955379486084, + "learning_rate": 2.4178356713426854e-05, + "loss": 0.0223, + "step": 2589 + }, + { + "epoch": 10.36, + "grad_norm": 1.1364651918411255, + "learning_rate": 2.4168336673346696e-05, + "loss": 0.0244, + "step": 2590 + }, + { + "epoch": 10.364, + "grad_norm": 0.9114662408828735, + "learning_rate": 2.4158316633266533e-05, + "loss": 0.027, + "step": 2591 + }, + { + "epoch": 10.368, + "grad_norm": 0.7826032638549805, + "learning_rate": 2.4148296593186375e-05, + "loss": 0.0192, + "step": 2592 + }, + { + "epoch": 10.372, + "grad_norm": 0.45732957124710083, + "learning_rate": 2.4138276553106216e-05, + "loss": 0.0133, + "step": 2593 + }, + { + "epoch": 10.376, + "grad_norm": 0.6898213624954224, + "learning_rate": 2.4128256513026054e-05, + "loss": 0.0214, + "step": 2594 + }, + { + "epoch": 10.38, + "grad_norm": 0.7986359596252441, + "learning_rate": 2.4118236472945892e-05, + "loss": 0.0184, + "step": 2595 + }, + { + "epoch": 10.384, + "grad_norm": 0.9170551896095276, + "learning_rate": 2.4108216432865733e-05, + "loss": 0.0215, + "step": 2596 + }, + { + "epoch": 10.388, + "grad_norm": 0.8053649067878723, + "learning_rate": 2.409819639278557e-05, + "loss": 0.0213, + "step": 2597 + }, + { + "epoch": 10.392, + "grad_norm": 0.8867815136909485, + "learning_rate": 2.4088176352705412e-05, + "loss": 0.0226, + "step": 2598 + }, + { + "epoch": 10.396, + "grad_norm": 0.8279624581336975, + "learning_rate": 2.4078156312625254e-05, + "loss": 0.0209, + "step": 2599 + }, + { + "epoch": 10.4, + "grad_norm": 0.6529654264450073, + "learning_rate": 2.406813627254509e-05, + "loss": 0.0134, + "step": 2600 + }, + { + "epoch": 10.404, + "grad_norm": 0.8741607666015625, + "learning_rate": 2.405811623246493e-05, + "loss": 0.0199, + "step": 2601 + }, + { + "epoch": 10.408, + "grad_norm": 0.9696851968765259, + "learning_rate": 2.404809619238477e-05, + "loss": 0.0212, + "step": 2602 + }, + { + "epoch": 10.412, + "grad_norm": 0.7984656095504761, + "learning_rate": 2.4038076152304612e-05, + "loss": 0.0197, + "step": 2603 + }, + { + "epoch": 10.416, + "grad_norm": 0.7449184656143188, + "learning_rate": 2.402805611222445e-05, + "loss": 0.0203, + "step": 2604 + }, + { + "epoch": 10.42, + "grad_norm": 0.7866941094398499, + "learning_rate": 2.401803607214429e-05, + "loss": 0.0214, + "step": 2605 + }, + { + "epoch": 10.424, + "grad_norm": 0.802543580532074, + "learning_rate": 2.400801603206413e-05, + "loss": 0.0216, + "step": 2606 + }, + { + "epoch": 10.428, + "grad_norm": 0.7483913898468018, + "learning_rate": 2.3997995991983967e-05, + "loss": 0.0205, + "step": 2607 + }, + { + "epoch": 10.432, + "grad_norm": 0.8060435652732849, + "learning_rate": 2.398797595190381e-05, + "loss": 0.0199, + "step": 2608 + }, + { + "epoch": 10.436, + "grad_norm": 0.7773402333259583, + "learning_rate": 2.397795591182365e-05, + "loss": 0.0184, + "step": 2609 + }, + { + "epoch": 10.44, + "grad_norm": 0.7629860639572144, + "learning_rate": 2.3967935871743487e-05, + "loss": 0.0191, + "step": 2610 + }, + { + "epoch": 10.444, + "grad_norm": 1.0891491174697876, + "learning_rate": 2.395791583166333e-05, + "loss": 0.0267, + "step": 2611 + }, + { + "epoch": 10.448, + "grad_norm": 0.8873718976974487, + "learning_rate": 2.3947895791583166e-05, + "loss": 0.023, + "step": 2612 + }, + { + "epoch": 10.452, + "grad_norm": 0.9421393871307373, + "learning_rate": 2.3937875751503008e-05, + "loss": 0.0231, + "step": 2613 + }, + { + "epoch": 10.456, + "grad_norm": 0.7414140105247498, + "learning_rate": 2.392785571142285e-05, + "loss": 0.0202, + "step": 2614 + }, + { + "epoch": 10.46, + "grad_norm": 0.9896325469017029, + "learning_rate": 2.3917835671342687e-05, + "loss": 0.0234, + "step": 2615 + }, + { + "epoch": 10.464, + "grad_norm": 0.7930034399032593, + "learning_rate": 2.3907815631262525e-05, + "loss": 0.021, + "step": 2616 + }, + { + "epoch": 10.468, + "grad_norm": 0.9249135851860046, + "learning_rate": 2.3897795591182366e-05, + "loss": 0.0225, + "step": 2617 + }, + { + "epoch": 10.472, + "grad_norm": 0.837981641292572, + "learning_rate": 2.3887775551102207e-05, + "loss": 0.0232, + "step": 2618 + }, + { + "epoch": 10.475999999999999, + "grad_norm": 1.0759623050689697, + "learning_rate": 2.3877755511022045e-05, + "loss": 0.0241, + "step": 2619 + }, + { + "epoch": 10.48, + "grad_norm": 0.8368483781814575, + "learning_rate": 2.3867735470941887e-05, + "loss": 0.0222, + "step": 2620 + }, + { + "epoch": 10.484, + "grad_norm": 0.9433078169822693, + "learning_rate": 2.3857715430861724e-05, + "loss": 0.0259, + "step": 2621 + }, + { + "epoch": 10.488, + "grad_norm": 0.9696186780929565, + "learning_rate": 2.3847695390781562e-05, + "loss": 0.0223, + "step": 2622 + }, + { + "epoch": 10.492, + "grad_norm": 1.07587468624115, + "learning_rate": 2.3837675350701404e-05, + "loss": 0.0252, + "step": 2623 + }, + { + "epoch": 10.496, + "grad_norm": 0.7479248642921448, + "learning_rate": 2.3827655310621245e-05, + "loss": 0.0196, + "step": 2624 + }, + { + "epoch": 10.5, + "grad_norm": 0.678727924823761, + "learning_rate": 2.3817635270541083e-05, + "loss": 0.0215, + "step": 2625 + }, + { + "epoch": 10.504, + "grad_norm": 0.7351118326187134, + "learning_rate": 2.380761523046092e-05, + "loss": 0.0214, + "step": 2626 + }, + { + "epoch": 10.508, + "grad_norm": 1.21925687789917, + "learning_rate": 2.3797595190380762e-05, + "loss": 0.0252, + "step": 2627 + }, + { + "epoch": 10.512, + "grad_norm": 0.9814172983169556, + "learning_rate": 2.3787575150300603e-05, + "loss": 0.0235, + "step": 2628 + }, + { + "epoch": 10.516, + "grad_norm": 0.880654513835907, + "learning_rate": 2.377755511022044e-05, + "loss": 0.0215, + "step": 2629 + }, + { + "epoch": 10.52, + "grad_norm": 0.9024983644485474, + "learning_rate": 2.3767535070140282e-05, + "loss": 0.0229, + "step": 2630 + }, + { + "epoch": 10.524000000000001, + "grad_norm": 1.2271602153778076, + "learning_rate": 2.375751503006012e-05, + "loss": 0.0266, + "step": 2631 + }, + { + "epoch": 10.528, + "grad_norm": 1.0690782070159912, + "learning_rate": 2.3747494989979958e-05, + "loss": 0.0242, + "step": 2632 + }, + { + "epoch": 10.532, + "grad_norm": 0.753580629825592, + "learning_rate": 2.3737474949899803e-05, + "loss": 0.0183, + "step": 2633 + }, + { + "epoch": 10.536, + "grad_norm": 0.7326211333274841, + "learning_rate": 2.372745490981964e-05, + "loss": 0.0209, + "step": 2634 + }, + { + "epoch": 10.54, + "grad_norm": 0.9598518013954163, + "learning_rate": 2.371743486973948e-05, + "loss": 0.0246, + "step": 2635 + }, + { + "epoch": 10.544, + "grad_norm": 0.7582380771636963, + "learning_rate": 2.370741482965932e-05, + "loss": 0.0197, + "step": 2636 + }, + { + "epoch": 10.548, + "grad_norm": 0.7412502765655518, + "learning_rate": 2.3697394789579158e-05, + "loss": 0.0207, + "step": 2637 + }, + { + "epoch": 10.552, + "grad_norm": 0.8043134808540344, + "learning_rate": 2.3687374749499e-05, + "loss": 0.0207, + "step": 2638 + }, + { + "epoch": 10.556000000000001, + "grad_norm": 1.084338665008545, + "learning_rate": 2.367735470941884e-05, + "loss": 0.0292, + "step": 2639 + }, + { + "epoch": 10.56, + "grad_norm": 0.871437668800354, + "learning_rate": 2.366733466933868e-05, + "loss": 0.0261, + "step": 2640 + }, + { + "epoch": 10.564, + "grad_norm": 0.8200016617774963, + "learning_rate": 2.3657314629258516e-05, + "loss": 0.0198, + "step": 2641 + }, + { + "epoch": 10.568, + "grad_norm": 0.9645569920539856, + "learning_rate": 2.3647294589178358e-05, + "loss": 0.0223, + "step": 2642 + }, + { + "epoch": 10.572, + "grad_norm": 1.119446873664856, + "learning_rate": 2.36372745490982e-05, + "loss": 0.0252, + "step": 2643 + }, + { + "epoch": 10.576, + "grad_norm": 1.2717159986495972, + "learning_rate": 2.3627254509018037e-05, + "loss": 0.0264, + "step": 2644 + }, + { + "epoch": 10.58, + "grad_norm": 0.7650407552719116, + "learning_rate": 2.3617234468937878e-05, + "loss": 0.0204, + "step": 2645 + }, + { + "epoch": 10.584, + "grad_norm": 0.48650267720222473, + "learning_rate": 2.3607214428857716e-05, + "loss": 0.011, + "step": 2646 + }, + { + "epoch": 10.588, + "grad_norm": 1.0167272090911865, + "learning_rate": 2.3597194388777557e-05, + "loss": 0.0281, + "step": 2647 + }, + { + "epoch": 10.592, + "grad_norm": 1.0820997953414917, + "learning_rate": 2.3587174348697395e-05, + "loss": 0.029, + "step": 2648 + }, + { + "epoch": 10.596, + "grad_norm": 0.4999312460422516, + "learning_rate": 2.3577154308617236e-05, + "loss": 0.0094, + "step": 2649 + }, + { + "epoch": 10.6, + "grad_norm": 0.9790526628494263, + "learning_rate": 2.3567134268537074e-05, + "loss": 0.0226, + "step": 2650 + }, + { + "epoch": 10.604, + "grad_norm": 0.9920722842216492, + "learning_rate": 2.3557114228456916e-05, + "loss": 0.0232, + "step": 2651 + }, + { + "epoch": 10.608, + "grad_norm": 0.6009891033172607, + "learning_rate": 2.3547094188376757e-05, + "loss": 0.0203, + "step": 2652 + }, + { + "epoch": 10.612, + "grad_norm": 0.7871741056442261, + "learning_rate": 2.3537074148296595e-05, + "loss": 0.0215, + "step": 2653 + }, + { + "epoch": 10.616, + "grad_norm": 1.217743992805481, + "learning_rate": 2.3527054108216433e-05, + "loss": 0.0251, + "step": 2654 + }, + { + "epoch": 10.62, + "grad_norm": 1.0484158992767334, + "learning_rate": 2.3517034068136274e-05, + "loss": 0.0261, + "step": 2655 + }, + { + "epoch": 10.624, + "grad_norm": 1.014896273612976, + "learning_rate": 2.3507014028056112e-05, + "loss": 0.0233, + "step": 2656 + }, + { + "epoch": 10.628, + "grad_norm": 1.0810414552688599, + "learning_rate": 2.3496993987975953e-05, + "loss": 0.0216, + "step": 2657 + }, + { + "epoch": 10.632, + "grad_norm": 0.7048183083534241, + "learning_rate": 2.3486973947895794e-05, + "loss": 0.0145, + "step": 2658 + }, + { + "epoch": 10.636, + "grad_norm": 1.054413914680481, + "learning_rate": 2.3476953907815632e-05, + "loss": 0.0252, + "step": 2659 + }, + { + "epoch": 10.64, + "grad_norm": 0.8256332874298096, + "learning_rate": 2.346693386773547e-05, + "loss": 0.021, + "step": 2660 + }, + { + "epoch": 10.644, + "grad_norm": 0.9366419911384583, + "learning_rate": 2.345691382765531e-05, + "loss": 0.0245, + "step": 2661 + }, + { + "epoch": 10.648, + "grad_norm": 0.9146870970726013, + "learning_rate": 2.3446893787575153e-05, + "loss": 0.0218, + "step": 2662 + }, + { + "epoch": 10.652, + "grad_norm": 0.9589238166809082, + "learning_rate": 2.343687374749499e-05, + "loss": 0.0272, + "step": 2663 + }, + { + "epoch": 10.656, + "grad_norm": 0.7012674808502197, + "learning_rate": 2.3426853707414832e-05, + "loss": 0.0205, + "step": 2664 + }, + { + "epoch": 10.66, + "grad_norm": 0.872169554233551, + "learning_rate": 2.341683366733467e-05, + "loss": 0.0212, + "step": 2665 + }, + { + "epoch": 10.664, + "grad_norm": 1.1105194091796875, + "learning_rate": 2.3406813627254508e-05, + "loss": 0.0236, + "step": 2666 + }, + { + "epoch": 10.668, + "grad_norm": 0.9917757511138916, + "learning_rate": 2.3396793587174352e-05, + "loss": 0.0274, + "step": 2667 + }, + { + "epoch": 10.672, + "grad_norm": 0.7267541289329529, + "learning_rate": 2.338677354709419e-05, + "loss": 0.0186, + "step": 2668 + }, + { + "epoch": 10.676, + "grad_norm": 0.8156752586364746, + "learning_rate": 2.3376753507014028e-05, + "loss": 0.0192, + "step": 2669 + }, + { + "epoch": 10.68, + "grad_norm": 0.8577165603637695, + "learning_rate": 2.336673346693387e-05, + "loss": 0.0235, + "step": 2670 + }, + { + "epoch": 10.684, + "grad_norm": 0.5355784893035889, + "learning_rate": 2.3356713426853707e-05, + "loss": 0.0113, + "step": 2671 + }, + { + "epoch": 10.688, + "grad_norm": 1.0173070430755615, + "learning_rate": 2.334669338677355e-05, + "loss": 0.0263, + "step": 2672 + }, + { + "epoch": 10.692, + "grad_norm": 0.8700453639030457, + "learning_rate": 2.333667334669339e-05, + "loss": 0.0253, + "step": 2673 + }, + { + "epoch": 10.696, + "grad_norm": 0.8330980539321899, + "learning_rate": 2.3326653306613228e-05, + "loss": 0.0198, + "step": 2674 + }, + { + "epoch": 10.7, + "grad_norm": 0.7934946417808533, + "learning_rate": 2.3316633266533066e-05, + "loss": 0.0204, + "step": 2675 + }, + { + "epoch": 10.704, + "grad_norm": 0.7809090614318848, + "learning_rate": 2.3306613226452907e-05, + "loss": 0.026, + "step": 2676 + }, + { + "epoch": 10.708, + "grad_norm": 0.8526442646980286, + "learning_rate": 2.3296593186372748e-05, + "loss": 0.0208, + "step": 2677 + }, + { + "epoch": 10.712, + "grad_norm": 0.9328563213348389, + "learning_rate": 2.3286573146292586e-05, + "loss": 0.0229, + "step": 2678 + }, + { + "epoch": 10.716, + "grad_norm": 0.767686665058136, + "learning_rate": 2.3276553106212427e-05, + "loss": 0.021, + "step": 2679 + }, + { + "epoch": 10.72, + "grad_norm": 1.204643726348877, + "learning_rate": 2.3266533066132265e-05, + "loss": 0.0283, + "step": 2680 + }, + { + "epoch": 10.724, + "grad_norm": 0.8405128121376038, + "learning_rate": 2.3256513026052103e-05, + "loss": 0.0216, + "step": 2681 + }, + { + "epoch": 10.728, + "grad_norm": 0.9929158091545105, + "learning_rate": 2.3246492985971944e-05, + "loss": 0.0236, + "step": 2682 + }, + { + "epoch": 10.732, + "grad_norm": 1.1628848314285278, + "learning_rate": 2.3236472945891786e-05, + "loss": 0.0234, + "step": 2683 + }, + { + "epoch": 10.736, + "grad_norm": 1.0076568126678467, + "learning_rate": 2.3226452905811624e-05, + "loss": 0.0232, + "step": 2684 + }, + { + "epoch": 10.74, + "grad_norm": 0.8345778584480286, + "learning_rate": 2.3216432865731465e-05, + "loss": 0.0221, + "step": 2685 + }, + { + "epoch": 10.744, + "grad_norm": 0.8815706968307495, + "learning_rate": 2.3206412825651303e-05, + "loss": 0.0255, + "step": 2686 + }, + { + "epoch": 10.748, + "grad_norm": 1.1422778367996216, + "learning_rate": 2.3196392785571144e-05, + "loss": 0.0309, + "step": 2687 + }, + { + "epoch": 10.752, + "grad_norm": 0.9792990684509277, + "learning_rate": 2.3186372745490982e-05, + "loss": 0.0271, + "step": 2688 + }, + { + "epoch": 10.756, + "grad_norm": 0.972955048084259, + "learning_rate": 2.3176352705410823e-05, + "loss": 0.0263, + "step": 2689 + }, + { + "epoch": 10.76, + "grad_norm": 0.9153329133987427, + "learning_rate": 2.316633266533066e-05, + "loss": 0.0213, + "step": 2690 + }, + { + "epoch": 10.764, + "grad_norm": 1.1069750785827637, + "learning_rate": 2.3156312625250502e-05, + "loss": 0.028, + "step": 2691 + }, + { + "epoch": 10.768, + "grad_norm": 1.0217959880828857, + "learning_rate": 2.3146292585170344e-05, + "loss": 0.0308, + "step": 2692 + }, + { + "epoch": 10.772, + "grad_norm": 0.8818560838699341, + "learning_rate": 2.313627254509018e-05, + "loss": 0.0241, + "step": 2693 + }, + { + "epoch": 10.776, + "grad_norm": 1.0862679481506348, + "learning_rate": 2.312625250501002e-05, + "loss": 0.0253, + "step": 2694 + }, + { + "epoch": 10.78, + "grad_norm": 0.8923512697219849, + "learning_rate": 2.311623246492986e-05, + "loss": 0.0232, + "step": 2695 + }, + { + "epoch": 10.784, + "grad_norm": 0.8306083679199219, + "learning_rate": 2.31062124248497e-05, + "loss": 0.0169, + "step": 2696 + }, + { + "epoch": 10.788, + "grad_norm": 0.904339075088501, + "learning_rate": 2.309619238476954e-05, + "loss": 0.0221, + "step": 2697 + }, + { + "epoch": 10.792, + "grad_norm": 1.143362283706665, + "learning_rate": 2.308617234468938e-05, + "loss": 0.0272, + "step": 2698 + }, + { + "epoch": 10.796, + "grad_norm": 0.9651113152503967, + "learning_rate": 2.307615230460922e-05, + "loss": 0.0271, + "step": 2699 + }, + { + "epoch": 10.8, + "grad_norm": 1.1126807928085327, + "learning_rate": 2.3066132264529057e-05, + "loss": 0.0264, + "step": 2700 + }, + { + "epoch": 10.804, + "grad_norm": 1.1432498693466187, + "learning_rate": 2.30561122244489e-05, + "loss": 0.0294, + "step": 2701 + }, + { + "epoch": 10.808, + "grad_norm": 0.907027542591095, + "learning_rate": 2.304609218436874e-05, + "loss": 0.0257, + "step": 2702 + }, + { + "epoch": 10.812, + "grad_norm": 1.038845181465149, + "learning_rate": 2.3036072144288577e-05, + "loss": 0.0269, + "step": 2703 + }, + { + "epoch": 10.816, + "grad_norm": 0.6149145364761353, + "learning_rate": 2.302605210420842e-05, + "loss": 0.012, + "step": 2704 + }, + { + "epoch": 10.82, + "grad_norm": 0.7798912525177002, + "learning_rate": 2.3016032064128257e-05, + "loss": 0.0213, + "step": 2705 + }, + { + "epoch": 10.824, + "grad_norm": 0.8147614002227783, + "learning_rate": 2.3006012024048098e-05, + "loss": 0.0209, + "step": 2706 + }, + { + "epoch": 10.828, + "grad_norm": 1.100545048713684, + "learning_rate": 2.299599198396794e-05, + "loss": 0.0239, + "step": 2707 + }, + { + "epoch": 10.832, + "grad_norm": 0.9930369853973389, + "learning_rate": 2.2985971943887777e-05, + "loss": 0.0247, + "step": 2708 + }, + { + "epoch": 10.836, + "grad_norm": 0.8570621609687805, + "learning_rate": 2.2975951903807615e-05, + "loss": 0.0212, + "step": 2709 + }, + { + "epoch": 10.84, + "grad_norm": 0.8312192559242249, + "learning_rate": 2.2965931863727456e-05, + "loss": 0.0168, + "step": 2710 + }, + { + "epoch": 10.844, + "grad_norm": 1.1362755298614502, + "learning_rate": 2.2955911823647298e-05, + "loss": 0.0303, + "step": 2711 + }, + { + "epoch": 10.848, + "grad_norm": 1.1022303104400635, + "learning_rate": 2.2945891783567135e-05, + "loss": 0.0293, + "step": 2712 + }, + { + "epoch": 10.852, + "grad_norm": 1.1668660640716553, + "learning_rate": 2.2935871743486977e-05, + "loss": 0.0259, + "step": 2713 + }, + { + "epoch": 10.856, + "grad_norm": 1.326250672340393, + "learning_rate": 2.2925851703406815e-05, + "loss": 0.0259, + "step": 2714 + }, + { + "epoch": 10.86, + "grad_norm": 0.9431809782981873, + "learning_rate": 2.2915831663326653e-05, + "loss": 0.0223, + "step": 2715 + }, + { + "epoch": 10.864, + "grad_norm": 1.0126006603240967, + "learning_rate": 2.2905811623246494e-05, + "loss": 0.0231, + "step": 2716 + }, + { + "epoch": 10.868, + "grad_norm": 0.9708862900733948, + "learning_rate": 2.2895791583166335e-05, + "loss": 0.0237, + "step": 2717 + }, + { + "epoch": 10.872, + "grad_norm": 0.9651861786842346, + "learning_rate": 2.2885771543086173e-05, + "loss": 0.0229, + "step": 2718 + }, + { + "epoch": 10.876, + "grad_norm": 0.8365417718887329, + "learning_rate": 2.2875751503006014e-05, + "loss": 0.0233, + "step": 2719 + }, + { + "epoch": 10.88, + "grad_norm": 0.8717615008354187, + "learning_rate": 2.2865731462925852e-05, + "loss": 0.022, + "step": 2720 + }, + { + "epoch": 10.884, + "grad_norm": 1.1816632747650146, + "learning_rate": 2.2855711422845693e-05, + "loss": 0.0273, + "step": 2721 + }, + { + "epoch": 10.888, + "grad_norm": 0.9528033137321472, + "learning_rate": 2.284569138276553e-05, + "loss": 0.0229, + "step": 2722 + }, + { + "epoch": 10.892, + "grad_norm": 1.0653725862503052, + "learning_rate": 2.2835671342685373e-05, + "loss": 0.027, + "step": 2723 + }, + { + "epoch": 10.896, + "grad_norm": 0.5483238101005554, + "learning_rate": 2.282565130260521e-05, + "loss": 0.013, + "step": 2724 + }, + { + "epoch": 10.9, + "grad_norm": 1.0084025859832764, + "learning_rate": 2.281563126252505e-05, + "loss": 0.0233, + "step": 2725 + }, + { + "epoch": 10.904, + "grad_norm": 1.0322855710983276, + "learning_rate": 2.2805611222444893e-05, + "loss": 0.0251, + "step": 2726 + }, + { + "epoch": 10.908, + "grad_norm": 0.9313704967498779, + "learning_rate": 2.279559118236473e-05, + "loss": 0.0234, + "step": 2727 + }, + { + "epoch": 10.912, + "grad_norm": 1.1278380155563354, + "learning_rate": 2.278557114228457e-05, + "loss": 0.0299, + "step": 2728 + }, + { + "epoch": 10.916, + "grad_norm": 1.118483304977417, + "learning_rate": 2.277555110220441e-05, + "loss": 0.0252, + "step": 2729 + }, + { + "epoch": 10.92, + "grad_norm": 0.8251449465751648, + "learning_rate": 2.2765531062124248e-05, + "loss": 0.0198, + "step": 2730 + }, + { + "epoch": 10.924, + "grad_norm": 1.0784047842025757, + "learning_rate": 2.275551102204409e-05, + "loss": 0.0305, + "step": 2731 + }, + { + "epoch": 10.928, + "grad_norm": 1.0076247453689575, + "learning_rate": 2.274549098196393e-05, + "loss": 0.028, + "step": 2732 + }, + { + "epoch": 10.932, + "grad_norm": 1.17436683177948, + "learning_rate": 2.273547094188377e-05, + "loss": 0.0264, + "step": 2733 + }, + { + "epoch": 10.936, + "grad_norm": 1.0806264877319336, + "learning_rate": 2.2725450901803606e-05, + "loss": 0.025, + "step": 2734 + }, + { + "epoch": 10.94, + "grad_norm": 0.8272656202316284, + "learning_rate": 2.2715430861723448e-05, + "loss": 0.0243, + "step": 2735 + }, + { + "epoch": 10.943999999999999, + "grad_norm": 1.1387969255447388, + "learning_rate": 2.270541082164329e-05, + "loss": 0.0316, + "step": 2736 + }, + { + "epoch": 10.948, + "grad_norm": 0.8464391231536865, + "learning_rate": 2.2695390781563127e-05, + "loss": 0.0245, + "step": 2737 + }, + { + "epoch": 10.952, + "grad_norm": 0.8681789636611938, + "learning_rate": 2.2685370741482968e-05, + "loss": 0.0247, + "step": 2738 + }, + { + "epoch": 10.956, + "grad_norm": 0.7716442942619324, + "learning_rate": 2.2675350701402806e-05, + "loss": 0.0196, + "step": 2739 + }, + { + "epoch": 10.96, + "grad_norm": 0.8722916841506958, + "learning_rate": 2.2665330661322644e-05, + "loss": 0.0243, + "step": 2740 + }, + { + "epoch": 10.964, + "grad_norm": 0.9783482551574707, + "learning_rate": 2.265531062124249e-05, + "loss": 0.025, + "step": 2741 + }, + { + "epoch": 10.968, + "grad_norm": 0.7285762429237366, + "learning_rate": 2.2645290581162327e-05, + "loss": 0.023, + "step": 2742 + }, + { + "epoch": 10.972, + "grad_norm": 1.012169599533081, + "learning_rate": 2.2635270541082164e-05, + "loss": 0.0263, + "step": 2743 + }, + { + "epoch": 10.975999999999999, + "grad_norm": 1.017822265625, + "learning_rate": 2.2625250501002006e-05, + "loss": 0.0249, + "step": 2744 + }, + { + "epoch": 10.98, + "grad_norm": 0.8947110772132874, + "learning_rate": 2.2615230460921844e-05, + "loss": 0.0242, + "step": 2745 + }, + { + "epoch": 10.984, + "grad_norm": 0.8670669198036194, + "learning_rate": 2.2605210420841685e-05, + "loss": 0.0209, + "step": 2746 + }, + { + "epoch": 10.988, + "grad_norm": 1.1200064420700073, + "learning_rate": 2.2595190380761526e-05, + "loss": 0.0294, + "step": 2747 + }, + { + "epoch": 10.992, + "grad_norm": 0.7422370910644531, + "learning_rate": 2.2585170340681364e-05, + "loss": 0.0218, + "step": 2748 + }, + { + "epoch": 10.996, + "grad_norm": 1.087212324142456, + "learning_rate": 2.2575150300601202e-05, + "loss": 0.0252, + "step": 2749 + }, + { + "epoch": 11.0, + "grad_norm": 0.9888896942138672, + "learning_rate": 2.2565130260521043e-05, + "loss": 0.0232, + "step": 2750 + }, + { + "epoch": 11.004, + "grad_norm": 0.7371053695678711, + "learning_rate": 2.2555110220440885e-05, + "loss": 0.0197, + "step": 2751 + }, + { + "epoch": 11.008, + "grad_norm": 0.7480749487876892, + "learning_rate": 2.2545090180360722e-05, + "loss": 0.0151, + "step": 2752 + }, + { + "epoch": 11.012, + "grad_norm": 0.47101399302482605, + "learning_rate": 2.253507014028056e-05, + "loss": 0.0126, + "step": 2753 + }, + { + "epoch": 11.016, + "grad_norm": 0.5534325242042542, + "learning_rate": 2.25250501002004e-05, + "loss": 0.0152, + "step": 2754 + }, + { + "epoch": 11.02, + "grad_norm": 0.6473478078842163, + "learning_rate": 2.251503006012024e-05, + "loss": 0.0152, + "step": 2755 + }, + { + "epoch": 11.024, + "grad_norm": 0.6150280833244324, + "learning_rate": 2.250501002004008e-05, + "loss": 0.0164, + "step": 2756 + }, + { + "epoch": 11.028, + "grad_norm": 0.6889513731002808, + "learning_rate": 2.2494989979959922e-05, + "loss": 0.0138, + "step": 2757 + }, + { + "epoch": 11.032, + "grad_norm": 0.7769390344619751, + "learning_rate": 2.248496993987976e-05, + "loss": 0.0163, + "step": 2758 + }, + { + "epoch": 11.036, + "grad_norm": 0.7019205093383789, + "learning_rate": 2.2474949899799598e-05, + "loss": 0.0157, + "step": 2759 + }, + { + "epoch": 11.04, + "grad_norm": 0.966162383556366, + "learning_rate": 2.2464929859719443e-05, + "loss": 0.0205, + "step": 2760 + }, + { + "epoch": 11.044, + "grad_norm": 0.7418727278709412, + "learning_rate": 2.245490981963928e-05, + "loss": 0.0172, + "step": 2761 + }, + { + "epoch": 11.048, + "grad_norm": 0.5667153596878052, + "learning_rate": 2.244488977955912e-05, + "loss": 0.0099, + "step": 2762 + }, + { + "epoch": 11.052, + "grad_norm": 0.9118229150772095, + "learning_rate": 2.243486973947896e-05, + "loss": 0.0182, + "step": 2763 + }, + { + "epoch": 11.056, + "grad_norm": 0.8428515195846558, + "learning_rate": 2.2424849699398797e-05, + "loss": 0.018, + "step": 2764 + }, + { + "epoch": 11.06, + "grad_norm": 0.5862798094749451, + "learning_rate": 2.241482965931864e-05, + "loss": 0.015, + "step": 2765 + }, + { + "epoch": 11.064, + "grad_norm": 0.6175693273544312, + "learning_rate": 2.240480961923848e-05, + "loss": 0.0171, + "step": 2766 + }, + { + "epoch": 11.068, + "grad_norm": 0.7874165177345276, + "learning_rate": 2.2394789579158318e-05, + "loss": 0.0183, + "step": 2767 + }, + { + "epoch": 11.072, + "grad_norm": 0.9282909631729126, + "learning_rate": 2.2384769539078156e-05, + "loss": 0.0184, + "step": 2768 + }, + { + "epoch": 11.076, + "grad_norm": 0.7215555906295776, + "learning_rate": 2.2374749498997997e-05, + "loss": 0.0185, + "step": 2769 + }, + { + "epoch": 11.08, + "grad_norm": 0.8601921200752258, + "learning_rate": 2.236472945891784e-05, + "loss": 0.0185, + "step": 2770 + }, + { + "epoch": 11.084, + "grad_norm": 0.9709322452545166, + "learning_rate": 2.2354709418837676e-05, + "loss": 0.0185, + "step": 2771 + }, + { + "epoch": 11.088, + "grad_norm": 0.799983561038971, + "learning_rate": 2.2344689378757518e-05, + "loss": 0.0164, + "step": 2772 + }, + { + "epoch": 11.092, + "grad_norm": 0.47060427069664, + "learning_rate": 2.2334669338677355e-05, + "loss": 0.0136, + "step": 2773 + }, + { + "epoch": 11.096, + "grad_norm": 0.8248326182365417, + "learning_rate": 2.2324649298597193e-05, + "loss": 0.018, + "step": 2774 + }, + { + "epoch": 11.1, + "grad_norm": 0.7863957285881042, + "learning_rate": 2.2314629258517035e-05, + "loss": 0.0199, + "step": 2775 + }, + { + "epoch": 11.104, + "grad_norm": 0.584855318069458, + "learning_rate": 2.2304609218436876e-05, + "loss": 0.0125, + "step": 2776 + }, + { + "epoch": 11.108, + "grad_norm": 0.7124798893928528, + "learning_rate": 2.2294589178356714e-05, + "loss": 0.0128, + "step": 2777 + }, + { + "epoch": 11.112, + "grad_norm": 0.6670029759407043, + "learning_rate": 2.2284569138276555e-05, + "loss": 0.0167, + "step": 2778 + }, + { + "epoch": 11.116, + "grad_norm": 0.7364521026611328, + "learning_rate": 2.2274549098196393e-05, + "loss": 0.0155, + "step": 2779 + }, + { + "epoch": 11.12, + "grad_norm": 0.7299067974090576, + "learning_rate": 2.2264529058116234e-05, + "loss": 0.0162, + "step": 2780 + }, + { + "epoch": 11.124, + "grad_norm": 0.5108133554458618, + "learning_rate": 2.2254509018036072e-05, + "loss": 0.0162, + "step": 2781 + }, + { + "epoch": 11.128, + "grad_norm": 0.8171691298484802, + "learning_rate": 2.2244488977955913e-05, + "loss": 0.0185, + "step": 2782 + }, + { + "epoch": 11.132, + "grad_norm": 0.7556437253952026, + "learning_rate": 2.223446893787575e-05, + "loss": 0.0181, + "step": 2783 + }, + { + "epoch": 11.136, + "grad_norm": 0.8455727100372314, + "learning_rate": 2.2224448897795593e-05, + "loss": 0.0158, + "step": 2784 + }, + { + "epoch": 11.14, + "grad_norm": 0.7372032999992371, + "learning_rate": 2.2214428857715434e-05, + "loss": 0.0168, + "step": 2785 + }, + { + "epoch": 11.144, + "grad_norm": 0.9267023801803589, + "learning_rate": 2.2204408817635272e-05, + "loss": 0.0162, + "step": 2786 + }, + { + "epoch": 11.148, + "grad_norm": 0.7094591856002808, + "learning_rate": 2.219438877755511e-05, + "loss": 0.0167, + "step": 2787 + }, + { + "epoch": 11.152, + "grad_norm": 0.8599808812141418, + "learning_rate": 2.218436873747495e-05, + "loss": 0.0223, + "step": 2788 + }, + { + "epoch": 11.156, + "grad_norm": 0.6255855560302734, + "learning_rate": 2.217434869739479e-05, + "loss": 0.014, + "step": 2789 + }, + { + "epoch": 11.16, + "grad_norm": 0.5172903537750244, + "learning_rate": 2.216432865731463e-05, + "loss": 0.0137, + "step": 2790 + }, + { + "epoch": 11.164, + "grad_norm": 0.971419095993042, + "learning_rate": 2.215430861723447e-05, + "loss": 0.0165, + "step": 2791 + }, + { + "epoch": 11.168, + "grad_norm": 0.8272615671157837, + "learning_rate": 2.214428857715431e-05, + "loss": 0.0165, + "step": 2792 + }, + { + "epoch": 11.172, + "grad_norm": 0.8575184345245361, + "learning_rate": 2.2134268537074147e-05, + "loss": 0.0194, + "step": 2793 + }, + { + "epoch": 11.176, + "grad_norm": 0.54851895570755, + "learning_rate": 2.212424849699399e-05, + "loss": 0.0147, + "step": 2794 + }, + { + "epoch": 11.18, + "grad_norm": 0.6194243431091309, + "learning_rate": 2.211422845691383e-05, + "loss": 0.0155, + "step": 2795 + }, + { + "epoch": 11.184, + "grad_norm": 0.6905847191810608, + "learning_rate": 2.2104208416833668e-05, + "loss": 0.0166, + "step": 2796 + }, + { + "epoch": 11.188, + "grad_norm": 0.7952155470848083, + "learning_rate": 2.209418837675351e-05, + "loss": 0.0172, + "step": 2797 + }, + { + "epoch": 11.192, + "grad_norm": 0.5479825735092163, + "learning_rate": 2.2084168336673347e-05, + "loss": 0.0138, + "step": 2798 + }, + { + "epoch": 11.196, + "grad_norm": 0.7587847709655762, + "learning_rate": 2.2074148296593185e-05, + "loss": 0.0174, + "step": 2799 + }, + { + "epoch": 11.2, + "grad_norm": 0.6983550786972046, + "learning_rate": 2.206412825651303e-05, + "loss": 0.0158, + "step": 2800 + }, + { + "epoch": 11.204, + "grad_norm": 0.7654238939285278, + "learning_rate": 2.2054108216432867e-05, + "loss": 0.0162, + "step": 2801 + }, + { + "epoch": 11.208, + "grad_norm": 0.7754288911819458, + "learning_rate": 2.2044088176352705e-05, + "loss": 0.016, + "step": 2802 + }, + { + "epoch": 11.212, + "grad_norm": 0.8081237077713013, + "learning_rate": 2.2034068136272547e-05, + "loss": 0.0176, + "step": 2803 + }, + { + "epoch": 11.216, + "grad_norm": 0.6752600073814392, + "learning_rate": 2.2024048096192384e-05, + "loss": 0.0147, + "step": 2804 + }, + { + "epoch": 11.22, + "grad_norm": 0.6993342041969299, + "learning_rate": 2.2014028056112226e-05, + "loss": 0.0159, + "step": 2805 + }, + { + "epoch": 11.224, + "grad_norm": 0.6669374108314514, + "learning_rate": 2.2004008016032067e-05, + "loss": 0.0163, + "step": 2806 + }, + { + "epoch": 11.228, + "grad_norm": 0.6162493228912354, + "learning_rate": 2.1993987975951905e-05, + "loss": 0.0168, + "step": 2807 + }, + { + "epoch": 11.232, + "grad_norm": 1.0216128826141357, + "learning_rate": 2.1983967935871743e-05, + "loss": 0.0179, + "step": 2808 + }, + { + "epoch": 11.236, + "grad_norm": 0.8339212536811829, + "learning_rate": 2.1973947895791584e-05, + "loss": 0.0161, + "step": 2809 + }, + { + "epoch": 11.24, + "grad_norm": 0.6215983033180237, + "learning_rate": 2.1963927855711425e-05, + "loss": 0.0166, + "step": 2810 + }, + { + "epoch": 11.244, + "grad_norm": 0.5474804639816284, + "learning_rate": 2.1953907815631263e-05, + "loss": 0.0149, + "step": 2811 + }, + { + "epoch": 11.248, + "grad_norm": 0.5640460252761841, + "learning_rate": 2.1943887775551105e-05, + "loss": 0.0132, + "step": 2812 + }, + { + "epoch": 11.252, + "grad_norm": 0.8308565616607666, + "learning_rate": 2.1933867735470942e-05, + "loss": 0.0185, + "step": 2813 + }, + { + "epoch": 11.256, + "grad_norm": 1.0242302417755127, + "learning_rate": 2.1923847695390784e-05, + "loss": 0.0197, + "step": 2814 + }, + { + "epoch": 11.26, + "grad_norm": 0.8074948787689209, + "learning_rate": 2.191382765531062e-05, + "loss": 0.0146, + "step": 2815 + }, + { + "epoch": 11.264, + "grad_norm": 0.5657005906105042, + "learning_rate": 2.1903807615230463e-05, + "loss": 0.0135, + "step": 2816 + }, + { + "epoch": 11.268, + "grad_norm": 0.8721190690994263, + "learning_rate": 2.18937875751503e-05, + "loss": 0.0193, + "step": 2817 + }, + { + "epoch": 11.272, + "grad_norm": 0.5107552409172058, + "learning_rate": 2.1883767535070142e-05, + "loss": 0.0158, + "step": 2818 + }, + { + "epoch": 11.276, + "grad_norm": 0.747622549533844, + "learning_rate": 2.1873747494989983e-05, + "loss": 0.0152, + "step": 2819 + }, + { + "epoch": 11.28, + "grad_norm": 0.7449367046356201, + "learning_rate": 2.186372745490982e-05, + "loss": 0.0163, + "step": 2820 + }, + { + "epoch": 11.284, + "grad_norm": 0.47581028938293457, + "learning_rate": 2.185370741482966e-05, + "loss": 0.0142, + "step": 2821 + }, + { + "epoch": 11.288, + "grad_norm": 0.5532326698303223, + "learning_rate": 2.18436873747495e-05, + "loss": 0.014, + "step": 2822 + }, + { + "epoch": 11.292, + "grad_norm": 0.7616000175476074, + "learning_rate": 2.1833667334669338e-05, + "loss": 0.0169, + "step": 2823 + }, + { + "epoch": 11.296, + "grad_norm": 0.4382837414741516, + "learning_rate": 2.182364729458918e-05, + "loss": 0.0098, + "step": 2824 + }, + { + "epoch": 11.3, + "grad_norm": 0.6851035952568054, + "learning_rate": 2.181362725450902e-05, + "loss": 0.0172, + "step": 2825 + }, + { + "epoch": 11.304, + "grad_norm": 0.7380544543266296, + "learning_rate": 2.180360721442886e-05, + "loss": 0.0176, + "step": 2826 + }, + { + "epoch": 11.308, + "grad_norm": 0.4597725570201874, + "learning_rate": 2.1793587174348697e-05, + "loss": 0.0134, + "step": 2827 + }, + { + "epoch": 11.312, + "grad_norm": 0.40633803606033325, + "learning_rate": 2.1783567134268538e-05, + "loss": 0.0129, + "step": 2828 + }, + { + "epoch": 11.316, + "grad_norm": 0.9875346422195435, + "learning_rate": 2.177354709418838e-05, + "loss": 0.0103, + "step": 2829 + }, + { + "epoch": 11.32, + "grad_norm": 0.9050310254096985, + "learning_rate": 2.1763527054108217e-05, + "loss": 0.0154, + "step": 2830 + }, + { + "epoch": 11.324, + "grad_norm": 0.7824398279190063, + "learning_rate": 2.175350701402806e-05, + "loss": 0.0186, + "step": 2831 + }, + { + "epoch": 11.328, + "grad_norm": 0.9175853729248047, + "learning_rate": 2.1743486973947896e-05, + "loss": 0.018, + "step": 2832 + }, + { + "epoch": 11.332, + "grad_norm": 0.75944584608078, + "learning_rate": 2.1733466933867734e-05, + "loss": 0.0188, + "step": 2833 + }, + { + "epoch": 11.336, + "grad_norm": 0.7953121662139893, + "learning_rate": 2.172344689378758e-05, + "loss": 0.0164, + "step": 2834 + }, + { + "epoch": 11.34, + "grad_norm": 0.662932276725769, + "learning_rate": 2.1713426853707417e-05, + "loss": 0.016, + "step": 2835 + }, + { + "epoch": 11.344, + "grad_norm": 0.6539945602416992, + "learning_rate": 2.1703406813627255e-05, + "loss": 0.0145, + "step": 2836 + }, + { + "epoch": 11.348, + "grad_norm": 0.8975854516029358, + "learning_rate": 2.1693386773547096e-05, + "loss": 0.0186, + "step": 2837 + }, + { + "epoch": 11.352, + "grad_norm": 0.8039414882659912, + "learning_rate": 2.1683366733466934e-05, + "loss": 0.0185, + "step": 2838 + }, + { + "epoch": 11.356, + "grad_norm": 0.891394853591919, + "learning_rate": 2.1673346693386775e-05, + "loss": 0.018, + "step": 2839 + }, + { + "epoch": 11.36, + "grad_norm": 0.6643425226211548, + "learning_rate": 2.1663326653306616e-05, + "loss": 0.0151, + "step": 2840 + }, + { + "epoch": 11.364, + "grad_norm": 0.6176607012748718, + "learning_rate": 2.1653306613226454e-05, + "loss": 0.0145, + "step": 2841 + }, + { + "epoch": 11.368, + "grad_norm": 0.4472271203994751, + "learning_rate": 2.1643286573146292e-05, + "loss": 0.0092, + "step": 2842 + }, + { + "epoch": 11.372, + "grad_norm": 0.7300881743431091, + "learning_rate": 2.1633266533066133e-05, + "loss": 0.0182, + "step": 2843 + }, + { + "epoch": 11.376, + "grad_norm": 0.5038206577301025, + "learning_rate": 2.1623246492985975e-05, + "loss": 0.0171, + "step": 2844 + }, + { + "epoch": 11.38, + "grad_norm": 0.8443142175674438, + "learning_rate": 2.1613226452905813e-05, + "loss": 0.0155, + "step": 2845 + }, + { + "epoch": 11.384, + "grad_norm": 0.7423885464668274, + "learning_rate": 2.1603206412825654e-05, + "loss": 0.0155, + "step": 2846 + }, + { + "epoch": 11.388, + "grad_norm": 0.6411489248275757, + "learning_rate": 2.1593186372745492e-05, + "loss": 0.0157, + "step": 2847 + }, + { + "epoch": 11.392, + "grad_norm": 0.7509790062904358, + "learning_rate": 2.158316633266533e-05, + "loss": 0.0163, + "step": 2848 + }, + { + "epoch": 11.396, + "grad_norm": 0.7018547058105469, + "learning_rate": 2.157314629258517e-05, + "loss": 0.019, + "step": 2849 + }, + { + "epoch": 11.4, + "grad_norm": 0.8679090738296509, + "learning_rate": 2.1563126252505012e-05, + "loss": 0.0145, + "step": 2850 + }, + { + "epoch": 11.404, + "grad_norm": 0.8156054019927979, + "learning_rate": 2.155310621242485e-05, + "loss": 0.0157, + "step": 2851 + }, + { + "epoch": 11.408, + "grad_norm": 0.7699236869812012, + "learning_rate": 2.1543086172344688e-05, + "loss": 0.0169, + "step": 2852 + }, + { + "epoch": 11.412, + "grad_norm": 0.9097781777381897, + "learning_rate": 2.153306613226453e-05, + "loss": 0.014, + "step": 2853 + }, + { + "epoch": 11.416, + "grad_norm": 0.3984467387199402, + "learning_rate": 2.152304609218437e-05, + "loss": 0.0126, + "step": 2854 + }, + { + "epoch": 11.42, + "grad_norm": 0.8444249033927917, + "learning_rate": 2.151302605210421e-05, + "loss": 0.0182, + "step": 2855 + }, + { + "epoch": 11.424, + "grad_norm": 0.9840474724769592, + "learning_rate": 2.150300601202405e-05, + "loss": 0.0199, + "step": 2856 + }, + { + "epoch": 11.428, + "grad_norm": 0.778589129447937, + "learning_rate": 2.1492985971943888e-05, + "loss": 0.015, + "step": 2857 + }, + { + "epoch": 11.432, + "grad_norm": 0.5163719654083252, + "learning_rate": 2.1482965931863726e-05, + "loss": 0.0148, + "step": 2858 + }, + { + "epoch": 11.436, + "grad_norm": 0.6548240780830383, + "learning_rate": 2.147294589178357e-05, + "loss": 0.0159, + "step": 2859 + }, + { + "epoch": 11.44, + "grad_norm": 0.6455692648887634, + "learning_rate": 2.1462925851703408e-05, + "loss": 0.0169, + "step": 2860 + }, + { + "epoch": 11.444, + "grad_norm": 0.7446917295455933, + "learning_rate": 2.1452905811623246e-05, + "loss": 0.0189, + "step": 2861 + }, + { + "epoch": 11.448, + "grad_norm": 0.6257652640342712, + "learning_rate": 2.1442885771543087e-05, + "loss": 0.0153, + "step": 2862 + }, + { + "epoch": 11.452, + "grad_norm": 0.9588707685470581, + "learning_rate": 2.1432865731462925e-05, + "loss": 0.0209, + "step": 2863 + }, + { + "epoch": 11.456, + "grad_norm": 0.825225830078125, + "learning_rate": 2.1422845691382767e-05, + "loss": 0.0163, + "step": 2864 + }, + { + "epoch": 11.46, + "grad_norm": 0.8426507711410522, + "learning_rate": 2.1412825651302608e-05, + "loss": 0.0208, + "step": 2865 + }, + { + "epoch": 11.464, + "grad_norm": 0.6643405556678772, + "learning_rate": 2.1402805611222446e-05, + "loss": 0.0169, + "step": 2866 + }, + { + "epoch": 11.468, + "grad_norm": 0.604695737361908, + "learning_rate": 2.1392785571142284e-05, + "loss": 0.0145, + "step": 2867 + }, + { + "epoch": 11.472, + "grad_norm": 0.6502740383148193, + "learning_rate": 2.1382765531062128e-05, + "loss": 0.0152, + "step": 2868 + }, + { + "epoch": 11.475999999999999, + "grad_norm": 0.47215506434440613, + "learning_rate": 2.1372745490981966e-05, + "loss": 0.0089, + "step": 2869 + }, + { + "epoch": 11.48, + "grad_norm": 0.597215473651886, + "learning_rate": 2.1362725450901804e-05, + "loss": 0.0157, + "step": 2870 + }, + { + "epoch": 11.484, + "grad_norm": 0.5259137153625488, + "learning_rate": 2.1352705410821645e-05, + "loss": 0.0144, + "step": 2871 + }, + { + "epoch": 11.488, + "grad_norm": 0.680182695388794, + "learning_rate": 2.1342685370741483e-05, + "loss": 0.0183, + "step": 2872 + }, + { + "epoch": 11.492, + "grad_norm": 0.8447750210762024, + "learning_rate": 2.1332665330661324e-05, + "loss": 0.0199, + "step": 2873 + }, + { + "epoch": 11.496, + "grad_norm": 0.8049308657646179, + "learning_rate": 2.1322645290581166e-05, + "loss": 0.0175, + "step": 2874 + }, + { + "epoch": 11.5, + "grad_norm": 0.916851818561554, + "learning_rate": 2.1312625250501004e-05, + "loss": 0.0191, + "step": 2875 + }, + { + "epoch": 11.504, + "grad_norm": 0.7663381695747375, + "learning_rate": 2.130260521042084e-05, + "loss": 0.0162, + "step": 2876 + }, + { + "epoch": 11.508, + "grad_norm": 0.639718770980835, + "learning_rate": 2.1292585170340683e-05, + "loss": 0.0128, + "step": 2877 + }, + { + "epoch": 11.512, + "grad_norm": 0.7572237849235535, + "learning_rate": 2.1282565130260524e-05, + "loss": 0.0167, + "step": 2878 + }, + { + "epoch": 11.516, + "grad_norm": 0.8541759252548218, + "learning_rate": 2.1272545090180362e-05, + "loss": 0.0182, + "step": 2879 + }, + { + "epoch": 11.52, + "grad_norm": 0.8712204098701477, + "learning_rate": 2.12625250501002e-05, + "loss": 0.02, + "step": 2880 + }, + { + "epoch": 11.524000000000001, + "grad_norm": 0.7454221248626709, + "learning_rate": 2.125250501002004e-05, + "loss": 0.0155, + "step": 2881 + }, + { + "epoch": 11.528, + "grad_norm": 0.47309982776641846, + "learning_rate": 2.124248496993988e-05, + "loss": 0.0129, + "step": 2882 + }, + { + "epoch": 11.532, + "grad_norm": 1.0623902082443237, + "learning_rate": 2.123246492985972e-05, + "loss": 0.0201, + "step": 2883 + }, + { + "epoch": 11.536, + "grad_norm": 0.600472092628479, + "learning_rate": 2.122244488977956e-05, + "loss": 0.0149, + "step": 2884 + }, + { + "epoch": 11.54, + "grad_norm": 0.9830151200294495, + "learning_rate": 2.12124248496994e-05, + "loss": 0.0171, + "step": 2885 + }, + { + "epoch": 11.544, + "grad_norm": 0.49249526858329773, + "learning_rate": 2.1202404809619237e-05, + "loss": 0.0116, + "step": 2886 + }, + { + "epoch": 11.548, + "grad_norm": 0.7066675424575806, + "learning_rate": 2.119238476953908e-05, + "loss": 0.0167, + "step": 2887 + }, + { + "epoch": 11.552, + "grad_norm": 0.9759204983711243, + "learning_rate": 2.118236472945892e-05, + "loss": 0.0221, + "step": 2888 + }, + { + "epoch": 11.556000000000001, + "grad_norm": 0.6902356743812561, + "learning_rate": 2.1172344689378758e-05, + "loss": 0.0154, + "step": 2889 + }, + { + "epoch": 11.56, + "grad_norm": 0.7708073854446411, + "learning_rate": 2.11623246492986e-05, + "loss": 0.0142, + "step": 2890 + }, + { + "epoch": 11.564, + "grad_norm": 0.6646739840507507, + "learning_rate": 2.1152304609218437e-05, + "loss": 0.0132, + "step": 2891 + }, + { + "epoch": 11.568, + "grad_norm": 0.5692991614341736, + "learning_rate": 2.1142284569138275e-05, + "loss": 0.0158, + "step": 2892 + }, + { + "epoch": 11.572, + "grad_norm": 0.683018147945404, + "learning_rate": 2.113226452905812e-05, + "loss": 0.0158, + "step": 2893 + }, + { + "epoch": 11.576, + "grad_norm": 0.6154299974441528, + "learning_rate": 2.1122244488977958e-05, + "loss": 0.0185, + "step": 2894 + }, + { + "epoch": 11.58, + "grad_norm": 0.9322575926780701, + "learning_rate": 2.1112224448897795e-05, + "loss": 0.0197, + "step": 2895 + }, + { + "epoch": 11.584, + "grad_norm": 0.6337608098983765, + "learning_rate": 2.1102204408817637e-05, + "loss": 0.0163, + "step": 2896 + }, + { + "epoch": 11.588, + "grad_norm": 0.8087349534034729, + "learning_rate": 2.1092184368737475e-05, + "loss": 0.017, + "step": 2897 + }, + { + "epoch": 11.592, + "grad_norm": 0.5522025227546692, + "learning_rate": 2.1082164328657316e-05, + "loss": 0.0155, + "step": 2898 + }, + { + "epoch": 11.596, + "grad_norm": 0.5902281403541565, + "learning_rate": 2.1072144288577157e-05, + "loss": 0.017, + "step": 2899 + }, + { + "epoch": 11.6, + "grad_norm": 0.5093696713447571, + "learning_rate": 2.1062124248496995e-05, + "loss": 0.0154, + "step": 2900 + }, + { + "epoch": 11.604, + "grad_norm": 0.602093517780304, + "learning_rate": 2.1052104208416833e-05, + "loss": 0.0163, + "step": 2901 + }, + { + "epoch": 11.608, + "grad_norm": 0.4907231628894806, + "learning_rate": 2.1042084168336674e-05, + "loss": 0.0105, + "step": 2902 + }, + { + "epoch": 11.612, + "grad_norm": 0.6661096811294556, + "learning_rate": 2.1032064128256516e-05, + "loss": 0.0167, + "step": 2903 + }, + { + "epoch": 11.616, + "grad_norm": 1.0688780546188354, + "learning_rate": 2.1022044088176353e-05, + "loss": 0.021, + "step": 2904 + }, + { + "epoch": 11.62, + "grad_norm": 0.7456181645393372, + "learning_rate": 2.1012024048096195e-05, + "loss": 0.0168, + "step": 2905 + }, + { + "epoch": 11.624, + "grad_norm": 0.7473933696746826, + "learning_rate": 2.1002004008016033e-05, + "loss": 0.0162, + "step": 2906 + }, + { + "epoch": 11.628, + "grad_norm": 0.539610743522644, + "learning_rate": 2.099198396793587e-05, + "loss": 0.0148, + "step": 2907 + }, + { + "epoch": 11.632, + "grad_norm": 1.1146992444992065, + "learning_rate": 2.0981963927855712e-05, + "loss": 0.0239, + "step": 2908 + }, + { + "epoch": 11.636, + "grad_norm": 0.7290841341018677, + "learning_rate": 2.0971943887775553e-05, + "loss": 0.0145, + "step": 2909 + }, + { + "epoch": 11.64, + "grad_norm": 0.6614787578582764, + "learning_rate": 2.096192384769539e-05, + "loss": 0.0169, + "step": 2910 + }, + { + "epoch": 11.644, + "grad_norm": 0.8798813819885254, + "learning_rate": 2.0951903807615232e-05, + "loss": 0.0196, + "step": 2911 + }, + { + "epoch": 11.648, + "grad_norm": 0.5717126727104187, + "learning_rate": 2.094188376753507e-05, + "loss": 0.0155, + "step": 2912 + }, + { + "epoch": 11.652, + "grad_norm": 1.0631353855133057, + "learning_rate": 2.093186372745491e-05, + "loss": 0.0191, + "step": 2913 + }, + { + "epoch": 11.656, + "grad_norm": 0.7153030633926392, + "learning_rate": 2.092184368737475e-05, + "loss": 0.0173, + "step": 2914 + }, + { + "epoch": 11.66, + "grad_norm": 0.8468221426010132, + "learning_rate": 2.091182364729459e-05, + "loss": 0.0203, + "step": 2915 + }, + { + "epoch": 11.664, + "grad_norm": 0.6937319040298462, + "learning_rate": 2.090180360721443e-05, + "loss": 0.016, + "step": 2916 + }, + { + "epoch": 11.668, + "grad_norm": 0.7014555931091309, + "learning_rate": 2.089178356713427e-05, + "loss": 0.0207, + "step": 2917 + }, + { + "epoch": 11.672, + "grad_norm": 0.7516029477119446, + "learning_rate": 2.088176352705411e-05, + "loss": 0.016, + "step": 2918 + }, + { + "epoch": 11.676, + "grad_norm": 0.8219577074050903, + "learning_rate": 2.087174348697395e-05, + "loss": 0.0167, + "step": 2919 + }, + { + "epoch": 11.68, + "grad_norm": 0.5712000131607056, + "learning_rate": 2.0861723446893787e-05, + "loss": 0.0175, + "step": 2920 + }, + { + "epoch": 11.684, + "grad_norm": 0.9513866305351257, + "learning_rate": 2.0851703406813628e-05, + "loss": 0.0197, + "step": 2921 + }, + { + "epoch": 11.688, + "grad_norm": 0.7864239811897278, + "learning_rate": 2.0841683366733466e-05, + "loss": 0.0175, + "step": 2922 + }, + { + "epoch": 11.692, + "grad_norm": 0.7664267420768738, + "learning_rate": 2.0831663326653307e-05, + "loss": 0.0189, + "step": 2923 + }, + { + "epoch": 11.696, + "grad_norm": 0.6849053502082825, + "learning_rate": 2.082164328657315e-05, + "loss": 0.0175, + "step": 2924 + }, + { + "epoch": 11.7, + "grad_norm": 0.7375827431678772, + "learning_rate": 2.0811623246492986e-05, + "loss": 0.0177, + "step": 2925 + }, + { + "epoch": 11.704, + "grad_norm": 0.661365807056427, + "learning_rate": 2.0801603206412824e-05, + "loss": 0.0154, + "step": 2926 + }, + { + "epoch": 11.708, + "grad_norm": 0.5127745866775513, + "learning_rate": 2.079158316633267e-05, + "loss": 0.014, + "step": 2927 + }, + { + "epoch": 11.712, + "grad_norm": 0.6667503714561462, + "learning_rate": 2.0781563126252507e-05, + "loss": 0.0158, + "step": 2928 + }, + { + "epoch": 11.716, + "grad_norm": 0.6455088257789612, + "learning_rate": 2.0771543086172345e-05, + "loss": 0.0156, + "step": 2929 + }, + { + "epoch": 11.72, + "grad_norm": 0.683720588684082, + "learning_rate": 2.0761523046092186e-05, + "loss": 0.015, + "step": 2930 + }, + { + "epoch": 11.724, + "grad_norm": 0.6581544280052185, + "learning_rate": 2.0751503006012024e-05, + "loss": 0.0158, + "step": 2931 + }, + { + "epoch": 11.728, + "grad_norm": 0.5899677276611328, + "learning_rate": 2.0741482965931865e-05, + "loss": 0.0159, + "step": 2932 + }, + { + "epoch": 11.732, + "grad_norm": 0.6744598746299744, + "learning_rate": 2.0731462925851707e-05, + "loss": 0.0157, + "step": 2933 + }, + { + "epoch": 11.736, + "grad_norm": 0.8028687834739685, + "learning_rate": 2.0721442885771544e-05, + "loss": 0.0172, + "step": 2934 + }, + { + "epoch": 11.74, + "grad_norm": 0.8576624393463135, + "learning_rate": 2.0711422845691382e-05, + "loss": 0.0201, + "step": 2935 + }, + { + "epoch": 11.744, + "grad_norm": 0.8122572302818298, + "learning_rate": 2.0701402805611224e-05, + "loss": 0.0187, + "step": 2936 + }, + { + "epoch": 11.748, + "grad_norm": 0.5539607405662537, + "learning_rate": 2.0691382765531065e-05, + "loss": 0.0155, + "step": 2937 + }, + { + "epoch": 11.752, + "grad_norm": 0.6936144232749939, + "learning_rate": 2.0681362725450903e-05, + "loss": 0.017, + "step": 2938 + }, + { + "epoch": 11.756, + "grad_norm": 0.8226979970932007, + "learning_rate": 2.0671342685370744e-05, + "loss": 0.0196, + "step": 2939 + }, + { + "epoch": 11.76, + "grad_norm": 0.49609795212745667, + "learning_rate": 2.0661322645290582e-05, + "loss": 0.014, + "step": 2940 + }, + { + "epoch": 11.764, + "grad_norm": 0.8211017847061157, + "learning_rate": 2.065130260521042e-05, + "loss": 0.0176, + "step": 2941 + }, + { + "epoch": 11.768, + "grad_norm": 0.6408782601356506, + "learning_rate": 2.064128256513026e-05, + "loss": 0.0166, + "step": 2942 + }, + { + "epoch": 11.772, + "grad_norm": 0.721869945526123, + "learning_rate": 2.0631262525050102e-05, + "loss": 0.0168, + "step": 2943 + }, + { + "epoch": 11.776, + "grad_norm": 0.7406426668167114, + "learning_rate": 2.062124248496994e-05, + "loss": 0.0172, + "step": 2944 + }, + { + "epoch": 11.78, + "grad_norm": 0.9095814228057861, + "learning_rate": 2.061122244488978e-05, + "loss": 0.0199, + "step": 2945 + }, + { + "epoch": 11.784, + "grad_norm": 0.6619588136672974, + "learning_rate": 2.060120240480962e-05, + "loss": 0.0193, + "step": 2946 + }, + { + "epoch": 11.788, + "grad_norm": 0.8828310966491699, + "learning_rate": 2.059118236472946e-05, + "loss": 0.0172, + "step": 2947 + }, + { + "epoch": 11.792, + "grad_norm": 0.6653797626495361, + "learning_rate": 2.05811623246493e-05, + "loss": 0.0158, + "step": 2948 + }, + { + "epoch": 11.796, + "grad_norm": 0.78447425365448, + "learning_rate": 2.057114228456914e-05, + "loss": 0.016, + "step": 2949 + }, + { + "epoch": 11.8, + "grad_norm": 0.7116568088531494, + "learning_rate": 2.0561122244488978e-05, + "loss": 0.0183, + "step": 2950 + }, + { + "epoch": 11.804, + "grad_norm": 0.5957525372505188, + "learning_rate": 2.055110220440882e-05, + "loss": 0.0165, + "step": 2951 + }, + { + "epoch": 11.808, + "grad_norm": 0.6143118143081665, + "learning_rate": 2.054108216432866e-05, + "loss": 0.0168, + "step": 2952 + }, + { + "epoch": 11.812, + "grad_norm": 0.8501677513122559, + "learning_rate": 2.05310621242485e-05, + "loss": 0.0147, + "step": 2953 + }, + { + "epoch": 11.816, + "grad_norm": 0.8333978056907654, + "learning_rate": 2.0521042084168336e-05, + "loss": 0.0178, + "step": 2954 + }, + { + "epoch": 11.82, + "grad_norm": 0.6845681667327881, + "learning_rate": 2.0511022044088178e-05, + "loss": 0.0186, + "step": 2955 + }, + { + "epoch": 11.824, + "grad_norm": 0.6612383127212524, + "learning_rate": 2.0501002004008015e-05, + "loss": 0.0161, + "step": 2956 + }, + { + "epoch": 11.828, + "grad_norm": 0.3920148015022278, + "learning_rate": 2.0490981963927857e-05, + "loss": 0.0082, + "step": 2957 + }, + { + "epoch": 11.832, + "grad_norm": 0.9177662134170532, + "learning_rate": 2.0480961923847698e-05, + "loss": 0.0208, + "step": 2958 + }, + { + "epoch": 11.836, + "grad_norm": 0.635342538356781, + "learning_rate": 2.0470941883767536e-05, + "loss": 0.0139, + "step": 2959 + }, + { + "epoch": 11.84, + "grad_norm": 0.9092418551445007, + "learning_rate": 2.0460921843687374e-05, + "loss": 0.0228, + "step": 2960 + }, + { + "epoch": 11.844, + "grad_norm": 0.7522537112236023, + "learning_rate": 2.0450901803607215e-05, + "loss": 0.0165, + "step": 2961 + }, + { + "epoch": 11.848, + "grad_norm": 0.8673317432403564, + "learning_rate": 2.0440881763527056e-05, + "loss": 0.0196, + "step": 2962 + }, + { + "epoch": 11.852, + "grad_norm": 0.5442277193069458, + "learning_rate": 2.0430861723446894e-05, + "loss": 0.015, + "step": 2963 + }, + { + "epoch": 11.856, + "grad_norm": 0.8119293451309204, + "learning_rate": 2.0420841683366736e-05, + "loss": 0.0189, + "step": 2964 + }, + { + "epoch": 11.86, + "grad_norm": 0.8945901989936829, + "learning_rate": 2.0410821643286573e-05, + "loss": 0.0165, + "step": 2965 + }, + { + "epoch": 11.864, + "grad_norm": 0.6969497203826904, + "learning_rate": 2.040080160320641e-05, + "loss": 0.0157, + "step": 2966 + }, + { + "epoch": 11.868, + "grad_norm": 0.700520396232605, + "learning_rate": 2.0390781563126256e-05, + "loss": 0.0168, + "step": 2967 + }, + { + "epoch": 11.872, + "grad_norm": 0.679799497127533, + "learning_rate": 2.0380761523046094e-05, + "loss": 0.016, + "step": 2968 + }, + { + "epoch": 11.876, + "grad_norm": 0.6355128884315491, + "learning_rate": 2.0370741482965932e-05, + "loss": 0.0156, + "step": 2969 + }, + { + "epoch": 11.88, + "grad_norm": 0.7955642342567444, + "learning_rate": 2.0360721442885773e-05, + "loss": 0.0174, + "step": 2970 + }, + { + "epoch": 11.884, + "grad_norm": 0.8534349203109741, + "learning_rate": 2.035070140280561e-05, + "loss": 0.0172, + "step": 2971 + }, + { + "epoch": 11.888, + "grad_norm": 0.9360595941543579, + "learning_rate": 2.0340681362725452e-05, + "loss": 0.0177, + "step": 2972 + }, + { + "epoch": 11.892, + "grad_norm": 0.7169991731643677, + "learning_rate": 2.0330661322645294e-05, + "loss": 0.0197, + "step": 2973 + }, + { + "epoch": 11.896, + "grad_norm": 0.853161096572876, + "learning_rate": 2.032064128256513e-05, + "loss": 0.0197, + "step": 2974 + }, + { + "epoch": 11.9, + "grad_norm": 0.714497447013855, + "learning_rate": 2.031062124248497e-05, + "loss": 0.0158, + "step": 2975 + }, + { + "epoch": 11.904, + "grad_norm": 0.8602288365364075, + "learning_rate": 2.030060120240481e-05, + "loss": 0.0176, + "step": 2976 + }, + { + "epoch": 11.908, + "grad_norm": 0.7017549276351929, + "learning_rate": 2.0290581162324652e-05, + "loss": 0.0171, + "step": 2977 + }, + { + "epoch": 11.912, + "grad_norm": 0.7314296960830688, + "learning_rate": 2.028056112224449e-05, + "loss": 0.0177, + "step": 2978 + }, + { + "epoch": 11.916, + "grad_norm": 0.5251822471618652, + "learning_rate": 2.0270541082164328e-05, + "loss": 0.0145, + "step": 2979 + }, + { + "epoch": 11.92, + "grad_norm": 0.5692286491394043, + "learning_rate": 2.026052104208417e-05, + "loss": 0.0152, + "step": 2980 + }, + { + "epoch": 11.924, + "grad_norm": 0.9542722105979919, + "learning_rate": 2.025050100200401e-05, + "loss": 0.019, + "step": 2981 + }, + { + "epoch": 11.928, + "grad_norm": 0.4988907277584076, + "learning_rate": 2.0240480961923848e-05, + "loss": 0.0148, + "step": 2982 + }, + { + "epoch": 11.932, + "grad_norm": 0.3393643796443939, + "learning_rate": 2.023046092184369e-05, + "loss": 0.0062, + "step": 2983 + }, + { + "epoch": 11.936, + "grad_norm": 0.6072190999984741, + "learning_rate": 2.0220440881763527e-05, + "loss": 0.0157, + "step": 2984 + }, + { + "epoch": 11.94, + "grad_norm": 0.7978100776672363, + "learning_rate": 2.0210420841683365e-05, + "loss": 0.0168, + "step": 2985 + }, + { + "epoch": 11.943999999999999, + "grad_norm": 0.6525113582611084, + "learning_rate": 2.020040080160321e-05, + "loss": 0.0162, + "step": 2986 + }, + { + "epoch": 11.948, + "grad_norm": 0.7721410989761353, + "learning_rate": 2.0190380761523048e-05, + "loss": 0.0168, + "step": 2987 + }, + { + "epoch": 11.952, + "grad_norm": 0.9068525433540344, + "learning_rate": 2.0180360721442886e-05, + "loss": 0.019, + "step": 2988 + }, + { + "epoch": 11.956, + "grad_norm": 0.6718014478683472, + "learning_rate": 2.0170340681362727e-05, + "loss": 0.0163, + "step": 2989 + }, + { + "epoch": 11.96, + "grad_norm": 0.8195679187774658, + "learning_rate": 2.0160320641282565e-05, + "loss": 0.0176, + "step": 2990 + }, + { + "epoch": 11.964, + "grad_norm": 0.6604552865028381, + "learning_rate": 2.0150300601202406e-05, + "loss": 0.0153, + "step": 2991 + }, + { + "epoch": 11.968, + "grad_norm": 0.7295999526977539, + "learning_rate": 2.0140280561122247e-05, + "loss": 0.0154, + "step": 2992 + }, + { + "epoch": 11.972, + "grad_norm": 0.9136268496513367, + "learning_rate": 2.0130260521042085e-05, + "loss": 0.0192, + "step": 2993 + }, + { + "epoch": 11.975999999999999, + "grad_norm": 0.8510957956314087, + "learning_rate": 2.0120240480961923e-05, + "loss": 0.0182, + "step": 2994 + }, + { + "epoch": 11.98, + "grad_norm": 0.9759835600852966, + "learning_rate": 2.0110220440881764e-05, + "loss": 0.0199, + "step": 2995 + }, + { + "epoch": 11.984, + "grad_norm": 0.5948774814605713, + "learning_rate": 2.0100200400801606e-05, + "loss": 0.0156, + "step": 2996 + }, + { + "epoch": 11.988, + "grad_norm": 0.6279386281967163, + "learning_rate": 2.0090180360721444e-05, + "loss": 0.0171, + "step": 2997 + }, + { + "epoch": 11.992, + "grad_norm": 0.4541473686695099, + "learning_rate": 2.0080160320641285e-05, + "loss": 0.0098, + "step": 2998 + }, + { + "epoch": 11.996, + "grad_norm": 0.7771435379981995, + "learning_rate": 2.0070140280561123e-05, + "loss": 0.0174, + "step": 2999 + }, + { + "epoch": 12.0, + "grad_norm": 0.8217807412147522, + "learning_rate": 2.006012024048096e-05, + "loss": 0.021, + "step": 3000 + }, + { + "epoch": 12.004, + "grad_norm": 0.6948625445365906, + "learning_rate": 2.0050100200400805e-05, + "loss": 0.0124, + "step": 3001 + }, + { + "epoch": 12.008, + "grad_norm": 0.4800888001918793, + "learning_rate": 2.0040080160320643e-05, + "loss": 0.0112, + "step": 3002 + }, + { + "epoch": 12.012, + "grad_norm": 0.5231291055679321, + "learning_rate": 2.003006012024048e-05, + "loss": 0.0113, + "step": 3003 + }, + { + "epoch": 12.016, + "grad_norm": 0.35470184683799744, + "learning_rate": 2.0020040080160322e-05, + "loss": 0.0113, + "step": 3004 + }, + { + "epoch": 12.02, + "grad_norm": 0.5849162936210632, + "learning_rate": 2.001002004008016e-05, + "loss": 0.0122, + "step": 3005 + }, + { + "epoch": 12.024, + "grad_norm": 0.5772752165794373, + "learning_rate": 2e-05, + "loss": 0.0126, + "step": 3006 + }, + { + "epoch": 12.028, + "grad_norm": 0.39507535099983215, + "learning_rate": 1.998997995991984e-05, + "loss": 0.0104, + "step": 3007 + }, + { + "epoch": 12.032, + "grad_norm": 0.5409855842590332, + "learning_rate": 1.997995991983968e-05, + "loss": 0.0125, + "step": 3008 + }, + { + "epoch": 12.036, + "grad_norm": 0.5285557508468628, + "learning_rate": 1.996993987975952e-05, + "loss": 0.014, + "step": 3009 + }, + { + "epoch": 12.04, + "grad_norm": 0.520379364490509, + "learning_rate": 1.995991983967936e-05, + "loss": 0.0131, + "step": 3010 + }, + { + "epoch": 12.044, + "grad_norm": 0.5537810325622559, + "learning_rate": 1.99498997995992e-05, + "loss": 0.0127, + "step": 3011 + }, + { + "epoch": 12.048, + "grad_norm": 0.4161956310272217, + "learning_rate": 1.993987975951904e-05, + "loss": 0.0123, + "step": 3012 + }, + { + "epoch": 12.052, + "grad_norm": 0.2786479890346527, + "learning_rate": 1.9929859719438877e-05, + "loss": 0.0063, + "step": 3013 + }, + { + "epoch": 12.056, + "grad_norm": 0.6354433298110962, + "learning_rate": 1.991983967935872e-05, + "loss": 0.0134, + "step": 3014 + }, + { + "epoch": 12.06, + "grad_norm": 0.4777648150920868, + "learning_rate": 1.9909819639278556e-05, + "loss": 0.0113, + "step": 3015 + }, + { + "epoch": 12.064, + "grad_norm": 0.622520923614502, + "learning_rate": 1.9899799599198398e-05, + "loss": 0.0121, + "step": 3016 + }, + { + "epoch": 12.068, + "grad_norm": 0.4313918650150299, + "learning_rate": 1.988977955911824e-05, + "loss": 0.0111, + "step": 3017 + }, + { + "epoch": 12.072, + "grad_norm": 0.35469359159469604, + "learning_rate": 1.9879759519038077e-05, + "loss": 0.0076, + "step": 3018 + }, + { + "epoch": 12.076, + "grad_norm": 0.6942519545555115, + "learning_rate": 1.9869739478957915e-05, + "loss": 0.0121, + "step": 3019 + }, + { + "epoch": 12.08, + "grad_norm": 0.5710741281509399, + "learning_rate": 1.9859719438877756e-05, + "loss": 0.0127, + "step": 3020 + }, + { + "epoch": 12.084, + "grad_norm": 0.3416895270347595, + "learning_rate": 1.9849699398797597e-05, + "loss": 0.0116, + "step": 3021 + }, + { + "epoch": 12.088, + "grad_norm": 0.5317337512969971, + "learning_rate": 1.9839679358717435e-05, + "loss": 0.0127, + "step": 3022 + }, + { + "epoch": 12.092, + "grad_norm": 0.45818451046943665, + "learning_rate": 1.9829659318637276e-05, + "loss": 0.012, + "step": 3023 + }, + { + "epoch": 12.096, + "grad_norm": 0.564237654209137, + "learning_rate": 1.9819639278557114e-05, + "loss": 0.0112, + "step": 3024 + }, + { + "epoch": 12.1, + "grad_norm": 0.5866971611976624, + "learning_rate": 1.9809619238476952e-05, + "loss": 0.0153, + "step": 3025 + }, + { + "epoch": 12.104, + "grad_norm": 0.44592806696891785, + "learning_rate": 1.9799599198396797e-05, + "loss": 0.0105, + "step": 3026 + }, + { + "epoch": 12.108, + "grad_norm": 0.3792635500431061, + "learning_rate": 1.9789579158316635e-05, + "loss": 0.0102, + "step": 3027 + }, + { + "epoch": 12.112, + "grad_norm": 0.5635868906974792, + "learning_rate": 1.9779559118236473e-05, + "loss": 0.0113, + "step": 3028 + }, + { + "epoch": 12.116, + "grad_norm": 0.6451796293258667, + "learning_rate": 1.9769539078156314e-05, + "loss": 0.012, + "step": 3029 + }, + { + "epoch": 12.12, + "grad_norm": 0.35301733016967773, + "learning_rate": 1.9759519038076152e-05, + "loss": 0.0103, + "step": 3030 + }, + { + "epoch": 12.124, + "grad_norm": 0.4701129198074341, + "learning_rate": 1.9749498997995993e-05, + "loss": 0.0113, + "step": 3031 + }, + { + "epoch": 12.128, + "grad_norm": 0.39844104647636414, + "learning_rate": 1.9739478957915834e-05, + "loss": 0.011, + "step": 3032 + }, + { + "epoch": 12.132, + "grad_norm": 0.5066357254981995, + "learning_rate": 1.9729458917835672e-05, + "loss": 0.0114, + "step": 3033 + }, + { + "epoch": 12.136, + "grad_norm": 0.3592749536037445, + "learning_rate": 1.971943887775551e-05, + "loss": 0.0109, + "step": 3034 + }, + { + "epoch": 12.14, + "grad_norm": 0.5839434266090393, + "learning_rate": 1.970941883767535e-05, + "loss": 0.0137, + "step": 3035 + }, + { + "epoch": 12.144, + "grad_norm": 0.35629791021347046, + "learning_rate": 1.9699398797595193e-05, + "loss": 0.0108, + "step": 3036 + }, + { + "epoch": 12.148, + "grad_norm": 0.474680095911026, + "learning_rate": 1.968937875751503e-05, + "loss": 0.0099, + "step": 3037 + }, + { + "epoch": 12.152, + "grad_norm": 0.4389362335205078, + "learning_rate": 1.9679358717434872e-05, + "loss": 0.0121, + "step": 3038 + }, + { + "epoch": 12.156, + "grad_norm": 0.5454397201538086, + "learning_rate": 1.966933867735471e-05, + "loss": 0.012, + "step": 3039 + }, + { + "epoch": 12.16, + "grad_norm": 0.4310266375541687, + "learning_rate": 1.965931863727455e-05, + "loss": 0.0124, + "step": 3040 + }, + { + "epoch": 12.164, + "grad_norm": 0.38319042325019836, + "learning_rate": 1.964929859719439e-05, + "loss": 0.0115, + "step": 3041 + }, + { + "epoch": 12.168, + "grad_norm": 0.5554765462875366, + "learning_rate": 1.963927855711423e-05, + "loss": 0.0123, + "step": 3042 + }, + { + "epoch": 12.172, + "grad_norm": 0.43762677907943726, + "learning_rate": 1.9629258517034068e-05, + "loss": 0.0126, + "step": 3043 + }, + { + "epoch": 12.176, + "grad_norm": 0.6405286192893982, + "learning_rate": 1.961923847695391e-05, + "loss": 0.0109, + "step": 3044 + }, + { + "epoch": 12.18, + "grad_norm": 0.4233160614967346, + "learning_rate": 1.960921843687375e-05, + "loss": 0.0131, + "step": 3045 + }, + { + "epoch": 12.184, + "grad_norm": 0.4447900652885437, + "learning_rate": 1.959919839679359e-05, + "loss": 0.012, + "step": 3046 + }, + { + "epoch": 12.188, + "grad_norm": 0.44814789295196533, + "learning_rate": 1.9589178356713426e-05, + "loss": 0.0134, + "step": 3047 + }, + { + "epoch": 12.192, + "grad_norm": 0.3883339762687683, + "learning_rate": 1.9579158316633268e-05, + "loss": 0.0127, + "step": 3048 + }, + { + "epoch": 12.196, + "grad_norm": 0.476978063583374, + "learning_rate": 1.9569138276553106e-05, + "loss": 0.0114, + "step": 3049 + }, + { + "epoch": 12.2, + "grad_norm": 0.41597089171409607, + "learning_rate": 1.9559118236472947e-05, + "loss": 0.0108, + "step": 3050 + }, + { + "epoch": 12.204, + "grad_norm": 0.3592725992202759, + "learning_rate": 1.9549098196392788e-05, + "loss": 0.0114, + "step": 3051 + }, + { + "epoch": 12.208, + "grad_norm": 0.5016531348228455, + "learning_rate": 1.9539078156312626e-05, + "loss": 0.0128, + "step": 3052 + }, + { + "epoch": 12.212, + "grad_norm": 0.46499642729759216, + "learning_rate": 1.9529058116232464e-05, + "loss": 0.01, + "step": 3053 + }, + { + "epoch": 12.216, + "grad_norm": 0.6043004989624023, + "learning_rate": 1.9519038076152305e-05, + "loss": 0.0102, + "step": 3054 + }, + { + "epoch": 12.22, + "grad_norm": 0.48706644773483276, + "learning_rate": 1.9509018036072147e-05, + "loss": 0.0126, + "step": 3055 + }, + { + "epoch": 12.224, + "grad_norm": 0.7386260032653809, + "learning_rate": 1.9498997995991984e-05, + "loss": 0.0117, + "step": 3056 + }, + { + "epoch": 12.228, + "grad_norm": 0.6791260838508606, + "learning_rate": 1.9488977955911826e-05, + "loss": 0.0133, + "step": 3057 + }, + { + "epoch": 12.232, + "grad_norm": 0.38665708899497986, + "learning_rate": 1.9478957915831664e-05, + "loss": 0.012, + "step": 3058 + }, + { + "epoch": 12.236, + "grad_norm": 0.6571320295333862, + "learning_rate": 1.94689378757515e-05, + "loss": 0.0113, + "step": 3059 + }, + { + "epoch": 12.24, + "grad_norm": 0.5152968168258667, + "learning_rate": 1.9458917835671346e-05, + "loss": 0.0121, + "step": 3060 + }, + { + "epoch": 12.244, + "grad_norm": 1.3934143781661987, + "learning_rate": 1.9448897795591184e-05, + "loss": 0.0139, + "step": 3061 + }, + { + "epoch": 12.248, + "grad_norm": 0.47328341007232666, + "learning_rate": 1.9438877755511022e-05, + "loss": 0.0149, + "step": 3062 + }, + { + "epoch": 12.252, + "grad_norm": 0.24610672891139984, + "learning_rate": 1.9428857715430863e-05, + "loss": 0.0089, + "step": 3063 + }, + { + "epoch": 12.256, + "grad_norm": 0.6528359055519104, + "learning_rate": 1.94188376753507e-05, + "loss": 0.0132, + "step": 3064 + }, + { + "epoch": 12.26, + "grad_norm": 0.5463185906410217, + "learning_rate": 1.9408817635270542e-05, + "loss": 0.0133, + "step": 3065 + }, + { + "epoch": 12.264, + "grad_norm": 0.40765008330345154, + "learning_rate": 1.9398797595190384e-05, + "loss": 0.0109, + "step": 3066 + }, + { + "epoch": 12.268, + "grad_norm": 0.6481624841690063, + "learning_rate": 1.938877755511022e-05, + "loss": 0.0168, + "step": 3067 + }, + { + "epoch": 12.272, + "grad_norm": 0.4352094829082489, + "learning_rate": 1.937875751503006e-05, + "loss": 0.0118, + "step": 3068 + }, + { + "epoch": 12.276, + "grad_norm": 0.3659978210926056, + "learning_rate": 1.93687374749499e-05, + "loss": 0.0123, + "step": 3069 + }, + { + "epoch": 12.28, + "grad_norm": 0.5505935549736023, + "learning_rate": 1.9358717434869742e-05, + "loss": 0.0125, + "step": 3070 + }, + { + "epoch": 12.284, + "grad_norm": 0.5281729102134705, + "learning_rate": 1.934869739478958e-05, + "loss": 0.0158, + "step": 3071 + }, + { + "epoch": 12.288, + "grad_norm": 0.42845743894577026, + "learning_rate": 1.933867735470942e-05, + "loss": 0.0114, + "step": 3072 + }, + { + "epoch": 12.292, + "grad_norm": 0.3950668275356293, + "learning_rate": 1.932865731462926e-05, + "loss": 0.0115, + "step": 3073 + }, + { + "epoch": 12.296, + "grad_norm": 0.6833834052085876, + "learning_rate": 1.9318637274549097e-05, + "loss": 0.0145, + "step": 3074 + }, + { + "epoch": 12.3, + "grad_norm": 0.3856029808521271, + "learning_rate": 1.930861723446894e-05, + "loss": 0.0132, + "step": 3075 + }, + { + "epoch": 12.304, + "grad_norm": 0.4963999092578888, + "learning_rate": 1.929859719438878e-05, + "loss": 0.0123, + "step": 3076 + }, + { + "epoch": 12.308, + "grad_norm": 0.46567070484161377, + "learning_rate": 1.9288577154308618e-05, + "loss": 0.0106, + "step": 3077 + }, + { + "epoch": 12.312, + "grad_norm": 0.5482111573219299, + "learning_rate": 1.927855711422846e-05, + "loss": 0.0145, + "step": 3078 + }, + { + "epoch": 12.316, + "grad_norm": 0.5668717622756958, + "learning_rate": 1.9268537074148297e-05, + "loss": 0.0119, + "step": 3079 + }, + { + "epoch": 12.32, + "grad_norm": 0.4780943691730499, + "learning_rate": 1.9258517034068138e-05, + "loss": 0.0131, + "step": 3080 + }, + { + "epoch": 12.324, + "grad_norm": 0.5395445227622986, + "learning_rate": 1.9248496993987976e-05, + "loss": 0.0119, + "step": 3081 + }, + { + "epoch": 12.328, + "grad_norm": 0.346057265996933, + "learning_rate": 1.9238476953907817e-05, + "loss": 0.011, + "step": 3082 + }, + { + "epoch": 12.332, + "grad_norm": 0.37034234404563904, + "learning_rate": 1.9228456913827655e-05, + "loss": 0.0119, + "step": 3083 + }, + { + "epoch": 12.336, + "grad_norm": 0.3500770032405853, + "learning_rate": 1.9218436873747493e-05, + "loss": 0.0125, + "step": 3084 + }, + { + "epoch": 12.34, + "grad_norm": 0.6100295782089233, + "learning_rate": 1.9208416833667338e-05, + "loss": 0.0154, + "step": 3085 + }, + { + "epoch": 12.344, + "grad_norm": 0.5396653413772583, + "learning_rate": 1.9198396793587175e-05, + "loss": 0.0139, + "step": 3086 + }, + { + "epoch": 12.348, + "grad_norm": 0.5847299695014954, + "learning_rate": 1.9188376753507013e-05, + "loss": 0.0119, + "step": 3087 + }, + { + "epoch": 12.352, + "grad_norm": 0.38033461570739746, + "learning_rate": 1.9178356713426855e-05, + "loss": 0.012, + "step": 3088 + }, + { + "epoch": 12.356, + "grad_norm": 0.4676550030708313, + "learning_rate": 1.9168336673346693e-05, + "loss": 0.0131, + "step": 3089 + }, + { + "epoch": 12.36, + "grad_norm": 0.40676653385162354, + "learning_rate": 1.9158316633266534e-05, + "loss": 0.0128, + "step": 3090 + }, + { + "epoch": 12.364, + "grad_norm": 0.4821133017539978, + "learning_rate": 1.9148296593186375e-05, + "loss": 0.0127, + "step": 3091 + }, + { + "epoch": 12.368, + "grad_norm": 0.3716961145401001, + "learning_rate": 1.9138276553106213e-05, + "loss": 0.0123, + "step": 3092 + }, + { + "epoch": 12.372, + "grad_norm": 0.4660569131374359, + "learning_rate": 1.912825651302605e-05, + "loss": 0.0112, + "step": 3093 + }, + { + "epoch": 12.376, + "grad_norm": 0.579636812210083, + "learning_rate": 1.9118236472945896e-05, + "loss": 0.0129, + "step": 3094 + }, + { + "epoch": 12.38, + "grad_norm": 0.373367577791214, + "learning_rate": 1.9108216432865733e-05, + "loss": 0.0112, + "step": 3095 + }, + { + "epoch": 12.384, + "grad_norm": 0.41835731267929077, + "learning_rate": 1.909819639278557e-05, + "loss": 0.0079, + "step": 3096 + }, + { + "epoch": 12.388, + "grad_norm": 0.43698638677597046, + "learning_rate": 1.9088176352705413e-05, + "loss": 0.0112, + "step": 3097 + }, + { + "epoch": 12.392, + "grad_norm": 0.7125802636146545, + "learning_rate": 1.907815631262525e-05, + "loss": 0.0122, + "step": 3098 + }, + { + "epoch": 12.396, + "grad_norm": 0.6462778449058533, + "learning_rate": 1.9068136272545092e-05, + "loss": 0.0124, + "step": 3099 + }, + { + "epoch": 12.4, + "grad_norm": 0.3642807900905609, + "learning_rate": 1.9058116232464933e-05, + "loss": 0.0109, + "step": 3100 + }, + { + "epoch": 12.404, + "grad_norm": 0.44956448674201965, + "learning_rate": 1.904809619238477e-05, + "loss": 0.0127, + "step": 3101 + }, + { + "epoch": 12.408, + "grad_norm": 0.398598849773407, + "learning_rate": 1.903807615230461e-05, + "loss": 0.0076, + "step": 3102 + }, + { + "epoch": 12.412, + "grad_norm": 0.8257546424865723, + "learning_rate": 1.902805611222445e-05, + "loss": 0.0131, + "step": 3103 + }, + { + "epoch": 12.416, + "grad_norm": 0.5510148406028748, + "learning_rate": 1.901803607214429e-05, + "loss": 0.0134, + "step": 3104 + }, + { + "epoch": 12.42, + "grad_norm": 0.415303111076355, + "learning_rate": 1.900801603206413e-05, + "loss": 0.0125, + "step": 3105 + }, + { + "epoch": 12.424, + "grad_norm": 0.36105599999427795, + "learning_rate": 1.8997995991983967e-05, + "loss": 0.0126, + "step": 3106 + }, + { + "epoch": 12.428, + "grad_norm": 0.45794200897216797, + "learning_rate": 1.898797595190381e-05, + "loss": 0.0139, + "step": 3107 + }, + { + "epoch": 12.432, + "grad_norm": 0.5848199129104614, + "learning_rate": 1.8977955911823646e-05, + "loss": 0.0119, + "step": 3108 + }, + { + "epoch": 12.436, + "grad_norm": 0.528476893901825, + "learning_rate": 1.8967935871743488e-05, + "loss": 0.013, + "step": 3109 + }, + { + "epoch": 12.44, + "grad_norm": 0.43130603432655334, + "learning_rate": 1.895791583166333e-05, + "loss": 0.0125, + "step": 3110 + }, + { + "epoch": 12.444, + "grad_norm": 0.5508957505226135, + "learning_rate": 1.8947895791583167e-05, + "loss": 0.014, + "step": 3111 + }, + { + "epoch": 12.448, + "grad_norm": 0.5716979503631592, + "learning_rate": 1.8937875751503005e-05, + "loss": 0.0123, + "step": 3112 + }, + { + "epoch": 12.452, + "grad_norm": 0.5015970468521118, + "learning_rate": 1.8927855711422846e-05, + "loss": 0.0122, + "step": 3113 + }, + { + "epoch": 12.456, + "grad_norm": 0.379427045583725, + "learning_rate": 1.8917835671342687e-05, + "loss": 0.0117, + "step": 3114 + }, + { + "epoch": 12.46, + "grad_norm": 0.3971169888973236, + "learning_rate": 1.8907815631262525e-05, + "loss": 0.0131, + "step": 3115 + }, + { + "epoch": 12.464, + "grad_norm": 0.540484607219696, + "learning_rate": 1.8897795591182367e-05, + "loss": 0.012, + "step": 3116 + }, + { + "epoch": 12.468, + "grad_norm": 0.6403137445449829, + "learning_rate": 1.8887775551102204e-05, + "loss": 0.0119, + "step": 3117 + }, + { + "epoch": 12.472, + "grad_norm": 0.4189258813858032, + "learning_rate": 1.8877755511022042e-05, + "loss": 0.0121, + "step": 3118 + }, + { + "epoch": 12.475999999999999, + "grad_norm": 0.731610119342804, + "learning_rate": 1.8867735470941887e-05, + "loss": 0.0126, + "step": 3119 + }, + { + "epoch": 12.48, + "grad_norm": 0.5180526375770569, + "learning_rate": 1.8857715430861725e-05, + "loss": 0.0113, + "step": 3120 + }, + { + "epoch": 12.484, + "grad_norm": 0.45408037304878235, + "learning_rate": 1.8847695390781563e-05, + "loss": 0.0114, + "step": 3121 + }, + { + "epoch": 12.488, + "grad_norm": 0.5250622034072876, + "learning_rate": 1.8837675350701404e-05, + "loss": 0.0125, + "step": 3122 + }, + { + "epoch": 12.492, + "grad_norm": 0.41708803176879883, + "learning_rate": 1.8827655310621242e-05, + "loss": 0.0108, + "step": 3123 + }, + { + "epoch": 12.496, + "grad_norm": 0.4630829989910126, + "learning_rate": 1.8817635270541083e-05, + "loss": 0.014, + "step": 3124 + }, + { + "epoch": 12.5, + "grad_norm": 0.5149716138839722, + "learning_rate": 1.8807615230460925e-05, + "loss": 0.0145, + "step": 3125 + }, + { + "epoch": 12.504, + "grad_norm": 0.4337029457092285, + "learning_rate": 1.8797595190380762e-05, + "loss": 0.0129, + "step": 3126 + }, + { + "epoch": 12.508, + "grad_norm": 0.5135932564735413, + "learning_rate": 1.87875751503006e-05, + "loss": 0.0123, + "step": 3127 + }, + { + "epoch": 12.512, + "grad_norm": 0.5922970771789551, + "learning_rate": 1.877755511022044e-05, + "loss": 0.0142, + "step": 3128 + }, + { + "epoch": 12.516, + "grad_norm": 0.6139187216758728, + "learning_rate": 1.8767535070140283e-05, + "loss": 0.0155, + "step": 3129 + }, + { + "epoch": 12.52, + "grad_norm": 0.34099677205085754, + "learning_rate": 1.875751503006012e-05, + "loss": 0.0114, + "step": 3130 + }, + { + "epoch": 12.524000000000001, + "grad_norm": 0.35690101981163025, + "learning_rate": 1.8747494989979962e-05, + "loss": 0.0115, + "step": 3131 + }, + { + "epoch": 12.528, + "grad_norm": 0.4974209666252136, + "learning_rate": 1.87374749498998e-05, + "loss": 0.0166, + "step": 3132 + }, + { + "epoch": 12.532, + "grad_norm": 0.47665518522262573, + "learning_rate": 1.8727454909819638e-05, + "loss": 0.0135, + "step": 3133 + }, + { + "epoch": 12.536, + "grad_norm": 0.45617246627807617, + "learning_rate": 1.871743486973948e-05, + "loss": 0.0135, + "step": 3134 + }, + { + "epoch": 12.54, + "grad_norm": 0.4416828453540802, + "learning_rate": 1.870741482965932e-05, + "loss": 0.0118, + "step": 3135 + }, + { + "epoch": 12.544, + "grad_norm": 0.4680028259754181, + "learning_rate": 1.869739478957916e-05, + "loss": 0.0128, + "step": 3136 + }, + { + "epoch": 12.548, + "grad_norm": 0.4106837809085846, + "learning_rate": 1.8687374749499e-05, + "loss": 0.0121, + "step": 3137 + }, + { + "epoch": 12.552, + "grad_norm": 0.53117835521698, + "learning_rate": 1.8677354709418837e-05, + "loss": 0.0121, + "step": 3138 + }, + { + "epoch": 12.556000000000001, + "grad_norm": 0.43182119727134705, + "learning_rate": 1.866733466933868e-05, + "loss": 0.0123, + "step": 3139 + }, + { + "epoch": 12.56, + "grad_norm": 0.7537881731987, + "learning_rate": 1.8657314629258517e-05, + "loss": 0.0145, + "step": 3140 + }, + { + "epoch": 12.564, + "grad_norm": 0.3404240906238556, + "learning_rate": 1.8647294589178358e-05, + "loss": 0.0115, + "step": 3141 + }, + { + "epoch": 12.568, + "grad_norm": 0.6614292860031128, + "learning_rate": 1.8637274549098196e-05, + "loss": 0.0117, + "step": 3142 + }, + { + "epoch": 12.572, + "grad_norm": 0.46803048253059387, + "learning_rate": 1.8627254509018037e-05, + "loss": 0.0117, + "step": 3143 + }, + { + "epoch": 12.576, + "grad_norm": 0.3678813874721527, + "learning_rate": 1.861723446893788e-05, + "loss": 0.0115, + "step": 3144 + }, + { + "epoch": 12.58, + "grad_norm": 0.30099377036094666, + "learning_rate": 1.8607214428857716e-05, + "loss": 0.0103, + "step": 3145 + }, + { + "epoch": 12.584, + "grad_norm": 0.47530147433280945, + "learning_rate": 1.8597194388777554e-05, + "loss": 0.013, + "step": 3146 + }, + { + "epoch": 12.588, + "grad_norm": 1.0010900497436523, + "learning_rate": 1.8587174348697395e-05, + "loss": 0.0121, + "step": 3147 + }, + { + "epoch": 12.592, + "grad_norm": 0.3169981837272644, + "learning_rate": 1.8577154308617237e-05, + "loss": 0.0113, + "step": 3148 + }, + { + "epoch": 12.596, + "grad_norm": 0.4307844936847687, + "learning_rate": 1.8567134268537075e-05, + "loss": 0.0123, + "step": 3149 + }, + { + "epoch": 12.6, + "grad_norm": 0.5654677748680115, + "learning_rate": 1.8557114228456916e-05, + "loss": 0.0135, + "step": 3150 + }, + { + "epoch": 12.604, + "grad_norm": 0.432781457901001, + "learning_rate": 1.8547094188376754e-05, + "loss": 0.0112, + "step": 3151 + }, + { + "epoch": 12.608, + "grad_norm": 0.6548603177070618, + "learning_rate": 1.8537074148296592e-05, + "loss": 0.013, + "step": 3152 + }, + { + "epoch": 12.612, + "grad_norm": 0.5078880786895752, + "learning_rate": 1.8527054108216436e-05, + "loss": 0.0114, + "step": 3153 + }, + { + "epoch": 12.616, + "grad_norm": 0.6445592641830444, + "learning_rate": 1.8517034068136274e-05, + "loss": 0.0132, + "step": 3154 + }, + { + "epoch": 12.62, + "grad_norm": 0.29496875405311584, + "learning_rate": 1.8507014028056112e-05, + "loss": 0.0109, + "step": 3155 + }, + { + "epoch": 12.624, + "grad_norm": 1.0399456024169922, + "learning_rate": 1.8496993987975953e-05, + "loss": 0.0156, + "step": 3156 + }, + { + "epoch": 12.628, + "grad_norm": 0.5184885263442993, + "learning_rate": 1.848697394789579e-05, + "loss": 0.011, + "step": 3157 + }, + { + "epoch": 12.632, + "grad_norm": 0.251708984375, + "learning_rate": 1.8476953907815633e-05, + "loss": 0.0064, + "step": 3158 + }, + { + "epoch": 12.636, + "grad_norm": 0.5945011377334595, + "learning_rate": 1.8466933867735474e-05, + "loss": 0.0151, + "step": 3159 + }, + { + "epoch": 12.64, + "grad_norm": 0.3488098382949829, + "learning_rate": 1.8456913827655312e-05, + "loss": 0.0112, + "step": 3160 + }, + { + "epoch": 12.644, + "grad_norm": 0.46110039949417114, + "learning_rate": 1.844689378757515e-05, + "loss": 0.0123, + "step": 3161 + }, + { + "epoch": 12.648, + "grad_norm": 0.5689302682876587, + "learning_rate": 1.843687374749499e-05, + "loss": 0.012, + "step": 3162 + }, + { + "epoch": 12.652, + "grad_norm": 0.4283682703971863, + "learning_rate": 1.8426853707414832e-05, + "loss": 0.0115, + "step": 3163 + }, + { + "epoch": 12.656, + "grad_norm": 0.5536277890205383, + "learning_rate": 1.841683366733467e-05, + "loss": 0.0108, + "step": 3164 + }, + { + "epoch": 12.66, + "grad_norm": 0.5680063366889954, + "learning_rate": 1.840681362725451e-05, + "loss": 0.014, + "step": 3165 + }, + { + "epoch": 12.664, + "grad_norm": 0.42657792568206787, + "learning_rate": 1.839679358717435e-05, + "loss": 0.013, + "step": 3166 + }, + { + "epoch": 12.668, + "grad_norm": 0.33542749285697937, + "learning_rate": 1.8386773547094187e-05, + "loss": 0.0123, + "step": 3167 + }, + { + "epoch": 12.672, + "grad_norm": 0.7784429788589478, + "learning_rate": 1.837675350701403e-05, + "loss": 0.0186, + "step": 3168 + }, + { + "epoch": 12.676, + "grad_norm": 0.35441887378692627, + "learning_rate": 1.836673346693387e-05, + "loss": 0.0113, + "step": 3169 + }, + { + "epoch": 12.68, + "grad_norm": 0.6247470378875732, + "learning_rate": 1.8356713426853708e-05, + "loss": 0.0117, + "step": 3170 + }, + { + "epoch": 12.684, + "grad_norm": 0.7267860770225525, + "learning_rate": 1.834669338677355e-05, + "loss": 0.0126, + "step": 3171 + }, + { + "epoch": 12.688, + "grad_norm": 0.40373706817626953, + "learning_rate": 1.8336673346693387e-05, + "loss": 0.0124, + "step": 3172 + }, + { + "epoch": 12.692, + "grad_norm": 0.38257259130477905, + "learning_rate": 1.8326653306613228e-05, + "loss": 0.0122, + "step": 3173 + }, + { + "epoch": 12.696, + "grad_norm": 0.3000936210155487, + "learning_rate": 1.8316633266533066e-05, + "loss": 0.0107, + "step": 3174 + }, + { + "epoch": 12.7, + "grad_norm": 0.5243247747421265, + "learning_rate": 1.8306613226452907e-05, + "loss": 0.012, + "step": 3175 + }, + { + "epoch": 12.704, + "grad_norm": 0.6114549040794373, + "learning_rate": 1.8296593186372745e-05, + "loss": 0.0106, + "step": 3176 + }, + { + "epoch": 12.708, + "grad_norm": 0.9472494721412659, + "learning_rate": 1.8286573146292587e-05, + "loss": 0.0164, + "step": 3177 + }, + { + "epoch": 12.712, + "grad_norm": 0.5240547060966492, + "learning_rate": 1.8276553106212428e-05, + "loss": 0.012, + "step": 3178 + }, + { + "epoch": 12.716, + "grad_norm": 0.6829943656921387, + "learning_rate": 1.8266533066132266e-05, + "loss": 0.0123, + "step": 3179 + }, + { + "epoch": 12.72, + "grad_norm": 0.4161919951438904, + "learning_rate": 1.8256513026052104e-05, + "loss": 0.0115, + "step": 3180 + }, + { + "epoch": 12.724, + "grad_norm": 0.2771739959716797, + "learning_rate": 1.8246492985971945e-05, + "loss": 0.0073, + "step": 3181 + }, + { + "epoch": 12.728, + "grad_norm": 0.40933287143707275, + "learning_rate": 1.8236472945891783e-05, + "loss": 0.0119, + "step": 3182 + }, + { + "epoch": 12.732, + "grad_norm": 0.44479766488075256, + "learning_rate": 1.8226452905811624e-05, + "loss": 0.0143, + "step": 3183 + }, + { + "epoch": 12.736, + "grad_norm": 0.32802703976631165, + "learning_rate": 1.8216432865731465e-05, + "loss": 0.0102, + "step": 3184 + }, + { + "epoch": 12.74, + "grad_norm": 0.43562448024749756, + "learning_rate": 1.8206412825651303e-05, + "loss": 0.0125, + "step": 3185 + }, + { + "epoch": 12.744, + "grad_norm": 0.2993567883968353, + "learning_rate": 1.819639278557114e-05, + "loss": 0.0064, + "step": 3186 + }, + { + "epoch": 12.748, + "grad_norm": 0.45686689019203186, + "learning_rate": 1.8186372745490982e-05, + "loss": 0.0137, + "step": 3187 + }, + { + "epoch": 12.752, + "grad_norm": 0.6637890934944153, + "learning_rate": 1.8176352705410824e-05, + "loss": 0.0117, + "step": 3188 + }, + { + "epoch": 12.756, + "grad_norm": 0.4745820462703705, + "learning_rate": 1.816633266533066e-05, + "loss": 0.0127, + "step": 3189 + }, + { + "epoch": 12.76, + "grad_norm": 0.5020652413368225, + "learning_rate": 1.8156312625250503e-05, + "loss": 0.0141, + "step": 3190 + }, + { + "epoch": 12.764, + "grad_norm": 0.49237433075904846, + "learning_rate": 1.814629258517034e-05, + "loss": 0.0151, + "step": 3191 + }, + { + "epoch": 12.768, + "grad_norm": 0.6075673699378967, + "learning_rate": 1.813627254509018e-05, + "loss": 0.0133, + "step": 3192 + }, + { + "epoch": 12.772, + "grad_norm": 0.5600119233131409, + "learning_rate": 1.8126252505010023e-05, + "loss": 0.0158, + "step": 3193 + }, + { + "epoch": 12.776, + "grad_norm": 0.4790187478065491, + "learning_rate": 1.811623246492986e-05, + "loss": 0.0146, + "step": 3194 + }, + { + "epoch": 12.78, + "grad_norm": 0.43495261669158936, + "learning_rate": 1.81062124248497e-05, + "loss": 0.0118, + "step": 3195 + }, + { + "epoch": 12.784, + "grad_norm": 0.36963024735450745, + "learning_rate": 1.809619238476954e-05, + "loss": 0.0116, + "step": 3196 + }, + { + "epoch": 12.788, + "grad_norm": 0.49386468529701233, + "learning_rate": 1.8086172344689378e-05, + "loss": 0.0144, + "step": 3197 + }, + { + "epoch": 12.792, + "grad_norm": 0.6671590805053711, + "learning_rate": 1.807615230460922e-05, + "loss": 0.013, + "step": 3198 + }, + { + "epoch": 12.796, + "grad_norm": 0.39627164602279663, + "learning_rate": 1.806613226452906e-05, + "loss": 0.0122, + "step": 3199 + }, + { + "epoch": 12.8, + "grad_norm": 0.5293060541152954, + "learning_rate": 1.80561122244489e-05, + "loss": 0.0139, + "step": 3200 + }, + { + "epoch": 12.804, + "grad_norm": 0.4009303152561188, + "learning_rate": 1.8046092184368737e-05, + "loss": 0.0131, + "step": 3201 + }, + { + "epoch": 12.808, + "grad_norm": 0.4425906240940094, + "learning_rate": 1.8036072144288578e-05, + "loss": 0.0124, + "step": 3202 + }, + { + "epoch": 12.812, + "grad_norm": 0.38386768102645874, + "learning_rate": 1.802605210420842e-05, + "loss": 0.0126, + "step": 3203 + }, + { + "epoch": 12.816, + "grad_norm": 0.48892340064048767, + "learning_rate": 1.8016032064128257e-05, + "loss": 0.0118, + "step": 3204 + }, + { + "epoch": 12.82, + "grad_norm": 0.44380101561546326, + "learning_rate": 1.80060120240481e-05, + "loss": 0.0129, + "step": 3205 + }, + { + "epoch": 12.824, + "grad_norm": 0.3459862172603607, + "learning_rate": 1.7995991983967936e-05, + "loss": 0.0119, + "step": 3206 + }, + { + "epoch": 12.828, + "grad_norm": 0.41011375188827515, + "learning_rate": 1.7985971943887778e-05, + "loss": 0.0119, + "step": 3207 + }, + { + "epoch": 12.832, + "grad_norm": 0.5159206986427307, + "learning_rate": 1.7975951903807615e-05, + "loss": 0.0167, + "step": 3208 + }, + { + "epoch": 12.836, + "grad_norm": 0.22304032742977142, + "learning_rate": 1.7965931863727457e-05, + "loss": 0.0066, + "step": 3209 + }, + { + "epoch": 12.84, + "grad_norm": 0.3644813597202301, + "learning_rate": 1.7955911823647295e-05, + "loss": 0.0084, + "step": 3210 + }, + { + "epoch": 12.844, + "grad_norm": 0.5323387384414673, + "learning_rate": 1.7945891783567133e-05, + "loss": 0.0146, + "step": 3211 + }, + { + "epoch": 12.848, + "grad_norm": 0.647765576839447, + "learning_rate": 1.7935871743486977e-05, + "loss": 0.0127, + "step": 3212 + }, + { + "epoch": 12.852, + "grad_norm": 0.4638051688671112, + "learning_rate": 1.7925851703406815e-05, + "loss": 0.0083, + "step": 3213 + }, + { + "epoch": 12.856, + "grad_norm": 0.7238399982452393, + "learning_rate": 1.7915831663326653e-05, + "loss": 0.0136, + "step": 3214 + }, + { + "epoch": 12.86, + "grad_norm": 0.7715800404548645, + "learning_rate": 1.7905811623246494e-05, + "loss": 0.0149, + "step": 3215 + }, + { + "epoch": 12.864, + "grad_norm": 0.7044159173965454, + "learning_rate": 1.7895791583166332e-05, + "loss": 0.0154, + "step": 3216 + }, + { + "epoch": 12.868, + "grad_norm": 0.45285147428512573, + "learning_rate": 1.7885771543086173e-05, + "loss": 0.0113, + "step": 3217 + }, + { + "epoch": 12.872, + "grad_norm": 0.42862364649772644, + "learning_rate": 1.7875751503006015e-05, + "loss": 0.0122, + "step": 3218 + }, + { + "epoch": 12.876, + "grad_norm": 0.4750185012817383, + "learning_rate": 1.7865731462925853e-05, + "loss": 0.0129, + "step": 3219 + }, + { + "epoch": 12.88, + "grad_norm": 0.4365493059158325, + "learning_rate": 1.785571142284569e-05, + "loss": 0.0143, + "step": 3220 + }, + { + "epoch": 12.884, + "grad_norm": 0.7560842633247375, + "learning_rate": 1.7845691382765532e-05, + "loss": 0.0162, + "step": 3221 + }, + { + "epoch": 12.888, + "grad_norm": 0.21909433603286743, + "learning_rate": 1.7835671342685373e-05, + "loss": 0.0064, + "step": 3222 + }, + { + "epoch": 12.892, + "grad_norm": 0.6783998608589172, + "learning_rate": 1.782565130260521e-05, + "loss": 0.0135, + "step": 3223 + }, + { + "epoch": 12.896, + "grad_norm": 0.3435244560241699, + "learning_rate": 1.7815631262525052e-05, + "loss": 0.0119, + "step": 3224 + }, + { + "epoch": 12.9, + "grad_norm": 0.20075704157352448, + "learning_rate": 1.780561122244489e-05, + "loss": 0.0071, + "step": 3225 + }, + { + "epoch": 12.904, + "grad_norm": 0.44066622853279114, + "learning_rate": 1.7795591182364728e-05, + "loss": 0.0124, + "step": 3226 + }, + { + "epoch": 12.908, + "grad_norm": 0.3443538248538971, + "learning_rate": 1.7785571142284573e-05, + "loss": 0.0116, + "step": 3227 + }, + { + "epoch": 12.912, + "grad_norm": 0.3867914378643036, + "learning_rate": 1.777555110220441e-05, + "loss": 0.0131, + "step": 3228 + }, + { + "epoch": 12.916, + "grad_norm": 0.44781914353370667, + "learning_rate": 1.776553106212425e-05, + "loss": 0.0138, + "step": 3229 + }, + { + "epoch": 12.92, + "grad_norm": 0.4673486649990082, + "learning_rate": 1.775551102204409e-05, + "loss": 0.0128, + "step": 3230 + }, + { + "epoch": 12.924, + "grad_norm": 0.8357963562011719, + "learning_rate": 1.7745490981963928e-05, + "loss": 0.0163, + "step": 3231 + }, + { + "epoch": 12.928, + "grad_norm": 0.52073073387146, + "learning_rate": 1.773547094188377e-05, + "loss": 0.0121, + "step": 3232 + }, + { + "epoch": 12.932, + "grad_norm": 0.620764970779419, + "learning_rate": 1.7725450901803607e-05, + "loss": 0.0132, + "step": 3233 + }, + { + "epoch": 12.936, + "grad_norm": 0.5402158498764038, + "learning_rate": 1.7715430861723448e-05, + "loss": 0.0147, + "step": 3234 + }, + { + "epoch": 12.94, + "grad_norm": 0.6234936118125916, + "learning_rate": 1.7705410821643286e-05, + "loss": 0.0142, + "step": 3235 + }, + { + "epoch": 12.943999999999999, + "grad_norm": 0.7595784664154053, + "learning_rate": 1.7695390781563127e-05, + "loss": 0.017, + "step": 3236 + }, + { + "epoch": 12.948, + "grad_norm": 0.7245892882347107, + "learning_rate": 1.768537074148297e-05, + "loss": 0.014, + "step": 3237 + }, + { + "epoch": 12.952, + "grad_norm": 0.6920550465583801, + "learning_rate": 1.7675350701402807e-05, + "loss": 0.0126, + "step": 3238 + }, + { + "epoch": 12.956, + "grad_norm": 0.6069607734680176, + "learning_rate": 1.7665330661322644e-05, + "loss": 0.014, + "step": 3239 + }, + { + "epoch": 12.96, + "grad_norm": 0.40507733821868896, + "learning_rate": 1.7655310621242486e-05, + "loss": 0.0138, + "step": 3240 + }, + { + "epoch": 12.964, + "grad_norm": 0.40632104873657227, + "learning_rate": 1.7645290581162324e-05, + "loss": 0.0138, + "step": 3241 + }, + { + "epoch": 12.968, + "grad_norm": 0.6928650140762329, + "learning_rate": 1.7635270541082165e-05, + "loss": 0.016, + "step": 3242 + }, + { + "epoch": 12.972, + "grad_norm": 0.5469399094581604, + "learning_rate": 1.7625250501002006e-05, + "loss": 0.0149, + "step": 3243 + }, + { + "epoch": 12.975999999999999, + "grad_norm": 0.44268378615379333, + "learning_rate": 1.7615230460921844e-05, + "loss": 0.013, + "step": 3244 + }, + { + "epoch": 12.98, + "grad_norm": 0.40536338090896606, + "learning_rate": 1.7605210420841682e-05, + "loss": 0.012, + "step": 3245 + }, + { + "epoch": 12.984, + "grad_norm": 0.4465670585632324, + "learning_rate": 1.7595190380761523e-05, + "loss": 0.0125, + "step": 3246 + }, + { + "epoch": 12.988, + "grad_norm": 0.9920943975448608, + "learning_rate": 1.7585170340681365e-05, + "loss": 0.014, + "step": 3247 + }, + { + "epoch": 12.992, + "grad_norm": 0.26608768105506897, + "learning_rate": 1.7575150300601202e-05, + "loss": 0.0072, + "step": 3248 + }, + { + "epoch": 12.996, + "grad_norm": 0.48054268956184387, + "learning_rate": 1.7565130260521044e-05, + "loss": 0.0135, + "step": 3249 + }, + { + "epoch": 13.0, + "grad_norm": 0.7327283024787903, + "learning_rate": 1.755511022044088e-05, + "loss": 0.0247, + "step": 3250 + }, + { + "epoch": 13.004, + "grad_norm": 0.27949610352516174, + "learning_rate": 1.754509018036072e-05, + "loss": 0.0092, + "step": 3251 + }, + { + "epoch": 13.008, + "grad_norm": 0.5924502015113831, + "learning_rate": 1.7535070140280564e-05, + "loss": 0.0124, + "step": 3252 + }, + { + "epoch": 13.012, + "grad_norm": 0.2856701612472534, + "learning_rate": 1.7525050100200402e-05, + "loss": 0.0088, + "step": 3253 + }, + { + "epoch": 13.016, + "grad_norm": 0.6892845034599304, + "learning_rate": 1.751503006012024e-05, + "loss": 0.0101, + "step": 3254 + }, + { + "epoch": 13.02, + "grad_norm": 0.28304752707481384, + "learning_rate": 1.750501002004008e-05, + "loss": 0.0093, + "step": 3255 + }, + { + "epoch": 13.024, + "grad_norm": 0.44429293274879456, + "learning_rate": 1.7494989979959922e-05, + "loss": 0.0106, + "step": 3256 + }, + { + "epoch": 13.028, + "grad_norm": 0.4208962917327881, + "learning_rate": 1.748496993987976e-05, + "loss": 0.011, + "step": 3257 + }, + { + "epoch": 13.032, + "grad_norm": 0.24816329777240753, + "learning_rate": 1.74749498997996e-05, + "loss": 0.0091, + "step": 3258 + }, + { + "epoch": 13.036, + "grad_norm": 0.3193182647228241, + "learning_rate": 1.746492985971944e-05, + "loss": 0.0109, + "step": 3259 + }, + { + "epoch": 13.04, + "grad_norm": 0.8636166453361511, + "learning_rate": 1.7454909819639277e-05, + "loss": 0.0097, + "step": 3260 + }, + { + "epoch": 13.044, + "grad_norm": 0.26683780550956726, + "learning_rate": 1.744488977955912e-05, + "loss": 0.0095, + "step": 3261 + }, + { + "epoch": 13.048, + "grad_norm": 0.5071774125099182, + "learning_rate": 1.743486973947896e-05, + "loss": 0.0126, + "step": 3262 + }, + { + "epoch": 13.052, + "grad_norm": 0.21876634657382965, + "learning_rate": 1.7424849699398798e-05, + "loss": 0.0094, + "step": 3263 + }, + { + "epoch": 13.056, + "grad_norm": 0.18555937707424164, + "learning_rate": 1.741482965931864e-05, + "loss": 0.0059, + "step": 3264 + }, + { + "epoch": 13.06, + "grad_norm": 0.2228597104549408, + "learning_rate": 1.7404809619238477e-05, + "loss": 0.0094, + "step": 3265 + }, + { + "epoch": 13.064, + "grad_norm": 0.21233904361724854, + "learning_rate": 1.739478957915832e-05, + "loss": 0.0082, + "step": 3266 + }, + { + "epoch": 13.068, + "grad_norm": 0.2386617809534073, + "learning_rate": 1.7384769539078156e-05, + "loss": 0.0094, + "step": 3267 + }, + { + "epoch": 13.072, + "grad_norm": 0.37460488080978394, + "learning_rate": 1.7374749498997998e-05, + "loss": 0.0108, + "step": 3268 + }, + { + "epoch": 13.076, + "grad_norm": 0.2160538136959076, + "learning_rate": 1.7364729458917835e-05, + "loss": 0.0094, + "step": 3269 + }, + { + "epoch": 13.08, + "grad_norm": 0.2701886296272278, + "learning_rate": 1.7354709418837677e-05, + "loss": 0.0065, + "step": 3270 + }, + { + "epoch": 13.084, + "grad_norm": 0.3855418264865875, + "learning_rate": 1.7344689378757518e-05, + "loss": 0.0096, + "step": 3271 + }, + { + "epoch": 13.088, + "grad_norm": 0.31014111638069153, + "learning_rate": 1.7334669338677356e-05, + "loss": 0.0095, + "step": 3272 + }, + { + "epoch": 13.092, + "grad_norm": 0.24090217053890228, + "learning_rate": 1.7324649298597194e-05, + "loss": 0.0095, + "step": 3273 + }, + { + "epoch": 13.096, + "grad_norm": 0.2191203087568283, + "learning_rate": 1.7314629258517035e-05, + "loss": 0.0093, + "step": 3274 + }, + { + "epoch": 13.1, + "grad_norm": 0.4989674389362335, + "learning_rate": 1.7304609218436873e-05, + "loss": 0.0121, + "step": 3275 + }, + { + "epoch": 13.104, + "grad_norm": 0.3217163383960724, + "learning_rate": 1.7294589178356714e-05, + "loss": 0.011, + "step": 3276 + }, + { + "epoch": 13.108, + "grad_norm": 0.3589065968990326, + "learning_rate": 1.7284569138276556e-05, + "loss": 0.0101, + "step": 3277 + }, + { + "epoch": 13.112, + "grad_norm": 0.3767361640930176, + "learning_rate": 1.7274549098196393e-05, + "loss": 0.0098, + "step": 3278 + }, + { + "epoch": 13.116, + "grad_norm": 0.3067415952682495, + "learning_rate": 1.726452905811623e-05, + "loss": 0.0095, + "step": 3279 + }, + { + "epoch": 13.12, + "grad_norm": 0.32393768429756165, + "learning_rate": 1.7254509018036073e-05, + "loss": 0.0106, + "step": 3280 + }, + { + "epoch": 13.124, + "grad_norm": 0.579896867275238, + "learning_rate": 1.7244488977955914e-05, + "loss": 0.0111, + "step": 3281 + }, + { + "epoch": 13.128, + "grad_norm": 0.16686947643756866, + "learning_rate": 1.7234468937875752e-05, + "loss": 0.006, + "step": 3282 + }, + { + "epoch": 13.132, + "grad_norm": 0.44065243005752563, + "learning_rate": 1.7224448897795593e-05, + "loss": 0.0106, + "step": 3283 + }, + { + "epoch": 13.136, + "grad_norm": 0.3439742922782898, + "learning_rate": 1.721442885771543e-05, + "loss": 0.0118, + "step": 3284 + }, + { + "epoch": 13.14, + "grad_norm": 0.35956379771232605, + "learning_rate": 1.720440881763527e-05, + "loss": 0.0101, + "step": 3285 + }, + { + "epoch": 13.144, + "grad_norm": 0.33086642622947693, + "learning_rate": 1.7194388777555114e-05, + "loss": 0.0103, + "step": 3286 + }, + { + "epoch": 13.148, + "grad_norm": 0.3616533577442169, + "learning_rate": 1.718436873747495e-05, + "loss": 0.0094, + "step": 3287 + }, + { + "epoch": 13.152, + "grad_norm": 0.3333515524864197, + "learning_rate": 1.717434869739479e-05, + "loss": 0.0093, + "step": 3288 + }, + { + "epoch": 13.156, + "grad_norm": 0.3492039144039154, + "learning_rate": 1.716432865731463e-05, + "loss": 0.011, + "step": 3289 + }, + { + "epoch": 13.16, + "grad_norm": 0.29600805044174194, + "learning_rate": 1.715430861723447e-05, + "loss": 0.0117, + "step": 3290 + }, + { + "epoch": 13.164, + "grad_norm": 0.3409759998321533, + "learning_rate": 1.714428857715431e-05, + "loss": 0.01, + "step": 3291 + }, + { + "epoch": 13.168, + "grad_norm": 0.237446591258049, + "learning_rate": 1.713426853707415e-05, + "loss": 0.0088, + "step": 3292 + }, + { + "epoch": 13.172, + "grad_norm": 0.3831733763217926, + "learning_rate": 1.712424849699399e-05, + "loss": 0.0099, + "step": 3293 + }, + { + "epoch": 13.176, + "grad_norm": 0.3086282014846802, + "learning_rate": 1.7114228456913827e-05, + "loss": 0.0101, + "step": 3294 + }, + { + "epoch": 13.18, + "grad_norm": 0.2323417216539383, + "learning_rate": 1.7104208416833668e-05, + "loss": 0.01, + "step": 3295 + }, + { + "epoch": 13.184, + "grad_norm": 0.2264842838048935, + "learning_rate": 1.709418837675351e-05, + "loss": 0.0089, + "step": 3296 + }, + { + "epoch": 13.188, + "grad_norm": 0.18145760893821716, + "learning_rate": 1.7084168336673347e-05, + "loss": 0.0087, + "step": 3297 + }, + { + "epoch": 13.192, + "grad_norm": 0.21976445615291595, + "learning_rate": 1.707414829659319e-05, + "loss": 0.0085, + "step": 3298 + }, + { + "epoch": 13.196, + "grad_norm": 0.2721193730831146, + "learning_rate": 1.7064128256513026e-05, + "loss": 0.0092, + "step": 3299 + }, + { + "epoch": 13.2, + "grad_norm": 0.2802695035934448, + "learning_rate": 1.7054108216432864e-05, + "loss": 0.0104, + "step": 3300 + }, + { + "epoch": 13.204, + "grad_norm": 0.48233532905578613, + "learning_rate": 1.7044088176352706e-05, + "loss": 0.0097, + "step": 3301 + }, + { + "epoch": 13.208, + "grad_norm": 0.2295309603214264, + "learning_rate": 1.7034068136272547e-05, + "loss": 0.0094, + "step": 3302 + }, + { + "epoch": 13.212, + "grad_norm": 0.32852253317832947, + "learning_rate": 1.7024048096192385e-05, + "loss": 0.0102, + "step": 3303 + }, + { + "epoch": 13.216, + "grad_norm": 0.30892303586006165, + "learning_rate": 1.7014028056112226e-05, + "loss": 0.01, + "step": 3304 + }, + { + "epoch": 13.22, + "grad_norm": 0.46433284878730774, + "learning_rate": 1.7004008016032064e-05, + "loss": 0.0097, + "step": 3305 + }, + { + "epoch": 13.224, + "grad_norm": 0.2719483971595764, + "learning_rate": 1.6993987975951905e-05, + "loss": 0.0094, + "step": 3306 + }, + { + "epoch": 13.228, + "grad_norm": 0.1855124831199646, + "learning_rate": 1.6983967935871743e-05, + "loss": 0.0085, + "step": 3307 + }, + { + "epoch": 13.232, + "grad_norm": 0.3346885144710541, + "learning_rate": 1.6973947895791584e-05, + "loss": 0.0109, + "step": 3308 + }, + { + "epoch": 13.236, + "grad_norm": 0.4800672233104706, + "learning_rate": 1.6963927855711422e-05, + "loss": 0.0123, + "step": 3309 + }, + { + "epoch": 13.24, + "grad_norm": 0.3444778025150299, + "learning_rate": 1.695390781563126e-05, + "loss": 0.0103, + "step": 3310 + }, + { + "epoch": 13.244, + "grad_norm": 0.2996631860733032, + "learning_rate": 1.6943887775551105e-05, + "loss": 0.0092, + "step": 3311 + }, + { + "epoch": 13.248, + "grad_norm": 0.2334967851638794, + "learning_rate": 1.6933867735470943e-05, + "loss": 0.0093, + "step": 3312 + }, + { + "epoch": 13.252, + "grad_norm": 0.20295573770999908, + "learning_rate": 1.692384769539078e-05, + "loss": 0.0086, + "step": 3313 + }, + { + "epoch": 13.256, + "grad_norm": 0.242875874042511, + "learning_rate": 1.6913827655310622e-05, + "loss": 0.0092, + "step": 3314 + }, + { + "epoch": 13.26, + "grad_norm": 0.41469478607177734, + "learning_rate": 1.6903807615230463e-05, + "loss": 0.0112, + "step": 3315 + }, + { + "epoch": 13.264, + "grad_norm": 0.2873092591762543, + "learning_rate": 1.68937875751503e-05, + "loss": 0.01, + "step": 3316 + }, + { + "epoch": 13.268, + "grad_norm": 0.16174815595149994, + "learning_rate": 1.6883767535070142e-05, + "loss": 0.005, + "step": 3317 + }, + { + "epoch": 13.272, + "grad_norm": 0.5048874020576477, + "learning_rate": 1.687374749498998e-05, + "loss": 0.012, + "step": 3318 + }, + { + "epoch": 13.276, + "grad_norm": 0.3545554578304291, + "learning_rate": 1.6863727454909818e-05, + "loss": 0.0095, + "step": 3319 + }, + { + "epoch": 13.28, + "grad_norm": 0.40919992327690125, + "learning_rate": 1.6853707414829663e-05, + "loss": 0.0111, + "step": 3320 + }, + { + "epoch": 13.284, + "grad_norm": 0.2165553867816925, + "learning_rate": 1.68436873747495e-05, + "loss": 0.0084, + "step": 3321 + }, + { + "epoch": 13.288, + "grad_norm": 0.369000107049942, + "learning_rate": 1.683366733466934e-05, + "loss": 0.0117, + "step": 3322 + }, + { + "epoch": 13.292, + "grad_norm": 0.3298990726470947, + "learning_rate": 1.682364729458918e-05, + "loss": 0.0087, + "step": 3323 + }, + { + "epoch": 13.296, + "grad_norm": 0.37005317211151123, + "learning_rate": 1.6813627254509018e-05, + "loss": 0.0103, + "step": 3324 + }, + { + "epoch": 13.3, + "grad_norm": 0.5654655694961548, + "learning_rate": 1.680360721442886e-05, + "loss": 0.0102, + "step": 3325 + }, + { + "epoch": 13.304, + "grad_norm": 0.7227338552474976, + "learning_rate": 1.67935871743487e-05, + "loss": 0.0166, + "step": 3326 + }, + { + "epoch": 13.308, + "grad_norm": 0.443639874458313, + "learning_rate": 1.678356713426854e-05, + "loss": 0.0111, + "step": 3327 + }, + { + "epoch": 13.312, + "grad_norm": 0.25988972187042236, + "learning_rate": 1.6773547094188376e-05, + "loss": 0.0093, + "step": 3328 + }, + { + "epoch": 13.316, + "grad_norm": 0.3140908181667328, + "learning_rate": 1.6763527054108218e-05, + "loss": 0.0094, + "step": 3329 + }, + { + "epoch": 13.32, + "grad_norm": 0.23378266394138336, + "learning_rate": 1.675350701402806e-05, + "loss": 0.0096, + "step": 3330 + }, + { + "epoch": 13.324, + "grad_norm": 0.31349173188209534, + "learning_rate": 1.6743486973947897e-05, + "loss": 0.0071, + "step": 3331 + }, + { + "epoch": 13.328, + "grad_norm": 0.3660728633403778, + "learning_rate": 1.6733466933867738e-05, + "loss": 0.0111, + "step": 3332 + }, + { + "epoch": 13.332, + "grad_norm": 0.4420227110385895, + "learning_rate": 1.6723446893787576e-05, + "loss": 0.0137, + "step": 3333 + }, + { + "epoch": 13.336, + "grad_norm": 0.2265954315662384, + "learning_rate": 1.6713426853707414e-05, + "loss": 0.0093, + "step": 3334 + }, + { + "epoch": 13.34, + "grad_norm": 0.31898754835128784, + "learning_rate": 1.6703406813627255e-05, + "loss": 0.0103, + "step": 3335 + }, + { + "epoch": 13.344, + "grad_norm": 0.34343066811561584, + "learning_rate": 1.6693386773547096e-05, + "loss": 0.0112, + "step": 3336 + }, + { + "epoch": 13.348, + "grad_norm": 0.27078312635421753, + "learning_rate": 1.6683366733466934e-05, + "loss": 0.01, + "step": 3337 + }, + { + "epoch": 13.352, + "grad_norm": 0.2199242115020752, + "learning_rate": 1.6673346693386772e-05, + "loss": 0.011, + "step": 3338 + }, + { + "epoch": 13.356, + "grad_norm": 0.25304946303367615, + "learning_rate": 1.6663326653306613e-05, + "loss": 0.0101, + "step": 3339 + }, + { + "epoch": 13.36, + "grad_norm": 0.3057776689529419, + "learning_rate": 1.6653306613226455e-05, + "loss": 0.0099, + "step": 3340 + }, + { + "epoch": 13.364, + "grad_norm": 0.17325368523597717, + "learning_rate": 1.6643286573146293e-05, + "loss": 0.0087, + "step": 3341 + }, + { + "epoch": 13.368, + "grad_norm": 0.2971237599849701, + "learning_rate": 1.6633266533066134e-05, + "loss": 0.0109, + "step": 3342 + }, + { + "epoch": 13.372, + "grad_norm": 0.2874193787574768, + "learning_rate": 1.6623246492985972e-05, + "loss": 0.0099, + "step": 3343 + }, + { + "epoch": 13.376, + "grad_norm": 0.3510386347770691, + "learning_rate": 1.661322645290581e-05, + "loss": 0.0119, + "step": 3344 + }, + { + "epoch": 13.38, + "grad_norm": 0.4247153401374817, + "learning_rate": 1.6603206412825654e-05, + "loss": 0.0138, + "step": 3345 + }, + { + "epoch": 13.384, + "grad_norm": 0.2908627986907959, + "learning_rate": 1.6593186372745492e-05, + "loss": 0.0103, + "step": 3346 + }, + { + "epoch": 13.388, + "grad_norm": 0.2727614939212799, + "learning_rate": 1.658316633266533e-05, + "loss": 0.0101, + "step": 3347 + }, + { + "epoch": 13.392, + "grad_norm": 0.3280937969684601, + "learning_rate": 1.657314629258517e-05, + "loss": 0.0103, + "step": 3348 + }, + { + "epoch": 13.396, + "grad_norm": 0.2449714094400406, + "learning_rate": 1.656312625250501e-05, + "loss": 0.0104, + "step": 3349 + }, + { + "epoch": 13.4, + "grad_norm": 0.24496877193450928, + "learning_rate": 1.655310621242485e-05, + "loss": 0.0099, + "step": 3350 + }, + { + "epoch": 13.404, + "grad_norm": 0.4488162398338318, + "learning_rate": 1.6543086172344692e-05, + "loss": 0.0127, + "step": 3351 + }, + { + "epoch": 13.408, + "grad_norm": 0.3072626292705536, + "learning_rate": 1.653306613226453e-05, + "loss": 0.0105, + "step": 3352 + }, + { + "epoch": 13.412, + "grad_norm": 0.27603739500045776, + "learning_rate": 1.6523046092184368e-05, + "loss": 0.0103, + "step": 3353 + }, + { + "epoch": 13.416, + "grad_norm": 0.3227679431438446, + "learning_rate": 1.651302605210421e-05, + "loss": 0.0104, + "step": 3354 + }, + { + "epoch": 13.42, + "grad_norm": 0.23910482227802277, + "learning_rate": 1.650300601202405e-05, + "loss": 0.0101, + "step": 3355 + }, + { + "epoch": 13.424, + "grad_norm": 0.4852541387081146, + "learning_rate": 1.6492985971943888e-05, + "loss": 0.0118, + "step": 3356 + }, + { + "epoch": 13.428, + "grad_norm": 0.3126225471496582, + "learning_rate": 1.648296593186373e-05, + "loss": 0.0103, + "step": 3357 + }, + { + "epoch": 13.432, + "grad_norm": 0.2821721136569977, + "learning_rate": 1.6472945891783567e-05, + "loss": 0.0098, + "step": 3358 + }, + { + "epoch": 13.436, + "grad_norm": 0.3140818774700165, + "learning_rate": 1.6462925851703405e-05, + "loss": 0.0103, + "step": 3359 + }, + { + "epoch": 13.44, + "grad_norm": 0.3383825719356537, + "learning_rate": 1.6452905811623246e-05, + "loss": 0.0112, + "step": 3360 + }, + { + "epoch": 13.444, + "grad_norm": 0.3161516487598419, + "learning_rate": 1.6442885771543088e-05, + "loss": 0.0109, + "step": 3361 + }, + { + "epoch": 13.448, + "grad_norm": 0.24821779131889343, + "learning_rate": 1.6432865731462926e-05, + "loss": 0.0093, + "step": 3362 + }, + { + "epoch": 13.452, + "grad_norm": 0.3279327154159546, + "learning_rate": 1.6422845691382767e-05, + "loss": 0.0098, + "step": 3363 + }, + { + "epoch": 13.456, + "grad_norm": 0.3004770874977112, + "learning_rate": 1.6412825651302605e-05, + "loss": 0.0101, + "step": 3364 + }, + { + "epoch": 13.46, + "grad_norm": 0.26334452629089355, + "learning_rate": 1.6402805611222446e-05, + "loss": 0.0094, + "step": 3365 + }, + { + "epoch": 13.464, + "grad_norm": 0.2865053713321686, + "learning_rate": 1.6392785571142284e-05, + "loss": 0.0107, + "step": 3366 + }, + { + "epoch": 13.468, + "grad_norm": 0.3839758038520813, + "learning_rate": 1.6382765531062125e-05, + "loss": 0.0103, + "step": 3367 + }, + { + "epoch": 13.472, + "grad_norm": 0.48596179485321045, + "learning_rate": 1.6372745490981963e-05, + "loss": 0.0097, + "step": 3368 + }, + { + "epoch": 13.475999999999999, + "grad_norm": 0.22549477219581604, + "learning_rate": 1.6362725450901804e-05, + "loss": 0.0085, + "step": 3369 + }, + { + "epoch": 13.48, + "grad_norm": 0.3293830156326294, + "learning_rate": 1.6352705410821646e-05, + "loss": 0.0096, + "step": 3370 + }, + { + "epoch": 13.484, + "grad_norm": 0.2493169605731964, + "learning_rate": 1.6342685370741484e-05, + "loss": 0.0093, + "step": 3371 + }, + { + "epoch": 13.488, + "grad_norm": 0.4540705978870392, + "learning_rate": 1.633266533066132e-05, + "loss": 0.0115, + "step": 3372 + }, + { + "epoch": 13.492, + "grad_norm": 0.2564983665943146, + "learning_rate": 1.6322645290581163e-05, + "loss": 0.0096, + "step": 3373 + }, + { + "epoch": 13.496, + "grad_norm": 0.26169297099113464, + "learning_rate": 1.6312625250501004e-05, + "loss": 0.0099, + "step": 3374 + }, + { + "epoch": 13.5, + "grad_norm": 0.31739065051078796, + "learning_rate": 1.6302605210420842e-05, + "loss": 0.0097, + "step": 3375 + }, + { + "epoch": 13.504, + "grad_norm": 0.2799374759197235, + "learning_rate": 1.6292585170340683e-05, + "loss": 0.0099, + "step": 3376 + }, + { + "epoch": 13.508, + "grad_norm": 0.22275328636169434, + "learning_rate": 1.628256513026052e-05, + "loss": 0.0095, + "step": 3377 + }, + { + "epoch": 13.512, + "grad_norm": 0.37261059880256653, + "learning_rate": 1.627254509018036e-05, + "loss": 0.0096, + "step": 3378 + }, + { + "epoch": 13.516, + "grad_norm": 0.23295381665229797, + "learning_rate": 1.6262525050100204e-05, + "loss": 0.0089, + "step": 3379 + }, + { + "epoch": 13.52, + "grad_norm": 0.23815542459487915, + "learning_rate": 1.625250501002004e-05, + "loss": 0.0101, + "step": 3380 + }, + { + "epoch": 13.524000000000001, + "grad_norm": 0.27734702825546265, + "learning_rate": 1.624248496993988e-05, + "loss": 0.01, + "step": 3381 + }, + { + "epoch": 13.528, + "grad_norm": 0.2713322937488556, + "learning_rate": 1.623246492985972e-05, + "loss": 0.0092, + "step": 3382 + }, + { + "epoch": 13.532, + "grad_norm": 0.4412878751754761, + "learning_rate": 1.622244488977956e-05, + "loss": 0.0113, + "step": 3383 + }, + { + "epoch": 13.536, + "grad_norm": 0.3151780068874359, + "learning_rate": 1.62124248496994e-05, + "loss": 0.0106, + "step": 3384 + }, + { + "epoch": 13.54, + "grad_norm": 0.3254270851612091, + "learning_rate": 1.620240480961924e-05, + "loss": 0.0102, + "step": 3385 + }, + { + "epoch": 13.544, + "grad_norm": 0.3681427240371704, + "learning_rate": 1.619238476953908e-05, + "loss": 0.0116, + "step": 3386 + }, + { + "epoch": 13.548, + "grad_norm": 0.3805301785469055, + "learning_rate": 1.6182364729458917e-05, + "loss": 0.009, + "step": 3387 + }, + { + "epoch": 13.552, + "grad_norm": 0.2876235246658325, + "learning_rate": 1.617234468937876e-05, + "loss": 0.0103, + "step": 3388 + }, + { + "epoch": 13.556000000000001, + "grad_norm": 0.3005918264389038, + "learning_rate": 1.61623246492986e-05, + "loss": 0.0099, + "step": 3389 + }, + { + "epoch": 13.56, + "grad_norm": 0.382587730884552, + "learning_rate": 1.6152304609218438e-05, + "loss": 0.0108, + "step": 3390 + }, + { + "epoch": 13.564, + "grad_norm": 0.4565657377243042, + "learning_rate": 1.614228456913828e-05, + "loss": 0.01, + "step": 3391 + }, + { + "epoch": 13.568, + "grad_norm": 0.14987795054912567, + "learning_rate": 1.6132264529058117e-05, + "loss": 0.0059, + "step": 3392 + }, + { + "epoch": 13.572, + "grad_norm": 0.21870999038219452, + "learning_rate": 1.6122244488977955e-05, + "loss": 0.009, + "step": 3393 + }, + { + "epoch": 13.576, + "grad_norm": 0.5497450232505798, + "learning_rate": 1.6112224448897796e-05, + "loss": 0.0117, + "step": 3394 + }, + { + "epoch": 13.58, + "grad_norm": 0.35143905878067017, + "learning_rate": 1.6102204408817637e-05, + "loss": 0.0111, + "step": 3395 + }, + { + "epoch": 13.584, + "grad_norm": 0.27477556467056274, + "learning_rate": 1.6092184368737475e-05, + "loss": 0.0105, + "step": 3396 + }, + { + "epoch": 13.588, + "grad_norm": 0.29407718777656555, + "learning_rate": 1.6082164328657316e-05, + "loss": 0.0095, + "step": 3397 + }, + { + "epoch": 13.592, + "grad_norm": 0.1857583224773407, + "learning_rate": 1.6072144288577154e-05, + "loss": 0.0092, + "step": 3398 + }, + { + "epoch": 13.596, + "grad_norm": 0.4153088927268982, + "learning_rate": 1.6062124248496996e-05, + "loss": 0.0096, + "step": 3399 + }, + { + "epoch": 13.6, + "grad_norm": 0.28967979550361633, + "learning_rate": 1.6052104208416833e-05, + "loss": 0.0107, + "step": 3400 + }, + { + "epoch": 13.604, + "grad_norm": 0.24863949418067932, + "learning_rate": 1.6042084168336675e-05, + "loss": 0.0103, + "step": 3401 + }, + { + "epoch": 13.608, + "grad_norm": 0.3102586269378662, + "learning_rate": 1.6032064128256513e-05, + "loss": 0.0103, + "step": 3402 + }, + { + "epoch": 13.612, + "grad_norm": 0.361162394285202, + "learning_rate": 1.6022044088176354e-05, + "loss": 0.0125, + "step": 3403 + }, + { + "epoch": 13.616, + "grad_norm": 0.40168410539627075, + "learning_rate": 1.6012024048096195e-05, + "loss": 0.0114, + "step": 3404 + }, + { + "epoch": 13.62, + "grad_norm": 0.29243239760398865, + "learning_rate": 1.6002004008016033e-05, + "loss": 0.0055, + "step": 3405 + }, + { + "epoch": 13.624, + "grad_norm": 0.26829758286476135, + "learning_rate": 1.599198396793587e-05, + "loss": 0.0112, + "step": 3406 + }, + { + "epoch": 13.628, + "grad_norm": 0.40442490577697754, + "learning_rate": 1.5981963927855712e-05, + "loss": 0.0115, + "step": 3407 + }, + { + "epoch": 13.632, + "grad_norm": 0.31696653366088867, + "learning_rate": 1.597194388777555e-05, + "loss": 0.0109, + "step": 3408 + }, + { + "epoch": 13.636, + "grad_norm": 0.44148167967796326, + "learning_rate": 1.596192384769539e-05, + "loss": 0.0107, + "step": 3409 + }, + { + "epoch": 13.64, + "grad_norm": 0.2826448082923889, + "learning_rate": 1.5951903807615233e-05, + "loss": 0.0107, + "step": 3410 + }, + { + "epoch": 13.644, + "grad_norm": 0.524307906627655, + "learning_rate": 1.594188376753507e-05, + "loss": 0.0111, + "step": 3411 + }, + { + "epoch": 13.648, + "grad_norm": 0.30534476041793823, + "learning_rate": 1.593186372745491e-05, + "loss": 0.0106, + "step": 3412 + }, + { + "epoch": 13.652, + "grad_norm": 0.3145725727081299, + "learning_rate": 1.592184368737475e-05, + "loss": 0.0098, + "step": 3413 + }, + { + "epoch": 13.656, + "grad_norm": 0.21220719814300537, + "learning_rate": 1.591182364729459e-05, + "loss": 0.0111, + "step": 3414 + }, + { + "epoch": 13.66, + "grad_norm": 0.37001675367355347, + "learning_rate": 1.590180360721443e-05, + "loss": 0.0102, + "step": 3415 + }, + { + "epoch": 13.664, + "grad_norm": 0.2675144672393799, + "learning_rate": 1.589178356713427e-05, + "loss": 0.0104, + "step": 3416 + }, + { + "epoch": 13.668, + "grad_norm": 0.4971553683280945, + "learning_rate": 1.5881763527054108e-05, + "loss": 0.0144, + "step": 3417 + }, + { + "epoch": 13.672, + "grad_norm": 0.32960245013237, + "learning_rate": 1.5871743486973946e-05, + "loss": 0.0096, + "step": 3418 + }, + { + "epoch": 13.676, + "grad_norm": 0.1954725980758667, + "learning_rate": 1.586172344689379e-05, + "loss": 0.0084, + "step": 3419 + }, + { + "epoch": 13.68, + "grad_norm": 0.338614821434021, + "learning_rate": 1.585170340681363e-05, + "loss": 0.012, + "step": 3420 + }, + { + "epoch": 13.684, + "grad_norm": 0.27286550402641296, + "learning_rate": 1.5841683366733466e-05, + "loss": 0.0105, + "step": 3421 + }, + { + "epoch": 13.688, + "grad_norm": 0.3043253421783447, + "learning_rate": 1.5831663326653308e-05, + "loss": 0.0111, + "step": 3422 + }, + { + "epoch": 13.692, + "grad_norm": 0.608754575252533, + "learning_rate": 1.582164328657315e-05, + "loss": 0.0122, + "step": 3423 + }, + { + "epoch": 13.696, + "grad_norm": 0.44034209847450256, + "learning_rate": 1.5811623246492987e-05, + "loss": 0.0103, + "step": 3424 + }, + { + "epoch": 13.7, + "grad_norm": 0.3596172630786896, + "learning_rate": 1.5801603206412828e-05, + "loss": 0.0101, + "step": 3425 + }, + { + "epoch": 13.704, + "grad_norm": 0.1947639137506485, + "learning_rate": 1.5791583166332666e-05, + "loss": 0.0086, + "step": 3426 + }, + { + "epoch": 13.708, + "grad_norm": 0.595519483089447, + "learning_rate": 1.5781563126252504e-05, + "loss": 0.0113, + "step": 3427 + }, + { + "epoch": 13.712, + "grad_norm": 0.30522915720939636, + "learning_rate": 1.5771543086172345e-05, + "loss": 0.011, + "step": 3428 + }, + { + "epoch": 13.716, + "grad_norm": 0.2296917587518692, + "learning_rate": 1.5761523046092187e-05, + "loss": 0.01, + "step": 3429 + }, + { + "epoch": 13.72, + "grad_norm": 0.5215935707092285, + "learning_rate": 1.5751503006012024e-05, + "loss": 0.0099, + "step": 3430 + }, + { + "epoch": 13.724, + "grad_norm": 0.22468657791614532, + "learning_rate": 1.5741482965931866e-05, + "loss": 0.0108, + "step": 3431 + }, + { + "epoch": 13.728, + "grad_norm": 0.25664982199668884, + "learning_rate": 1.5731462925851704e-05, + "loss": 0.0109, + "step": 3432 + }, + { + "epoch": 13.732, + "grad_norm": 0.2053966224193573, + "learning_rate": 1.5721442885771545e-05, + "loss": 0.0096, + "step": 3433 + }, + { + "epoch": 13.736, + "grad_norm": 0.2271842062473297, + "learning_rate": 1.5711422845691383e-05, + "loss": 0.0092, + "step": 3434 + }, + { + "epoch": 13.74, + "grad_norm": 0.41068193316459656, + "learning_rate": 1.5701402805611224e-05, + "loss": 0.0119, + "step": 3435 + }, + { + "epoch": 13.744, + "grad_norm": 0.2704119086265564, + "learning_rate": 1.5691382765531062e-05, + "loss": 0.0096, + "step": 3436 + }, + { + "epoch": 13.748, + "grad_norm": 0.37335240840911865, + "learning_rate": 1.56813627254509e-05, + "loss": 0.0106, + "step": 3437 + }, + { + "epoch": 13.752, + "grad_norm": 0.6146287322044373, + "learning_rate": 1.5671342685370745e-05, + "loss": 0.0117, + "step": 3438 + }, + { + "epoch": 13.756, + "grad_norm": 0.22417746484279633, + "learning_rate": 1.5661322645290582e-05, + "loss": 0.0061, + "step": 3439 + }, + { + "epoch": 13.76, + "grad_norm": 0.38575106859207153, + "learning_rate": 1.565130260521042e-05, + "loss": 0.0108, + "step": 3440 + }, + { + "epoch": 13.764, + "grad_norm": 0.2831723690032959, + "learning_rate": 1.564128256513026e-05, + "loss": 0.0103, + "step": 3441 + }, + { + "epoch": 13.768, + "grad_norm": 0.39680731296539307, + "learning_rate": 1.56312625250501e-05, + "loss": 0.0117, + "step": 3442 + }, + { + "epoch": 13.772, + "grad_norm": 0.5540252327919006, + "learning_rate": 1.562124248496994e-05, + "loss": 0.0123, + "step": 3443 + }, + { + "epoch": 13.776, + "grad_norm": 0.34284451603889465, + "learning_rate": 1.5611222444889782e-05, + "loss": 0.011, + "step": 3444 + }, + { + "epoch": 13.78, + "grad_norm": 0.23013213276863098, + "learning_rate": 1.560120240480962e-05, + "loss": 0.011, + "step": 3445 + }, + { + "epoch": 13.784, + "grad_norm": 0.583663284778595, + "learning_rate": 1.5591182364729458e-05, + "loss": 0.0121, + "step": 3446 + }, + { + "epoch": 13.788, + "grad_norm": 0.2305268794298172, + "learning_rate": 1.55811623246493e-05, + "loss": 0.0098, + "step": 3447 + }, + { + "epoch": 13.792, + "grad_norm": 0.2994031608104706, + "learning_rate": 1.557114228456914e-05, + "loss": 0.0102, + "step": 3448 + }, + { + "epoch": 13.796, + "grad_norm": 0.26790478825569153, + "learning_rate": 1.556112224448898e-05, + "loss": 0.009, + "step": 3449 + }, + { + "epoch": 13.8, + "grad_norm": 0.2465956211090088, + "learning_rate": 1.555110220440882e-05, + "loss": 0.0106, + "step": 3450 + }, + { + "epoch": 13.804, + "grad_norm": 0.40356552600860596, + "learning_rate": 1.5541082164328658e-05, + "loss": 0.0105, + "step": 3451 + }, + { + "epoch": 13.808, + "grad_norm": 0.2311774492263794, + "learning_rate": 1.5531062124248495e-05, + "loss": 0.0094, + "step": 3452 + }, + { + "epoch": 13.812, + "grad_norm": 0.32201912999153137, + "learning_rate": 1.552104208416834e-05, + "loss": 0.012, + "step": 3453 + }, + { + "epoch": 13.816, + "grad_norm": 0.35122230648994446, + "learning_rate": 1.5511022044088178e-05, + "loss": 0.0101, + "step": 3454 + }, + { + "epoch": 13.82, + "grad_norm": 0.3138692378997803, + "learning_rate": 1.5501002004008016e-05, + "loss": 0.0109, + "step": 3455 + }, + { + "epoch": 13.824, + "grad_norm": 0.3369031846523285, + "learning_rate": 1.5490981963927857e-05, + "loss": 0.0101, + "step": 3456 + }, + { + "epoch": 13.828, + "grad_norm": 0.31963375210762024, + "learning_rate": 1.5480961923847695e-05, + "loss": 0.011, + "step": 3457 + }, + { + "epoch": 13.832, + "grad_norm": 0.42217960953712463, + "learning_rate": 1.5470941883767536e-05, + "loss": 0.0111, + "step": 3458 + }, + { + "epoch": 13.836, + "grad_norm": 0.39974090456962585, + "learning_rate": 1.5460921843687378e-05, + "loss": 0.0114, + "step": 3459 + }, + { + "epoch": 13.84, + "grad_norm": 0.540280818939209, + "learning_rate": 1.5450901803607216e-05, + "loss": 0.0129, + "step": 3460 + }, + { + "epoch": 13.844, + "grad_norm": 0.2044568657875061, + "learning_rate": 1.5440881763527053e-05, + "loss": 0.0095, + "step": 3461 + }, + { + "epoch": 13.848, + "grad_norm": 0.376753032207489, + "learning_rate": 1.5430861723446895e-05, + "loss": 0.0115, + "step": 3462 + }, + { + "epoch": 13.852, + "grad_norm": 0.40467560291290283, + "learning_rate": 1.5420841683366736e-05, + "loss": 0.0122, + "step": 3463 + }, + { + "epoch": 13.856, + "grad_norm": 0.2836839556694031, + "learning_rate": 1.5410821643286574e-05, + "loss": 0.0098, + "step": 3464 + }, + { + "epoch": 13.86, + "grad_norm": 0.5280314683914185, + "learning_rate": 1.5400801603206412e-05, + "loss": 0.0105, + "step": 3465 + }, + { + "epoch": 13.864, + "grad_norm": 0.24809741973876953, + "learning_rate": 1.5390781563126253e-05, + "loss": 0.0101, + "step": 3466 + }, + { + "epoch": 13.868, + "grad_norm": 0.3241751790046692, + "learning_rate": 1.538076152304609e-05, + "loss": 0.0111, + "step": 3467 + }, + { + "epoch": 13.872, + "grad_norm": 0.2702431380748749, + "learning_rate": 1.5370741482965932e-05, + "loss": 0.0112, + "step": 3468 + }, + { + "epoch": 13.876, + "grad_norm": 0.4810154139995575, + "learning_rate": 1.5360721442885773e-05, + "loss": 0.0118, + "step": 3469 + }, + { + "epoch": 13.88, + "grad_norm": 0.3676247000694275, + "learning_rate": 1.535070140280561e-05, + "loss": 0.0104, + "step": 3470 + }, + { + "epoch": 13.884, + "grad_norm": 0.27984535694122314, + "learning_rate": 1.534068136272545e-05, + "loss": 0.0099, + "step": 3471 + }, + { + "epoch": 13.888, + "grad_norm": 0.20753216743469238, + "learning_rate": 1.533066132264529e-05, + "loss": 0.01, + "step": 3472 + }, + { + "epoch": 13.892, + "grad_norm": 0.22736266255378723, + "learning_rate": 1.5320641282565132e-05, + "loss": 0.0094, + "step": 3473 + }, + { + "epoch": 13.896, + "grad_norm": 0.2514065206050873, + "learning_rate": 1.531062124248497e-05, + "loss": 0.0095, + "step": 3474 + }, + { + "epoch": 13.9, + "grad_norm": 0.39425936341285706, + "learning_rate": 1.530060120240481e-05, + "loss": 0.011, + "step": 3475 + }, + { + "epoch": 13.904, + "grad_norm": 0.3547126054763794, + "learning_rate": 1.529058116232465e-05, + "loss": 0.01, + "step": 3476 + }, + { + "epoch": 13.908, + "grad_norm": 0.20616790652275085, + "learning_rate": 1.5280561122244487e-05, + "loss": 0.0067, + "step": 3477 + }, + { + "epoch": 13.912, + "grad_norm": 0.2141866683959961, + "learning_rate": 1.527054108216433e-05, + "loss": 0.0094, + "step": 3478 + }, + { + "epoch": 13.916, + "grad_norm": 0.3642483055591583, + "learning_rate": 1.526052104208417e-05, + "loss": 0.01, + "step": 3479 + }, + { + "epoch": 13.92, + "grad_norm": 0.24967001378536224, + "learning_rate": 1.5250501002004009e-05, + "loss": 0.0115, + "step": 3480 + }, + { + "epoch": 13.924, + "grad_norm": 0.31511619687080383, + "learning_rate": 1.5240480961923847e-05, + "loss": 0.009, + "step": 3481 + }, + { + "epoch": 13.928, + "grad_norm": 0.5050347447395325, + "learning_rate": 1.523046092184369e-05, + "loss": 0.0128, + "step": 3482 + }, + { + "epoch": 13.932, + "grad_norm": 0.3271845579147339, + "learning_rate": 1.5220440881763528e-05, + "loss": 0.0102, + "step": 3483 + }, + { + "epoch": 13.936, + "grad_norm": 0.49033862352371216, + "learning_rate": 1.5210420841683367e-05, + "loss": 0.0095, + "step": 3484 + }, + { + "epoch": 13.94, + "grad_norm": 0.34133315086364746, + "learning_rate": 1.5200400801603207e-05, + "loss": 0.0106, + "step": 3485 + }, + { + "epoch": 13.943999999999999, + "grad_norm": 0.29839426279067993, + "learning_rate": 1.5190380761523047e-05, + "loss": 0.0096, + "step": 3486 + }, + { + "epoch": 13.948, + "grad_norm": 0.21854914724826813, + "learning_rate": 1.5180360721442888e-05, + "loss": 0.0096, + "step": 3487 + }, + { + "epoch": 13.952, + "grad_norm": 0.41724851727485657, + "learning_rate": 1.5170340681362727e-05, + "loss": 0.012, + "step": 3488 + }, + { + "epoch": 13.956, + "grad_norm": 0.09979083389043808, + "learning_rate": 1.5160320641282565e-05, + "loss": 0.0041, + "step": 3489 + }, + { + "epoch": 13.96, + "grad_norm": 0.1923419088125229, + "learning_rate": 1.5150300601202405e-05, + "loss": 0.0062, + "step": 3490 + }, + { + "epoch": 13.964, + "grad_norm": 0.2782224416732788, + "learning_rate": 1.5140280561122244e-05, + "loss": 0.0104, + "step": 3491 + }, + { + "epoch": 13.968, + "grad_norm": 0.3307152986526489, + "learning_rate": 1.5130260521042086e-05, + "loss": 0.0108, + "step": 3492 + }, + { + "epoch": 13.972, + "grad_norm": 0.247494637966156, + "learning_rate": 1.5120240480961925e-05, + "loss": 0.006, + "step": 3493 + }, + { + "epoch": 13.975999999999999, + "grad_norm": 0.27174144983291626, + "learning_rate": 1.5110220440881765e-05, + "loss": 0.0081, + "step": 3494 + }, + { + "epoch": 13.98, + "grad_norm": 0.26648882031440735, + "learning_rate": 1.5100200400801603e-05, + "loss": 0.0101, + "step": 3495 + }, + { + "epoch": 13.984, + "grad_norm": 0.2658537030220032, + "learning_rate": 1.5090180360721442e-05, + "loss": 0.0098, + "step": 3496 + }, + { + "epoch": 13.988, + "grad_norm": 0.2758139371871948, + "learning_rate": 1.5080160320641284e-05, + "loss": 0.0095, + "step": 3497 + }, + { + "epoch": 13.992, + "grad_norm": 0.3168604075908661, + "learning_rate": 1.5070140280561123e-05, + "loss": 0.0106, + "step": 3498 + }, + { + "epoch": 13.996, + "grad_norm": 0.3637627065181732, + "learning_rate": 1.5060120240480963e-05, + "loss": 0.012, + "step": 3499 + }, + { + "epoch": 14.0, + "grad_norm": 0.3541504144668579, + "learning_rate": 1.5050100200400802e-05, + "loss": 0.0065, + "step": 3500 + }, + { + "epoch": 14.004, + "grad_norm": 0.15899381041526794, + "learning_rate": 1.504008016032064e-05, + "loss": 0.008, + "step": 3501 + }, + { + "epoch": 14.008, + "grad_norm": 0.18751265108585358, + "learning_rate": 1.5030060120240483e-05, + "loss": 0.0079, + "step": 3502 + }, + { + "epoch": 14.012, + "grad_norm": 0.1956561952829361, + "learning_rate": 1.5020040080160321e-05, + "loss": 0.0084, + "step": 3503 + }, + { + "epoch": 14.016, + "grad_norm": 0.17938251793384552, + "learning_rate": 1.501002004008016e-05, + "loss": 0.0093, + "step": 3504 + }, + { + "epoch": 14.02, + "grad_norm": 0.2153725028038025, + "learning_rate": 1.5e-05, + "loss": 0.0086, + "step": 3505 + }, + { + "epoch": 14.024, + "grad_norm": 0.08399572223424911, + "learning_rate": 1.498997995991984e-05, + "loss": 0.0041, + "step": 3506 + }, + { + "epoch": 14.028, + "grad_norm": 0.20708952844142914, + "learning_rate": 1.4979959919839681e-05, + "loss": 0.0078, + "step": 3507 + }, + { + "epoch": 14.032, + "grad_norm": 0.18916714191436768, + "learning_rate": 1.496993987975952e-05, + "loss": 0.0089, + "step": 3508 + }, + { + "epoch": 14.036, + "grad_norm": 0.14934112131595612, + "learning_rate": 1.4959919839679359e-05, + "loss": 0.0076, + "step": 3509 + }, + { + "epoch": 14.04, + "grad_norm": 0.2596082389354706, + "learning_rate": 1.4949899799599198e-05, + "loss": 0.0092, + "step": 3510 + }, + { + "epoch": 14.044, + "grad_norm": 0.16445767879486084, + "learning_rate": 1.4939879759519038e-05, + "loss": 0.0054, + "step": 3511 + }, + { + "epoch": 14.048, + "grad_norm": 0.2873588800430298, + "learning_rate": 1.492985971943888e-05, + "loss": 0.0098, + "step": 3512 + }, + { + "epoch": 14.052, + "grad_norm": 0.19016903638839722, + "learning_rate": 1.4919839679358719e-05, + "loss": 0.0082, + "step": 3513 + }, + { + "epoch": 14.056, + "grad_norm": 0.1945473998785019, + "learning_rate": 1.4909819639278558e-05, + "loss": 0.0084, + "step": 3514 + }, + { + "epoch": 14.06, + "grad_norm": 0.5994365215301514, + "learning_rate": 1.4899799599198396e-05, + "loss": 0.0127, + "step": 3515 + }, + { + "epoch": 14.064, + "grad_norm": 0.2284667044878006, + "learning_rate": 1.4889779559118236e-05, + "loss": 0.0081, + "step": 3516 + }, + { + "epoch": 14.068, + "grad_norm": 0.1868845373392105, + "learning_rate": 1.4879759519038077e-05, + "loss": 0.0087, + "step": 3517 + }, + { + "epoch": 14.072, + "grad_norm": 0.25140196084976196, + "learning_rate": 1.4869739478957917e-05, + "loss": 0.0087, + "step": 3518 + }, + { + "epoch": 14.076, + "grad_norm": 0.30023810267448425, + "learning_rate": 1.4859719438877756e-05, + "loss": 0.0098, + "step": 3519 + }, + { + "epoch": 14.08, + "grad_norm": 0.3479897379875183, + "learning_rate": 1.4849699398797596e-05, + "loss": 0.0087, + "step": 3520 + }, + { + "epoch": 14.084, + "grad_norm": 0.33798903226852417, + "learning_rate": 1.4839679358717434e-05, + "loss": 0.0078, + "step": 3521 + }, + { + "epoch": 14.088, + "grad_norm": 0.21001707017421722, + "learning_rate": 1.4829659318637277e-05, + "loss": 0.0056, + "step": 3522 + }, + { + "epoch": 14.092, + "grad_norm": 0.3976731598377228, + "learning_rate": 1.4819639278557115e-05, + "loss": 0.0098, + "step": 3523 + }, + { + "epoch": 14.096, + "grad_norm": 0.26884251832962036, + "learning_rate": 1.4809619238476954e-05, + "loss": 0.008, + "step": 3524 + }, + { + "epoch": 14.1, + "grad_norm": 0.16603799164295197, + "learning_rate": 1.4799599198396794e-05, + "loss": 0.0081, + "step": 3525 + }, + { + "epoch": 14.104, + "grad_norm": 0.25354495644569397, + "learning_rate": 1.4789579158316633e-05, + "loss": 0.0085, + "step": 3526 + }, + { + "epoch": 14.108, + "grad_norm": 0.49945899844169617, + "learning_rate": 1.4779559118236475e-05, + "loss": 0.0098, + "step": 3527 + }, + { + "epoch": 14.112, + "grad_norm": 0.15931391716003418, + "learning_rate": 1.4769539078156314e-05, + "loss": 0.0051, + "step": 3528 + }, + { + "epoch": 14.116, + "grad_norm": 0.2964676022529602, + "learning_rate": 1.4759519038076152e-05, + "loss": 0.0096, + "step": 3529 + }, + { + "epoch": 14.12, + "grad_norm": 0.2884165048599243, + "learning_rate": 1.4749498997995992e-05, + "loss": 0.008, + "step": 3530 + }, + { + "epoch": 14.124, + "grad_norm": 1.5515947341918945, + "learning_rate": 1.4739478957915831e-05, + "loss": 0.0073, + "step": 3531 + }, + { + "epoch": 14.128, + "grad_norm": 0.2911422550678253, + "learning_rate": 1.4729458917835673e-05, + "loss": 0.0107, + "step": 3532 + }, + { + "epoch": 14.132, + "grad_norm": 0.23672667145729065, + "learning_rate": 1.4719438877755512e-05, + "loss": 0.009, + "step": 3533 + }, + { + "epoch": 14.136, + "grad_norm": 0.4554106593132019, + "learning_rate": 1.4709418837675352e-05, + "loss": 0.0095, + "step": 3534 + }, + { + "epoch": 14.14, + "grad_norm": 0.27175405621528625, + "learning_rate": 1.469939879759519e-05, + "loss": 0.0049, + "step": 3535 + }, + { + "epoch": 14.144, + "grad_norm": 0.3456001281738281, + "learning_rate": 1.4689378757515033e-05, + "loss": 0.0107, + "step": 3536 + }, + { + "epoch": 14.148, + "grad_norm": 0.2605019509792328, + "learning_rate": 1.467935871743487e-05, + "loss": 0.0064, + "step": 3537 + }, + { + "epoch": 14.152, + "grad_norm": 0.26379692554473877, + "learning_rate": 1.466933867735471e-05, + "loss": 0.0084, + "step": 3538 + }, + { + "epoch": 14.156, + "grad_norm": 0.23636199533939362, + "learning_rate": 1.465931863727455e-05, + "loss": 0.0082, + "step": 3539 + }, + { + "epoch": 14.16, + "grad_norm": 0.1825660616159439, + "learning_rate": 1.464929859719439e-05, + "loss": 0.008, + "step": 3540 + }, + { + "epoch": 14.164, + "grad_norm": 0.21577121317386627, + "learning_rate": 1.463927855711423e-05, + "loss": 0.0088, + "step": 3541 + }, + { + "epoch": 14.168, + "grad_norm": 0.18342304229736328, + "learning_rate": 1.462925851703407e-05, + "loss": 0.008, + "step": 3542 + }, + { + "epoch": 14.172, + "grad_norm": 0.16573582589626312, + "learning_rate": 1.4619238476953908e-05, + "loss": 0.0082, + "step": 3543 + }, + { + "epoch": 14.176, + "grad_norm": 0.20480182766914368, + "learning_rate": 1.4609218436873748e-05, + "loss": 0.0081, + "step": 3544 + }, + { + "epoch": 14.18, + "grad_norm": 0.19577351212501526, + "learning_rate": 1.4599198396793587e-05, + "loss": 0.0093, + "step": 3545 + }, + { + "epoch": 14.184, + "grad_norm": 0.23374129831790924, + "learning_rate": 1.4589178356713429e-05, + "loss": 0.0082, + "step": 3546 + }, + { + "epoch": 14.188, + "grad_norm": 0.22724345326423645, + "learning_rate": 1.4579158316633268e-05, + "loss": 0.0089, + "step": 3547 + }, + { + "epoch": 14.192, + "grad_norm": 0.31487712264060974, + "learning_rate": 1.4569138276553108e-05, + "loss": 0.0082, + "step": 3548 + }, + { + "epoch": 14.196, + "grad_norm": 0.2387700378894806, + "learning_rate": 1.4559118236472946e-05, + "loss": 0.0093, + "step": 3549 + }, + { + "epoch": 14.2, + "grad_norm": 0.24899888038635254, + "learning_rate": 1.4549098196392785e-05, + "loss": 0.0086, + "step": 3550 + }, + { + "epoch": 14.204, + "grad_norm": 0.21508151292800903, + "learning_rate": 1.4539078156312627e-05, + "loss": 0.0093, + "step": 3551 + }, + { + "epoch": 14.208, + "grad_norm": 0.2187836617231369, + "learning_rate": 1.4529058116232466e-05, + "loss": 0.0084, + "step": 3552 + }, + { + "epoch": 14.212, + "grad_norm": 0.2252088487148285, + "learning_rate": 1.4519038076152306e-05, + "loss": 0.0078, + "step": 3553 + }, + { + "epoch": 14.216, + "grad_norm": 0.22216878831386566, + "learning_rate": 1.4509018036072145e-05, + "loss": 0.0088, + "step": 3554 + }, + { + "epoch": 14.22, + "grad_norm": 0.27992987632751465, + "learning_rate": 1.4498997995991983e-05, + "loss": 0.0082, + "step": 3555 + }, + { + "epoch": 14.224, + "grad_norm": 0.21498125791549683, + "learning_rate": 1.4488977955911826e-05, + "loss": 0.0088, + "step": 3556 + }, + { + "epoch": 14.228, + "grad_norm": 0.15717551112174988, + "learning_rate": 1.4478957915831664e-05, + "loss": 0.0074, + "step": 3557 + }, + { + "epoch": 14.232, + "grad_norm": 0.22314034402370453, + "learning_rate": 1.4468937875751504e-05, + "loss": 0.0082, + "step": 3558 + }, + { + "epoch": 14.236, + "grad_norm": 0.28885912895202637, + "learning_rate": 1.4458917835671343e-05, + "loss": 0.0101, + "step": 3559 + }, + { + "epoch": 14.24, + "grad_norm": 0.22253185510635376, + "learning_rate": 1.4448897795591181e-05, + "loss": 0.0084, + "step": 3560 + }, + { + "epoch": 14.244, + "grad_norm": 0.22679786384105682, + "learning_rate": 1.4438877755511024e-05, + "loss": 0.0097, + "step": 3561 + }, + { + "epoch": 14.248, + "grad_norm": 0.22354552149772644, + "learning_rate": 1.4428857715430864e-05, + "loss": 0.0085, + "step": 3562 + }, + { + "epoch": 14.252, + "grad_norm": 0.24927063286304474, + "learning_rate": 1.4418837675350702e-05, + "loss": 0.0096, + "step": 3563 + }, + { + "epoch": 14.256, + "grad_norm": 0.20839311182498932, + "learning_rate": 1.4408817635270541e-05, + "loss": 0.0084, + "step": 3564 + }, + { + "epoch": 14.26, + "grad_norm": 0.15880200266838074, + "learning_rate": 1.439879759519038e-05, + "loss": 0.0084, + "step": 3565 + }, + { + "epoch": 14.264, + "grad_norm": 0.1647965908050537, + "learning_rate": 1.4388777555110222e-05, + "loss": 0.0077, + "step": 3566 + }, + { + "epoch": 14.268, + "grad_norm": 0.20082905888557434, + "learning_rate": 1.4378757515030062e-05, + "loss": 0.0092, + "step": 3567 + }, + { + "epoch": 14.272, + "grad_norm": 0.49212315678596497, + "learning_rate": 1.4368737474949901e-05, + "loss": 0.012, + "step": 3568 + }, + { + "epoch": 14.276, + "grad_norm": 0.20353491604328156, + "learning_rate": 1.4358717434869739e-05, + "loss": 0.0085, + "step": 3569 + }, + { + "epoch": 14.28, + "grad_norm": 0.18611657619476318, + "learning_rate": 1.4348697394789579e-05, + "loss": 0.0086, + "step": 3570 + }, + { + "epoch": 14.284, + "grad_norm": 0.2258971929550171, + "learning_rate": 1.433867735470942e-05, + "loss": 0.0081, + "step": 3571 + }, + { + "epoch": 14.288, + "grad_norm": 0.21564999222755432, + "learning_rate": 1.432865731462926e-05, + "loss": 0.0076, + "step": 3572 + }, + { + "epoch": 14.292, + "grad_norm": 0.17343463003635406, + "learning_rate": 1.43186372745491e-05, + "loss": 0.0085, + "step": 3573 + }, + { + "epoch": 14.296, + "grad_norm": 0.367103636264801, + "learning_rate": 1.4308617234468937e-05, + "loss": 0.01, + "step": 3574 + }, + { + "epoch": 14.3, + "grad_norm": 0.21690668165683746, + "learning_rate": 1.4298597194388777e-05, + "loss": 0.0094, + "step": 3575 + }, + { + "epoch": 14.304, + "grad_norm": 0.15850843489170074, + "learning_rate": 1.428857715430862e-05, + "loss": 0.0075, + "step": 3576 + }, + { + "epoch": 14.308, + "grad_norm": 0.31984981894493103, + "learning_rate": 1.4278557114228458e-05, + "loss": 0.0101, + "step": 3577 + }, + { + "epoch": 14.312, + "grad_norm": 0.45315784215927124, + "learning_rate": 1.4268537074148297e-05, + "loss": 0.0115, + "step": 3578 + }, + { + "epoch": 14.316, + "grad_norm": 0.207364022731781, + "learning_rate": 1.4258517034068137e-05, + "loss": 0.0086, + "step": 3579 + }, + { + "epoch": 14.32, + "grad_norm": 0.18384985625743866, + "learning_rate": 1.4248496993987975e-05, + "loss": 0.0086, + "step": 3580 + }, + { + "epoch": 14.324, + "grad_norm": 0.23308269679546356, + "learning_rate": 1.4238476953907818e-05, + "loss": 0.0089, + "step": 3581 + }, + { + "epoch": 14.328, + "grad_norm": 0.3302237093448639, + "learning_rate": 1.4228456913827657e-05, + "loss": 0.0093, + "step": 3582 + }, + { + "epoch": 14.332, + "grad_norm": 0.23539598286151886, + "learning_rate": 1.4218436873747495e-05, + "loss": 0.0089, + "step": 3583 + }, + { + "epoch": 14.336, + "grad_norm": 0.16788506507873535, + "learning_rate": 1.4208416833667335e-05, + "loss": 0.0072, + "step": 3584 + }, + { + "epoch": 14.34, + "grad_norm": 0.20347057282924652, + "learning_rate": 1.4198396793587174e-05, + "loss": 0.0087, + "step": 3585 + }, + { + "epoch": 14.344, + "grad_norm": 0.2108476608991623, + "learning_rate": 1.4188376753507016e-05, + "loss": 0.0092, + "step": 3586 + }, + { + "epoch": 14.348, + "grad_norm": 0.24422381818294525, + "learning_rate": 1.4178356713426855e-05, + "loss": 0.0096, + "step": 3587 + }, + { + "epoch": 14.352, + "grad_norm": 0.16197939217090607, + "learning_rate": 1.4168336673346693e-05, + "loss": 0.0081, + "step": 3588 + }, + { + "epoch": 14.356, + "grad_norm": 0.20140737295150757, + "learning_rate": 1.4158316633266533e-05, + "loss": 0.0085, + "step": 3589 + }, + { + "epoch": 14.36, + "grad_norm": 0.27182310819625854, + "learning_rate": 1.4148296593186376e-05, + "loss": 0.0097, + "step": 3590 + }, + { + "epoch": 14.364, + "grad_norm": 0.1505798101425171, + "learning_rate": 1.4138276553106213e-05, + "loss": 0.0081, + "step": 3591 + }, + { + "epoch": 14.368, + "grad_norm": 0.17416727542877197, + "learning_rate": 1.4128256513026053e-05, + "loss": 0.0081, + "step": 3592 + }, + { + "epoch": 14.372, + "grad_norm": 0.21804209053516388, + "learning_rate": 1.4118236472945893e-05, + "loss": 0.0083, + "step": 3593 + }, + { + "epoch": 14.376, + "grad_norm": 0.36001965403556824, + "learning_rate": 1.410821643286573e-05, + "loss": 0.0104, + "step": 3594 + }, + { + "epoch": 14.38, + "grad_norm": 0.20571143925189972, + "learning_rate": 1.4098196392785574e-05, + "loss": 0.0082, + "step": 3595 + }, + { + "epoch": 14.384, + "grad_norm": 0.19949229061603546, + "learning_rate": 1.4088176352705411e-05, + "loss": 0.0089, + "step": 3596 + }, + { + "epoch": 14.388, + "grad_norm": 0.667675256729126, + "learning_rate": 1.4078156312625251e-05, + "loss": 0.0088, + "step": 3597 + }, + { + "epoch": 14.392, + "grad_norm": 0.167070671916008, + "learning_rate": 1.406813627254509e-05, + "loss": 0.0087, + "step": 3598 + }, + { + "epoch": 14.396, + "grad_norm": 0.3734051585197449, + "learning_rate": 1.405811623246493e-05, + "loss": 0.0095, + "step": 3599 + }, + { + "epoch": 14.4, + "grad_norm": 0.28370603919029236, + "learning_rate": 1.4048096192384771e-05, + "loss": 0.0088, + "step": 3600 + }, + { + "epoch": 14.404, + "grad_norm": 0.30198901891708374, + "learning_rate": 1.4038076152304611e-05, + "loss": 0.0098, + "step": 3601 + }, + { + "epoch": 14.408, + "grad_norm": 0.23166725039482117, + "learning_rate": 1.4028056112224449e-05, + "loss": 0.008, + "step": 3602 + }, + { + "epoch": 14.412, + "grad_norm": 0.19708774983882904, + "learning_rate": 1.4018036072144289e-05, + "loss": 0.009, + "step": 3603 + }, + { + "epoch": 14.416, + "grad_norm": 0.30307242274284363, + "learning_rate": 1.4008016032064128e-05, + "loss": 0.0101, + "step": 3604 + }, + { + "epoch": 14.42, + "grad_norm": 0.16705302894115448, + "learning_rate": 1.399799599198397e-05, + "loss": 0.0076, + "step": 3605 + }, + { + "epoch": 14.424, + "grad_norm": 0.1781936138868332, + "learning_rate": 1.3987975951903809e-05, + "loss": 0.0087, + "step": 3606 + }, + { + "epoch": 14.428, + "grad_norm": 0.2449142336845398, + "learning_rate": 1.3977955911823649e-05, + "loss": 0.0104, + "step": 3607 + }, + { + "epoch": 14.432, + "grad_norm": 0.16610880196094513, + "learning_rate": 1.3967935871743486e-05, + "loss": 0.0082, + "step": 3608 + }, + { + "epoch": 14.436, + "grad_norm": 0.26270821690559387, + "learning_rate": 1.3957915831663326e-05, + "loss": 0.0096, + "step": 3609 + }, + { + "epoch": 14.44, + "grad_norm": 0.18053866922855377, + "learning_rate": 1.3947895791583167e-05, + "loss": 0.0055, + "step": 3610 + }, + { + "epoch": 14.444, + "grad_norm": 0.17106251418590546, + "learning_rate": 1.3937875751503007e-05, + "loss": 0.0085, + "step": 3611 + }, + { + "epoch": 14.448, + "grad_norm": 1.2041096687316895, + "learning_rate": 1.3927855711422847e-05, + "loss": 0.0099, + "step": 3612 + }, + { + "epoch": 14.452, + "grad_norm": 0.2078244984149933, + "learning_rate": 1.3917835671342686e-05, + "loss": 0.0093, + "step": 3613 + }, + { + "epoch": 14.456, + "grad_norm": 0.2774932384490967, + "learning_rate": 1.3907815631262524e-05, + "loss": 0.0095, + "step": 3614 + }, + { + "epoch": 14.46, + "grad_norm": 0.17771194875240326, + "learning_rate": 1.3897795591182367e-05, + "loss": 0.0075, + "step": 3615 + }, + { + "epoch": 14.464, + "grad_norm": 0.17751413583755493, + "learning_rate": 1.3887775551102205e-05, + "loss": 0.0086, + "step": 3616 + }, + { + "epoch": 14.468, + "grad_norm": 0.289604514837265, + "learning_rate": 1.3877755511022044e-05, + "loss": 0.0094, + "step": 3617 + }, + { + "epoch": 14.472, + "grad_norm": 0.1762779951095581, + "learning_rate": 1.3867735470941884e-05, + "loss": 0.0083, + "step": 3618 + }, + { + "epoch": 14.475999999999999, + "grad_norm": 0.2782403528690338, + "learning_rate": 1.3857715430861724e-05, + "loss": 0.0064, + "step": 3619 + }, + { + "epoch": 14.48, + "grad_norm": 0.22804078459739685, + "learning_rate": 1.3847695390781565e-05, + "loss": 0.0094, + "step": 3620 + }, + { + "epoch": 14.484, + "grad_norm": 0.29062676429748535, + "learning_rate": 1.3837675350701405e-05, + "loss": 0.0109, + "step": 3621 + }, + { + "epoch": 14.488, + "grad_norm": 0.2231641560792923, + "learning_rate": 1.3827655310621242e-05, + "loss": 0.0086, + "step": 3622 + }, + { + "epoch": 14.492, + "grad_norm": 0.1812341958284378, + "learning_rate": 1.3817635270541082e-05, + "loss": 0.0088, + "step": 3623 + }, + { + "epoch": 14.496, + "grad_norm": 0.1771288812160492, + "learning_rate": 1.3807615230460922e-05, + "loss": 0.009, + "step": 3624 + }, + { + "epoch": 14.5, + "grad_norm": 0.23491171002388, + "learning_rate": 1.3797595190380763e-05, + "loss": 0.0091, + "step": 3625 + }, + { + "epoch": 14.504, + "grad_norm": 0.20434904098510742, + "learning_rate": 1.3787575150300602e-05, + "loss": 0.0086, + "step": 3626 + }, + { + "epoch": 14.508, + "grad_norm": 0.16007202863693237, + "learning_rate": 1.3777555110220442e-05, + "loss": 0.0075, + "step": 3627 + }, + { + "epoch": 14.512, + "grad_norm": 0.21671073138713837, + "learning_rate": 1.376753507014028e-05, + "loss": 0.0085, + "step": 3628 + }, + { + "epoch": 14.516, + "grad_norm": 0.23178523778915405, + "learning_rate": 1.375751503006012e-05, + "loss": 0.009, + "step": 3629 + }, + { + "epoch": 14.52, + "grad_norm": 0.2952883541584015, + "learning_rate": 1.374749498997996e-05, + "loss": 0.0104, + "step": 3630 + }, + { + "epoch": 14.524000000000001, + "grad_norm": 0.22766168415546417, + "learning_rate": 1.37374749498998e-05, + "loss": 0.0082, + "step": 3631 + }, + { + "epoch": 14.528, + "grad_norm": 0.17687074840068817, + "learning_rate": 1.372745490981964e-05, + "loss": 0.0077, + "step": 3632 + }, + { + "epoch": 14.532, + "grad_norm": 0.2766241133213043, + "learning_rate": 1.371743486973948e-05, + "loss": 0.0097, + "step": 3633 + }, + { + "epoch": 14.536, + "grad_norm": 0.2916935980319977, + "learning_rate": 1.3707414829659317e-05, + "loss": 0.011, + "step": 3634 + }, + { + "epoch": 14.54, + "grad_norm": 0.4812479615211487, + "learning_rate": 1.369739478957916e-05, + "loss": 0.0104, + "step": 3635 + }, + { + "epoch": 14.544, + "grad_norm": 0.20633623003959656, + "learning_rate": 1.3687374749498998e-05, + "loss": 0.008, + "step": 3636 + }, + { + "epoch": 14.548, + "grad_norm": 0.20415335893630981, + "learning_rate": 1.3677354709418838e-05, + "loss": 0.0083, + "step": 3637 + }, + { + "epoch": 14.552, + "grad_norm": 0.23814812302589417, + "learning_rate": 1.3667334669338678e-05, + "loss": 0.0095, + "step": 3638 + }, + { + "epoch": 14.556000000000001, + "grad_norm": 0.33633625507354736, + "learning_rate": 1.3657314629258517e-05, + "loss": 0.0099, + "step": 3639 + }, + { + "epoch": 14.56, + "grad_norm": 0.3303033113479614, + "learning_rate": 1.3647294589178358e-05, + "loss": 0.0094, + "step": 3640 + }, + { + "epoch": 14.564, + "grad_norm": 0.2839348316192627, + "learning_rate": 1.3637274549098198e-05, + "loss": 0.0103, + "step": 3641 + }, + { + "epoch": 14.568, + "grad_norm": 0.2589602470397949, + "learning_rate": 1.3627254509018036e-05, + "loss": 0.0086, + "step": 3642 + }, + { + "epoch": 14.572, + "grad_norm": 0.17141884565353394, + "learning_rate": 1.3617234468937875e-05, + "loss": 0.0085, + "step": 3643 + }, + { + "epoch": 14.576, + "grad_norm": 0.20422735810279846, + "learning_rate": 1.3607214428857717e-05, + "loss": 0.0087, + "step": 3644 + }, + { + "epoch": 14.58, + "grad_norm": 0.36146315932273865, + "learning_rate": 1.3597194388777556e-05, + "loss": 0.0103, + "step": 3645 + }, + { + "epoch": 14.584, + "grad_norm": 0.3737517297267914, + "learning_rate": 1.3587174348697396e-05, + "loss": 0.0085, + "step": 3646 + }, + { + "epoch": 14.588, + "grad_norm": 0.20324692130088806, + "learning_rate": 1.3577154308617236e-05, + "loss": 0.0083, + "step": 3647 + }, + { + "epoch": 14.592, + "grad_norm": 0.21344539523124695, + "learning_rate": 1.3567134268537073e-05, + "loss": 0.009, + "step": 3648 + }, + { + "epoch": 14.596, + "grad_norm": 0.3655132055282593, + "learning_rate": 1.3557114228456916e-05, + "loss": 0.011, + "step": 3649 + }, + { + "epoch": 14.6, + "grad_norm": 0.20782031118869781, + "learning_rate": 1.3547094188376754e-05, + "loss": 0.0089, + "step": 3650 + }, + { + "epoch": 14.604, + "grad_norm": 0.28657761216163635, + "learning_rate": 1.3537074148296594e-05, + "loss": 0.0093, + "step": 3651 + }, + { + "epoch": 14.608, + "grad_norm": 0.25337931513786316, + "learning_rate": 1.3527054108216433e-05, + "loss": 0.0093, + "step": 3652 + }, + { + "epoch": 14.612, + "grad_norm": 0.36946016550064087, + "learning_rate": 1.3517034068136273e-05, + "loss": 0.0113, + "step": 3653 + }, + { + "epoch": 14.616, + "grad_norm": 0.18227268755435944, + "learning_rate": 1.3507014028056114e-05, + "loss": 0.0084, + "step": 3654 + }, + { + "epoch": 14.62, + "grad_norm": 0.1899784952402115, + "learning_rate": 1.3496993987975954e-05, + "loss": 0.0085, + "step": 3655 + }, + { + "epoch": 14.624, + "grad_norm": 0.16006070375442505, + "learning_rate": 1.3486973947895792e-05, + "loss": 0.0086, + "step": 3656 + }, + { + "epoch": 14.628, + "grad_norm": 0.2663494348526001, + "learning_rate": 1.3476953907815631e-05, + "loss": 0.0104, + "step": 3657 + }, + { + "epoch": 14.632, + "grad_norm": 0.19320881366729736, + "learning_rate": 1.3466933867735471e-05, + "loss": 0.0092, + "step": 3658 + }, + { + "epoch": 14.636, + "grad_norm": 0.5768705010414124, + "learning_rate": 1.3456913827655312e-05, + "loss": 0.0127, + "step": 3659 + }, + { + "epoch": 14.64, + "grad_norm": 0.15998607873916626, + "learning_rate": 1.3446893787575152e-05, + "loss": 0.0089, + "step": 3660 + }, + { + "epoch": 14.644, + "grad_norm": 0.21522219479084015, + "learning_rate": 1.3436873747494991e-05, + "loss": 0.0079, + "step": 3661 + }, + { + "epoch": 14.648, + "grad_norm": 0.2248634248971939, + "learning_rate": 1.342685370741483e-05, + "loss": 0.0098, + "step": 3662 + }, + { + "epoch": 14.652, + "grad_norm": 0.23990559577941895, + "learning_rate": 1.3416833667334669e-05, + "loss": 0.0096, + "step": 3663 + }, + { + "epoch": 14.656, + "grad_norm": 0.18346965312957764, + "learning_rate": 1.340681362725451e-05, + "loss": 0.0081, + "step": 3664 + }, + { + "epoch": 14.66, + "grad_norm": 0.46722784638404846, + "learning_rate": 1.339679358717435e-05, + "loss": 0.0119, + "step": 3665 + }, + { + "epoch": 14.664, + "grad_norm": 0.19062618911266327, + "learning_rate": 1.338677354709419e-05, + "loss": 0.0082, + "step": 3666 + }, + { + "epoch": 14.668, + "grad_norm": 0.40659964084625244, + "learning_rate": 1.3376753507014029e-05, + "loss": 0.0097, + "step": 3667 + }, + { + "epoch": 14.672, + "grad_norm": 0.22992847859859467, + "learning_rate": 1.3366733466933867e-05, + "loss": 0.0094, + "step": 3668 + }, + { + "epoch": 14.676, + "grad_norm": 0.3991800844669342, + "learning_rate": 1.335671342685371e-05, + "loss": 0.0129, + "step": 3669 + }, + { + "epoch": 14.68, + "grad_norm": 0.1878584921360016, + "learning_rate": 1.3346693386773548e-05, + "loss": 0.0085, + "step": 3670 + }, + { + "epoch": 14.684, + "grad_norm": 0.1736784130334854, + "learning_rate": 1.3336673346693387e-05, + "loss": 0.0094, + "step": 3671 + }, + { + "epoch": 14.688, + "grad_norm": 0.18842560052871704, + "learning_rate": 1.3326653306613227e-05, + "loss": 0.0085, + "step": 3672 + }, + { + "epoch": 14.692, + "grad_norm": 0.2932375967502594, + "learning_rate": 1.3316633266533065e-05, + "loss": 0.0094, + "step": 3673 + }, + { + "epoch": 14.696, + "grad_norm": 0.5152665376663208, + "learning_rate": 1.3306613226452908e-05, + "loss": 0.0096, + "step": 3674 + }, + { + "epoch": 14.7, + "grad_norm": 0.34712889790534973, + "learning_rate": 1.3296593186372747e-05, + "loss": 0.0092, + "step": 3675 + }, + { + "epoch": 14.704, + "grad_norm": 0.18002943694591522, + "learning_rate": 1.3286573146292585e-05, + "loss": 0.0089, + "step": 3676 + }, + { + "epoch": 14.708, + "grad_norm": 0.26756277680397034, + "learning_rate": 1.3276553106212425e-05, + "loss": 0.0092, + "step": 3677 + }, + { + "epoch": 14.712, + "grad_norm": 0.24038799107074738, + "learning_rate": 1.3266533066132264e-05, + "loss": 0.01, + "step": 3678 + }, + { + "epoch": 14.716, + "grad_norm": 0.526962161064148, + "learning_rate": 1.3256513026052106e-05, + "loss": 0.0112, + "step": 3679 + }, + { + "epoch": 14.72, + "grad_norm": 0.184894397854805, + "learning_rate": 1.3246492985971945e-05, + "loss": 0.008, + "step": 3680 + }, + { + "epoch": 14.724, + "grad_norm": 0.2070516049861908, + "learning_rate": 1.3236472945891785e-05, + "loss": 0.0087, + "step": 3681 + }, + { + "epoch": 14.728, + "grad_norm": 0.23932699859142303, + "learning_rate": 1.3226452905811623e-05, + "loss": 0.0099, + "step": 3682 + }, + { + "epoch": 14.732, + "grad_norm": 0.24801121652126312, + "learning_rate": 1.3216432865731462e-05, + "loss": 0.0094, + "step": 3683 + }, + { + "epoch": 14.736, + "grad_norm": 0.21665263175964355, + "learning_rate": 1.3206412825651304e-05, + "loss": 0.0094, + "step": 3684 + }, + { + "epoch": 14.74, + "grad_norm": 0.19703127443790436, + "learning_rate": 1.3196392785571143e-05, + "loss": 0.008, + "step": 3685 + }, + { + "epoch": 14.744, + "grad_norm": 0.40190309286117554, + "learning_rate": 1.3186372745490983e-05, + "loss": 0.0089, + "step": 3686 + }, + { + "epoch": 14.748, + "grad_norm": 0.2351587414741516, + "learning_rate": 1.317635270541082e-05, + "loss": 0.0086, + "step": 3687 + }, + { + "epoch": 14.752, + "grad_norm": 0.26315975189208984, + "learning_rate": 1.316633266533066e-05, + "loss": 0.01, + "step": 3688 + }, + { + "epoch": 14.756, + "grad_norm": 0.33228597044944763, + "learning_rate": 1.3156312625250503e-05, + "loss": 0.0115, + "step": 3689 + }, + { + "epoch": 14.76, + "grad_norm": 0.2187896966934204, + "learning_rate": 1.3146292585170341e-05, + "loss": 0.0086, + "step": 3690 + }, + { + "epoch": 14.764, + "grad_norm": 0.2327001541852951, + "learning_rate": 1.313627254509018e-05, + "loss": 0.0096, + "step": 3691 + }, + { + "epoch": 14.768, + "grad_norm": 0.1458553969860077, + "learning_rate": 1.312625250501002e-05, + "loss": 0.0083, + "step": 3692 + }, + { + "epoch": 14.772, + "grad_norm": 0.2837975323200226, + "learning_rate": 1.3116232464929858e-05, + "loss": 0.0096, + "step": 3693 + }, + { + "epoch": 14.776, + "grad_norm": 0.2698477804660797, + "learning_rate": 1.3106212424849701e-05, + "loss": 0.0095, + "step": 3694 + }, + { + "epoch": 14.78, + "grad_norm": 0.6040169596672058, + "learning_rate": 1.309619238476954e-05, + "loss": 0.0119, + "step": 3695 + }, + { + "epoch": 14.784, + "grad_norm": 0.14232851564884186, + "learning_rate": 1.3086172344689379e-05, + "loss": 0.0085, + "step": 3696 + }, + { + "epoch": 14.788, + "grad_norm": 0.24106432497501373, + "learning_rate": 1.3076152304609218e-05, + "loss": 0.0088, + "step": 3697 + }, + { + "epoch": 14.792, + "grad_norm": 0.2642269432544708, + "learning_rate": 1.3066132264529058e-05, + "loss": 0.0098, + "step": 3698 + }, + { + "epoch": 14.796, + "grad_norm": 0.23594169318675995, + "learning_rate": 1.30561122244489e-05, + "loss": 0.0092, + "step": 3699 + }, + { + "epoch": 14.8, + "grad_norm": 0.24775654077529907, + "learning_rate": 1.3046092184368739e-05, + "loss": 0.0092, + "step": 3700 + }, + { + "epoch": 14.804, + "grad_norm": 0.2097250074148178, + "learning_rate": 1.3036072144288577e-05, + "loss": 0.0096, + "step": 3701 + }, + { + "epoch": 14.808, + "grad_norm": 0.2012004554271698, + "learning_rate": 1.3026052104208416e-05, + "loss": 0.0092, + "step": 3702 + }, + { + "epoch": 14.812, + "grad_norm": 0.20894677937030792, + "learning_rate": 1.301603206412826e-05, + "loss": 0.0093, + "step": 3703 + }, + { + "epoch": 14.816, + "grad_norm": 0.23382696509361267, + "learning_rate": 1.3006012024048097e-05, + "loss": 0.009, + "step": 3704 + }, + { + "epoch": 14.82, + "grad_norm": 0.19932790100574493, + "learning_rate": 1.2995991983967937e-05, + "loss": 0.0103, + "step": 3705 + }, + { + "epoch": 14.824, + "grad_norm": 0.28219786286354065, + "learning_rate": 1.2985971943887776e-05, + "loss": 0.01, + "step": 3706 + }, + { + "epoch": 14.828, + "grad_norm": 0.2530341148376465, + "learning_rate": 1.2975951903807614e-05, + "loss": 0.0092, + "step": 3707 + }, + { + "epoch": 14.832, + "grad_norm": 0.26421231031417847, + "learning_rate": 1.2965931863727457e-05, + "loss": 0.0097, + "step": 3708 + }, + { + "epoch": 14.836, + "grad_norm": 0.1763448268175125, + "learning_rate": 1.2955911823647297e-05, + "loss": 0.0084, + "step": 3709 + }, + { + "epoch": 14.84, + "grad_norm": 0.26100030541419983, + "learning_rate": 1.2945891783567135e-05, + "loss": 0.0102, + "step": 3710 + }, + { + "epoch": 14.844, + "grad_norm": 0.22259047627449036, + "learning_rate": 1.2935871743486974e-05, + "loss": 0.0092, + "step": 3711 + }, + { + "epoch": 14.848, + "grad_norm": 0.19816388189792633, + "learning_rate": 1.2925851703406814e-05, + "loss": 0.0059, + "step": 3712 + }, + { + "epoch": 14.852, + "grad_norm": 0.270816832780838, + "learning_rate": 1.2915831663326655e-05, + "loss": 0.01, + "step": 3713 + }, + { + "epoch": 14.856, + "grad_norm": 0.25951826572418213, + "learning_rate": 1.2905811623246495e-05, + "loss": 0.0098, + "step": 3714 + }, + { + "epoch": 14.86, + "grad_norm": 0.28285837173461914, + "learning_rate": 1.2895791583166333e-05, + "loss": 0.0093, + "step": 3715 + }, + { + "epoch": 14.864, + "grad_norm": 0.42073768377304077, + "learning_rate": 1.2885771543086172e-05, + "loss": 0.0108, + "step": 3716 + }, + { + "epoch": 14.868, + "grad_norm": 0.506669282913208, + "learning_rate": 1.2875751503006012e-05, + "loss": 0.0094, + "step": 3717 + }, + { + "epoch": 14.872, + "grad_norm": 0.18972696363925934, + "learning_rate": 1.2865731462925853e-05, + "loss": 0.009, + "step": 3718 + }, + { + "epoch": 14.876, + "grad_norm": 0.17668847739696503, + "learning_rate": 1.2855711422845693e-05, + "loss": 0.0082, + "step": 3719 + }, + { + "epoch": 14.88, + "grad_norm": 0.30280017852783203, + "learning_rate": 1.2845691382765532e-05, + "loss": 0.0109, + "step": 3720 + }, + { + "epoch": 14.884, + "grad_norm": 0.21172870695590973, + "learning_rate": 1.283567134268537e-05, + "loss": 0.0094, + "step": 3721 + }, + { + "epoch": 14.888, + "grad_norm": 0.17289844155311584, + "learning_rate": 1.282565130260521e-05, + "loss": 0.0074, + "step": 3722 + }, + { + "epoch": 14.892, + "grad_norm": 0.17328234016895294, + "learning_rate": 1.2815631262525051e-05, + "loss": 0.0082, + "step": 3723 + }, + { + "epoch": 14.896, + "grad_norm": 0.2376927137374878, + "learning_rate": 1.280561122244489e-05, + "loss": 0.01, + "step": 3724 + }, + { + "epoch": 14.9, + "grad_norm": 0.21705985069274902, + "learning_rate": 1.279559118236473e-05, + "loss": 0.0086, + "step": 3725 + }, + { + "epoch": 14.904, + "grad_norm": 0.272956520318985, + "learning_rate": 1.278557114228457e-05, + "loss": 0.0092, + "step": 3726 + }, + { + "epoch": 14.908, + "grad_norm": 0.18439574539661407, + "learning_rate": 1.2775551102204408e-05, + "loss": 0.0084, + "step": 3727 + }, + { + "epoch": 14.912, + "grad_norm": 0.19920729100704193, + "learning_rate": 1.276553106212425e-05, + "loss": 0.0091, + "step": 3728 + }, + { + "epoch": 14.916, + "grad_norm": 0.2559821605682373, + "learning_rate": 1.2755511022044089e-05, + "loss": 0.0091, + "step": 3729 + }, + { + "epoch": 14.92, + "grad_norm": 0.23575206100940704, + "learning_rate": 1.2745490981963928e-05, + "loss": 0.0084, + "step": 3730 + }, + { + "epoch": 14.924, + "grad_norm": 0.2531851828098297, + "learning_rate": 1.2735470941883768e-05, + "loss": 0.0089, + "step": 3731 + }, + { + "epoch": 14.928, + "grad_norm": 0.21447163820266724, + "learning_rate": 1.2725450901803607e-05, + "loss": 0.0099, + "step": 3732 + }, + { + "epoch": 14.932, + "grad_norm": 0.23144927620887756, + "learning_rate": 1.2715430861723449e-05, + "loss": 0.0098, + "step": 3733 + }, + { + "epoch": 14.936, + "grad_norm": 0.16847439110279083, + "learning_rate": 1.2705410821643288e-05, + "loss": 0.0083, + "step": 3734 + }, + { + "epoch": 14.94, + "grad_norm": 0.25669029355049133, + "learning_rate": 1.2695390781563126e-05, + "loss": 0.0097, + "step": 3735 + }, + { + "epoch": 14.943999999999999, + "grad_norm": 0.3748454451560974, + "learning_rate": 1.2685370741482966e-05, + "loss": 0.0097, + "step": 3736 + }, + { + "epoch": 14.948, + "grad_norm": 0.2604009509086609, + "learning_rate": 1.2675350701402805e-05, + "loss": 0.0094, + "step": 3737 + }, + { + "epoch": 14.952, + "grad_norm": 0.09197622537612915, + "learning_rate": 1.2665330661322647e-05, + "loss": 0.004, + "step": 3738 + }, + { + "epoch": 14.956, + "grad_norm": 0.2926792800426483, + "learning_rate": 1.2655310621242486e-05, + "loss": 0.0103, + "step": 3739 + }, + { + "epoch": 14.96, + "grad_norm": 0.19025073945522308, + "learning_rate": 1.2645290581162326e-05, + "loss": 0.009, + "step": 3740 + }, + { + "epoch": 14.964, + "grad_norm": 0.17044049501419067, + "learning_rate": 1.2635270541082164e-05, + "loss": 0.006, + "step": 3741 + }, + { + "epoch": 14.968, + "grad_norm": 0.16821922361850739, + "learning_rate": 1.2625250501002003e-05, + "loss": 0.006, + "step": 3742 + }, + { + "epoch": 14.972, + "grad_norm": 0.2913142740726471, + "learning_rate": 1.2615230460921844e-05, + "loss": 0.0097, + "step": 3743 + }, + { + "epoch": 14.975999999999999, + "grad_norm": 0.48879778385162354, + "learning_rate": 1.2605210420841684e-05, + "loss": 0.0101, + "step": 3744 + }, + { + "epoch": 14.98, + "grad_norm": 0.2455979734659195, + "learning_rate": 1.2595190380761524e-05, + "loss": 0.009, + "step": 3745 + }, + { + "epoch": 14.984, + "grad_norm": 0.20019696652889252, + "learning_rate": 1.2585170340681363e-05, + "loss": 0.009, + "step": 3746 + }, + { + "epoch": 14.988, + "grad_norm": 0.30625882744789124, + "learning_rate": 1.2575150300601201e-05, + "loss": 0.0101, + "step": 3747 + }, + { + "epoch": 14.992, + "grad_norm": 0.18524271249771118, + "learning_rate": 1.2565130260521044e-05, + "loss": 0.0093, + "step": 3748 + }, + { + "epoch": 14.996, + "grad_norm": 0.33599841594696045, + "learning_rate": 1.2555110220440882e-05, + "loss": 0.0118, + "step": 3749 + }, + { + "epoch": 15.0, + "grad_norm": 0.17473024129867554, + "learning_rate": 1.2545090180360722e-05, + "loss": 0.006, + "step": 3750 + }, + { + "epoch": 15.004, + "grad_norm": 0.19427764415740967, + "learning_rate": 1.2535070140280561e-05, + "loss": 0.0093, + "step": 3751 + }, + { + "epoch": 15.008, + "grad_norm": 0.1349514275789261, + "learning_rate": 1.25250501002004e-05, + "loss": 0.0072, + "step": 3752 + }, + { + "epoch": 15.012, + "grad_norm": 0.11184310913085938, + "learning_rate": 1.2515030060120242e-05, + "loss": 0.0056, + "step": 3753 + }, + { + "epoch": 15.016, + "grad_norm": 0.22410862147808075, + "learning_rate": 1.2505010020040082e-05, + "loss": 0.0085, + "step": 3754 + }, + { + "epoch": 15.02, + "grad_norm": 0.20344886183738708, + "learning_rate": 1.249498997995992e-05, + "loss": 0.0077, + "step": 3755 + }, + { + "epoch": 15.024, + "grad_norm": 0.20372788608074188, + "learning_rate": 1.248496993987976e-05, + "loss": 0.0083, + "step": 3756 + }, + { + "epoch": 15.028, + "grad_norm": 0.31820449233055115, + "learning_rate": 1.24749498997996e-05, + "loss": 0.0081, + "step": 3757 + }, + { + "epoch": 15.032, + "grad_norm": 0.2090875804424286, + "learning_rate": 1.2464929859719438e-05, + "loss": 0.0087, + "step": 3758 + }, + { + "epoch": 15.036, + "grad_norm": 0.15436923503875732, + "learning_rate": 1.245490981963928e-05, + "loss": 0.0069, + "step": 3759 + }, + { + "epoch": 15.04, + "grad_norm": 0.17816013097763062, + "learning_rate": 1.244488977955912e-05, + "loss": 0.0087, + "step": 3760 + }, + { + "epoch": 15.044, + "grad_norm": 0.14754058420658112, + "learning_rate": 1.2434869739478959e-05, + "loss": 0.0072, + "step": 3761 + }, + { + "epoch": 15.048, + "grad_norm": 0.29081836342811584, + "learning_rate": 1.2424849699398798e-05, + "loss": 0.0085, + "step": 3762 + }, + { + "epoch": 15.052, + "grad_norm": 0.14590558409690857, + "learning_rate": 1.2414829659318638e-05, + "loss": 0.0067, + "step": 3763 + }, + { + "epoch": 15.056, + "grad_norm": 0.1646532565355301, + "learning_rate": 1.2404809619238478e-05, + "loss": 0.0072, + "step": 3764 + }, + { + "epoch": 15.06, + "grad_norm": 0.3623543977737427, + "learning_rate": 1.2394789579158317e-05, + "loss": 0.0097, + "step": 3765 + }, + { + "epoch": 15.064, + "grad_norm": 0.074567049741745, + "learning_rate": 1.2384769539078157e-05, + "loss": 0.0032, + "step": 3766 + }, + { + "epoch": 15.068, + "grad_norm": 0.25378715991973877, + "learning_rate": 1.2374749498997996e-05, + "loss": 0.0085, + "step": 3767 + }, + { + "epoch": 15.072, + "grad_norm": 0.14567115902900696, + "learning_rate": 1.2364729458917836e-05, + "loss": 0.0076, + "step": 3768 + }, + { + "epoch": 15.076, + "grad_norm": 0.13616079092025757, + "learning_rate": 1.2354709418837675e-05, + "loss": 0.0077, + "step": 3769 + }, + { + "epoch": 15.08, + "grad_norm": 0.13720935583114624, + "learning_rate": 1.2344689378757515e-05, + "loss": 0.0076, + "step": 3770 + }, + { + "epoch": 15.084, + "grad_norm": 0.13663603365421295, + "learning_rate": 1.2334669338677356e-05, + "loss": 0.0045, + "step": 3771 + }, + { + "epoch": 15.088, + "grad_norm": 0.16399826109409332, + "learning_rate": 1.2324649298597194e-05, + "loss": 0.0079, + "step": 3772 + }, + { + "epoch": 15.092, + "grad_norm": 0.25261420011520386, + "learning_rate": 1.2314629258517036e-05, + "loss": 0.0087, + "step": 3773 + }, + { + "epoch": 15.096, + "grad_norm": 0.17582829296588898, + "learning_rate": 1.2304609218436875e-05, + "loss": 0.0068, + "step": 3774 + }, + { + "epoch": 15.1, + "grad_norm": 0.18607257306575775, + "learning_rate": 1.2294589178356713e-05, + "loss": 0.0079, + "step": 3775 + }, + { + "epoch": 15.104, + "grad_norm": 0.19465355575084686, + "learning_rate": 1.2284569138276554e-05, + "loss": 0.0067, + "step": 3776 + }, + { + "epoch": 15.108, + "grad_norm": 0.28906580805778503, + "learning_rate": 1.2274549098196394e-05, + "loss": 0.0092, + "step": 3777 + }, + { + "epoch": 15.112, + "grad_norm": 0.2137903869152069, + "learning_rate": 1.2264529058116233e-05, + "loss": 0.0084, + "step": 3778 + }, + { + "epoch": 15.116, + "grad_norm": 0.18388208746910095, + "learning_rate": 1.2254509018036073e-05, + "loss": 0.0075, + "step": 3779 + }, + { + "epoch": 15.12, + "grad_norm": 0.237356498837471, + "learning_rate": 1.2244488977955913e-05, + "loss": 0.008, + "step": 3780 + }, + { + "epoch": 15.124, + "grad_norm": 0.22582365572452545, + "learning_rate": 1.2234468937875752e-05, + "loss": 0.0094, + "step": 3781 + }, + { + "epoch": 15.128, + "grad_norm": 0.150029718875885, + "learning_rate": 1.2224448897795592e-05, + "loss": 0.008, + "step": 3782 + }, + { + "epoch": 15.132, + "grad_norm": 0.2120167315006256, + "learning_rate": 1.2214428857715431e-05, + "loss": 0.0091, + "step": 3783 + }, + { + "epoch": 15.136, + "grad_norm": 0.18410436809062958, + "learning_rate": 1.2204408817635271e-05, + "loss": 0.0073, + "step": 3784 + }, + { + "epoch": 15.14, + "grad_norm": 0.18777211010456085, + "learning_rate": 1.219438877755511e-05, + "loss": 0.0084, + "step": 3785 + }, + { + "epoch": 15.144, + "grad_norm": 0.1506580412387848, + "learning_rate": 1.218436873747495e-05, + "loss": 0.0077, + "step": 3786 + }, + { + "epoch": 15.148, + "grad_norm": 0.1882610023021698, + "learning_rate": 1.217434869739479e-05, + "loss": 0.0072, + "step": 3787 + }, + { + "epoch": 15.152, + "grad_norm": 0.14596189558506012, + "learning_rate": 1.2164328657314631e-05, + "loss": 0.0074, + "step": 3788 + }, + { + "epoch": 15.156, + "grad_norm": 0.14775827527046204, + "learning_rate": 1.2154308617234469e-05, + "loss": 0.0074, + "step": 3789 + }, + { + "epoch": 15.16, + "grad_norm": 0.1532842367887497, + "learning_rate": 1.2144288577154309e-05, + "loss": 0.0071, + "step": 3790 + }, + { + "epoch": 15.164, + "grad_norm": 0.18244397640228271, + "learning_rate": 1.213426853707415e-05, + "loss": 0.0085, + "step": 3791 + }, + { + "epoch": 15.168, + "grad_norm": 0.17389196157455444, + "learning_rate": 1.2124248496993988e-05, + "loss": 0.0081, + "step": 3792 + }, + { + "epoch": 15.172, + "grad_norm": 0.24255871772766113, + "learning_rate": 1.2114228456913829e-05, + "loss": 0.0095, + "step": 3793 + }, + { + "epoch": 15.176, + "grad_norm": 0.13241663575172424, + "learning_rate": 1.2104208416833669e-05, + "loss": 0.0066, + "step": 3794 + }, + { + "epoch": 15.18, + "grad_norm": 0.2261979728937149, + "learning_rate": 1.2094188376753506e-05, + "loss": 0.0105, + "step": 3795 + }, + { + "epoch": 15.184, + "grad_norm": 0.1342063695192337, + "learning_rate": 1.2084168336673348e-05, + "loss": 0.0073, + "step": 3796 + }, + { + "epoch": 15.188, + "grad_norm": 0.18595094978809357, + "learning_rate": 1.2074148296593187e-05, + "loss": 0.0078, + "step": 3797 + }, + { + "epoch": 15.192, + "grad_norm": 0.21512697637081146, + "learning_rate": 1.2064128256513027e-05, + "loss": 0.0083, + "step": 3798 + }, + { + "epoch": 15.196, + "grad_norm": 0.19803418219089508, + "learning_rate": 1.2054108216432867e-05, + "loss": 0.0079, + "step": 3799 + }, + { + "epoch": 15.2, + "grad_norm": 0.2116500288248062, + "learning_rate": 1.2044088176352706e-05, + "loss": 0.0083, + "step": 3800 + }, + { + "epoch": 15.204, + "grad_norm": 0.18894079327583313, + "learning_rate": 1.2034068136272546e-05, + "loss": 0.0075, + "step": 3801 + }, + { + "epoch": 15.208, + "grad_norm": 0.30808115005493164, + "learning_rate": 1.2024048096192385e-05, + "loss": 0.0085, + "step": 3802 + }, + { + "epoch": 15.212, + "grad_norm": 0.17777995765209198, + "learning_rate": 1.2014028056112225e-05, + "loss": 0.0078, + "step": 3803 + }, + { + "epoch": 15.216, + "grad_norm": 0.28307095170021057, + "learning_rate": 1.2004008016032064e-05, + "loss": 0.0089, + "step": 3804 + }, + { + "epoch": 15.22, + "grad_norm": 0.2219671756029129, + "learning_rate": 1.1993987975951906e-05, + "loss": 0.009, + "step": 3805 + }, + { + "epoch": 15.224, + "grad_norm": 0.1308339238166809, + "learning_rate": 1.1983967935871744e-05, + "loss": 0.008, + "step": 3806 + }, + { + "epoch": 15.228, + "grad_norm": 0.1603306233882904, + "learning_rate": 1.1973947895791583e-05, + "loss": 0.0073, + "step": 3807 + }, + { + "epoch": 15.232, + "grad_norm": 0.21934445202350616, + "learning_rate": 1.1963927855711425e-05, + "loss": 0.0089, + "step": 3808 + }, + { + "epoch": 15.236, + "grad_norm": 0.16111411154270172, + "learning_rate": 1.1953907815631262e-05, + "loss": 0.0081, + "step": 3809 + }, + { + "epoch": 15.24, + "grad_norm": 0.1615634709596634, + "learning_rate": 1.1943887775551104e-05, + "loss": 0.0078, + "step": 3810 + }, + { + "epoch": 15.244, + "grad_norm": 0.1142813116312027, + "learning_rate": 1.1933867735470943e-05, + "loss": 0.0042, + "step": 3811 + }, + { + "epoch": 15.248, + "grad_norm": 0.20265451073646545, + "learning_rate": 1.1923847695390781e-05, + "loss": 0.0075, + "step": 3812 + }, + { + "epoch": 15.252, + "grad_norm": 0.17693500220775604, + "learning_rate": 1.1913827655310622e-05, + "loss": 0.0091, + "step": 3813 + }, + { + "epoch": 15.256, + "grad_norm": 0.16070616245269775, + "learning_rate": 1.190380761523046e-05, + "loss": 0.0084, + "step": 3814 + }, + { + "epoch": 15.26, + "grad_norm": 0.2477542757987976, + "learning_rate": 1.1893787575150302e-05, + "loss": 0.0087, + "step": 3815 + }, + { + "epoch": 15.264, + "grad_norm": 0.2083113044500351, + "learning_rate": 1.1883767535070141e-05, + "loss": 0.0086, + "step": 3816 + }, + { + "epoch": 15.268, + "grad_norm": 0.20015212893486023, + "learning_rate": 1.1873747494989979e-05, + "loss": 0.0087, + "step": 3817 + }, + { + "epoch": 15.272, + "grad_norm": 0.1395539939403534, + "learning_rate": 1.186372745490982e-05, + "loss": 0.0073, + "step": 3818 + }, + { + "epoch": 15.276, + "grad_norm": 0.1408277451992035, + "learning_rate": 1.185370741482966e-05, + "loss": 0.0081, + "step": 3819 + }, + { + "epoch": 15.28, + "grad_norm": 0.20688875019550323, + "learning_rate": 1.18436873747495e-05, + "loss": 0.0082, + "step": 3820 + }, + { + "epoch": 15.284, + "grad_norm": 0.21537630259990692, + "learning_rate": 1.183366733466934e-05, + "loss": 0.0085, + "step": 3821 + }, + { + "epoch": 15.288, + "grad_norm": 0.22216440737247467, + "learning_rate": 1.1823647294589179e-05, + "loss": 0.0074, + "step": 3822 + }, + { + "epoch": 15.292, + "grad_norm": 0.21135936677455902, + "learning_rate": 1.1813627254509018e-05, + "loss": 0.0091, + "step": 3823 + }, + { + "epoch": 15.296, + "grad_norm": 0.23614716529846191, + "learning_rate": 1.1803607214428858e-05, + "loss": 0.0082, + "step": 3824 + }, + { + "epoch": 15.3, + "grad_norm": 0.16992735862731934, + "learning_rate": 1.1793587174348698e-05, + "loss": 0.0074, + "step": 3825 + }, + { + "epoch": 15.304, + "grad_norm": 0.21491000056266785, + "learning_rate": 1.1783567134268537e-05, + "loss": 0.0073, + "step": 3826 + }, + { + "epoch": 15.308, + "grad_norm": 0.16719304025173187, + "learning_rate": 1.1773547094188378e-05, + "loss": 0.0069, + "step": 3827 + }, + { + "epoch": 15.312, + "grad_norm": 0.2545108497142792, + "learning_rate": 1.1763527054108216e-05, + "loss": 0.0094, + "step": 3828 + }, + { + "epoch": 15.316, + "grad_norm": 0.16995342075824738, + "learning_rate": 1.1753507014028056e-05, + "loss": 0.0072, + "step": 3829 + }, + { + "epoch": 15.32, + "grad_norm": 0.16017405688762665, + "learning_rate": 1.1743486973947897e-05, + "loss": 0.007, + "step": 3830 + }, + { + "epoch": 15.324, + "grad_norm": 0.2081645429134369, + "learning_rate": 1.1733466933867735e-05, + "loss": 0.009, + "step": 3831 + }, + { + "epoch": 15.328, + "grad_norm": 0.1570480614900589, + "learning_rate": 1.1723446893787576e-05, + "loss": 0.0075, + "step": 3832 + }, + { + "epoch": 15.332, + "grad_norm": 0.23571652173995972, + "learning_rate": 1.1713426853707416e-05, + "loss": 0.0095, + "step": 3833 + }, + { + "epoch": 15.336, + "grad_norm": 0.2799575626850128, + "learning_rate": 1.1703406813627254e-05, + "loss": 0.0107, + "step": 3834 + }, + { + "epoch": 15.34, + "grad_norm": 0.22484329342842102, + "learning_rate": 1.1693386773547095e-05, + "loss": 0.0087, + "step": 3835 + }, + { + "epoch": 15.344, + "grad_norm": 0.2471844106912613, + "learning_rate": 1.1683366733466935e-05, + "loss": 0.009, + "step": 3836 + }, + { + "epoch": 15.348, + "grad_norm": 0.21544387936592102, + "learning_rate": 1.1673346693386774e-05, + "loss": 0.0083, + "step": 3837 + }, + { + "epoch": 15.352, + "grad_norm": 0.21488280594348907, + "learning_rate": 1.1663326653306614e-05, + "loss": 0.0088, + "step": 3838 + }, + { + "epoch": 15.356, + "grad_norm": 0.12035142630338669, + "learning_rate": 1.1653306613226453e-05, + "loss": 0.0068, + "step": 3839 + }, + { + "epoch": 15.36, + "grad_norm": 0.20783907175064087, + "learning_rate": 1.1643286573146293e-05, + "loss": 0.0079, + "step": 3840 + }, + { + "epoch": 15.364, + "grad_norm": 0.15776130557060242, + "learning_rate": 1.1633266533066133e-05, + "loss": 0.0078, + "step": 3841 + }, + { + "epoch": 15.368, + "grad_norm": 0.1693926751613617, + "learning_rate": 1.1623246492985972e-05, + "loss": 0.0085, + "step": 3842 + }, + { + "epoch": 15.372, + "grad_norm": 0.20709924399852753, + "learning_rate": 1.1613226452905812e-05, + "loss": 0.0088, + "step": 3843 + }, + { + "epoch": 15.376, + "grad_norm": 0.33956825733184814, + "learning_rate": 1.1603206412825651e-05, + "loss": 0.0091, + "step": 3844 + }, + { + "epoch": 15.38, + "grad_norm": 0.20944558084011078, + "learning_rate": 1.1593186372745491e-05, + "loss": 0.0087, + "step": 3845 + }, + { + "epoch": 15.384, + "grad_norm": 0.16297529637813568, + "learning_rate": 1.158316633266533e-05, + "loss": 0.0078, + "step": 3846 + }, + { + "epoch": 15.388, + "grad_norm": 0.2716394364833832, + "learning_rate": 1.1573146292585172e-05, + "loss": 0.0089, + "step": 3847 + }, + { + "epoch": 15.392, + "grad_norm": 0.15041834115982056, + "learning_rate": 1.156312625250501e-05, + "loss": 0.0076, + "step": 3848 + }, + { + "epoch": 15.396, + "grad_norm": 0.16877737641334534, + "learning_rate": 1.155310621242485e-05, + "loss": 0.0074, + "step": 3849 + }, + { + "epoch": 15.4, + "grad_norm": 0.2795097231864929, + "learning_rate": 1.154308617234469e-05, + "loss": 0.0096, + "step": 3850 + }, + { + "epoch": 15.404, + "grad_norm": 0.13509052991867065, + "learning_rate": 1.1533066132264529e-05, + "loss": 0.0078, + "step": 3851 + }, + { + "epoch": 15.408, + "grad_norm": 0.2292792648077011, + "learning_rate": 1.152304609218437e-05, + "loss": 0.0084, + "step": 3852 + }, + { + "epoch": 15.412, + "grad_norm": 0.19881927967071533, + "learning_rate": 1.151302605210421e-05, + "loss": 0.0076, + "step": 3853 + }, + { + "epoch": 15.416, + "grad_norm": 0.2702074646949768, + "learning_rate": 1.1503006012024049e-05, + "loss": 0.0095, + "step": 3854 + }, + { + "epoch": 15.42, + "grad_norm": 0.2394787222146988, + "learning_rate": 1.1492985971943889e-05, + "loss": 0.0099, + "step": 3855 + }, + { + "epoch": 15.424, + "grad_norm": 0.13445042073726654, + "learning_rate": 1.1482965931863728e-05, + "loss": 0.0073, + "step": 3856 + }, + { + "epoch": 15.428, + "grad_norm": 0.2362317591905594, + "learning_rate": 1.1472945891783568e-05, + "loss": 0.0074, + "step": 3857 + }, + { + "epoch": 15.432, + "grad_norm": 0.14894196391105652, + "learning_rate": 1.1462925851703407e-05, + "loss": 0.0086, + "step": 3858 + }, + { + "epoch": 15.436, + "grad_norm": 0.17250634729862213, + "learning_rate": 1.1452905811623247e-05, + "loss": 0.009, + "step": 3859 + }, + { + "epoch": 15.44, + "grad_norm": 0.197704017162323, + "learning_rate": 1.1442885771543087e-05, + "loss": 0.0094, + "step": 3860 + }, + { + "epoch": 15.444, + "grad_norm": 0.23201380670070648, + "learning_rate": 1.1432865731462926e-05, + "loss": 0.009, + "step": 3861 + }, + { + "epoch": 15.448, + "grad_norm": 0.26006773114204407, + "learning_rate": 1.1422845691382766e-05, + "loss": 0.009, + "step": 3862 + }, + { + "epoch": 15.452, + "grad_norm": 0.21038317680358887, + "learning_rate": 1.1412825651302605e-05, + "loss": 0.0085, + "step": 3863 + }, + { + "epoch": 15.456, + "grad_norm": 0.2623082399368286, + "learning_rate": 1.1402805611222447e-05, + "loss": 0.0097, + "step": 3864 + }, + { + "epoch": 15.46, + "grad_norm": 0.17351366579532623, + "learning_rate": 1.1392785571142284e-05, + "loss": 0.0081, + "step": 3865 + }, + { + "epoch": 15.464, + "grad_norm": 0.1778712272644043, + "learning_rate": 1.1382765531062124e-05, + "loss": 0.0085, + "step": 3866 + }, + { + "epoch": 15.468, + "grad_norm": 0.16661031544208527, + "learning_rate": 1.1372745490981965e-05, + "loss": 0.008, + "step": 3867 + }, + { + "epoch": 15.472, + "grad_norm": 0.1771150380373001, + "learning_rate": 1.1362725450901803e-05, + "loss": 0.0084, + "step": 3868 + }, + { + "epoch": 15.475999999999999, + "grad_norm": 0.2267809361219406, + "learning_rate": 1.1352705410821645e-05, + "loss": 0.0086, + "step": 3869 + }, + { + "epoch": 15.48, + "grad_norm": 0.26965567469596863, + "learning_rate": 1.1342685370741484e-05, + "loss": 0.0091, + "step": 3870 + }, + { + "epoch": 15.484, + "grad_norm": 0.1582583636045456, + "learning_rate": 1.1332665330661322e-05, + "loss": 0.0056, + "step": 3871 + }, + { + "epoch": 15.488, + "grad_norm": 0.24081505835056305, + "learning_rate": 1.1322645290581163e-05, + "loss": 0.0089, + "step": 3872 + }, + { + "epoch": 15.492, + "grad_norm": 0.12798239290714264, + "learning_rate": 1.1312625250501003e-05, + "loss": 0.0049, + "step": 3873 + }, + { + "epoch": 15.496, + "grad_norm": 0.14275750517845154, + "learning_rate": 1.1302605210420842e-05, + "loss": 0.0083, + "step": 3874 + }, + { + "epoch": 15.5, + "grad_norm": 0.22652091085910797, + "learning_rate": 1.1292585170340682e-05, + "loss": 0.0089, + "step": 3875 + }, + { + "epoch": 15.504, + "grad_norm": 0.18880586326122284, + "learning_rate": 1.1282565130260522e-05, + "loss": 0.0083, + "step": 3876 + }, + { + "epoch": 15.508, + "grad_norm": 0.23349466919898987, + "learning_rate": 1.1272545090180361e-05, + "loss": 0.0088, + "step": 3877 + }, + { + "epoch": 15.512, + "grad_norm": 0.2626950740814209, + "learning_rate": 1.12625250501002e-05, + "loss": 0.0096, + "step": 3878 + }, + { + "epoch": 15.516, + "grad_norm": 0.1874002069234848, + "learning_rate": 1.125250501002004e-05, + "loss": 0.0085, + "step": 3879 + }, + { + "epoch": 15.52, + "grad_norm": 0.2023652046918869, + "learning_rate": 1.124248496993988e-05, + "loss": 0.0085, + "step": 3880 + }, + { + "epoch": 15.524000000000001, + "grad_norm": 0.3214608132839203, + "learning_rate": 1.1232464929859721e-05, + "loss": 0.0097, + "step": 3881 + }, + { + "epoch": 15.528, + "grad_norm": 0.13317032158374786, + "learning_rate": 1.122244488977956e-05, + "loss": 0.0071, + "step": 3882 + }, + { + "epoch": 15.532, + "grad_norm": 0.16016507148742676, + "learning_rate": 1.1212424849699399e-05, + "loss": 0.0081, + "step": 3883 + }, + { + "epoch": 15.536, + "grad_norm": 0.2174006998538971, + "learning_rate": 1.120240480961924e-05, + "loss": 0.008, + "step": 3884 + }, + { + "epoch": 15.54, + "grad_norm": 0.28932812809944153, + "learning_rate": 1.1192384769539078e-05, + "loss": 0.0095, + "step": 3885 + }, + { + "epoch": 15.544, + "grad_norm": 0.20081889629364014, + "learning_rate": 1.118236472945892e-05, + "loss": 0.0083, + "step": 3886 + }, + { + "epoch": 15.548, + "grad_norm": 0.2505418658256531, + "learning_rate": 1.1172344689378759e-05, + "loss": 0.0095, + "step": 3887 + }, + { + "epoch": 15.552, + "grad_norm": 0.19087974727153778, + "learning_rate": 1.1162324649298597e-05, + "loss": 0.0074, + "step": 3888 + }, + { + "epoch": 15.556000000000001, + "grad_norm": 0.2052469551563263, + "learning_rate": 1.1152304609218438e-05, + "loss": 0.0082, + "step": 3889 + }, + { + "epoch": 15.56, + "grad_norm": 0.24596913158893585, + "learning_rate": 1.1142284569138278e-05, + "loss": 0.0084, + "step": 3890 + }, + { + "epoch": 15.564, + "grad_norm": 0.46544262766838074, + "learning_rate": 1.1132264529058117e-05, + "loss": 0.0084, + "step": 3891 + }, + { + "epoch": 15.568, + "grad_norm": 0.20327705144882202, + "learning_rate": 1.1122244488977957e-05, + "loss": 0.0079, + "step": 3892 + }, + { + "epoch": 15.572, + "grad_norm": 0.21792593598365784, + "learning_rate": 1.1112224448897796e-05, + "loss": 0.0094, + "step": 3893 + }, + { + "epoch": 15.576, + "grad_norm": 0.1825850009918213, + "learning_rate": 1.1102204408817636e-05, + "loss": 0.0079, + "step": 3894 + }, + { + "epoch": 15.58, + "grad_norm": 0.18869619071483612, + "learning_rate": 1.1092184368737475e-05, + "loss": 0.0078, + "step": 3895 + }, + { + "epoch": 15.584, + "grad_norm": 0.26426199078559875, + "learning_rate": 1.1082164328657315e-05, + "loss": 0.0112, + "step": 3896 + }, + { + "epoch": 15.588, + "grad_norm": 0.20740513503551483, + "learning_rate": 1.1072144288577155e-05, + "loss": 0.0094, + "step": 3897 + }, + { + "epoch": 15.592, + "grad_norm": 0.16762283444404602, + "learning_rate": 1.1062124248496994e-05, + "loss": 0.0079, + "step": 3898 + }, + { + "epoch": 15.596, + "grad_norm": 0.28626206517219543, + "learning_rate": 1.1052104208416834e-05, + "loss": 0.0091, + "step": 3899 + }, + { + "epoch": 15.6, + "grad_norm": 0.1966615617275238, + "learning_rate": 1.1042084168336673e-05, + "loss": 0.0076, + "step": 3900 + }, + { + "epoch": 15.604, + "grad_norm": 0.2372523993253708, + "learning_rate": 1.1032064128256515e-05, + "loss": 0.0089, + "step": 3901 + }, + { + "epoch": 15.608, + "grad_norm": 0.11428368836641312, + "learning_rate": 1.1022044088176353e-05, + "loss": 0.0048, + "step": 3902 + }, + { + "epoch": 15.612, + "grad_norm": 0.20153571665287018, + "learning_rate": 1.1012024048096192e-05, + "loss": 0.0087, + "step": 3903 + }, + { + "epoch": 15.616, + "grad_norm": 0.2549794614315033, + "learning_rate": 1.1002004008016033e-05, + "loss": 0.0087, + "step": 3904 + }, + { + "epoch": 15.62, + "grad_norm": 0.11948023736476898, + "learning_rate": 1.0991983967935871e-05, + "loss": 0.0052, + "step": 3905 + }, + { + "epoch": 15.624, + "grad_norm": 0.1888275444507599, + "learning_rate": 1.0981963927855713e-05, + "loss": 0.0085, + "step": 3906 + }, + { + "epoch": 15.628, + "grad_norm": 0.38151490688323975, + "learning_rate": 1.0971943887775552e-05, + "loss": 0.009, + "step": 3907 + }, + { + "epoch": 15.632, + "grad_norm": 0.12330213189125061, + "learning_rate": 1.0961923847695392e-05, + "loss": 0.0047, + "step": 3908 + }, + { + "epoch": 15.636, + "grad_norm": 0.20336763560771942, + "learning_rate": 1.0951903807615231e-05, + "loss": 0.0083, + "step": 3909 + }, + { + "epoch": 15.64, + "grad_norm": 0.26230552792549133, + "learning_rate": 1.0941883767535071e-05, + "loss": 0.0104, + "step": 3910 + }, + { + "epoch": 15.644, + "grad_norm": 0.2377796471118927, + "learning_rate": 1.093186372745491e-05, + "loss": 0.0086, + "step": 3911 + }, + { + "epoch": 15.648, + "grad_norm": 0.17421042919158936, + "learning_rate": 1.092184368737475e-05, + "loss": 0.0084, + "step": 3912 + }, + { + "epoch": 15.652, + "grad_norm": 0.21408884227275848, + "learning_rate": 1.091182364729459e-05, + "loss": 0.0089, + "step": 3913 + }, + { + "epoch": 15.656, + "grad_norm": 0.15429489314556122, + "learning_rate": 1.090180360721443e-05, + "loss": 0.0085, + "step": 3914 + }, + { + "epoch": 15.66, + "grad_norm": 0.15924520790576935, + "learning_rate": 1.0891783567134269e-05, + "loss": 0.0055, + "step": 3915 + }, + { + "epoch": 15.664, + "grad_norm": 0.3604001998901367, + "learning_rate": 1.0881763527054109e-05, + "loss": 0.0098, + "step": 3916 + }, + { + "epoch": 15.668, + "grad_norm": 0.1900898814201355, + "learning_rate": 1.0871743486973948e-05, + "loss": 0.0084, + "step": 3917 + }, + { + "epoch": 15.672, + "grad_norm": 0.2091568261384964, + "learning_rate": 1.086172344689379e-05, + "loss": 0.0091, + "step": 3918 + }, + { + "epoch": 15.676, + "grad_norm": 0.32461073994636536, + "learning_rate": 1.0851703406813627e-05, + "loss": 0.0093, + "step": 3919 + }, + { + "epoch": 15.68, + "grad_norm": 0.34765294194221497, + "learning_rate": 1.0841683366733467e-05, + "loss": 0.0097, + "step": 3920 + }, + { + "epoch": 15.684, + "grad_norm": 0.1439502090215683, + "learning_rate": 1.0831663326653308e-05, + "loss": 0.0073, + "step": 3921 + }, + { + "epoch": 15.688, + "grad_norm": 0.11668162047863007, + "learning_rate": 1.0821643286573146e-05, + "loss": 0.0078, + "step": 3922 + }, + { + "epoch": 15.692, + "grad_norm": 0.16795441508293152, + "learning_rate": 1.0811623246492987e-05, + "loss": 0.0083, + "step": 3923 + }, + { + "epoch": 15.696, + "grad_norm": 0.32226186990737915, + "learning_rate": 1.0801603206412827e-05, + "loss": 0.0103, + "step": 3924 + }, + { + "epoch": 15.7, + "grad_norm": 0.13408587872982025, + "learning_rate": 1.0791583166332665e-05, + "loss": 0.005, + "step": 3925 + }, + { + "epoch": 15.704, + "grad_norm": 0.12357212603092194, + "learning_rate": 1.0781563126252506e-05, + "loss": 0.0058, + "step": 3926 + }, + { + "epoch": 15.708, + "grad_norm": 0.11612733453512192, + "learning_rate": 1.0771543086172344e-05, + "loss": 0.0045, + "step": 3927 + }, + { + "epoch": 15.712, + "grad_norm": 0.23471392691135406, + "learning_rate": 1.0761523046092185e-05, + "loss": 0.0087, + "step": 3928 + }, + { + "epoch": 15.716, + "grad_norm": 0.1405946910381317, + "learning_rate": 1.0751503006012025e-05, + "loss": 0.008, + "step": 3929 + }, + { + "epoch": 15.72, + "grad_norm": 0.168296217918396, + "learning_rate": 1.0741482965931863e-05, + "loss": 0.0079, + "step": 3930 + }, + { + "epoch": 15.724, + "grad_norm": 0.3210127353668213, + "learning_rate": 1.0731462925851704e-05, + "loss": 0.0106, + "step": 3931 + }, + { + "epoch": 15.728, + "grad_norm": 0.17187538743019104, + "learning_rate": 1.0721442885771544e-05, + "loss": 0.0085, + "step": 3932 + }, + { + "epoch": 15.732, + "grad_norm": 0.15461215376853943, + "learning_rate": 1.0711422845691383e-05, + "loss": 0.009, + "step": 3933 + }, + { + "epoch": 15.736, + "grad_norm": 0.15376779437065125, + "learning_rate": 1.0701402805611223e-05, + "loss": 0.0082, + "step": 3934 + }, + { + "epoch": 15.74, + "grad_norm": 0.3120775818824768, + "learning_rate": 1.0691382765531064e-05, + "loss": 0.0087, + "step": 3935 + }, + { + "epoch": 15.744, + "grad_norm": 0.16850018501281738, + "learning_rate": 1.0681362725450902e-05, + "loss": 0.0085, + "step": 3936 + }, + { + "epoch": 15.748, + "grad_norm": 0.21848592162132263, + "learning_rate": 1.0671342685370742e-05, + "loss": 0.0083, + "step": 3937 + }, + { + "epoch": 15.752, + "grad_norm": 0.16288349032402039, + "learning_rate": 1.0661322645290583e-05, + "loss": 0.0073, + "step": 3938 + }, + { + "epoch": 15.756, + "grad_norm": 0.24063876271247864, + "learning_rate": 1.065130260521042e-05, + "loss": 0.008, + "step": 3939 + }, + { + "epoch": 15.76, + "grad_norm": 0.20195181667804718, + "learning_rate": 1.0641282565130262e-05, + "loss": 0.0079, + "step": 3940 + }, + { + "epoch": 15.764, + "grad_norm": 0.16952313482761383, + "learning_rate": 1.06312625250501e-05, + "loss": 0.0082, + "step": 3941 + }, + { + "epoch": 15.768, + "grad_norm": 0.23509082198143005, + "learning_rate": 1.062124248496994e-05, + "loss": 0.009, + "step": 3942 + }, + { + "epoch": 15.772, + "grad_norm": 0.1740461140871048, + "learning_rate": 1.061122244488978e-05, + "loss": 0.0081, + "step": 3943 + }, + { + "epoch": 15.776, + "grad_norm": 0.23396913707256317, + "learning_rate": 1.0601202404809619e-05, + "loss": 0.0093, + "step": 3944 + }, + { + "epoch": 15.78, + "grad_norm": 0.1290149837732315, + "learning_rate": 1.059118236472946e-05, + "loss": 0.0071, + "step": 3945 + }, + { + "epoch": 15.784, + "grad_norm": 0.19442108273506165, + "learning_rate": 1.05811623246493e-05, + "loss": 0.0086, + "step": 3946 + }, + { + "epoch": 15.788, + "grad_norm": 0.24171869456768036, + "learning_rate": 1.0571142284569137e-05, + "loss": 0.0085, + "step": 3947 + }, + { + "epoch": 15.792, + "grad_norm": 0.17903035879135132, + "learning_rate": 1.0561122244488979e-05, + "loss": 0.0082, + "step": 3948 + }, + { + "epoch": 15.796, + "grad_norm": 0.23411180078983307, + "learning_rate": 1.0551102204408818e-05, + "loss": 0.0091, + "step": 3949 + }, + { + "epoch": 15.8, + "grad_norm": 0.6385198831558228, + "learning_rate": 1.0541082164328658e-05, + "loss": 0.0077, + "step": 3950 + }, + { + "epoch": 15.804, + "grad_norm": 0.201161727309227, + "learning_rate": 1.0531062124248498e-05, + "loss": 0.0102, + "step": 3951 + }, + { + "epoch": 15.808, + "grad_norm": 0.17722225189208984, + "learning_rate": 1.0521042084168337e-05, + "loss": 0.0083, + "step": 3952 + }, + { + "epoch": 15.812, + "grad_norm": 0.16090677678585052, + "learning_rate": 1.0511022044088177e-05, + "loss": 0.0084, + "step": 3953 + }, + { + "epoch": 15.816, + "grad_norm": 0.1804235279560089, + "learning_rate": 1.0501002004008016e-05, + "loss": 0.0088, + "step": 3954 + }, + { + "epoch": 15.82, + "grad_norm": 0.18935038149356842, + "learning_rate": 1.0490981963927856e-05, + "loss": 0.0093, + "step": 3955 + }, + { + "epoch": 15.824, + "grad_norm": 0.1825006753206253, + "learning_rate": 1.0480961923847695e-05, + "loss": 0.0093, + "step": 3956 + }, + { + "epoch": 15.828, + "grad_norm": 0.24779827892780304, + "learning_rate": 1.0470941883767535e-05, + "loss": 0.01, + "step": 3957 + }, + { + "epoch": 15.832, + "grad_norm": 0.26754239201545715, + "learning_rate": 1.0460921843687375e-05, + "loss": 0.0092, + "step": 3958 + }, + { + "epoch": 15.836, + "grad_norm": 0.20227602124214172, + "learning_rate": 1.0450901803607214e-05, + "loss": 0.0086, + "step": 3959 + }, + { + "epoch": 15.84, + "grad_norm": 0.28584498167037964, + "learning_rate": 1.0440881763527056e-05, + "loss": 0.0108, + "step": 3960 + }, + { + "epoch": 15.844, + "grad_norm": 0.23494118452072144, + "learning_rate": 1.0430861723446893e-05, + "loss": 0.0086, + "step": 3961 + }, + { + "epoch": 15.848, + "grad_norm": 0.23064729571342468, + "learning_rate": 1.0420841683366733e-05, + "loss": 0.0093, + "step": 3962 + }, + { + "epoch": 15.852, + "grad_norm": 0.3154512643814087, + "learning_rate": 1.0410821643286574e-05, + "loss": 0.0097, + "step": 3963 + }, + { + "epoch": 15.856, + "grad_norm": 0.18013764917850494, + "learning_rate": 1.0400801603206412e-05, + "loss": 0.0083, + "step": 3964 + }, + { + "epoch": 15.86, + "grad_norm": 0.1942710429430008, + "learning_rate": 1.0390781563126253e-05, + "loss": 0.0089, + "step": 3965 + }, + { + "epoch": 15.864, + "grad_norm": 0.24880248308181763, + "learning_rate": 1.0380761523046093e-05, + "loss": 0.0087, + "step": 3966 + }, + { + "epoch": 15.868, + "grad_norm": 0.20355558395385742, + "learning_rate": 1.0370741482965933e-05, + "loss": 0.008, + "step": 3967 + }, + { + "epoch": 15.872, + "grad_norm": 0.2536620795726776, + "learning_rate": 1.0360721442885772e-05, + "loss": 0.0085, + "step": 3968 + }, + { + "epoch": 15.876, + "grad_norm": 0.17376026511192322, + "learning_rate": 1.0350701402805612e-05, + "loss": 0.008, + "step": 3969 + }, + { + "epoch": 15.88, + "grad_norm": 0.2233988642692566, + "learning_rate": 1.0340681362725451e-05, + "loss": 0.0091, + "step": 3970 + }, + { + "epoch": 15.884, + "grad_norm": 0.1942223310470581, + "learning_rate": 1.0330661322645291e-05, + "loss": 0.0092, + "step": 3971 + }, + { + "epoch": 15.888, + "grad_norm": 0.1970679759979248, + "learning_rate": 1.032064128256513e-05, + "loss": 0.0083, + "step": 3972 + }, + { + "epoch": 15.892, + "grad_norm": 0.14123409986495972, + "learning_rate": 1.031062124248497e-05, + "loss": 0.0077, + "step": 3973 + }, + { + "epoch": 15.896, + "grad_norm": 0.1591396927833557, + "learning_rate": 1.030060120240481e-05, + "loss": 0.008, + "step": 3974 + }, + { + "epoch": 15.9, + "grad_norm": 0.1460416465997696, + "learning_rate": 1.029058116232465e-05, + "loss": 0.0079, + "step": 3975 + }, + { + "epoch": 15.904, + "grad_norm": 0.1960773915052414, + "learning_rate": 1.0280561122244489e-05, + "loss": 0.0085, + "step": 3976 + }, + { + "epoch": 15.908, + "grad_norm": 0.16314558684825897, + "learning_rate": 1.027054108216433e-05, + "loss": 0.0077, + "step": 3977 + }, + { + "epoch": 15.912, + "grad_norm": 0.3076308071613312, + "learning_rate": 1.0260521042084168e-05, + "loss": 0.0098, + "step": 3978 + }, + { + "epoch": 15.916, + "grad_norm": 0.8374438285827637, + "learning_rate": 1.0250501002004008e-05, + "loss": 0.0114, + "step": 3979 + }, + { + "epoch": 15.92, + "grad_norm": 0.31852617859840393, + "learning_rate": 1.0240480961923849e-05, + "loss": 0.0107, + "step": 3980 + }, + { + "epoch": 15.924, + "grad_norm": 0.19266337156295776, + "learning_rate": 1.0230460921843687e-05, + "loss": 0.0084, + "step": 3981 + }, + { + "epoch": 15.928, + "grad_norm": 0.20687268674373627, + "learning_rate": 1.0220440881763528e-05, + "loss": 0.0089, + "step": 3982 + }, + { + "epoch": 15.932, + "grad_norm": 0.17380601167678833, + "learning_rate": 1.0210420841683368e-05, + "loss": 0.0083, + "step": 3983 + }, + { + "epoch": 15.936, + "grad_norm": 0.2257053554058075, + "learning_rate": 1.0200400801603206e-05, + "loss": 0.0084, + "step": 3984 + }, + { + "epoch": 15.94, + "grad_norm": 0.23071831464767456, + "learning_rate": 1.0190380761523047e-05, + "loss": 0.0102, + "step": 3985 + }, + { + "epoch": 15.943999999999999, + "grad_norm": 0.21640700101852417, + "learning_rate": 1.0180360721442887e-05, + "loss": 0.0085, + "step": 3986 + }, + { + "epoch": 15.948, + "grad_norm": 0.17279183864593506, + "learning_rate": 1.0170340681362726e-05, + "loss": 0.0082, + "step": 3987 + }, + { + "epoch": 15.952, + "grad_norm": 0.15583649277687073, + "learning_rate": 1.0160320641282566e-05, + "loss": 0.0082, + "step": 3988 + }, + { + "epoch": 15.956, + "grad_norm": 0.30920568108558655, + "learning_rate": 1.0150300601202405e-05, + "loss": 0.0101, + "step": 3989 + }, + { + "epoch": 15.96, + "grad_norm": 0.23646380007266998, + "learning_rate": 1.0140280561122245e-05, + "loss": 0.0092, + "step": 3990 + }, + { + "epoch": 15.964, + "grad_norm": 0.16509555280208588, + "learning_rate": 1.0130260521042084e-05, + "loss": 0.0079, + "step": 3991 + }, + { + "epoch": 15.968, + "grad_norm": 0.23459596931934357, + "learning_rate": 1.0120240480961924e-05, + "loss": 0.0088, + "step": 3992 + }, + { + "epoch": 15.972, + "grad_norm": 0.1768113523721695, + "learning_rate": 1.0110220440881764e-05, + "loss": 0.0088, + "step": 3993 + }, + { + "epoch": 15.975999999999999, + "grad_norm": 0.22669987380504608, + "learning_rate": 1.0100200400801605e-05, + "loss": 0.0087, + "step": 3994 + }, + { + "epoch": 15.98, + "grad_norm": 0.1484929621219635, + "learning_rate": 1.0090180360721443e-05, + "loss": 0.0078, + "step": 3995 + }, + { + "epoch": 15.984, + "grad_norm": 0.16619373857975006, + "learning_rate": 1.0080160320641282e-05, + "loss": 0.0083, + "step": 3996 + }, + { + "epoch": 15.988, + "grad_norm": 0.14768655598163605, + "learning_rate": 1.0070140280561124e-05, + "loss": 0.007, + "step": 3997 + }, + { + "epoch": 15.992, + "grad_norm": 0.169325053691864, + "learning_rate": 1.0060120240480962e-05, + "loss": 0.0081, + "step": 3998 + }, + { + "epoch": 15.996, + "grad_norm": 0.22410063445568085, + "learning_rate": 1.0050100200400803e-05, + "loss": 0.0086, + "step": 3999 + }, + { + "epoch": 16.0, + "grad_norm": 0.25347474217414856, + "learning_rate": 1.0040080160320642e-05, + "loss": 0.0092, + "step": 4000 + }, + { + "epoch": 16.004, + "grad_norm": 0.1537349820137024, + "learning_rate": 1.003006012024048e-05, + "loss": 0.0079, + "step": 4001 + }, + { + "epoch": 16.008, + "grad_norm": 0.16942495107650757, + "learning_rate": 1.0020040080160322e-05, + "loss": 0.0078, + "step": 4002 + }, + { + "epoch": 16.012, + "grad_norm": 0.15320508182048798, + "learning_rate": 1.0010020040080161e-05, + "loss": 0.0078, + "step": 4003 + }, + { + "epoch": 16.016, + "grad_norm": 0.2386036515235901, + "learning_rate": 1e-05, + "loss": 0.0079, + "step": 4004 + }, + { + "epoch": 16.02, + "grad_norm": 0.14939050376415253, + "learning_rate": 9.98997995991984e-06, + "loss": 0.008, + "step": 4005 + }, + { + "epoch": 16.024, + "grad_norm": 0.14771635830402374, + "learning_rate": 9.97995991983968e-06, + "loss": 0.0074, + "step": 4006 + }, + { + "epoch": 16.028, + "grad_norm": 0.12847012281417847, + "learning_rate": 9.96993987975952e-06, + "loss": 0.0073, + "step": 4007 + }, + { + "epoch": 16.032, + "grad_norm": 0.2023051530122757, + "learning_rate": 9.95991983967936e-06, + "loss": 0.0075, + "step": 4008 + }, + { + "epoch": 16.036, + "grad_norm": 0.1175389364361763, + "learning_rate": 9.949899799599199e-06, + "loss": 0.006, + "step": 4009 + }, + { + "epoch": 16.04, + "grad_norm": 0.2609016001224518, + "learning_rate": 9.939879759519038e-06, + "loss": 0.0071, + "step": 4010 + }, + { + "epoch": 16.044, + "grad_norm": 0.1463293582201004, + "learning_rate": 9.929859719438878e-06, + "loss": 0.0071, + "step": 4011 + }, + { + "epoch": 16.048, + "grad_norm": 0.15348109602928162, + "learning_rate": 9.919839679358718e-06, + "loss": 0.0076, + "step": 4012 + }, + { + "epoch": 16.052, + "grad_norm": 0.15261180698871613, + "learning_rate": 9.909819639278557e-06, + "loss": 0.0072, + "step": 4013 + }, + { + "epoch": 16.056, + "grad_norm": 0.21835850179195404, + "learning_rate": 9.899799599198398e-06, + "loss": 0.007, + "step": 4014 + }, + { + "epoch": 16.06, + "grad_norm": 0.1864071786403656, + "learning_rate": 9.889779559118236e-06, + "loss": 0.0075, + "step": 4015 + }, + { + "epoch": 16.064, + "grad_norm": 0.23007088899612427, + "learning_rate": 9.879759519038076e-06, + "loss": 0.0098, + "step": 4016 + }, + { + "epoch": 16.068, + "grad_norm": 0.12705695629119873, + "learning_rate": 9.869739478957917e-06, + "loss": 0.0073, + "step": 4017 + }, + { + "epoch": 16.072, + "grad_norm": 0.16708585619926453, + "learning_rate": 9.859719438877755e-06, + "loss": 0.0071, + "step": 4018 + }, + { + "epoch": 16.076, + "grad_norm": 0.1379920393228531, + "learning_rate": 9.849699398797596e-06, + "loss": 0.0075, + "step": 4019 + }, + { + "epoch": 16.08, + "grad_norm": 0.16059952974319458, + "learning_rate": 9.839679358717436e-06, + "loss": 0.0077, + "step": 4020 + }, + { + "epoch": 16.084, + "grad_norm": 0.14642521739006042, + "learning_rate": 9.829659318637276e-06, + "loss": 0.0081, + "step": 4021 + }, + { + "epoch": 16.088, + "grad_norm": 0.15344499051570892, + "learning_rate": 9.819639278557115e-06, + "loss": 0.0079, + "step": 4022 + }, + { + "epoch": 16.092, + "grad_norm": 0.17185145616531372, + "learning_rate": 9.809619238476955e-06, + "loss": 0.0071, + "step": 4023 + }, + { + "epoch": 16.096, + "grad_norm": 0.14292530715465546, + "learning_rate": 9.799599198396794e-06, + "loss": 0.0074, + "step": 4024 + }, + { + "epoch": 16.1, + "grad_norm": 0.15338358283042908, + "learning_rate": 9.789579158316634e-06, + "loss": 0.007, + "step": 4025 + }, + { + "epoch": 16.104, + "grad_norm": 0.1363472044467926, + "learning_rate": 9.779559118236473e-06, + "loss": 0.0065, + "step": 4026 + }, + { + "epoch": 16.108, + "grad_norm": 0.18110625445842743, + "learning_rate": 9.769539078156313e-06, + "loss": 0.0075, + "step": 4027 + }, + { + "epoch": 16.112, + "grad_norm": 0.24270915985107422, + "learning_rate": 9.759519038076153e-06, + "loss": 0.0073, + "step": 4028 + }, + { + "epoch": 16.116, + "grad_norm": 0.11325805634260178, + "learning_rate": 9.749498997995992e-06, + "loss": 0.0045, + "step": 4029 + }, + { + "epoch": 16.12, + "grad_norm": 0.6353485584259033, + "learning_rate": 9.739478957915832e-06, + "loss": 0.0083, + "step": 4030 + }, + { + "epoch": 16.124, + "grad_norm": 0.20861081779003143, + "learning_rate": 9.729458917835673e-06, + "loss": 0.008, + "step": 4031 + }, + { + "epoch": 16.128, + "grad_norm": 0.1678023636341095, + "learning_rate": 9.719438877755511e-06, + "loss": 0.0075, + "step": 4032 + }, + { + "epoch": 16.132, + "grad_norm": 0.22754140198230743, + "learning_rate": 9.70941883767535e-06, + "loss": 0.0077, + "step": 4033 + }, + { + "epoch": 16.136, + "grad_norm": 0.16630183160305023, + "learning_rate": 9.699398797595192e-06, + "loss": 0.0082, + "step": 4034 + }, + { + "epoch": 16.14, + "grad_norm": 0.1556076854467392, + "learning_rate": 9.68937875751503e-06, + "loss": 0.0075, + "step": 4035 + }, + { + "epoch": 16.144, + "grad_norm": 0.12683729827404022, + "learning_rate": 9.679358717434871e-06, + "loss": 0.0068, + "step": 4036 + }, + { + "epoch": 16.148, + "grad_norm": 0.17109414935112, + "learning_rate": 9.66933867735471e-06, + "loss": 0.0085, + "step": 4037 + }, + { + "epoch": 16.152, + "grad_norm": 0.20201754570007324, + "learning_rate": 9.659318637274549e-06, + "loss": 0.0071, + "step": 4038 + }, + { + "epoch": 16.156, + "grad_norm": 0.30220654606819153, + "learning_rate": 9.64929859719439e-06, + "loss": 0.0078, + "step": 4039 + }, + { + "epoch": 16.16, + "grad_norm": 0.17886076867580414, + "learning_rate": 9.63927855711423e-06, + "loss": 0.0081, + "step": 4040 + }, + { + "epoch": 16.164, + "grad_norm": 0.14879950881004333, + "learning_rate": 9.629258517034069e-06, + "loss": 0.0073, + "step": 4041 + }, + { + "epoch": 16.168, + "grad_norm": 0.21824462711811066, + "learning_rate": 9.619238476953909e-06, + "loss": 0.0055, + "step": 4042 + }, + { + "epoch": 16.172, + "grad_norm": 0.2450965940952301, + "learning_rate": 9.609218436873746e-06, + "loss": 0.0088, + "step": 4043 + }, + { + "epoch": 16.176, + "grad_norm": 0.16798727214336395, + "learning_rate": 9.599198396793588e-06, + "loss": 0.008, + "step": 4044 + }, + { + "epoch": 16.18, + "grad_norm": 0.21162693202495575, + "learning_rate": 9.589178356713427e-06, + "loss": 0.0072, + "step": 4045 + }, + { + "epoch": 16.184, + "grad_norm": 0.1760333925485611, + "learning_rate": 9.579158316633267e-06, + "loss": 0.0073, + "step": 4046 + }, + { + "epoch": 16.188, + "grad_norm": 0.2047117054462433, + "learning_rate": 9.569138276553107e-06, + "loss": 0.008, + "step": 4047 + }, + { + "epoch": 16.192, + "grad_norm": 0.18758143484592438, + "learning_rate": 9.559118236472948e-06, + "loss": 0.0069, + "step": 4048 + }, + { + "epoch": 16.196, + "grad_norm": 0.2109622359275818, + "learning_rate": 9.549098196392786e-06, + "loss": 0.0079, + "step": 4049 + }, + { + "epoch": 16.2, + "grad_norm": 0.16541019082069397, + "learning_rate": 9.539078156312625e-06, + "loss": 0.0065, + "step": 4050 + }, + { + "epoch": 16.204, + "grad_norm": 0.12194262444972992, + "learning_rate": 9.529058116232467e-06, + "loss": 0.0067, + "step": 4051 + }, + { + "epoch": 16.208, + "grad_norm": 0.5753403306007385, + "learning_rate": 9.519038076152304e-06, + "loss": 0.0117, + "step": 4052 + }, + { + "epoch": 16.212, + "grad_norm": 0.2605521082878113, + "learning_rate": 9.509018036072146e-06, + "loss": 0.0094, + "step": 4053 + }, + { + "epoch": 16.216, + "grad_norm": 0.2662070393562317, + "learning_rate": 9.498997995991984e-06, + "loss": 0.0089, + "step": 4054 + }, + { + "epoch": 16.22, + "grad_norm": 0.17253991961479187, + "learning_rate": 9.488977955911823e-06, + "loss": 0.0064, + "step": 4055 + }, + { + "epoch": 16.224, + "grad_norm": 0.1860446333885193, + "learning_rate": 9.478957915831665e-06, + "loss": 0.0077, + "step": 4056 + }, + { + "epoch": 16.228, + "grad_norm": 0.1807558834552765, + "learning_rate": 9.468937875751502e-06, + "loss": 0.0084, + "step": 4057 + }, + { + "epoch": 16.232, + "grad_norm": 0.2518766224384308, + "learning_rate": 9.458917835671344e-06, + "loss": 0.0085, + "step": 4058 + }, + { + "epoch": 16.236, + "grad_norm": 0.18722839653491974, + "learning_rate": 9.448897795591183e-06, + "loss": 0.0073, + "step": 4059 + }, + { + "epoch": 16.24, + "grad_norm": 0.16822689771652222, + "learning_rate": 9.438877755511021e-06, + "loss": 0.0072, + "step": 4060 + }, + { + "epoch": 16.244, + "grad_norm": 0.26538798213005066, + "learning_rate": 9.428857715430862e-06, + "loss": 0.0082, + "step": 4061 + }, + { + "epoch": 16.248, + "grad_norm": 0.24949723482131958, + "learning_rate": 9.418837675350702e-06, + "loss": 0.009, + "step": 4062 + }, + { + "epoch": 16.252, + "grad_norm": 0.1420658975839615, + "learning_rate": 9.408817635270542e-06, + "loss": 0.0066, + "step": 4063 + }, + { + "epoch": 16.256, + "grad_norm": 0.1801360845565796, + "learning_rate": 9.398797595190381e-06, + "loss": 0.0076, + "step": 4064 + }, + { + "epoch": 16.26, + "grad_norm": 0.16072385013103485, + "learning_rate": 9.38877755511022e-06, + "loss": 0.0074, + "step": 4065 + }, + { + "epoch": 16.264, + "grad_norm": 0.1723981499671936, + "learning_rate": 9.37875751503006e-06, + "loss": 0.0079, + "step": 4066 + }, + { + "epoch": 16.268, + "grad_norm": 0.20122207701206207, + "learning_rate": 9.3687374749499e-06, + "loss": 0.0085, + "step": 4067 + }, + { + "epoch": 16.272, + "grad_norm": 0.16807159781455994, + "learning_rate": 9.35871743486974e-06, + "loss": 0.0081, + "step": 4068 + }, + { + "epoch": 16.276, + "grad_norm": 0.21395786106586456, + "learning_rate": 9.34869739478958e-06, + "loss": 0.0084, + "step": 4069 + }, + { + "epoch": 16.28, + "grad_norm": 0.1668098419904709, + "learning_rate": 9.338677354709419e-06, + "loss": 0.0071, + "step": 4070 + }, + { + "epoch": 16.284, + "grad_norm": 0.1805601865053177, + "learning_rate": 9.328657314629258e-06, + "loss": 0.0071, + "step": 4071 + }, + { + "epoch": 16.288, + "grad_norm": 0.2028336524963379, + "learning_rate": 9.318637274549098e-06, + "loss": 0.0074, + "step": 4072 + }, + { + "epoch": 16.292, + "grad_norm": 0.20740242302417755, + "learning_rate": 9.30861723446894e-06, + "loss": 0.0076, + "step": 4073 + }, + { + "epoch": 16.296, + "grad_norm": 0.21578271687030792, + "learning_rate": 9.298597194388777e-06, + "loss": 0.0082, + "step": 4074 + }, + { + "epoch": 16.3, + "grad_norm": 0.16717369854450226, + "learning_rate": 9.288577154308618e-06, + "loss": 0.0076, + "step": 4075 + }, + { + "epoch": 16.304, + "grad_norm": 0.1616104692220688, + "learning_rate": 9.278557114228458e-06, + "loss": 0.008, + "step": 4076 + }, + { + "epoch": 16.308, + "grad_norm": 0.16851823031902313, + "learning_rate": 9.268537074148296e-06, + "loss": 0.0072, + "step": 4077 + }, + { + "epoch": 16.312, + "grad_norm": 0.18806764483451843, + "learning_rate": 9.258517034068137e-06, + "loss": 0.008, + "step": 4078 + }, + { + "epoch": 16.316, + "grad_norm": 0.21248526871204376, + "learning_rate": 9.248496993987977e-06, + "loss": 0.0076, + "step": 4079 + }, + { + "epoch": 16.32, + "grad_norm": 0.2105952352285385, + "learning_rate": 9.238476953907816e-06, + "loss": 0.009, + "step": 4080 + }, + { + "epoch": 16.324, + "grad_norm": 0.1982223242521286, + "learning_rate": 9.228456913827656e-06, + "loss": 0.0071, + "step": 4081 + }, + { + "epoch": 16.328, + "grad_norm": 0.13595516979694366, + "learning_rate": 9.218436873747496e-06, + "loss": 0.0075, + "step": 4082 + }, + { + "epoch": 16.332, + "grad_norm": 0.16603784263134003, + "learning_rate": 9.208416833667335e-06, + "loss": 0.0079, + "step": 4083 + }, + { + "epoch": 16.336, + "grad_norm": 0.12831614911556244, + "learning_rate": 9.198396793587175e-06, + "loss": 0.0063, + "step": 4084 + }, + { + "epoch": 16.34, + "grad_norm": 0.1970067024230957, + "learning_rate": 9.188376753507014e-06, + "loss": 0.0067, + "step": 4085 + }, + { + "epoch": 16.344, + "grad_norm": 0.1089383140206337, + "learning_rate": 9.178356713426854e-06, + "loss": 0.0045, + "step": 4086 + }, + { + "epoch": 16.348, + "grad_norm": 0.2165975272655487, + "learning_rate": 9.168336673346693e-06, + "loss": 0.008, + "step": 4087 + }, + { + "epoch": 16.352, + "grad_norm": 0.1895694136619568, + "learning_rate": 9.158316633266533e-06, + "loss": 0.007, + "step": 4088 + }, + { + "epoch": 16.356, + "grad_norm": 0.2537732720375061, + "learning_rate": 9.148296593186373e-06, + "loss": 0.0093, + "step": 4089 + }, + { + "epoch": 16.36, + "grad_norm": 0.16854915022850037, + "learning_rate": 9.138276553106214e-06, + "loss": 0.0087, + "step": 4090 + }, + { + "epoch": 16.364, + "grad_norm": 0.1767825335264206, + "learning_rate": 9.128256513026052e-06, + "loss": 0.0075, + "step": 4091 + }, + { + "epoch": 16.368, + "grad_norm": 0.19032791256904602, + "learning_rate": 9.118236472945891e-06, + "loss": 0.0078, + "step": 4092 + }, + { + "epoch": 16.372, + "grad_norm": 0.21050649881362915, + "learning_rate": 9.108216432865733e-06, + "loss": 0.0083, + "step": 4093 + }, + { + "epoch": 16.376, + "grad_norm": 0.1695554405450821, + "learning_rate": 9.09819639278557e-06, + "loss": 0.0079, + "step": 4094 + }, + { + "epoch": 16.38, + "grad_norm": 0.15526901185512543, + "learning_rate": 9.088176352705412e-06, + "loss": 0.0084, + "step": 4095 + }, + { + "epoch": 16.384, + "grad_norm": 0.18327704071998596, + "learning_rate": 9.078156312625251e-06, + "loss": 0.0076, + "step": 4096 + }, + { + "epoch": 16.388, + "grad_norm": 0.1327105313539505, + "learning_rate": 9.06813627254509e-06, + "loss": 0.007, + "step": 4097 + }, + { + "epoch": 16.392, + "grad_norm": 0.17368905246257782, + "learning_rate": 9.05811623246493e-06, + "loss": 0.0078, + "step": 4098 + }, + { + "epoch": 16.396, + "grad_norm": 0.2445645034313202, + "learning_rate": 9.04809619238477e-06, + "loss": 0.0084, + "step": 4099 + }, + { + "epoch": 16.4, + "grad_norm": 0.1971755027770996, + "learning_rate": 9.03807615230461e-06, + "loss": 0.0084, + "step": 4100 + }, + { + "epoch": 16.404, + "grad_norm": 0.22356528043746948, + "learning_rate": 9.02805611222445e-06, + "loss": 0.0085, + "step": 4101 + }, + { + "epoch": 16.408, + "grad_norm": 0.2354351133108139, + "learning_rate": 9.018036072144289e-06, + "loss": 0.0088, + "step": 4102 + }, + { + "epoch": 16.412, + "grad_norm": 0.12401222437620163, + "learning_rate": 9.008016032064129e-06, + "loss": 0.0047, + "step": 4103 + }, + { + "epoch": 16.416, + "grad_norm": 0.10349331051111221, + "learning_rate": 8.997995991983968e-06, + "loss": 0.005, + "step": 4104 + }, + { + "epoch": 16.42, + "grad_norm": 0.16208389401435852, + "learning_rate": 8.987975951903808e-06, + "loss": 0.0084, + "step": 4105 + }, + { + "epoch": 16.424, + "grad_norm": 0.243452250957489, + "learning_rate": 8.977955911823647e-06, + "loss": 0.0094, + "step": 4106 + }, + { + "epoch": 16.428, + "grad_norm": 0.142629936337471, + "learning_rate": 8.967935871743489e-06, + "loss": 0.0075, + "step": 4107 + }, + { + "epoch": 16.432, + "grad_norm": 0.15112538635730743, + "learning_rate": 8.957915831663327e-06, + "loss": 0.0075, + "step": 4108 + }, + { + "epoch": 16.436, + "grad_norm": 0.15232206881046295, + "learning_rate": 8.947895791583166e-06, + "loss": 0.004, + "step": 4109 + }, + { + "epoch": 16.44, + "grad_norm": 0.20458529889583588, + "learning_rate": 8.937875751503007e-06, + "loss": 0.0081, + "step": 4110 + }, + { + "epoch": 16.444, + "grad_norm": 0.1483834683895111, + "learning_rate": 8.927855711422845e-06, + "loss": 0.0086, + "step": 4111 + }, + { + "epoch": 16.448, + "grad_norm": 0.13278226554393768, + "learning_rate": 8.917835671342687e-06, + "loss": 0.0067, + "step": 4112 + }, + { + "epoch": 16.452, + "grad_norm": 0.16911619901657104, + "learning_rate": 8.907815631262526e-06, + "loss": 0.0074, + "step": 4113 + }, + { + "epoch": 16.456, + "grad_norm": 0.2102874517440796, + "learning_rate": 8.897795591182364e-06, + "loss": 0.0089, + "step": 4114 + }, + { + "epoch": 16.46, + "grad_norm": 0.2927187383174896, + "learning_rate": 8.887775551102205e-06, + "loss": 0.0083, + "step": 4115 + }, + { + "epoch": 16.464, + "grad_norm": 0.13784025609493256, + "learning_rate": 8.877755511022045e-06, + "loss": 0.0079, + "step": 4116 + }, + { + "epoch": 16.468, + "grad_norm": 0.15802910923957825, + "learning_rate": 8.867735470941884e-06, + "loss": 0.0077, + "step": 4117 + }, + { + "epoch": 16.472, + "grad_norm": 0.15576040744781494, + "learning_rate": 8.857715430861724e-06, + "loss": 0.007, + "step": 4118 + }, + { + "epoch": 16.476, + "grad_norm": 0.1388392150402069, + "learning_rate": 8.847695390781564e-06, + "loss": 0.0073, + "step": 4119 + }, + { + "epoch": 16.48, + "grad_norm": 0.14177972078323364, + "learning_rate": 8.837675350701403e-06, + "loss": 0.0075, + "step": 4120 + }, + { + "epoch": 16.484, + "grad_norm": 0.19317297637462616, + "learning_rate": 8.827655310621243e-06, + "loss": 0.0072, + "step": 4121 + }, + { + "epoch": 16.488, + "grad_norm": 0.15589751303195953, + "learning_rate": 8.817635270541082e-06, + "loss": 0.0079, + "step": 4122 + }, + { + "epoch": 16.492, + "grad_norm": 0.23068901896476746, + "learning_rate": 8.807615230460922e-06, + "loss": 0.0078, + "step": 4123 + }, + { + "epoch": 16.496, + "grad_norm": 0.2934131920337677, + "learning_rate": 8.797595190380762e-06, + "loss": 0.008, + "step": 4124 + }, + { + "epoch": 16.5, + "grad_norm": 0.20778466761112213, + "learning_rate": 8.787575150300601e-06, + "loss": 0.0087, + "step": 4125 + }, + { + "epoch": 16.504, + "grad_norm": 0.1556091457605362, + "learning_rate": 8.77755511022044e-06, + "loss": 0.0071, + "step": 4126 + }, + { + "epoch": 16.508, + "grad_norm": 0.1435934156179428, + "learning_rate": 8.767535070140282e-06, + "loss": 0.0078, + "step": 4127 + }, + { + "epoch": 16.512, + "grad_norm": 0.18272621929645538, + "learning_rate": 8.75751503006012e-06, + "loss": 0.0082, + "step": 4128 + }, + { + "epoch": 16.516, + "grad_norm": 0.18199694156646729, + "learning_rate": 8.747494989979961e-06, + "loss": 0.0073, + "step": 4129 + }, + { + "epoch": 16.52, + "grad_norm": 0.18646559119224548, + "learning_rate": 8.7374749498998e-06, + "loss": 0.0071, + "step": 4130 + }, + { + "epoch": 16.524, + "grad_norm": 0.20238129794597626, + "learning_rate": 8.727454909819639e-06, + "loss": 0.0087, + "step": 4131 + }, + { + "epoch": 16.528, + "grad_norm": 0.17325668036937714, + "learning_rate": 8.71743486973948e-06, + "loss": 0.0049, + "step": 4132 + }, + { + "epoch": 16.532, + "grad_norm": 0.21818231046199799, + "learning_rate": 8.70741482965932e-06, + "loss": 0.008, + "step": 4133 + }, + { + "epoch": 16.536, + "grad_norm": 0.12345243245363235, + "learning_rate": 8.69739478957916e-06, + "loss": 0.0066, + "step": 4134 + }, + { + "epoch": 16.54, + "grad_norm": 0.21801672875881195, + "learning_rate": 8.687374749498999e-06, + "loss": 0.009, + "step": 4135 + }, + { + "epoch": 16.544, + "grad_norm": 0.22668403387069702, + "learning_rate": 8.677354709418838e-06, + "loss": 0.0087, + "step": 4136 + }, + { + "epoch": 16.548000000000002, + "grad_norm": 0.11883547157049179, + "learning_rate": 8.667334669338678e-06, + "loss": 0.0069, + "step": 4137 + }, + { + "epoch": 16.552, + "grad_norm": 0.3186958134174347, + "learning_rate": 8.657314629258518e-06, + "loss": 0.0092, + "step": 4138 + }, + { + "epoch": 16.556, + "grad_norm": 0.1984918713569641, + "learning_rate": 8.647294589178357e-06, + "loss": 0.0085, + "step": 4139 + }, + { + "epoch": 16.56, + "grad_norm": 0.2220780849456787, + "learning_rate": 8.637274549098197e-06, + "loss": 0.0093, + "step": 4140 + }, + { + "epoch": 16.564, + "grad_norm": 0.11941434442996979, + "learning_rate": 8.627254509018036e-06, + "loss": 0.0067, + "step": 4141 + }, + { + "epoch": 16.568, + "grad_norm": 0.1803411990404129, + "learning_rate": 8.617234468937876e-06, + "loss": 0.0081, + "step": 4142 + }, + { + "epoch": 16.572, + "grad_norm": 0.2058972716331482, + "learning_rate": 8.607214428857715e-06, + "loss": 0.0075, + "step": 4143 + }, + { + "epoch": 16.576, + "grad_norm": 0.23917993903160095, + "learning_rate": 8.597194388777557e-06, + "loss": 0.0091, + "step": 4144 + }, + { + "epoch": 16.58, + "grad_norm": 0.1594621241092682, + "learning_rate": 8.587174348697395e-06, + "loss": 0.0082, + "step": 4145 + }, + { + "epoch": 16.584, + "grad_norm": 0.08639764040708542, + "learning_rate": 8.577154308617234e-06, + "loss": 0.004, + "step": 4146 + }, + { + "epoch": 16.588, + "grad_norm": 0.2543061673641205, + "learning_rate": 8.567134268537076e-06, + "loss": 0.009, + "step": 4147 + }, + { + "epoch": 16.592, + "grad_norm": 0.2126331478357315, + "learning_rate": 8.557114228456913e-06, + "loss": 0.0082, + "step": 4148 + }, + { + "epoch": 16.596, + "grad_norm": 0.2556276023387909, + "learning_rate": 8.547094188376755e-06, + "loss": 0.0088, + "step": 4149 + }, + { + "epoch": 16.6, + "grad_norm": 0.14997954666614532, + "learning_rate": 8.537074148296594e-06, + "loss": 0.008, + "step": 4150 + }, + { + "epoch": 16.604, + "grad_norm": 0.18085747957229614, + "learning_rate": 8.527054108216432e-06, + "loss": 0.008, + "step": 4151 + }, + { + "epoch": 16.608, + "grad_norm": 0.22281257808208466, + "learning_rate": 8.517034068136273e-06, + "loss": 0.0076, + "step": 4152 + }, + { + "epoch": 16.612, + "grad_norm": 0.11172794550657272, + "learning_rate": 8.507014028056113e-06, + "loss": 0.0052, + "step": 4153 + }, + { + "epoch": 16.616, + "grad_norm": 0.2109166383743286, + "learning_rate": 8.496993987975953e-06, + "loss": 0.0081, + "step": 4154 + }, + { + "epoch": 16.62, + "grad_norm": 0.19735772907733917, + "learning_rate": 8.486973947895792e-06, + "loss": 0.0084, + "step": 4155 + }, + { + "epoch": 16.624, + "grad_norm": 0.1670539826154709, + "learning_rate": 8.47695390781563e-06, + "loss": 0.0076, + "step": 4156 + }, + { + "epoch": 16.628, + "grad_norm": 0.26279884576797485, + "learning_rate": 8.466933867735471e-06, + "loss": 0.0094, + "step": 4157 + }, + { + "epoch": 16.632, + "grad_norm": 0.24270443618297577, + "learning_rate": 8.456913827655311e-06, + "loss": 0.0088, + "step": 4158 + }, + { + "epoch": 16.636, + "grad_norm": 0.2702789604663849, + "learning_rate": 8.44689378757515e-06, + "loss": 0.0094, + "step": 4159 + }, + { + "epoch": 16.64, + "grad_norm": 0.13629254698753357, + "learning_rate": 8.43687374749499e-06, + "loss": 0.007, + "step": 4160 + }, + { + "epoch": 16.644, + "grad_norm": 0.1675589680671692, + "learning_rate": 8.426853707414831e-06, + "loss": 0.0093, + "step": 4161 + }, + { + "epoch": 16.648, + "grad_norm": 0.10212656110525131, + "learning_rate": 8.41683366733467e-06, + "loss": 0.0048, + "step": 4162 + }, + { + "epoch": 16.652, + "grad_norm": 0.39032217860221863, + "learning_rate": 8.406813627254509e-06, + "loss": 0.0074, + "step": 4163 + }, + { + "epoch": 16.656, + "grad_norm": 0.20752616226673126, + "learning_rate": 8.39679358717435e-06, + "loss": 0.0095, + "step": 4164 + }, + { + "epoch": 16.66, + "grad_norm": 0.15666551887989044, + "learning_rate": 8.386773547094188e-06, + "loss": 0.0083, + "step": 4165 + }, + { + "epoch": 16.664, + "grad_norm": 0.17233043909072876, + "learning_rate": 8.37675350701403e-06, + "loss": 0.0076, + "step": 4166 + }, + { + "epoch": 16.668, + "grad_norm": 0.17786544561386108, + "learning_rate": 8.366733466933869e-06, + "loss": 0.008, + "step": 4167 + }, + { + "epoch": 16.672, + "grad_norm": 0.14327113330364227, + "learning_rate": 8.356713426853707e-06, + "loss": 0.0077, + "step": 4168 + }, + { + "epoch": 16.676, + "grad_norm": 0.16869743168354034, + "learning_rate": 8.346693386773548e-06, + "loss": 0.0074, + "step": 4169 + }, + { + "epoch": 16.68, + "grad_norm": 0.15478970110416412, + "learning_rate": 8.336673346693386e-06, + "loss": 0.0077, + "step": 4170 + }, + { + "epoch": 16.684, + "grad_norm": 0.29708924889564514, + "learning_rate": 8.326653306613227e-06, + "loss": 0.009, + "step": 4171 + }, + { + "epoch": 16.688, + "grad_norm": 0.1700715571641922, + "learning_rate": 8.316633266533067e-06, + "loss": 0.0086, + "step": 4172 + }, + { + "epoch": 16.692, + "grad_norm": 0.16658703982830048, + "learning_rate": 8.306613226452905e-06, + "loss": 0.0086, + "step": 4173 + }, + { + "epoch": 16.696, + "grad_norm": 0.18442490696907043, + "learning_rate": 8.296593186372746e-06, + "loss": 0.0084, + "step": 4174 + }, + { + "epoch": 16.7, + "grad_norm": 0.2534857392311096, + "learning_rate": 8.286573146292586e-06, + "loss": 0.0094, + "step": 4175 + }, + { + "epoch": 16.704, + "grad_norm": 0.193229541182518, + "learning_rate": 8.276553106212425e-06, + "loss": 0.0084, + "step": 4176 + }, + { + "epoch": 16.708, + "grad_norm": 0.14113031327724457, + "learning_rate": 8.266533066132265e-06, + "loss": 0.0072, + "step": 4177 + }, + { + "epoch": 16.712, + "grad_norm": 0.1453218162059784, + "learning_rate": 8.256513026052104e-06, + "loss": 0.0083, + "step": 4178 + }, + { + "epoch": 16.716, + "grad_norm": 0.17128872871398926, + "learning_rate": 8.246492985971944e-06, + "loss": 0.0084, + "step": 4179 + }, + { + "epoch": 16.72, + "grad_norm": 0.17818258702754974, + "learning_rate": 8.236472945891784e-06, + "loss": 0.0067, + "step": 4180 + }, + { + "epoch": 16.724, + "grad_norm": 0.1808307021856308, + "learning_rate": 8.226452905811623e-06, + "loss": 0.0084, + "step": 4181 + }, + { + "epoch": 16.728, + "grad_norm": 0.11610844731330872, + "learning_rate": 8.216432865731463e-06, + "loss": 0.0048, + "step": 4182 + }, + { + "epoch": 16.732, + "grad_norm": 0.25724536180496216, + "learning_rate": 8.206412825651302e-06, + "loss": 0.0097, + "step": 4183 + }, + { + "epoch": 16.736, + "grad_norm": 0.26306334137916565, + "learning_rate": 8.196392785571142e-06, + "loss": 0.0093, + "step": 4184 + }, + { + "epoch": 16.74, + "grad_norm": 0.17018313705921173, + "learning_rate": 8.186372745490982e-06, + "loss": 0.0073, + "step": 4185 + }, + { + "epoch": 16.744, + "grad_norm": 0.20504972338676453, + "learning_rate": 8.176352705410823e-06, + "loss": 0.0085, + "step": 4186 + }, + { + "epoch": 16.748, + "grad_norm": 0.19006863236427307, + "learning_rate": 8.16633266533066e-06, + "loss": 0.0079, + "step": 4187 + }, + { + "epoch": 16.752, + "grad_norm": 0.22111523151397705, + "learning_rate": 8.156312625250502e-06, + "loss": 0.009, + "step": 4188 + }, + { + "epoch": 16.756, + "grad_norm": 0.17327627539634705, + "learning_rate": 8.146292585170342e-06, + "loss": 0.0079, + "step": 4189 + }, + { + "epoch": 16.76, + "grad_norm": 0.20672562718391418, + "learning_rate": 8.13627254509018e-06, + "loss": 0.0095, + "step": 4190 + }, + { + "epoch": 16.764, + "grad_norm": 0.3867163062095642, + "learning_rate": 8.12625250501002e-06, + "loss": 0.0091, + "step": 4191 + }, + { + "epoch": 16.768, + "grad_norm": 0.22634045779705048, + "learning_rate": 8.11623246492986e-06, + "loss": 0.0076, + "step": 4192 + }, + { + "epoch": 16.772, + "grad_norm": 0.27298519015312195, + "learning_rate": 8.1062124248497e-06, + "loss": 0.0097, + "step": 4193 + }, + { + "epoch": 16.776, + "grad_norm": 0.14704295992851257, + "learning_rate": 8.09619238476954e-06, + "loss": 0.0082, + "step": 4194 + }, + { + "epoch": 16.78, + "grad_norm": 0.12649138271808624, + "learning_rate": 8.08617234468938e-06, + "loss": 0.007, + "step": 4195 + }, + { + "epoch": 16.784, + "grad_norm": 0.2581845223903656, + "learning_rate": 8.076152304609219e-06, + "loss": 0.0093, + "step": 4196 + }, + { + "epoch": 16.788, + "grad_norm": 0.19466523826122284, + "learning_rate": 8.066132264529058e-06, + "loss": 0.008, + "step": 4197 + }, + { + "epoch": 16.792, + "grad_norm": 0.2284821718931198, + "learning_rate": 8.056112224448898e-06, + "loss": 0.009, + "step": 4198 + }, + { + "epoch": 16.796, + "grad_norm": 0.18178433179855347, + "learning_rate": 8.046092184368738e-06, + "loss": 0.0096, + "step": 4199 + }, + { + "epoch": 16.8, + "grad_norm": 0.143863707780838, + "learning_rate": 8.036072144288577e-06, + "loss": 0.005, + "step": 4200 + }, + { + "epoch": 16.804, + "grad_norm": 0.14635920524597168, + "learning_rate": 8.026052104208417e-06, + "loss": 0.0085, + "step": 4201 + }, + { + "epoch": 16.808, + "grad_norm": 0.11327727884054184, + "learning_rate": 8.016032064128256e-06, + "loss": 0.0046, + "step": 4202 + }, + { + "epoch": 16.812, + "grad_norm": 0.19041374325752258, + "learning_rate": 8.006012024048098e-06, + "loss": 0.0089, + "step": 4203 + }, + { + "epoch": 16.816, + "grad_norm": 0.13083264231681824, + "learning_rate": 7.995991983967935e-06, + "loss": 0.0072, + "step": 4204 + }, + { + "epoch": 16.82, + "grad_norm": 0.21465055644512177, + "learning_rate": 7.985971943887775e-06, + "loss": 0.0083, + "step": 4205 + }, + { + "epoch": 16.824, + "grad_norm": 0.20523211359977722, + "learning_rate": 7.975951903807616e-06, + "loss": 0.008, + "step": 4206 + }, + { + "epoch": 16.828, + "grad_norm": 0.14747971296310425, + "learning_rate": 7.965931863727454e-06, + "loss": 0.0076, + "step": 4207 + }, + { + "epoch": 16.832, + "grad_norm": 0.15081685781478882, + "learning_rate": 7.955911823647296e-06, + "loss": 0.0072, + "step": 4208 + }, + { + "epoch": 16.836, + "grad_norm": 0.18735671043395996, + "learning_rate": 7.945891783567135e-06, + "loss": 0.0078, + "step": 4209 + }, + { + "epoch": 16.84, + "grad_norm": 0.2500723898410797, + "learning_rate": 7.935871743486973e-06, + "loss": 0.0099, + "step": 4210 + }, + { + "epoch": 16.844, + "grad_norm": 0.44487854838371277, + "learning_rate": 7.925851703406814e-06, + "loss": 0.0097, + "step": 4211 + }, + { + "epoch": 16.848, + "grad_norm": 0.18985402584075928, + "learning_rate": 7.915831663326654e-06, + "loss": 0.0082, + "step": 4212 + }, + { + "epoch": 16.852, + "grad_norm": 0.16304108500480652, + "learning_rate": 7.905811623246493e-06, + "loss": 0.0079, + "step": 4213 + }, + { + "epoch": 16.856, + "grad_norm": 0.14347808063030243, + "learning_rate": 7.895791583166333e-06, + "loss": 0.008, + "step": 4214 + }, + { + "epoch": 16.86, + "grad_norm": 0.18546195328235626, + "learning_rate": 7.885771543086173e-06, + "loss": 0.0085, + "step": 4215 + }, + { + "epoch": 16.864, + "grad_norm": 0.16561359167099, + "learning_rate": 7.875751503006012e-06, + "loss": 0.0084, + "step": 4216 + }, + { + "epoch": 16.868, + "grad_norm": 0.1679631471633911, + "learning_rate": 7.865731462925852e-06, + "loss": 0.0083, + "step": 4217 + }, + { + "epoch": 16.872, + "grad_norm": 0.1606558859348297, + "learning_rate": 7.855711422845691e-06, + "loss": 0.0077, + "step": 4218 + }, + { + "epoch": 16.876, + "grad_norm": 0.20817658305168152, + "learning_rate": 7.845691382765531e-06, + "loss": 0.0089, + "step": 4219 + }, + { + "epoch": 16.88, + "grad_norm": 0.14732685685157776, + "learning_rate": 7.835671342685372e-06, + "loss": 0.0077, + "step": 4220 + }, + { + "epoch": 16.884, + "grad_norm": 0.20664165914058685, + "learning_rate": 7.82565130260521e-06, + "loss": 0.0081, + "step": 4221 + }, + { + "epoch": 16.888, + "grad_norm": 0.20258696377277374, + "learning_rate": 7.81563126252505e-06, + "loss": 0.0093, + "step": 4222 + }, + { + "epoch": 16.892, + "grad_norm": 0.19178465008735657, + "learning_rate": 7.805611222444891e-06, + "loss": 0.0082, + "step": 4223 + }, + { + "epoch": 16.896, + "grad_norm": 0.30760034918785095, + "learning_rate": 7.795591182364729e-06, + "loss": 0.0096, + "step": 4224 + }, + { + "epoch": 16.9, + "grad_norm": 0.17052769660949707, + "learning_rate": 7.78557114228457e-06, + "loss": 0.0086, + "step": 4225 + }, + { + "epoch": 16.904, + "grad_norm": 0.1625903993844986, + "learning_rate": 7.77555110220441e-06, + "loss": 0.0082, + "step": 4226 + }, + { + "epoch": 16.908, + "grad_norm": 0.17967870831489563, + "learning_rate": 7.765531062124248e-06, + "loss": 0.0087, + "step": 4227 + }, + { + "epoch": 16.912, + "grad_norm": 0.2507075071334839, + "learning_rate": 7.755511022044089e-06, + "loss": 0.0091, + "step": 4228 + }, + { + "epoch": 16.916, + "grad_norm": 0.33597537875175476, + "learning_rate": 7.745490981963929e-06, + "loss": 0.0086, + "step": 4229 + }, + { + "epoch": 16.92, + "grad_norm": 0.15713933110237122, + "learning_rate": 7.735470941883768e-06, + "loss": 0.0075, + "step": 4230 + }, + { + "epoch": 16.924, + "grad_norm": 0.23833885788917542, + "learning_rate": 7.725450901803608e-06, + "loss": 0.0087, + "step": 4231 + }, + { + "epoch": 16.928, + "grad_norm": 0.18948806822299957, + "learning_rate": 7.715430861723447e-06, + "loss": 0.0083, + "step": 4232 + }, + { + "epoch": 16.932, + "grad_norm": 0.23685553669929504, + "learning_rate": 7.705410821643287e-06, + "loss": 0.0086, + "step": 4233 + }, + { + "epoch": 16.936, + "grad_norm": 0.22197188436985016, + "learning_rate": 7.695390781563127e-06, + "loss": 0.0092, + "step": 4234 + }, + { + "epoch": 16.94, + "grad_norm": 0.14752846956253052, + "learning_rate": 7.685370741482966e-06, + "loss": 0.0075, + "step": 4235 + }, + { + "epoch": 16.944, + "grad_norm": 0.16525785624980927, + "learning_rate": 7.675350701402806e-06, + "loss": 0.0082, + "step": 4236 + }, + { + "epoch": 16.948, + "grad_norm": 0.18567903339862823, + "learning_rate": 7.665330661322645e-06, + "loss": 0.0079, + "step": 4237 + }, + { + "epoch": 16.951999999999998, + "grad_norm": 0.16079868376255035, + "learning_rate": 7.655310621242485e-06, + "loss": 0.0079, + "step": 4238 + }, + { + "epoch": 16.956, + "grad_norm": 0.17427866160869598, + "learning_rate": 7.645290581162324e-06, + "loss": 0.0088, + "step": 4239 + }, + { + "epoch": 16.96, + "grad_norm": 0.18866820633411407, + "learning_rate": 7.635270541082166e-06, + "loss": 0.0085, + "step": 4240 + }, + { + "epoch": 16.964, + "grad_norm": 0.15932509303092957, + "learning_rate": 7.6252505010020045e-06, + "loss": 0.0074, + "step": 4241 + }, + { + "epoch": 16.968, + "grad_norm": 0.2091926783323288, + "learning_rate": 7.615230460921845e-06, + "loss": 0.0093, + "step": 4242 + }, + { + "epoch": 16.972, + "grad_norm": 0.2609868049621582, + "learning_rate": 7.605210420841684e-06, + "loss": 0.0103, + "step": 4243 + }, + { + "epoch": 16.976, + "grad_norm": 0.1595424860715866, + "learning_rate": 7.595190380761523e-06, + "loss": 0.0079, + "step": 4244 + }, + { + "epoch": 16.98, + "grad_norm": 0.1985984891653061, + "learning_rate": 7.585170340681364e-06, + "loss": 0.0089, + "step": 4245 + }, + { + "epoch": 16.984, + "grad_norm": 0.17337600886821747, + "learning_rate": 7.5751503006012024e-06, + "loss": 0.0074, + "step": 4246 + }, + { + "epoch": 16.988, + "grad_norm": 0.18920986354351044, + "learning_rate": 7.565130260521043e-06, + "loss": 0.0083, + "step": 4247 + }, + { + "epoch": 16.992, + "grad_norm": 0.17387863993644714, + "learning_rate": 7.5551102204408825e-06, + "loss": 0.0085, + "step": 4248 + }, + { + "epoch": 16.996, + "grad_norm": 0.20932914316654205, + "learning_rate": 7.545090180360721e-06, + "loss": 0.0081, + "step": 4249 + }, + { + "epoch": 17.0, + "grad_norm": 0.18405738472938538, + "learning_rate": 7.535070140280562e-06, + "loss": 0.0078, + "step": 4250 + }, + { + "epoch": 17.004, + "grad_norm": 0.12063789367675781, + "learning_rate": 7.525050100200401e-06, + "loss": 0.0067, + "step": 4251 + }, + { + "epoch": 17.008, + "grad_norm": 0.13446062803268433, + "learning_rate": 7.515030060120242e-06, + "loss": 0.0067, + "step": 4252 + }, + { + "epoch": 17.012, + "grad_norm": 0.13746428489685059, + "learning_rate": 7.50501002004008e-06, + "loss": 0.0073, + "step": 4253 + }, + { + "epoch": 17.016, + "grad_norm": 0.17416371405124664, + "learning_rate": 7.49498997995992e-06, + "loss": 0.0079, + "step": 4254 + }, + { + "epoch": 17.02, + "grad_norm": 0.19402289390563965, + "learning_rate": 7.48496993987976e-06, + "loss": 0.0073, + "step": 4255 + }, + { + "epoch": 17.024, + "grad_norm": 0.14378122985363007, + "learning_rate": 7.474949899799599e-06, + "loss": 0.0065, + "step": 4256 + }, + { + "epoch": 17.028, + "grad_norm": 0.14554685354232788, + "learning_rate": 7.46492985971944e-06, + "loss": 0.0069, + "step": 4257 + }, + { + "epoch": 17.032, + "grad_norm": 0.14952172338962555, + "learning_rate": 7.454909819639279e-06, + "loss": 0.0075, + "step": 4258 + }, + { + "epoch": 17.036, + "grad_norm": 0.08101173490285873, + "learning_rate": 7.444889779559118e-06, + "loss": 0.0038, + "step": 4259 + }, + { + "epoch": 17.04, + "grad_norm": 0.13694711029529572, + "learning_rate": 7.434869739478958e-06, + "loss": 0.0065, + "step": 4260 + }, + { + "epoch": 17.044, + "grad_norm": 0.1457206904888153, + "learning_rate": 7.424849699398798e-06, + "loss": 0.0075, + "step": 4261 + }, + { + "epoch": 17.048, + "grad_norm": 0.1786690503358841, + "learning_rate": 7.414829659318638e-06, + "loss": 0.0067, + "step": 4262 + }, + { + "epoch": 17.052, + "grad_norm": 0.16678613424301147, + "learning_rate": 7.404809619238477e-06, + "loss": 0.007, + "step": 4263 + }, + { + "epoch": 17.056, + "grad_norm": 0.12600558996200562, + "learning_rate": 7.394789579158317e-06, + "loss": 0.0074, + "step": 4264 + }, + { + "epoch": 17.06, + "grad_norm": 0.1651485711336136, + "learning_rate": 7.384769539078157e-06, + "loss": 0.0077, + "step": 4265 + }, + { + "epoch": 17.064, + "grad_norm": 0.12365109473466873, + "learning_rate": 7.374749498997996e-06, + "loss": 0.0065, + "step": 4266 + }, + { + "epoch": 17.068, + "grad_norm": 0.16177420318126678, + "learning_rate": 7.364729458917836e-06, + "loss": 0.007, + "step": 4267 + }, + { + "epoch": 17.072, + "grad_norm": 0.1336231380701065, + "learning_rate": 7.354709418837676e-06, + "loss": 0.0067, + "step": 4268 + }, + { + "epoch": 17.076, + "grad_norm": 0.12120088934898376, + "learning_rate": 7.344689378757516e-06, + "loss": 0.0044, + "step": 4269 + }, + { + "epoch": 17.08, + "grad_norm": 0.1796593815088272, + "learning_rate": 7.334669338677355e-06, + "loss": 0.0076, + "step": 4270 + }, + { + "epoch": 17.084, + "grad_norm": 0.158540278673172, + "learning_rate": 7.324649298597195e-06, + "loss": 0.0069, + "step": 4271 + }, + { + "epoch": 17.088, + "grad_norm": 0.19492676854133606, + "learning_rate": 7.314629258517035e-06, + "loss": 0.0073, + "step": 4272 + }, + { + "epoch": 17.092, + "grad_norm": 0.19501705467700958, + "learning_rate": 7.304609218436874e-06, + "loss": 0.008, + "step": 4273 + }, + { + "epoch": 17.096, + "grad_norm": 0.16361992061138153, + "learning_rate": 7.294589178356714e-06, + "loss": 0.0068, + "step": 4274 + }, + { + "epoch": 17.1, + "grad_norm": 0.14692838490009308, + "learning_rate": 7.284569138276554e-06, + "loss": 0.007, + "step": 4275 + }, + { + "epoch": 17.104, + "grad_norm": 0.1860724836587906, + "learning_rate": 7.274549098196393e-06, + "loss": 0.0079, + "step": 4276 + }, + { + "epoch": 17.108, + "grad_norm": 0.2329706847667694, + "learning_rate": 7.264529058116233e-06, + "loss": 0.0071, + "step": 4277 + }, + { + "epoch": 17.112, + "grad_norm": 0.16117851436138153, + "learning_rate": 7.254509018036073e-06, + "loss": 0.0077, + "step": 4278 + }, + { + "epoch": 17.116, + "grad_norm": 0.15216809511184692, + "learning_rate": 7.244488977955913e-06, + "loss": 0.0071, + "step": 4279 + }, + { + "epoch": 17.12, + "grad_norm": 0.19427397847175598, + "learning_rate": 7.234468937875752e-06, + "loss": 0.0074, + "step": 4280 + }, + { + "epoch": 17.124, + "grad_norm": 0.23492863774299622, + "learning_rate": 7.2244488977955906e-06, + "loss": 0.0078, + "step": 4281 + }, + { + "epoch": 17.128, + "grad_norm": 0.1491718888282776, + "learning_rate": 7.214428857715432e-06, + "loss": 0.0064, + "step": 4282 + }, + { + "epoch": 17.132, + "grad_norm": 0.14911611378192902, + "learning_rate": 7.204408817635271e-06, + "loss": 0.0069, + "step": 4283 + }, + { + "epoch": 17.136, + "grad_norm": 0.17563241720199585, + "learning_rate": 7.194388777555111e-06, + "loss": 0.0076, + "step": 4284 + }, + { + "epoch": 17.14, + "grad_norm": 0.1660158634185791, + "learning_rate": 7.184368737474951e-06, + "loss": 0.007, + "step": 4285 + }, + { + "epoch": 17.144, + "grad_norm": 0.1971948891878128, + "learning_rate": 7.174348697394789e-06, + "loss": 0.0067, + "step": 4286 + }, + { + "epoch": 17.148, + "grad_norm": 0.15479083359241486, + "learning_rate": 7.16432865731463e-06, + "loss": 0.0066, + "step": 4287 + }, + { + "epoch": 17.152, + "grad_norm": 0.20346534252166748, + "learning_rate": 7.1543086172344685e-06, + "loss": 0.0068, + "step": 4288 + }, + { + "epoch": 17.156, + "grad_norm": 0.22112035751342773, + "learning_rate": 7.14428857715431e-06, + "loss": 0.0071, + "step": 4289 + }, + { + "epoch": 17.16, + "grad_norm": 0.17985723912715912, + "learning_rate": 7.1342685370741486e-06, + "loss": 0.0081, + "step": 4290 + }, + { + "epoch": 17.164, + "grad_norm": 0.20944520831108093, + "learning_rate": 7.124248496993987e-06, + "loss": 0.007, + "step": 4291 + }, + { + "epoch": 17.168, + "grad_norm": 0.1437358558177948, + "learning_rate": 7.114228456913829e-06, + "loss": 0.0084, + "step": 4292 + }, + { + "epoch": 17.172, + "grad_norm": 0.14544442296028137, + "learning_rate": 7.104208416833667e-06, + "loss": 0.0047, + "step": 4293 + }, + { + "epoch": 17.176, + "grad_norm": 0.19183100759983063, + "learning_rate": 7.094188376753508e-06, + "loss": 0.0074, + "step": 4294 + }, + { + "epoch": 17.18, + "grad_norm": 0.16251499950885773, + "learning_rate": 7.0841683366733465e-06, + "loss": 0.0068, + "step": 4295 + }, + { + "epoch": 17.184, + "grad_norm": 0.16117817163467407, + "learning_rate": 7.074148296593188e-06, + "loss": 0.0069, + "step": 4296 + }, + { + "epoch": 17.188, + "grad_norm": 0.15662899613380432, + "learning_rate": 7.0641282565130265e-06, + "loss": 0.0076, + "step": 4297 + }, + { + "epoch": 17.192, + "grad_norm": 0.21028119325637817, + "learning_rate": 7.054108216432865e-06, + "loss": 0.0078, + "step": 4298 + }, + { + "epoch": 17.196, + "grad_norm": 0.19951249659061432, + "learning_rate": 7.044088176352706e-06, + "loss": 0.0075, + "step": 4299 + }, + { + "epoch": 17.2, + "grad_norm": 0.17216528952121735, + "learning_rate": 7.034068136272545e-06, + "loss": 0.0069, + "step": 4300 + }, + { + "epoch": 17.204, + "grad_norm": 0.14699934422969818, + "learning_rate": 7.024048096192386e-06, + "loss": 0.0045, + "step": 4301 + }, + { + "epoch": 17.208, + "grad_norm": 0.15212714672088623, + "learning_rate": 7.0140280561122245e-06, + "loss": 0.0079, + "step": 4302 + }, + { + "epoch": 17.212, + "grad_norm": 0.1797187179327011, + "learning_rate": 7.004008016032064e-06, + "loss": 0.0071, + "step": 4303 + }, + { + "epoch": 17.216, + "grad_norm": 0.13188792765140533, + "learning_rate": 6.9939879759519045e-06, + "loss": 0.0068, + "step": 4304 + }, + { + "epoch": 17.22, + "grad_norm": 0.1433761864900589, + "learning_rate": 6.983967935871743e-06, + "loss": 0.0067, + "step": 4305 + }, + { + "epoch": 17.224, + "grad_norm": 0.22716593742370605, + "learning_rate": 6.973947895791584e-06, + "loss": 0.0087, + "step": 4306 + }, + { + "epoch": 17.228, + "grad_norm": 0.09545940160751343, + "learning_rate": 6.963927855711423e-06, + "loss": 0.0042, + "step": 4307 + }, + { + "epoch": 17.232, + "grad_norm": 0.15111321210861206, + "learning_rate": 6.953907815631262e-06, + "loss": 0.0072, + "step": 4308 + }, + { + "epoch": 17.236, + "grad_norm": 0.14687678217887878, + "learning_rate": 6.9438877755511024e-06, + "loss": 0.0069, + "step": 4309 + }, + { + "epoch": 17.24, + "grad_norm": 0.148050919175148, + "learning_rate": 6.933867735470942e-06, + "loss": 0.0068, + "step": 4310 + }, + { + "epoch": 17.244, + "grad_norm": 0.12253053486347198, + "learning_rate": 6.9238476953907825e-06, + "loss": 0.0066, + "step": 4311 + }, + { + "epoch": 17.248, + "grad_norm": 0.19216535985469818, + "learning_rate": 6.913827655310621e-06, + "loss": 0.0082, + "step": 4312 + }, + { + "epoch": 17.252, + "grad_norm": 0.22438839077949524, + "learning_rate": 6.903807615230461e-06, + "loss": 0.0075, + "step": 4313 + }, + { + "epoch": 17.256, + "grad_norm": 0.2512076497077942, + "learning_rate": 6.893787575150301e-06, + "loss": 0.0082, + "step": 4314 + }, + { + "epoch": 17.26, + "grad_norm": 0.15971721708774567, + "learning_rate": 6.88376753507014e-06, + "loss": 0.0068, + "step": 4315 + }, + { + "epoch": 17.264, + "grad_norm": 0.14771561324596405, + "learning_rate": 6.87374749498998e-06, + "loss": 0.0078, + "step": 4316 + }, + { + "epoch": 17.268, + "grad_norm": 0.1611126810312271, + "learning_rate": 6.86372745490982e-06, + "loss": 0.0083, + "step": 4317 + }, + { + "epoch": 17.272, + "grad_norm": 0.2246340811252594, + "learning_rate": 6.853707414829659e-06, + "loss": 0.0079, + "step": 4318 + }, + { + "epoch": 17.276, + "grad_norm": 0.13150276243686676, + "learning_rate": 6.843687374749499e-06, + "loss": 0.0048, + "step": 4319 + }, + { + "epoch": 17.28, + "grad_norm": 0.12884242832660675, + "learning_rate": 6.833667334669339e-06, + "loss": 0.0062, + "step": 4320 + }, + { + "epoch": 17.284, + "grad_norm": 0.16102537512779236, + "learning_rate": 6.823647294589179e-06, + "loss": 0.0063, + "step": 4321 + }, + { + "epoch": 17.288, + "grad_norm": 0.21450555324554443, + "learning_rate": 6.813627254509018e-06, + "loss": 0.0075, + "step": 4322 + }, + { + "epoch": 17.292, + "grad_norm": 0.21209336817264557, + "learning_rate": 6.803607214428858e-06, + "loss": 0.0076, + "step": 4323 + }, + { + "epoch": 17.296, + "grad_norm": 0.1917915791273117, + "learning_rate": 6.793587174348698e-06, + "loss": 0.0074, + "step": 4324 + }, + { + "epoch": 17.3, + "grad_norm": 0.14592121541500092, + "learning_rate": 6.783567134268537e-06, + "loss": 0.0071, + "step": 4325 + }, + { + "epoch": 17.304, + "grad_norm": 0.1582149863243103, + "learning_rate": 6.773547094188377e-06, + "loss": 0.007, + "step": 4326 + }, + { + "epoch": 17.308, + "grad_norm": 0.21334248781204224, + "learning_rate": 6.763527054108217e-06, + "loss": 0.0081, + "step": 4327 + }, + { + "epoch": 17.312, + "grad_norm": 0.15731792151927948, + "learning_rate": 6.753507014028057e-06, + "loss": 0.0068, + "step": 4328 + }, + { + "epoch": 17.316, + "grad_norm": 0.15384608507156372, + "learning_rate": 6.743486973947896e-06, + "loss": 0.0079, + "step": 4329 + }, + { + "epoch": 17.32, + "grad_norm": 0.23401755094528198, + "learning_rate": 6.7334669338677355e-06, + "loss": 0.008, + "step": 4330 + }, + { + "epoch": 17.324, + "grad_norm": 0.16333438456058502, + "learning_rate": 6.723446893787576e-06, + "loss": 0.0076, + "step": 4331 + }, + { + "epoch": 17.328, + "grad_norm": 0.25040316581726074, + "learning_rate": 6.713426853707415e-06, + "loss": 0.0085, + "step": 4332 + }, + { + "epoch": 17.332, + "grad_norm": 0.22485579550266266, + "learning_rate": 6.703406813627255e-06, + "loss": 0.0087, + "step": 4333 + }, + { + "epoch": 17.336, + "grad_norm": 0.16370800137519836, + "learning_rate": 6.693386773547095e-06, + "loss": 0.0076, + "step": 4334 + }, + { + "epoch": 17.34, + "grad_norm": 0.18383802473545074, + "learning_rate": 6.6833667334669334e-06, + "loss": 0.0074, + "step": 4335 + }, + { + "epoch": 17.344, + "grad_norm": 0.13991597294807434, + "learning_rate": 6.673346693386774e-06, + "loss": 0.0071, + "step": 4336 + }, + { + "epoch": 17.348, + "grad_norm": 0.24963268637657166, + "learning_rate": 6.6633266533066135e-06, + "loss": 0.0084, + "step": 4337 + }, + { + "epoch": 17.352, + "grad_norm": 0.22435139119625092, + "learning_rate": 6.653306613226454e-06, + "loss": 0.0079, + "step": 4338 + }, + { + "epoch": 17.356, + "grad_norm": 0.2332351952791214, + "learning_rate": 6.643286573146293e-06, + "loss": 0.0082, + "step": 4339 + }, + { + "epoch": 17.36, + "grad_norm": 0.1963132917881012, + "learning_rate": 6.633266533066132e-06, + "loss": 0.0069, + "step": 4340 + }, + { + "epoch": 17.364, + "grad_norm": 0.21712207794189453, + "learning_rate": 6.623246492985973e-06, + "loss": 0.0086, + "step": 4341 + }, + { + "epoch": 17.368, + "grad_norm": 0.12130995839834213, + "learning_rate": 6.613226452905811e-06, + "loss": 0.0038, + "step": 4342 + }, + { + "epoch": 17.372, + "grad_norm": 0.2439410537481308, + "learning_rate": 6.603206412825652e-06, + "loss": 0.0081, + "step": 4343 + }, + { + "epoch": 17.376, + "grad_norm": 0.17727676033973694, + "learning_rate": 6.5931863727454914e-06, + "loss": 0.0071, + "step": 4344 + }, + { + "epoch": 17.38, + "grad_norm": 0.15301989018917084, + "learning_rate": 6.58316633266533e-06, + "loss": 0.0065, + "step": 4345 + }, + { + "epoch": 17.384, + "grad_norm": 0.17886261641979218, + "learning_rate": 6.573146292585171e-06, + "loss": 0.0079, + "step": 4346 + }, + { + "epoch": 17.388, + "grad_norm": 0.21048425137996674, + "learning_rate": 6.56312625250501e-06, + "loss": 0.0077, + "step": 4347 + }, + { + "epoch": 17.392, + "grad_norm": 0.23739394545555115, + "learning_rate": 6.553106212424851e-06, + "loss": 0.0077, + "step": 4348 + }, + { + "epoch": 17.396, + "grad_norm": 0.22822335362434387, + "learning_rate": 6.543086172344689e-06, + "loss": 0.0078, + "step": 4349 + }, + { + "epoch": 17.4, + "grad_norm": 0.24672290682792664, + "learning_rate": 6.533066132264529e-06, + "loss": 0.0077, + "step": 4350 + }, + { + "epoch": 17.404, + "grad_norm": 0.22589755058288574, + "learning_rate": 6.523046092184369e-06, + "loss": 0.008, + "step": 4351 + }, + { + "epoch": 17.408, + "grad_norm": 0.1355731040239334, + "learning_rate": 6.513026052104208e-06, + "loss": 0.0076, + "step": 4352 + }, + { + "epoch": 17.412, + "grad_norm": 0.15726511180400848, + "learning_rate": 6.5030060120240486e-06, + "loss": 0.007, + "step": 4353 + }, + { + "epoch": 17.416, + "grad_norm": 0.14194290339946747, + "learning_rate": 6.492985971943888e-06, + "loss": 0.0069, + "step": 4354 + }, + { + "epoch": 17.42, + "grad_norm": 0.15629169344902039, + "learning_rate": 6.482965931863729e-06, + "loss": 0.0077, + "step": 4355 + }, + { + "epoch": 17.424, + "grad_norm": 0.20658808946609497, + "learning_rate": 6.472945891783567e-06, + "loss": 0.0071, + "step": 4356 + }, + { + "epoch": 17.428, + "grad_norm": 0.13976025581359863, + "learning_rate": 6.462925851703407e-06, + "loss": 0.0074, + "step": 4357 + }, + { + "epoch": 17.432, + "grad_norm": 0.14922749996185303, + "learning_rate": 6.452905811623247e-06, + "loss": 0.0075, + "step": 4358 + }, + { + "epoch": 17.436, + "grad_norm": 0.31609266996383667, + "learning_rate": 6.442885771543086e-06, + "loss": 0.0083, + "step": 4359 + }, + { + "epoch": 17.44, + "grad_norm": 0.17123810946941376, + "learning_rate": 6.4328657314629265e-06, + "loss": 0.0068, + "step": 4360 + }, + { + "epoch": 17.444, + "grad_norm": 0.2661895155906677, + "learning_rate": 6.422845691382766e-06, + "loss": 0.0085, + "step": 4361 + }, + { + "epoch": 17.448, + "grad_norm": 0.20732519030570984, + "learning_rate": 6.412825651302605e-06, + "loss": 0.0082, + "step": 4362 + }, + { + "epoch": 17.452, + "grad_norm": 0.20721253752708435, + "learning_rate": 6.402805611222445e-06, + "loss": 0.0087, + "step": 4363 + }, + { + "epoch": 17.456, + "grad_norm": 0.15347276628017426, + "learning_rate": 6.392785571142285e-06, + "loss": 0.0063, + "step": 4364 + }, + { + "epoch": 17.46, + "grad_norm": 0.19956089556217194, + "learning_rate": 6.382765531062125e-06, + "loss": 0.0082, + "step": 4365 + }, + { + "epoch": 17.464, + "grad_norm": 0.15364766120910645, + "learning_rate": 6.372745490981964e-06, + "loss": 0.0075, + "step": 4366 + }, + { + "epoch": 17.468, + "grad_norm": 0.48688241839408875, + "learning_rate": 6.362725450901804e-06, + "loss": 0.0109, + "step": 4367 + }, + { + "epoch": 17.472, + "grad_norm": 0.19626305997371674, + "learning_rate": 6.352705410821644e-06, + "loss": 0.0085, + "step": 4368 + }, + { + "epoch": 17.476, + "grad_norm": 0.21328666806221008, + "learning_rate": 6.342685370741483e-06, + "loss": 0.0087, + "step": 4369 + }, + { + "epoch": 17.48, + "grad_norm": 0.2773035764694214, + "learning_rate": 6.332665330661323e-06, + "loss": 0.0107, + "step": 4370 + }, + { + "epoch": 17.484, + "grad_norm": 0.15668006241321564, + "learning_rate": 6.322645290581163e-06, + "loss": 0.0075, + "step": 4371 + }, + { + "epoch": 17.488, + "grad_norm": 0.20465034246444702, + "learning_rate": 6.312625250501002e-06, + "loss": 0.0079, + "step": 4372 + }, + { + "epoch": 17.492, + "grad_norm": 0.1748172491788864, + "learning_rate": 6.302605210420842e-06, + "loss": 0.0074, + "step": 4373 + }, + { + "epoch": 17.496, + "grad_norm": 0.13092002272605896, + "learning_rate": 6.292585170340682e-06, + "loss": 0.0077, + "step": 4374 + }, + { + "epoch": 17.5, + "grad_norm": 0.13474947214126587, + "learning_rate": 6.282565130260522e-06, + "loss": 0.0064, + "step": 4375 + }, + { + "epoch": 17.504, + "grad_norm": 0.17254269123077393, + "learning_rate": 6.272545090180361e-06, + "loss": 0.0076, + "step": 4376 + }, + { + "epoch": 17.508, + "grad_norm": 0.1626187562942505, + "learning_rate": 6.2625250501002e-06, + "loss": 0.008, + "step": 4377 + }, + { + "epoch": 17.512, + "grad_norm": 0.12236899137496948, + "learning_rate": 6.252505010020041e-06, + "loss": 0.0073, + "step": 4378 + }, + { + "epoch": 17.516, + "grad_norm": 0.1585134118795395, + "learning_rate": 6.24248496993988e-06, + "loss": 0.0067, + "step": 4379 + }, + { + "epoch": 17.52, + "grad_norm": 0.2114783227443695, + "learning_rate": 6.232464929859719e-06, + "loss": 0.009, + "step": 4380 + }, + { + "epoch": 17.524, + "grad_norm": 0.17043016850948334, + "learning_rate": 6.22244488977956e-06, + "loss": 0.0076, + "step": 4381 + }, + { + "epoch": 17.528, + "grad_norm": 0.2268737554550171, + "learning_rate": 6.212424849699399e-06, + "loss": 0.0084, + "step": 4382 + }, + { + "epoch": 17.532, + "grad_norm": 0.18764908611774445, + "learning_rate": 6.202404809619239e-06, + "loss": 0.0078, + "step": 4383 + }, + { + "epoch": 17.536, + "grad_norm": 0.1297231912612915, + "learning_rate": 6.192384769539078e-06, + "loss": 0.0069, + "step": 4384 + }, + { + "epoch": 17.54, + "grad_norm": 0.134876549243927, + "learning_rate": 6.182364729458918e-06, + "loss": 0.0074, + "step": 4385 + }, + { + "epoch": 17.544, + "grad_norm": 0.1665986031293869, + "learning_rate": 6.1723446893787575e-06, + "loss": 0.008, + "step": 4386 + }, + { + "epoch": 17.548000000000002, + "grad_norm": 0.17875370383262634, + "learning_rate": 6.162324649298597e-06, + "loss": 0.0077, + "step": 4387 + }, + { + "epoch": 17.552, + "grad_norm": 0.11975561082363129, + "learning_rate": 6.1523046092184376e-06, + "loss": 0.0048, + "step": 4388 + }, + { + "epoch": 17.556, + "grad_norm": 0.15035223960876465, + "learning_rate": 6.142284569138277e-06, + "loss": 0.0087, + "step": 4389 + }, + { + "epoch": 17.56, + "grad_norm": 0.1503482609987259, + "learning_rate": 6.132264529058117e-06, + "loss": 0.0077, + "step": 4390 + }, + { + "epoch": 17.564, + "grad_norm": 0.12905697524547577, + "learning_rate": 6.122244488977956e-06, + "loss": 0.0065, + "step": 4391 + }, + { + "epoch": 17.568, + "grad_norm": 0.13570000231266022, + "learning_rate": 6.112224448897796e-06, + "loss": 0.0078, + "step": 4392 + }, + { + "epoch": 17.572, + "grad_norm": 0.12492106854915619, + "learning_rate": 6.1022044088176355e-06, + "loss": 0.0068, + "step": 4393 + }, + { + "epoch": 17.576, + "grad_norm": 0.23991592228412628, + "learning_rate": 6.092184368737475e-06, + "loss": 0.0086, + "step": 4394 + }, + { + "epoch": 17.58, + "grad_norm": 0.15174011886119843, + "learning_rate": 6.0821643286573155e-06, + "loss": 0.0077, + "step": 4395 + }, + { + "epoch": 17.584, + "grad_norm": 0.16359587013721466, + "learning_rate": 6.072144288577154e-06, + "loss": 0.0077, + "step": 4396 + }, + { + "epoch": 17.588, + "grad_norm": 0.2310338318347931, + "learning_rate": 6.062124248496994e-06, + "loss": 0.0084, + "step": 4397 + }, + { + "epoch": 17.592, + "grad_norm": 0.1341230720281601, + "learning_rate": 6.052104208416834e-06, + "loss": 0.0083, + "step": 4398 + }, + { + "epoch": 17.596, + "grad_norm": 0.21559903025627136, + "learning_rate": 6.042084168336674e-06, + "loss": 0.0089, + "step": 4399 + }, + { + "epoch": 17.6, + "grad_norm": 0.15158264338970184, + "learning_rate": 6.0320641282565135e-06, + "loss": 0.0073, + "step": 4400 + }, + { + "epoch": 17.604, + "grad_norm": 0.2421329915523529, + "learning_rate": 6.022044088176353e-06, + "loss": 0.0081, + "step": 4401 + }, + { + "epoch": 17.608, + "grad_norm": 0.2801038920879364, + "learning_rate": 6.012024048096193e-06, + "loss": 0.0089, + "step": 4402 + }, + { + "epoch": 17.612, + "grad_norm": 0.1329454481601715, + "learning_rate": 6.002004008016032e-06, + "loss": 0.0078, + "step": 4403 + }, + { + "epoch": 17.616, + "grad_norm": 0.23923999071121216, + "learning_rate": 5.991983967935872e-06, + "loss": 0.0086, + "step": 4404 + }, + { + "epoch": 17.62, + "grad_norm": 0.15022501349449158, + "learning_rate": 5.981963927855712e-06, + "loss": 0.0067, + "step": 4405 + }, + { + "epoch": 17.624, + "grad_norm": 0.1838655024766922, + "learning_rate": 5.971943887775552e-06, + "loss": 0.0073, + "step": 4406 + }, + { + "epoch": 17.628, + "grad_norm": 0.15019722282886505, + "learning_rate": 5.961923847695391e-06, + "loss": 0.0072, + "step": 4407 + }, + { + "epoch": 17.632, + "grad_norm": 0.2248644381761551, + "learning_rate": 5.95190380761523e-06, + "loss": 0.0085, + "step": 4408 + }, + { + "epoch": 17.636, + "grad_norm": 0.25571855902671814, + "learning_rate": 5.941883767535071e-06, + "loss": 0.0098, + "step": 4409 + }, + { + "epoch": 17.64, + "grad_norm": 0.14798125624656677, + "learning_rate": 5.93186372745491e-06, + "loss": 0.0076, + "step": 4410 + }, + { + "epoch": 17.644, + "grad_norm": 0.199660062789917, + "learning_rate": 5.92184368737475e-06, + "loss": 0.0085, + "step": 4411 + }, + { + "epoch": 17.648, + "grad_norm": 0.15380799770355225, + "learning_rate": 5.911823647294589e-06, + "loss": 0.008, + "step": 4412 + }, + { + "epoch": 17.652, + "grad_norm": 0.20436356961727142, + "learning_rate": 5.901803607214429e-06, + "loss": 0.0054, + "step": 4413 + }, + { + "epoch": 17.656, + "grad_norm": 0.22345702350139618, + "learning_rate": 5.8917835671342686e-06, + "loss": 0.0082, + "step": 4414 + }, + { + "epoch": 17.66, + "grad_norm": 0.17874562740325928, + "learning_rate": 5.881763527054108e-06, + "loss": 0.0084, + "step": 4415 + }, + { + "epoch": 17.664, + "grad_norm": 0.14801347255706787, + "learning_rate": 5.871743486973949e-06, + "loss": 0.0072, + "step": 4416 + }, + { + "epoch": 17.668, + "grad_norm": 0.20537561178207397, + "learning_rate": 5.861723446893788e-06, + "loss": 0.0079, + "step": 4417 + }, + { + "epoch": 17.672, + "grad_norm": 0.1717330515384674, + "learning_rate": 5.851703406813627e-06, + "loss": 0.0077, + "step": 4418 + }, + { + "epoch": 17.676, + "grad_norm": 0.13762032985687256, + "learning_rate": 5.841683366733467e-06, + "loss": 0.0069, + "step": 4419 + }, + { + "epoch": 17.68, + "grad_norm": 0.16380003094673157, + "learning_rate": 5.831663326653307e-06, + "loss": 0.0077, + "step": 4420 + }, + { + "epoch": 17.684, + "grad_norm": 0.20809531211853027, + "learning_rate": 5.8216432865731465e-06, + "loss": 0.0084, + "step": 4421 + }, + { + "epoch": 17.688, + "grad_norm": 0.24906416237354279, + "learning_rate": 5.811623246492986e-06, + "loss": 0.0077, + "step": 4422 + }, + { + "epoch": 17.692, + "grad_norm": 0.15348020195960999, + "learning_rate": 5.801603206412826e-06, + "loss": 0.0081, + "step": 4423 + }, + { + "epoch": 17.696, + "grad_norm": 0.15915721654891968, + "learning_rate": 5.791583166332665e-06, + "loss": 0.0071, + "step": 4424 + }, + { + "epoch": 17.7, + "grad_norm": 0.1927608698606491, + "learning_rate": 5.781563126252505e-06, + "loss": 0.0071, + "step": 4425 + }, + { + "epoch": 17.704, + "grad_norm": 0.23460881412029266, + "learning_rate": 5.771543086172345e-06, + "loss": 0.0077, + "step": 4426 + }, + { + "epoch": 17.708, + "grad_norm": 0.1741648018360138, + "learning_rate": 5.761523046092185e-06, + "loss": 0.008, + "step": 4427 + }, + { + "epoch": 17.712, + "grad_norm": 0.13403233885765076, + "learning_rate": 5.7515030060120245e-06, + "loss": 0.005, + "step": 4428 + }, + { + "epoch": 17.716, + "grad_norm": 0.0899837389588356, + "learning_rate": 5.741482965931864e-06, + "loss": 0.0045, + "step": 4429 + }, + { + "epoch": 17.72, + "grad_norm": 0.15737628936767578, + "learning_rate": 5.731462925851704e-06, + "loss": 0.0073, + "step": 4430 + }, + { + "epoch": 17.724, + "grad_norm": 0.14937400817871094, + "learning_rate": 5.721442885771543e-06, + "loss": 0.0081, + "step": 4431 + }, + { + "epoch": 17.728, + "grad_norm": 0.12557938694953918, + "learning_rate": 5.711422845691383e-06, + "loss": 0.0067, + "step": 4432 + }, + { + "epoch": 17.732, + "grad_norm": 0.20801903307437897, + "learning_rate": 5.701402805611223e-06, + "loss": 0.0083, + "step": 4433 + }, + { + "epoch": 17.736, + "grad_norm": 0.17545443773269653, + "learning_rate": 5.691382765531062e-06, + "loss": 0.0076, + "step": 4434 + }, + { + "epoch": 17.74, + "grad_norm": 0.16065756976604462, + "learning_rate": 5.681362725450902e-06, + "loss": 0.0078, + "step": 4435 + }, + { + "epoch": 17.744, + "grad_norm": 0.15100617706775665, + "learning_rate": 5.671342685370742e-06, + "loss": 0.0067, + "step": 4436 + }, + { + "epoch": 17.748, + "grad_norm": 0.17027676105499268, + "learning_rate": 5.661322645290582e-06, + "loss": 0.0078, + "step": 4437 + }, + { + "epoch": 17.752, + "grad_norm": 0.19848494231700897, + "learning_rate": 5.651302605210421e-06, + "loss": 0.0075, + "step": 4438 + }, + { + "epoch": 17.756, + "grad_norm": 0.15401864051818848, + "learning_rate": 5.641282565130261e-06, + "loss": 0.0073, + "step": 4439 + }, + { + "epoch": 17.76, + "grad_norm": 0.21219518780708313, + "learning_rate": 5.6312625250501e-06, + "loss": 0.0081, + "step": 4440 + }, + { + "epoch": 17.764, + "grad_norm": 0.09641958028078079, + "learning_rate": 5.62124248496994e-06, + "loss": 0.0047, + "step": 4441 + }, + { + "epoch": 17.768, + "grad_norm": 0.15153850615024567, + "learning_rate": 5.61122244488978e-06, + "loss": 0.0073, + "step": 4442 + }, + { + "epoch": 17.772, + "grad_norm": 0.20264235138893127, + "learning_rate": 5.60120240480962e-06, + "loss": 0.0078, + "step": 4443 + }, + { + "epoch": 17.776, + "grad_norm": 0.14838512241840363, + "learning_rate": 5.59118236472946e-06, + "loss": 0.0071, + "step": 4444 + }, + { + "epoch": 17.78, + "grad_norm": 0.1825137883424759, + "learning_rate": 5.581162324649298e-06, + "loss": 0.0068, + "step": 4445 + }, + { + "epoch": 17.784, + "grad_norm": 0.14169864356517792, + "learning_rate": 5.571142284569139e-06, + "loss": 0.0074, + "step": 4446 + }, + { + "epoch": 17.788, + "grad_norm": 0.20076483488082886, + "learning_rate": 5.561122244488978e-06, + "loss": 0.0085, + "step": 4447 + }, + { + "epoch": 17.792, + "grad_norm": 0.23179060220718384, + "learning_rate": 5.551102204408818e-06, + "loss": 0.0081, + "step": 4448 + }, + { + "epoch": 17.796, + "grad_norm": 0.15632283687591553, + "learning_rate": 5.5410821643286575e-06, + "loss": 0.0075, + "step": 4449 + }, + { + "epoch": 17.8, + "grad_norm": 0.16295547783374786, + "learning_rate": 5.531062124248497e-06, + "loss": 0.0078, + "step": 4450 + }, + { + "epoch": 17.804, + "grad_norm": 0.15852996706962585, + "learning_rate": 5.521042084168337e-06, + "loss": 0.0076, + "step": 4451 + }, + { + "epoch": 17.808, + "grad_norm": 0.24899086356163025, + "learning_rate": 5.511022044088176e-06, + "loss": 0.008, + "step": 4452 + }, + { + "epoch": 17.812, + "grad_norm": 0.26219606399536133, + "learning_rate": 5.501002004008017e-06, + "loss": 0.0079, + "step": 4453 + }, + { + "epoch": 17.816, + "grad_norm": 0.15716837346553802, + "learning_rate": 5.490981963927856e-06, + "loss": 0.0068, + "step": 4454 + }, + { + "epoch": 17.82, + "grad_norm": 0.21653732657432556, + "learning_rate": 5.480961923847696e-06, + "loss": 0.008, + "step": 4455 + }, + { + "epoch": 17.824, + "grad_norm": 0.1806323230266571, + "learning_rate": 5.4709418837675355e-06, + "loss": 0.0086, + "step": 4456 + }, + { + "epoch": 17.828, + "grad_norm": 0.2788422703742981, + "learning_rate": 5.460921843687375e-06, + "loss": 0.009, + "step": 4457 + }, + { + "epoch": 17.832, + "grad_norm": 0.16361011564731598, + "learning_rate": 5.450901803607215e-06, + "loss": 0.0073, + "step": 4458 + }, + { + "epoch": 17.836, + "grad_norm": 0.24502210319042206, + "learning_rate": 5.440881763527054e-06, + "loss": 0.0088, + "step": 4459 + }, + { + "epoch": 17.84, + "grad_norm": 0.18323510885238647, + "learning_rate": 5.430861723446895e-06, + "loss": 0.0073, + "step": 4460 + }, + { + "epoch": 17.844, + "grad_norm": 0.1967632919549942, + "learning_rate": 5.4208416833667335e-06, + "loss": 0.0085, + "step": 4461 + }, + { + "epoch": 17.848, + "grad_norm": 0.19177305698394775, + "learning_rate": 5.410821643286573e-06, + "loss": 0.0079, + "step": 4462 + }, + { + "epoch": 17.852, + "grad_norm": 0.1529979109764099, + "learning_rate": 5.4008016032064135e-06, + "loss": 0.0077, + "step": 4463 + }, + { + "epoch": 17.856, + "grad_norm": 0.16155672073364258, + "learning_rate": 5.390781563126253e-06, + "loss": 0.0083, + "step": 4464 + }, + { + "epoch": 17.86, + "grad_norm": 0.1222570464015007, + "learning_rate": 5.380761523046093e-06, + "loss": 0.0075, + "step": 4465 + }, + { + "epoch": 17.864, + "grad_norm": 0.17447544634342194, + "learning_rate": 5.370741482965931e-06, + "loss": 0.0074, + "step": 4466 + }, + { + "epoch": 17.868, + "grad_norm": 0.18004770576953888, + "learning_rate": 5.360721442885772e-06, + "loss": 0.008, + "step": 4467 + }, + { + "epoch": 17.872, + "grad_norm": 0.12705212831497192, + "learning_rate": 5.350701402805611e-06, + "loss": 0.0065, + "step": 4468 + }, + { + "epoch": 17.876, + "grad_norm": 0.13531920313835144, + "learning_rate": 5.340681362725451e-06, + "loss": 0.0077, + "step": 4469 + }, + { + "epoch": 17.88, + "grad_norm": 0.11091826111078262, + "learning_rate": 5.3306613226452914e-06, + "loss": 0.0038, + "step": 4470 + }, + { + "epoch": 17.884, + "grad_norm": 0.17355629801750183, + "learning_rate": 5.320641282565131e-06, + "loss": 0.0073, + "step": 4471 + }, + { + "epoch": 17.888, + "grad_norm": 0.2770105302333832, + "learning_rate": 5.31062124248497e-06, + "loss": 0.0092, + "step": 4472 + }, + { + "epoch": 17.892, + "grad_norm": 0.13555404543876648, + "learning_rate": 5.300601202404809e-06, + "loss": 0.007, + "step": 4473 + }, + { + "epoch": 17.896, + "grad_norm": 0.13651050627231598, + "learning_rate": 5.29058116232465e-06, + "loss": 0.0071, + "step": 4474 + }, + { + "epoch": 17.9, + "grad_norm": 0.22955292463302612, + "learning_rate": 5.280561122244489e-06, + "loss": 0.0086, + "step": 4475 + }, + { + "epoch": 17.904, + "grad_norm": 0.13148556649684906, + "learning_rate": 5.270541082164329e-06, + "loss": 0.007, + "step": 4476 + }, + { + "epoch": 17.908, + "grad_norm": 0.1831294298171997, + "learning_rate": 5.2605210420841686e-06, + "loss": 0.0084, + "step": 4477 + }, + { + "epoch": 17.912, + "grad_norm": 0.15592306852340698, + "learning_rate": 5.250501002004008e-06, + "loss": 0.0076, + "step": 4478 + }, + { + "epoch": 17.916, + "grad_norm": 0.20427504181861877, + "learning_rate": 5.240480961923848e-06, + "loss": 0.0083, + "step": 4479 + }, + { + "epoch": 17.92, + "grad_norm": 0.1998140513896942, + "learning_rate": 5.230460921843687e-06, + "loss": 0.0089, + "step": 4480 + }, + { + "epoch": 17.924, + "grad_norm": 0.1798437237739563, + "learning_rate": 5.220440881763528e-06, + "loss": 0.0085, + "step": 4481 + }, + { + "epoch": 17.928, + "grad_norm": 0.1546415388584137, + "learning_rate": 5.2104208416833665e-06, + "loss": 0.0076, + "step": 4482 + }, + { + "epoch": 17.932, + "grad_norm": 0.17189155519008636, + "learning_rate": 5.200400801603206e-06, + "loss": 0.0074, + "step": 4483 + }, + { + "epoch": 17.936, + "grad_norm": 0.26644057035446167, + "learning_rate": 5.1903807615230465e-06, + "loss": 0.0084, + "step": 4484 + }, + { + "epoch": 17.94, + "grad_norm": 0.12923316657543182, + "learning_rate": 5.180360721442886e-06, + "loss": 0.0067, + "step": 4485 + }, + { + "epoch": 17.944, + "grad_norm": 0.20447267591953278, + "learning_rate": 5.170340681362726e-06, + "loss": 0.0075, + "step": 4486 + }, + { + "epoch": 17.948, + "grad_norm": 0.17053300142288208, + "learning_rate": 5.160320641282565e-06, + "loss": 0.0076, + "step": 4487 + }, + { + "epoch": 17.951999999999998, + "grad_norm": 0.1922687590122223, + "learning_rate": 5.150300601202405e-06, + "loss": 0.0082, + "step": 4488 + }, + { + "epoch": 17.956, + "grad_norm": 0.17536620795726776, + "learning_rate": 5.1402805611222445e-06, + "loss": 0.0081, + "step": 4489 + }, + { + "epoch": 17.96, + "grad_norm": 0.1868142932653427, + "learning_rate": 5.130260521042084e-06, + "loss": 0.0079, + "step": 4490 + }, + { + "epoch": 17.964, + "grad_norm": 0.21813417971134186, + "learning_rate": 5.1202404809619245e-06, + "loss": 0.0082, + "step": 4491 + }, + { + "epoch": 17.968, + "grad_norm": 0.1625450849533081, + "learning_rate": 5.110220440881764e-06, + "loss": 0.0083, + "step": 4492 + }, + { + "epoch": 17.972, + "grad_norm": 0.1685769259929657, + "learning_rate": 5.100200400801603e-06, + "loss": 0.0083, + "step": 4493 + }, + { + "epoch": 17.976, + "grad_norm": 0.16878901422023773, + "learning_rate": 5.090180360721443e-06, + "loss": 0.008, + "step": 4494 + }, + { + "epoch": 17.98, + "grad_norm": 0.26042139530181885, + "learning_rate": 5.080160320641283e-06, + "loss": 0.009, + "step": 4495 + }, + { + "epoch": 17.984, + "grad_norm": 0.22069434821605682, + "learning_rate": 5.0701402805611224e-06, + "loss": 0.0072, + "step": 4496 + }, + { + "epoch": 17.988, + "grad_norm": 0.1699599176645279, + "learning_rate": 5.060120240480962e-06, + "loss": 0.0089, + "step": 4497 + }, + { + "epoch": 17.992, + "grad_norm": 0.1470976173877716, + "learning_rate": 5.0501002004008025e-06, + "loss": 0.0074, + "step": 4498 + }, + { + "epoch": 17.996, + "grad_norm": 0.22268114984035492, + "learning_rate": 5.040080160320641e-06, + "loss": 0.0084, + "step": 4499 + }, + { + "epoch": 18.0, + "grad_norm": 0.1696414351463318, + "learning_rate": 5.030060120240481e-06, + "loss": 0.0081, + "step": 4500 + }, + { + "epoch": 18.004, + "grad_norm": 0.18585962057113647, + "learning_rate": 5.020040080160321e-06, + "loss": 0.0072, + "step": 4501 + }, + { + "epoch": 18.008, + "grad_norm": 0.1416616141796112, + "learning_rate": 5.010020040080161e-06, + "loss": 0.0064, + "step": 4502 + }, + { + "epoch": 18.012, + "grad_norm": 0.1328905075788498, + "learning_rate": 5e-06, + "loss": 0.0064, + "step": 4503 + }, + { + "epoch": 18.016, + "grad_norm": 0.16968417167663574, + "learning_rate": 4.98997995991984e-06, + "loss": 0.0073, + "step": 4504 + }, + { + "epoch": 18.02, + "grad_norm": 0.15814343094825745, + "learning_rate": 4.97995991983968e-06, + "loss": 0.0073, + "step": 4505 + }, + { + "epoch": 18.024, + "grad_norm": 0.17646895349025726, + "learning_rate": 4.969939879759519e-06, + "loss": 0.0076, + "step": 4506 + }, + { + "epoch": 18.028, + "grad_norm": 0.14789192378520966, + "learning_rate": 4.959919839679359e-06, + "loss": 0.0064, + "step": 4507 + }, + { + "epoch": 18.032, + "grad_norm": 0.15149499475955963, + "learning_rate": 4.949899799599199e-06, + "loss": 0.0075, + "step": 4508 + }, + { + "epoch": 18.036, + "grad_norm": 0.12191706895828247, + "learning_rate": 4.939879759519038e-06, + "loss": 0.0065, + "step": 4509 + }, + { + "epoch": 18.04, + "grad_norm": 0.1561209261417389, + "learning_rate": 4.9298597194388775e-06, + "loss": 0.0079, + "step": 4510 + }, + { + "epoch": 18.044, + "grad_norm": 0.15725496411323547, + "learning_rate": 4.919839679358718e-06, + "loss": 0.0072, + "step": 4511 + }, + { + "epoch": 18.048, + "grad_norm": 0.14337027072906494, + "learning_rate": 4.9098196392785576e-06, + "loss": 0.0065, + "step": 4512 + }, + { + "epoch": 18.052, + "grad_norm": 0.16354084014892578, + "learning_rate": 4.899799599198397e-06, + "loss": 0.0068, + "step": 4513 + }, + { + "epoch": 18.056, + "grad_norm": 0.16241081058979034, + "learning_rate": 4.889779559118237e-06, + "loss": 0.0066, + "step": 4514 + }, + { + "epoch": 18.06, + "grad_norm": 0.1941567361354828, + "learning_rate": 4.879759519038076e-06, + "loss": 0.0062, + "step": 4515 + }, + { + "epoch": 18.064, + "grad_norm": 0.1292446106672287, + "learning_rate": 4.869739478957916e-06, + "loss": 0.0066, + "step": 4516 + }, + { + "epoch": 18.068, + "grad_norm": 0.09367750585079193, + "learning_rate": 4.8597194388777555e-06, + "loss": 0.0047, + "step": 4517 + }, + { + "epoch": 18.072, + "grad_norm": 0.14819225668907166, + "learning_rate": 4.849699398797596e-06, + "loss": 0.0074, + "step": 4518 + }, + { + "epoch": 18.076, + "grad_norm": 0.17116355895996094, + "learning_rate": 4.8396793587174355e-06, + "loss": 0.0064, + "step": 4519 + }, + { + "epoch": 18.08, + "grad_norm": 0.19013841450214386, + "learning_rate": 4.829659318637274e-06, + "loss": 0.0075, + "step": 4520 + }, + { + "epoch": 18.084, + "grad_norm": 0.17657625675201416, + "learning_rate": 4.819639278557115e-06, + "loss": 0.0077, + "step": 4521 + }, + { + "epoch": 18.088, + "grad_norm": 0.14406649768352509, + "learning_rate": 4.809619238476954e-06, + "loss": 0.0074, + "step": 4522 + }, + { + "epoch": 18.092, + "grad_norm": 0.17921116948127747, + "learning_rate": 4.799599198396794e-06, + "loss": 0.0071, + "step": 4523 + }, + { + "epoch": 18.096, + "grad_norm": 0.22926273941993713, + "learning_rate": 4.7895791583166335e-06, + "loss": 0.0083, + "step": 4524 + }, + { + "epoch": 18.1, + "grad_norm": 0.1705189198255539, + "learning_rate": 4.779559118236474e-06, + "loss": 0.0074, + "step": 4525 + }, + { + "epoch": 18.104, + "grad_norm": 0.15004006028175354, + "learning_rate": 4.769539078156313e-06, + "loss": 0.0063, + "step": 4526 + }, + { + "epoch": 18.108, + "grad_norm": 0.1770288497209549, + "learning_rate": 4.759519038076152e-06, + "loss": 0.0077, + "step": 4527 + }, + { + "epoch": 18.112, + "grad_norm": 0.14755572378635406, + "learning_rate": 4.749498997995992e-06, + "loss": 0.0065, + "step": 4528 + }, + { + "epoch": 18.116, + "grad_norm": 0.12540753185749054, + "learning_rate": 4.739478957915832e-06, + "loss": 0.0067, + "step": 4529 + }, + { + "epoch": 18.12, + "grad_norm": 0.1722092181444168, + "learning_rate": 4.729458917835672e-06, + "loss": 0.0076, + "step": 4530 + }, + { + "epoch": 18.124, + "grad_norm": 0.15818531811237335, + "learning_rate": 4.719438877755511e-06, + "loss": 0.0065, + "step": 4531 + }, + { + "epoch": 18.128, + "grad_norm": 0.17607039213180542, + "learning_rate": 4.709418837675351e-06, + "loss": 0.0058, + "step": 4532 + }, + { + "epoch": 18.132, + "grad_norm": 0.15239058434963226, + "learning_rate": 4.699398797595191e-06, + "loss": 0.0072, + "step": 4533 + }, + { + "epoch": 18.136, + "grad_norm": 0.1865680068731308, + "learning_rate": 4.68937875751503e-06, + "loss": 0.0074, + "step": 4534 + }, + { + "epoch": 18.14, + "grad_norm": 0.13207893073558807, + "learning_rate": 4.67935871743487e-06, + "loss": 0.0069, + "step": 4535 + }, + { + "epoch": 18.144, + "grad_norm": 0.1481158584356308, + "learning_rate": 4.669338677354709e-06, + "loss": 0.0068, + "step": 4536 + }, + { + "epoch": 18.148, + "grad_norm": 0.1650175005197525, + "learning_rate": 4.659318637274549e-06, + "loss": 0.0084, + "step": 4537 + }, + { + "epoch": 18.152, + "grad_norm": 0.2189430296421051, + "learning_rate": 4.6492985971943886e-06, + "loss": 0.0068, + "step": 4538 + }, + { + "epoch": 18.156, + "grad_norm": 0.17136210203170776, + "learning_rate": 4.639278557114229e-06, + "loss": 0.0071, + "step": 4539 + }, + { + "epoch": 18.16, + "grad_norm": 0.1906738430261612, + "learning_rate": 4.6292585170340686e-06, + "loss": 0.007, + "step": 4540 + }, + { + "epoch": 18.164, + "grad_norm": 0.1356390118598938, + "learning_rate": 4.619238476953908e-06, + "loss": 0.0063, + "step": 4541 + }, + { + "epoch": 18.168, + "grad_norm": 0.1401359885931015, + "learning_rate": 4.609218436873748e-06, + "loss": 0.0073, + "step": 4542 + }, + { + "epoch": 18.172, + "grad_norm": 0.22201108932495117, + "learning_rate": 4.599198396793587e-06, + "loss": 0.0082, + "step": 4543 + }, + { + "epoch": 18.176, + "grad_norm": 0.17944709956645966, + "learning_rate": 4.589178356713427e-06, + "loss": 0.0065, + "step": 4544 + }, + { + "epoch": 18.18, + "grad_norm": 0.19789981842041016, + "learning_rate": 4.5791583166332665e-06, + "loss": 0.0071, + "step": 4545 + }, + { + "epoch": 18.184, + "grad_norm": 0.2103254646062851, + "learning_rate": 4.569138276553107e-06, + "loss": 0.0074, + "step": 4546 + }, + { + "epoch": 18.188, + "grad_norm": 0.15887366235256195, + "learning_rate": 4.559118236472946e-06, + "loss": 0.0064, + "step": 4547 + }, + { + "epoch": 18.192, + "grad_norm": 0.12371140718460083, + "learning_rate": 4.549098196392785e-06, + "loss": 0.0064, + "step": 4548 + }, + { + "epoch": 18.196, + "grad_norm": 0.20762166380882263, + "learning_rate": 4.539078156312626e-06, + "loss": 0.0078, + "step": 4549 + }, + { + "epoch": 18.2, + "grad_norm": 0.12717363238334656, + "learning_rate": 4.529058116232465e-06, + "loss": 0.0064, + "step": 4550 + }, + { + "epoch": 18.204, + "grad_norm": 0.2185417264699936, + "learning_rate": 4.519038076152305e-06, + "loss": 0.0068, + "step": 4551 + }, + { + "epoch": 18.208, + "grad_norm": 0.1922868937253952, + "learning_rate": 4.5090180360721445e-06, + "loss": 0.0049, + "step": 4552 + }, + { + "epoch": 18.212, + "grad_norm": 0.20411092042922974, + "learning_rate": 4.498997995991984e-06, + "loss": 0.0076, + "step": 4553 + }, + { + "epoch": 18.216, + "grad_norm": 0.18740029633045197, + "learning_rate": 4.488977955911824e-06, + "loss": 0.008, + "step": 4554 + }, + { + "epoch": 18.22, + "grad_norm": 0.14314641058444977, + "learning_rate": 4.478957915831663e-06, + "loss": 0.0068, + "step": 4555 + }, + { + "epoch": 18.224, + "grad_norm": 0.15303204953670502, + "learning_rate": 4.468937875751504e-06, + "loss": 0.0069, + "step": 4556 + }, + { + "epoch": 18.228, + "grad_norm": 0.21865060925483704, + "learning_rate": 4.458917835671343e-06, + "loss": 0.0068, + "step": 4557 + }, + { + "epoch": 18.232, + "grad_norm": 0.14031033217906952, + "learning_rate": 4.448897795591182e-06, + "loss": 0.0071, + "step": 4558 + }, + { + "epoch": 18.236, + "grad_norm": 0.18095235526561737, + "learning_rate": 4.4388777555110225e-06, + "loss": 0.0071, + "step": 4559 + }, + { + "epoch": 18.24, + "grad_norm": 0.16418349742889404, + "learning_rate": 4.428857715430862e-06, + "loss": 0.0077, + "step": 4560 + }, + { + "epoch": 18.244, + "grad_norm": 0.14331473410129547, + "learning_rate": 4.418837675350702e-06, + "loss": 0.0067, + "step": 4561 + }, + { + "epoch": 18.248, + "grad_norm": 0.15830665826797485, + "learning_rate": 4.408817635270541e-06, + "loss": 0.0074, + "step": 4562 + }, + { + "epoch": 18.252, + "grad_norm": 0.1355806142091751, + "learning_rate": 4.398797595190381e-06, + "loss": 0.0069, + "step": 4563 + }, + { + "epoch": 18.256, + "grad_norm": 0.05929170548915863, + "learning_rate": 4.38877755511022e-06, + "loss": 0.0023, + "step": 4564 + }, + { + "epoch": 18.26, + "grad_norm": 0.15484470129013062, + "learning_rate": 4.37875751503006e-06, + "loss": 0.0061, + "step": 4565 + }, + { + "epoch": 18.264, + "grad_norm": 0.17139868438243866, + "learning_rate": 4.3687374749499e-06, + "loss": 0.006, + "step": 4566 + }, + { + "epoch": 18.268, + "grad_norm": 0.23054589331150055, + "learning_rate": 4.35871743486974e-06, + "loss": 0.008, + "step": 4567 + }, + { + "epoch": 18.272, + "grad_norm": 0.1985149085521698, + "learning_rate": 4.34869739478958e-06, + "loss": 0.0072, + "step": 4568 + }, + { + "epoch": 18.276, + "grad_norm": 0.17223723232746124, + "learning_rate": 4.338677354709419e-06, + "loss": 0.0072, + "step": 4569 + }, + { + "epoch": 18.28, + "grad_norm": 0.14117112755775452, + "learning_rate": 4.328657314629259e-06, + "loss": 0.0065, + "step": 4570 + }, + { + "epoch": 18.284, + "grad_norm": 0.22822153568267822, + "learning_rate": 4.318637274549098e-06, + "loss": 0.0074, + "step": 4571 + }, + { + "epoch": 18.288, + "grad_norm": 0.21039901673793793, + "learning_rate": 4.308617234468938e-06, + "loss": 0.0079, + "step": 4572 + }, + { + "epoch": 18.292, + "grad_norm": 0.14674365520477295, + "learning_rate": 4.298597194388778e-06, + "loss": 0.0065, + "step": 4573 + }, + { + "epoch": 18.296, + "grad_norm": 0.1451415717601776, + "learning_rate": 4.288577154308617e-06, + "loss": 0.0065, + "step": 4574 + }, + { + "epoch": 18.3, + "grad_norm": 0.15624050796031952, + "learning_rate": 4.278557114228457e-06, + "loss": 0.0068, + "step": 4575 + }, + { + "epoch": 18.304, + "grad_norm": 0.1852012276649475, + "learning_rate": 4.268537074148297e-06, + "loss": 0.0077, + "step": 4576 + }, + { + "epoch": 18.308, + "grad_norm": 0.22112013399600983, + "learning_rate": 4.258517034068137e-06, + "loss": 0.0074, + "step": 4577 + }, + { + "epoch": 18.312, + "grad_norm": 0.1755540817975998, + "learning_rate": 4.248496993987976e-06, + "loss": 0.0078, + "step": 4578 + }, + { + "epoch": 18.316, + "grad_norm": 0.16024050116539001, + "learning_rate": 4.238476953907815e-06, + "loss": 0.007, + "step": 4579 + }, + { + "epoch": 18.32, + "grad_norm": 0.18745197355747223, + "learning_rate": 4.2284569138276555e-06, + "loss": 0.0077, + "step": 4580 + }, + { + "epoch": 18.324, + "grad_norm": 0.18656589090824127, + "learning_rate": 4.218436873747495e-06, + "loss": 0.0081, + "step": 4581 + }, + { + "epoch": 18.328, + "grad_norm": 0.16219571232795715, + "learning_rate": 4.208416833667335e-06, + "loss": 0.0072, + "step": 4582 + }, + { + "epoch": 18.332, + "grad_norm": 0.23128695785999298, + "learning_rate": 4.198396793587175e-06, + "loss": 0.0079, + "step": 4583 + }, + { + "epoch": 18.336, + "grad_norm": 0.21390704810619354, + "learning_rate": 4.188376753507015e-06, + "loss": 0.0088, + "step": 4584 + }, + { + "epoch": 18.34, + "grad_norm": 0.23539131879806519, + "learning_rate": 4.1783567134268534e-06, + "loss": 0.0081, + "step": 4585 + }, + { + "epoch": 18.344, + "grad_norm": 0.19893336296081543, + "learning_rate": 4.168336673346693e-06, + "loss": 0.0068, + "step": 4586 + }, + { + "epoch": 18.348, + "grad_norm": 0.44295942783355713, + "learning_rate": 4.1583166332665335e-06, + "loss": 0.0096, + "step": 4587 + }, + { + "epoch": 18.352, + "grad_norm": 0.17842140793800354, + "learning_rate": 4.148296593186373e-06, + "loss": 0.0077, + "step": 4588 + }, + { + "epoch": 18.356, + "grad_norm": 0.22559908032417297, + "learning_rate": 4.138276553106213e-06, + "loss": 0.0067, + "step": 4589 + }, + { + "epoch": 18.36, + "grad_norm": 0.19061769545078278, + "learning_rate": 4.128256513026052e-06, + "loss": 0.0076, + "step": 4590 + }, + { + "epoch": 18.364, + "grad_norm": 0.14413903653621674, + "learning_rate": 4.118236472945892e-06, + "loss": 0.0074, + "step": 4591 + }, + { + "epoch": 18.368, + "grad_norm": 0.1512213498353958, + "learning_rate": 4.108216432865731e-06, + "loss": 0.0082, + "step": 4592 + }, + { + "epoch": 18.372, + "grad_norm": 0.1925954520702362, + "learning_rate": 4.098196392785571e-06, + "loss": 0.0071, + "step": 4593 + }, + { + "epoch": 18.376, + "grad_norm": 0.18109238147735596, + "learning_rate": 4.0881763527054114e-06, + "loss": 0.0075, + "step": 4594 + }, + { + "epoch": 18.38, + "grad_norm": 0.1389477699995041, + "learning_rate": 4.078156312625251e-06, + "loss": 0.0066, + "step": 4595 + }, + { + "epoch": 18.384, + "grad_norm": 0.24592560529708862, + "learning_rate": 4.06813627254509e-06, + "loss": 0.0075, + "step": 4596 + }, + { + "epoch": 18.388, + "grad_norm": 0.18193602561950684, + "learning_rate": 4.05811623246493e-06, + "loss": 0.0081, + "step": 4597 + }, + { + "epoch": 18.392, + "grad_norm": 0.26493698358535767, + "learning_rate": 4.04809619238477e-06, + "loss": 0.0081, + "step": 4598 + }, + { + "epoch": 18.396, + "grad_norm": 0.16297456622123718, + "learning_rate": 4.038076152304609e-06, + "loss": 0.0072, + "step": 4599 + }, + { + "epoch": 18.4, + "grad_norm": 0.27221083641052246, + "learning_rate": 4.028056112224449e-06, + "loss": 0.0088, + "step": 4600 + }, + { + "epoch": 18.404, + "grad_norm": 0.19877958297729492, + "learning_rate": 4.0180360721442886e-06, + "loss": 0.0083, + "step": 4601 + }, + { + "epoch": 18.408, + "grad_norm": 0.19414174556732178, + "learning_rate": 4.008016032064128e-06, + "loss": 0.0071, + "step": 4602 + }, + { + "epoch": 18.412, + "grad_norm": 0.1475425511598587, + "learning_rate": 3.997995991983968e-06, + "loss": 0.0051, + "step": 4603 + }, + { + "epoch": 18.416, + "grad_norm": 0.13370120525360107, + "learning_rate": 3.987975951903808e-06, + "loss": 0.0073, + "step": 4604 + }, + { + "epoch": 18.42, + "grad_norm": 0.17306354641914368, + "learning_rate": 3.977955911823648e-06, + "loss": 0.0077, + "step": 4605 + }, + { + "epoch": 18.424, + "grad_norm": 0.21375983953475952, + "learning_rate": 3.9679358717434865e-06, + "loss": 0.0068, + "step": 4606 + }, + { + "epoch": 18.428, + "grad_norm": 0.17107589542865753, + "learning_rate": 3.957915831663327e-06, + "loss": 0.0076, + "step": 4607 + }, + { + "epoch": 18.432, + "grad_norm": 0.15007686614990234, + "learning_rate": 3.9478957915831665e-06, + "loss": 0.008, + "step": 4608 + }, + { + "epoch": 18.436, + "grad_norm": 0.13160204887390137, + "learning_rate": 3.937875751503006e-06, + "loss": 0.0064, + "step": 4609 + }, + { + "epoch": 18.44, + "grad_norm": 0.2253139317035675, + "learning_rate": 3.927855711422846e-06, + "loss": 0.0077, + "step": 4610 + }, + { + "epoch": 18.444, + "grad_norm": 0.12682516872882843, + "learning_rate": 3.917835671342686e-06, + "loss": 0.0035, + "step": 4611 + }, + { + "epoch": 18.448, + "grad_norm": 0.17346949875354767, + "learning_rate": 3.907815631262525e-06, + "loss": 0.007, + "step": 4612 + }, + { + "epoch": 18.452, + "grad_norm": 0.24276018142700195, + "learning_rate": 3.8977955911823645e-06, + "loss": 0.0075, + "step": 4613 + }, + { + "epoch": 18.456, + "grad_norm": 0.13902723789215088, + "learning_rate": 3.887775551102205e-06, + "loss": 0.0067, + "step": 4614 + }, + { + "epoch": 18.46, + "grad_norm": 0.23678214848041534, + "learning_rate": 3.8777555110220445e-06, + "loss": 0.0079, + "step": 4615 + }, + { + "epoch": 18.464, + "grad_norm": 0.11600477248430252, + "learning_rate": 3.867735470941884e-06, + "loss": 0.0069, + "step": 4616 + }, + { + "epoch": 18.468, + "grad_norm": 0.17199131846427917, + "learning_rate": 3.857715430861724e-06, + "loss": 0.0079, + "step": 4617 + }, + { + "epoch": 18.472, + "grad_norm": 0.13820935785770416, + "learning_rate": 3.847695390781563e-06, + "loss": 0.0065, + "step": 4618 + }, + { + "epoch": 18.476, + "grad_norm": 0.2043878436088562, + "learning_rate": 3.837675350701403e-06, + "loss": 0.008, + "step": 4619 + }, + { + "epoch": 18.48, + "grad_norm": 0.1664237231016159, + "learning_rate": 3.8276553106212424e-06, + "loss": 0.0073, + "step": 4620 + }, + { + "epoch": 18.484, + "grad_norm": 0.14281480014324188, + "learning_rate": 3.817635270541083e-06, + "loss": 0.0077, + "step": 4621 + }, + { + "epoch": 18.488, + "grad_norm": 0.15422894060611725, + "learning_rate": 3.8076152304609225e-06, + "loss": 0.0071, + "step": 4622 + }, + { + "epoch": 18.492, + "grad_norm": 0.18872609734535217, + "learning_rate": 3.7975951903807616e-06, + "loss": 0.0075, + "step": 4623 + }, + { + "epoch": 18.496, + "grad_norm": 0.2351578027009964, + "learning_rate": 3.7875751503006012e-06, + "loss": 0.0079, + "step": 4624 + }, + { + "epoch": 18.5, + "grad_norm": 0.15533076226711273, + "learning_rate": 3.7775551102204412e-06, + "loss": 0.007, + "step": 4625 + }, + { + "epoch": 18.504, + "grad_norm": 0.19388796389102936, + "learning_rate": 3.767535070140281e-06, + "loss": 0.0069, + "step": 4626 + }, + { + "epoch": 18.508, + "grad_norm": 0.17046746611595154, + "learning_rate": 3.757515030060121e-06, + "loss": 0.0067, + "step": 4627 + }, + { + "epoch": 18.512, + "grad_norm": 0.15375731885433197, + "learning_rate": 3.74749498997996e-06, + "loss": 0.0074, + "step": 4628 + }, + { + "epoch": 18.516, + "grad_norm": 0.2605665922164917, + "learning_rate": 3.7374749498997996e-06, + "loss": 0.0073, + "step": 4629 + }, + { + "epoch": 18.52, + "grad_norm": 0.18455125391483307, + "learning_rate": 3.7274549098196396e-06, + "loss": 0.0081, + "step": 4630 + }, + { + "epoch": 18.524, + "grad_norm": 0.21785873174667358, + "learning_rate": 3.717434869739479e-06, + "loss": 0.0082, + "step": 4631 + }, + { + "epoch": 18.528, + "grad_norm": 0.14746534824371338, + "learning_rate": 3.707414829659319e-06, + "loss": 0.0074, + "step": 4632 + }, + { + "epoch": 18.532, + "grad_norm": 0.1808655709028244, + "learning_rate": 3.6973947895791584e-06, + "loss": 0.0073, + "step": 4633 + }, + { + "epoch": 18.536, + "grad_norm": 0.20626331865787506, + "learning_rate": 3.687374749498998e-06, + "loss": 0.0075, + "step": 4634 + }, + { + "epoch": 18.54, + "grad_norm": 0.19164638221263885, + "learning_rate": 3.677354709418838e-06, + "loss": 0.0073, + "step": 4635 + }, + { + "epoch": 18.544, + "grad_norm": 0.12534405291080475, + "learning_rate": 3.6673346693386775e-06, + "loss": 0.0072, + "step": 4636 + }, + { + "epoch": 18.548000000000002, + "grad_norm": 0.1504947692155838, + "learning_rate": 3.6573146292585176e-06, + "loss": 0.007, + "step": 4637 + }, + { + "epoch": 18.552, + "grad_norm": 0.2330176681280136, + "learning_rate": 3.647294589178357e-06, + "loss": 0.0077, + "step": 4638 + }, + { + "epoch": 18.556, + "grad_norm": 0.20846612751483917, + "learning_rate": 3.6372745490981963e-06, + "loss": 0.0087, + "step": 4639 + }, + { + "epoch": 18.56, + "grad_norm": 0.15900227427482605, + "learning_rate": 3.6272545090180363e-06, + "loss": 0.0062, + "step": 4640 + }, + { + "epoch": 18.564, + "grad_norm": 0.10888917744159698, + "learning_rate": 3.617234468937876e-06, + "loss": 0.0062, + "step": 4641 + }, + { + "epoch": 18.568, + "grad_norm": 0.18537414073944092, + "learning_rate": 3.607214428857716e-06, + "loss": 0.0078, + "step": 4642 + }, + { + "epoch": 18.572, + "grad_norm": 0.2435263842344284, + "learning_rate": 3.5971943887775555e-06, + "loss": 0.008, + "step": 4643 + }, + { + "epoch": 18.576, + "grad_norm": 0.1557953655719757, + "learning_rate": 3.5871743486973947e-06, + "loss": 0.0066, + "step": 4644 + }, + { + "epoch": 18.58, + "grad_norm": 0.22416846454143524, + "learning_rate": 3.5771543086172343e-06, + "loss": 0.0078, + "step": 4645 + }, + { + "epoch": 18.584, + "grad_norm": 0.1947488784790039, + "learning_rate": 3.5671342685370743e-06, + "loss": 0.0077, + "step": 4646 + }, + { + "epoch": 18.588, + "grad_norm": 0.1567157357931137, + "learning_rate": 3.5571142284569143e-06, + "loss": 0.0064, + "step": 4647 + }, + { + "epoch": 18.592, + "grad_norm": 0.19222493469715118, + "learning_rate": 3.547094188376754e-06, + "loss": 0.0074, + "step": 4648 + }, + { + "epoch": 18.596, + "grad_norm": 0.11136149615049362, + "learning_rate": 3.537074148296594e-06, + "loss": 0.0041, + "step": 4649 + }, + { + "epoch": 18.6, + "grad_norm": 0.22324241697788239, + "learning_rate": 3.5270541082164326e-06, + "loss": 0.0079, + "step": 4650 + }, + { + "epoch": 18.604, + "grad_norm": 0.19871820509433746, + "learning_rate": 3.5170340681362726e-06, + "loss": 0.008, + "step": 4651 + }, + { + "epoch": 18.608, + "grad_norm": 0.14418363571166992, + "learning_rate": 3.5070140280561122e-06, + "loss": 0.0074, + "step": 4652 + }, + { + "epoch": 18.612, + "grad_norm": 0.10854968428611755, + "learning_rate": 3.4969939879759522e-06, + "loss": 0.0045, + "step": 4653 + }, + { + "epoch": 18.616, + "grad_norm": 0.1685395985841751, + "learning_rate": 3.486973947895792e-06, + "loss": 0.0074, + "step": 4654 + }, + { + "epoch": 18.62, + "grad_norm": 0.1491825133562088, + "learning_rate": 3.476953907815631e-06, + "loss": 0.0081, + "step": 4655 + }, + { + "epoch": 18.624, + "grad_norm": 0.10171408206224442, + "learning_rate": 3.466933867735471e-06, + "loss": 0.0042, + "step": 4656 + }, + { + "epoch": 18.628, + "grad_norm": 0.18516850471496582, + "learning_rate": 3.4569138276553106e-06, + "loss": 0.0076, + "step": 4657 + }, + { + "epoch": 18.632, + "grad_norm": 0.192645862698555, + "learning_rate": 3.4468937875751506e-06, + "loss": 0.0067, + "step": 4658 + }, + { + "epoch": 18.636, + "grad_norm": 0.17298291623592377, + "learning_rate": 3.43687374749499e-06, + "loss": 0.0089, + "step": 4659 + }, + { + "epoch": 18.64, + "grad_norm": 0.20988403260707855, + "learning_rate": 3.4268537074148294e-06, + "loss": 0.0089, + "step": 4660 + }, + { + "epoch": 18.644, + "grad_norm": 0.13690654933452606, + "learning_rate": 3.4168336673346694e-06, + "loss": 0.0072, + "step": 4661 + }, + { + "epoch": 18.648, + "grad_norm": 0.29314905405044556, + "learning_rate": 3.406813627254509e-06, + "loss": 0.0084, + "step": 4662 + }, + { + "epoch": 18.652, + "grad_norm": 0.18612822890281677, + "learning_rate": 3.396793587174349e-06, + "loss": 0.0072, + "step": 4663 + }, + { + "epoch": 18.656, + "grad_norm": 0.15141285955905914, + "learning_rate": 3.3867735470941886e-06, + "loss": 0.0066, + "step": 4664 + }, + { + "epoch": 18.66, + "grad_norm": 0.13616497814655304, + "learning_rate": 3.3767535070140286e-06, + "loss": 0.0069, + "step": 4665 + }, + { + "epoch": 18.664, + "grad_norm": 0.2509894073009491, + "learning_rate": 3.3667334669338677e-06, + "loss": 0.008, + "step": 4666 + }, + { + "epoch": 18.668, + "grad_norm": 0.2138143926858902, + "learning_rate": 3.3567134268537073e-06, + "loss": 0.0073, + "step": 4667 + }, + { + "epoch": 18.672, + "grad_norm": 0.16789166629314423, + "learning_rate": 3.3466933867735473e-06, + "loss": 0.0077, + "step": 4668 + }, + { + "epoch": 18.676, + "grad_norm": 0.16509471833705902, + "learning_rate": 3.336673346693387e-06, + "loss": 0.0073, + "step": 4669 + }, + { + "epoch": 18.68, + "grad_norm": 0.2040925770998001, + "learning_rate": 3.326653306613227e-06, + "loss": 0.0071, + "step": 4670 + }, + { + "epoch": 18.684, + "grad_norm": 0.17239294946193695, + "learning_rate": 3.316633266533066e-06, + "loss": 0.0077, + "step": 4671 + }, + { + "epoch": 18.688, + "grad_norm": 0.17981763184070587, + "learning_rate": 3.3066132264529057e-06, + "loss": 0.0073, + "step": 4672 + }, + { + "epoch": 18.692, + "grad_norm": 0.15703114867210388, + "learning_rate": 3.2965931863727457e-06, + "loss": 0.008, + "step": 4673 + }, + { + "epoch": 18.696, + "grad_norm": 0.17290978133678436, + "learning_rate": 3.2865731462925853e-06, + "loss": 0.0085, + "step": 4674 + }, + { + "epoch": 18.7, + "grad_norm": 0.18817895650863647, + "learning_rate": 3.2765531062124253e-06, + "loss": 0.0083, + "step": 4675 + }, + { + "epoch": 18.704, + "grad_norm": 0.1855512112379074, + "learning_rate": 3.2665330661322645e-06, + "loss": 0.0075, + "step": 4676 + }, + { + "epoch": 18.708, + "grad_norm": 0.2292889952659607, + "learning_rate": 3.256513026052104e-06, + "loss": 0.0081, + "step": 4677 + }, + { + "epoch": 18.712, + "grad_norm": 0.15478086471557617, + "learning_rate": 3.246492985971944e-06, + "loss": 0.0072, + "step": 4678 + }, + { + "epoch": 18.716, + "grad_norm": 0.16357959806919098, + "learning_rate": 3.2364729458917837e-06, + "loss": 0.0077, + "step": 4679 + }, + { + "epoch": 18.72, + "grad_norm": 0.17677243053913116, + "learning_rate": 3.2264529058116237e-06, + "loss": 0.0072, + "step": 4680 + }, + { + "epoch": 18.724, + "grad_norm": 0.23427382111549377, + "learning_rate": 3.2164328657314633e-06, + "loss": 0.0089, + "step": 4681 + }, + { + "epoch": 18.728, + "grad_norm": 0.1855786293745041, + "learning_rate": 3.2064128256513024e-06, + "loss": 0.0069, + "step": 4682 + }, + { + "epoch": 18.732, + "grad_norm": 0.17159710824489594, + "learning_rate": 3.1963927855711424e-06, + "loss": 0.0071, + "step": 4683 + }, + { + "epoch": 18.736, + "grad_norm": 0.18755950033664703, + "learning_rate": 3.186372745490982e-06, + "loss": 0.0075, + "step": 4684 + }, + { + "epoch": 18.74, + "grad_norm": 0.16439513862133026, + "learning_rate": 3.176352705410822e-06, + "loss": 0.0072, + "step": 4685 + }, + { + "epoch": 18.744, + "grad_norm": 0.17373932898044586, + "learning_rate": 3.1663326653306616e-06, + "loss": 0.0072, + "step": 4686 + }, + { + "epoch": 18.748, + "grad_norm": 0.19658909738063812, + "learning_rate": 3.156312625250501e-06, + "loss": 0.008, + "step": 4687 + }, + { + "epoch": 18.752, + "grad_norm": 0.1813708245754242, + "learning_rate": 3.146292585170341e-06, + "loss": 0.0068, + "step": 4688 + }, + { + "epoch": 18.756, + "grad_norm": 0.15275795757770538, + "learning_rate": 3.1362725450901804e-06, + "loss": 0.0066, + "step": 4689 + }, + { + "epoch": 18.76, + "grad_norm": 0.20497362315654755, + "learning_rate": 3.1262525050100204e-06, + "loss": 0.0081, + "step": 4690 + }, + { + "epoch": 18.764, + "grad_norm": 0.1795947104692459, + "learning_rate": 3.1162324649298596e-06, + "loss": 0.0077, + "step": 4691 + }, + { + "epoch": 18.768, + "grad_norm": 0.14779050648212433, + "learning_rate": 3.1062124248496996e-06, + "loss": 0.0072, + "step": 4692 + }, + { + "epoch": 18.772, + "grad_norm": 0.15518996119499207, + "learning_rate": 3.096192384769539e-06, + "loss": 0.0048, + "step": 4693 + }, + { + "epoch": 18.776, + "grad_norm": 0.1515381932258606, + "learning_rate": 3.0861723446893788e-06, + "loss": 0.0068, + "step": 4694 + }, + { + "epoch": 18.78, + "grad_norm": 0.2062983363866806, + "learning_rate": 3.0761523046092188e-06, + "loss": 0.0085, + "step": 4695 + }, + { + "epoch": 18.784, + "grad_norm": 0.1976705640554428, + "learning_rate": 3.0661322645290584e-06, + "loss": 0.0072, + "step": 4696 + }, + { + "epoch": 18.788, + "grad_norm": 0.13696104288101196, + "learning_rate": 3.056112224448898e-06, + "loss": 0.0065, + "step": 4697 + }, + { + "epoch": 18.792, + "grad_norm": 0.19733761250972748, + "learning_rate": 3.0460921843687375e-06, + "loss": 0.0072, + "step": 4698 + }, + { + "epoch": 18.796, + "grad_norm": 0.17642058432102203, + "learning_rate": 3.036072144288577e-06, + "loss": 0.0078, + "step": 4699 + }, + { + "epoch": 18.8, + "grad_norm": 0.17116330564022064, + "learning_rate": 3.026052104208417e-06, + "loss": 0.0069, + "step": 4700 + }, + { + "epoch": 18.804, + "grad_norm": 0.2022354155778885, + "learning_rate": 3.0160320641282567e-06, + "loss": 0.0079, + "step": 4701 + }, + { + "epoch": 18.808, + "grad_norm": 0.147251158952713, + "learning_rate": 3.0060120240480963e-06, + "loss": 0.0062, + "step": 4702 + }, + { + "epoch": 18.812, + "grad_norm": 0.1599157601594925, + "learning_rate": 2.995991983967936e-06, + "loss": 0.008, + "step": 4703 + }, + { + "epoch": 18.816, + "grad_norm": 0.12859754264354706, + "learning_rate": 2.985971943887776e-06, + "loss": 0.0073, + "step": 4704 + }, + { + "epoch": 18.82, + "grad_norm": 0.2207632064819336, + "learning_rate": 2.975951903807615e-06, + "loss": 0.0078, + "step": 4705 + }, + { + "epoch": 18.824, + "grad_norm": 0.16606071591377258, + "learning_rate": 2.965931863727455e-06, + "loss": 0.0072, + "step": 4706 + }, + { + "epoch": 18.828, + "grad_norm": 0.1599171757698059, + "learning_rate": 2.9559118236472947e-06, + "loss": 0.008, + "step": 4707 + }, + { + "epoch": 18.832, + "grad_norm": 0.14735926687717438, + "learning_rate": 2.9458917835671343e-06, + "loss": 0.0071, + "step": 4708 + }, + { + "epoch": 18.836, + "grad_norm": 0.25691819190979004, + "learning_rate": 2.9358717434869743e-06, + "loss": 0.0079, + "step": 4709 + }, + { + "epoch": 18.84, + "grad_norm": 0.19598759710788727, + "learning_rate": 2.9258517034068135e-06, + "loss": 0.0076, + "step": 4710 + }, + { + "epoch": 18.844, + "grad_norm": 0.19333742558956146, + "learning_rate": 2.9158316633266535e-06, + "loss": 0.0076, + "step": 4711 + }, + { + "epoch": 18.848, + "grad_norm": 0.1969856470823288, + "learning_rate": 2.905811623246493e-06, + "loss": 0.0075, + "step": 4712 + }, + { + "epoch": 18.852, + "grad_norm": 0.1487724334001541, + "learning_rate": 2.8957915831663326e-06, + "loss": 0.0067, + "step": 4713 + }, + { + "epoch": 18.856, + "grad_norm": 0.1891922503709793, + "learning_rate": 2.8857715430861727e-06, + "loss": 0.007, + "step": 4714 + }, + { + "epoch": 18.86, + "grad_norm": 0.1953631490468979, + "learning_rate": 2.8757515030060122e-06, + "loss": 0.0077, + "step": 4715 + }, + { + "epoch": 18.864, + "grad_norm": 0.17311617732048035, + "learning_rate": 2.865731462925852e-06, + "loss": 0.0072, + "step": 4716 + }, + { + "epoch": 18.868, + "grad_norm": 0.10396530479192734, + "learning_rate": 2.8557114228456914e-06, + "loss": 0.0041, + "step": 4717 + }, + { + "epoch": 18.872, + "grad_norm": 0.1536823958158493, + "learning_rate": 2.845691382765531e-06, + "loss": 0.008, + "step": 4718 + }, + { + "epoch": 18.876, + "grad_norm": 0.1858183890581131, + "learning_rate": 2.835671342685371e-06, + "loss": 0.0076, + "step": 4719 + }, + { + "epoch": 18.88, + "grad_norm": 0.13414987921714783, + "learning_rate": 2.8256513026052106e-06, + "loss": 0.0068, + "step": 4720 + }, + { + "epoch": 18.884, + "grad_norm": 0.11735019087791443, + "learning_rate": 2.81563126252505e-06, + "loss": 0.0059, + "step": 4721 + }, + { + "epoch": 18.888, + "grad_norm": 0.17751845717430115, + "learning_rate": 2.80561122244489e-06, + "loss": 0.0073, + "step": 4722 + }, + { + "epoch": 18.892, + "grad_norm": 0.22168204188346863, + "learning_rate": 2.79559118236473e-06, + "loss": 0.0083, + "step": 4723 + }, + { + "epoch": 18.896, + "grad_norm": 0.18267209827899933, + "learning_rate": 2.7855711422845694e-06, + "loss": 0.0077, + "step": 4724 + }, + { + "epoch": 18.9, + "grad_norm": 0.1117076501250267, + "learning_rate": 2.775551102204409e-06, + "loss": 0.0069, + "step": 4725 + }, + { + "epoch": 18.904, + "grad_norm": 0.12114907801151276, + "learning_rate": 2.7655310621242486e-06, + "loss": 0.007, + "step": 4726 + }, + { + "epoch": 18.908, + "grad_norm": 0.2513674199581146, + "learning_rate": 2.755511022044088e-06, + "loss": 0.0084, + "step": 4727 + }, + { + "epoch": 18.912, + "grad_norm": 0.1959051787853241, + "learning_rate": 2.745490981963928e-06, + "loss": 0.0073, + "step": 4728 + }, + { + "epoch": 18.916, + "grad_norm": 0.2004287838935852, + "learning_rate": 2.7354709418837678e-06, + "loss": 0.0077, + "step": 4729 + }, + { + "epoch": 18.92, + "grad_norm": 0.17070572078227997, + "learning_rate": 2.7254509018036073e-06, + "loss": 0.0068, + "step": 4730 + }, + { + "epoch": 18.924, + "grad_norm": 0.14630602300167084, + "learning_rate": 2.7154308617234474e-06, + "loss": 0.0071, + "step": 4731 + }, + { + "epoch": 18.928, + "grad_norm": 0.16959349811077118, + "learning_rate": 2.7054108216432865e-06, + "loss": 0.0083, + "step": 4732 + }, + { + "epoch": 18.932, + "grad_norm": 0.15642155706882477, + "learning_rate": 2.6953907815631265e-06, + "loss": 0.0075, + "step": 4733 + }, + { + "epoch": 18.936, + "grad_norm": 0.17925404012203217, + "learning_rate": 2.6853707414829657e-06, + "loss": 0.0065, + "step": 4734 + }, + { + "epoch": 18.94, + "grad_norm": 0.19508227705955505, + "learning_rate": 2.6753507014028057e-06, + "loss": 0.008, + "step": 4735 + }, + { + "epoch": 18.944, + "grad_norm": 0.16290459036827087, + "learning_rate": 2.6653306613226457e-06, + "loss": 0.005, + "step": 4736 + }, + { + "epoch": 18.948, + "grad_norm": 0.15462210774421692, + "learning_rate": 2.655310621242485e-06, + "loss": 0.0078, + "step": 4737 + }, + { + "epoch": 18.951999999999998, + "grad_norm": 0.11808783560991287, + "learning_rate": 2.645290581162325e-06, + "loss": 0.0066, + "step": 4738 + }, + { + "epoch": 18.956, + "grad_norm": 0.20686569809913635, + "learning_rate": 2.6352705410821645e-06, + "loss": 0.0078, + "step": 4739 + }, + { + "epoch": 18.96, + "grad_norm": 0.17440447211265564, + "learning_rate": 2.625250501002004e-06, + "loss": 0.0081, + "step": 4740 + }, + { + "epoch": 18.964, + "grad_norm": 0.337189644575119, + "learning_rate": 2.6152304609218437e-06, + "loss": 0.0079, + "step": 4741 + }, + { + "epoch": 18.968, + "grad_norm": 0.18942351639270782, + "learning_rate": 2.6052104208416833e-06, + "loss": 0.0091, + "step": 4742 + }, + { + "epoch": 18.972, + "grad_norm": 0.10684970766305923, + "learning_rate": 2.5951903807615233e-06, + "loss": 0.0046, + "step": 4743 + }, + { + "epoch": 18.976, + "grad_norm": 0.22341260313987732, + "learning_rate": 2.585170340681363e-06, + "loss": 0.0077, + "step": 4744 + }, + { + "epoch": 18.98, + "grad_norm": 0.24727453291416168, + "learning_rate": 2.5751503006012024e-06, + "loss": 0.0088, + "step": 4745 + }, + { + "epoch": 18.984, + "grad_norm": 0.19121108949184418, + "learning_rate": 2.565130260521042e-06, + "loss": 0.0073, + "step": 4746 + }, + { + "epoch": 18.988, + "grad_norm": 0.2110673189163208, + "learning_rate": 2.555110220440882e-06, + "loss": 0.0077, + "step": 4747 + }, + { + "epoch": 18.992, + "grad_norm": 0.13829149305820465, + "learning_rate": 2.5450901803607216e-06, + "loss": 0.0069, + "step": 4748 + }, + { + "epoch": 18.996, + "grad_norm": 0.1673704981803894, + "learning_rate": 2.5350701402805612e-06, + "loss": 0.0073, + "step": 4749 + }, + { + "epoch": 19.0, + "grad_norm": 0.18628820776939392, + "learning_rate": 2.5250501002004012e-06, + "loss": 0.0076, + "step": 4750 + }, + { + "epoch": 19.004, + "grad_norm": 0.17194223403930664, + "learning_rate": 2.5150300601202404e-06, + "loss": 0.0065, + "step": 4751 + }, + { + "epoch": 19.008, + "grad_norm": 0.14391052722930908, + "learning_rate": 2.5050100200400804e-06, + "loss": 0.0066, + "step": 4752 + }, + { + "epoch": 19.012, + "grad_norm": 0.1288471221923828, + "learning_rate": 2.49498997995992e-06, + "loss": 0.0064, + "step": 4753 + }, + { + "epoch": 19.016, + "grad_norm": 0.158019557595253, + "learning_rate": 2.4849699398797596e-06, + "loss": 0.007, + "step": 4754 + }, + { + "epoch": 19.02, + "grad_norm": 0.19652923941612244, + "learning_rate": 2.4749498997995996e-06, + "loss": 0.0078, + "step": 4755 + }, + { + "epoch": 19.024, + "grad_norm": 0.1863459348678589, + "learning_rate": 2.4649298597194388e-06, + "loss": 0.0073, + "step": 4756 + }, + { + "epoch": 19.028, + "grad_norm": 0.16609062254428864, + "learning_rate": 2.4549098196392788e-06, + "loss": 0.0073, + "step": 4757 + }, + { + "epoch": 19.032, + "grad_norm": 0.1344534158706665, + "learning_rate": 2.4448897795591184e-06, + "loss": 0.0062, + "step": 4758 + }, + { + "epoch": 19.036, + "grad_norm": 0.11711183190345764, + "learning_rate": 2.434869739478958e-06, + "loss": 0.0058, + "step": 4759 + }, + { + "epoch": 19.04, + "grad_norm": 0.12532389163970947, + "learning_rate": 2.424849699398798e-06, + "loss": 0.0068, + "step": 4760 + }, + { + "epoch": 19.044, + "grad_norm": 0.14596407115459442, + "learning_rate": 2.414829659318637e-06, + "loss": 0.007, + "step": 4761 + }, + { + "epoch": 19.048, + "grad_norm": 0.13948693871498108, + "learning_rate": 2.404809619238477e-06, + "loss": 0.0067, + "step": 4762 + }, + { + "epoch": 19.052, + "grad_norm": 0.15014758706092834, + "learning_rate": 2.3947895791583167e-06, + "loss": 0.0067, + "step": 4763 + }, + { + "epoch": 19.056, + "grad_norm": 0.11399639397859573, + "learning_rate": 2.3847695390781563e-06, + "loss": 0.0039, + "step": 4764 + }, + { + "epoch": 19.06, + "grad_norm": 0.27429330348968506, + "learning_rate": 2.374749498997996e-06, + "loss": 0.0077, + "step": 4765 + }, + { + "epoch": 19.064, + "grad_norm": 0.1491885483264923, + "learning_rate": 2.364729458917836e-06, + "loss": 0.0069, + "step": 4766 + }, + { + "epoch": 19.068, + "grad_norm": 0.1384284943342209, + "learning_rate": 2.3547094188376755e-06, + "loss": 0.0063, + "step": 4767 + }, + { + "epoch": 19.072, + "grad_norm": 0.15902647376060486, + "learning_rate": 2.344689378757515e-06, + "loss": 0.0076, + "step": 4768 + }, + { + "epoch": 19.076, + "grad_norm": 0.17550063133239746, + "learning_rate": 2.3346693386773547e-06, + "loss": 0.0075, + "step": 4769 + }, + { + "epoch": 19.08, + "grad_norm": 0.11770374327898026, + "learning_rate": 2.3246492985971943e-06, + "loss": 0.0065, + "step": 4770 + }, + { + "epoch": 19.084, + "grad_norm": 0.13193221390247345, + "learning_rate": 2.3146292585170343e-06, + "loss": 0.0064, + "step": 4771 + }, + { + "epoch": 19.088, + "grad_norm": 0.20296931266784668, + "learning_rate": 2.304609218436874e-06, + "loss": 0.0082, + "step": 4772 + }, + { + "epoch": 19.092, + "grad_norm": 0.17178785800933838, + "learning_rate": 2.2945891783567135e-06, + "loss": 0.007, + "step": 4773 + }, + { + "epoch": 19.096, + "grad_norm": 0.18985581398010254, + "learning_rate": 2.2845691382765535e-06, + "loss": 0.0064, + "step": 4774 + }, + { + "epoch": 19.1, + "grad_norm": 0.180537611246109, + "learning_rate": 2.2745490981963926e-06, + "loss": 0.0069, + "step": 4775 + }, + { + "epoch": 19.104, + "grad_norm": 0.13604559004306793, + "learning_rate": 2.2645290581162327e-06, + "loss": 0.0045, + "step": 4776 + }, + { + "epoch": 19.108, + "grad_norm": 0.13354989886283875, + "learning_rate": 2.2545090180360722e-06, + "loss": 0.0061, + "step": 4777 + }, + { + "epoch": 19.112, + "grad_norm": 0.1494477540254593, + "learning_rate": 2.244488977955912e-06, + "loss": 0.0069, + "step": 4778 + }, + { + "epoch": 19.116, + "grad_norm": 0.22611655294895172, + "learning_rate": 2.234468937875752e-06, + "loss": 0.008, + "step": 4779 + }, + { + "epoch": 19.12, + "grad_norm": 0.11629709601402283, + "learning_rate": 2.224448897795591e-06, + "loss": 0.0045, + "step": 4780 + }, + { + "epoch": 19.124, + "grad_norm": 0.16238947212696075, + "learning_rate": 2.214428857715431e-06, + "loss": 0.007, + "step": 4781 + }, + { + "epoch": 19.128, + "grad_norm": 0.15134558081626892, + "learning_rate": 2.2044088176352706e-06, + "loss": 0.0072, + "step": 4782 + }, + { + "epoch": 19.132, + "grad_norm": 0.1289103478193283, + "learning_rate": 2.19438877755511e-06, + "loss": 0.0056, + "step": 4783 + }, + { + "epoch": 19.136, + "grad_norm": 0.13187222182750702, + "learning_rate": 2.18436873747495e-06, + "loss": 0.0062, + "step": 4784 + }, + { + "epoch": 19.14, + "grad_norm": 0.16803032159805298, + "learning_rate": 2.17434869739479e-06, + "loss": 0.0072, + "step": 4785 + }, + { + "epoch": 19.144, + "grad_norm": 0.16196125745773315, + "learning_rate": 2.1643286573146294e-06, + "loss": 0.0071, + "step": 4786 + }, + { + "epoch": 19.148, + "grad_norm": 0.14935888350009918, + "learning_rate": 2.154308617234469e-06, + "loss": 0.007, + "step": 4787 + }, + { + "epoch": 19.152, + "grad_norm": 0.17535895109176636, + "learning_rate": 2.1442885771543086e-06, + "loss": 0.007, + "step": 4788 + }, + { + "epoch": 19.156, + "grad_norm": 0.12928767502307892, + "learning_rate": 2.1342685370741486e-06, + "loss": 0.0065, + "step": 4789 + }, + { + "epoch": 19.16, + "grad_norm": 0.1758950650691986, + "learning_rate": 2.124248496993988e-06, + "loss": 0.0064, + "step": 4790 + }, + { + "epoch": 19.164, + "grad_norm": 0.1001003235578537, + "learning_rate": 2.1142284569138278e-06, + "loss": 0.0058, + "step": 4791 + }, + { + "epoch": 19.168, + "grad_norm": 0.17448782920837402, + "learning_rate": 2.1042084168336673e-06, + "loss": 0.0064, + "step": 4792 + }, + { + "epoch": 19.172, + "grad_norm": 0.16154441237449646, + "learning_rate": 2.0941883767535074e-06, + "loss": 0.0072, + "step": 4793 + }, + { + "epoch": 19.176, + "grad_norm": 0.16718144714832306, + "learning_rate": 2.0841683366733465e-06, + "loss": 0.007, + "step": 4794 + }, + { + "epoch": 19.18, + "grad_norm": 0.16893278062343597, + "learning_rate": 2.0741482965931865e-06, + "loss": 0.0077, + "step": 4795 + }, + { + "epoch": 19.184, + "grad_norm": 0.14809642732143402, + "learning_rate": 2.064128256513026e-06, + "loss": 0.0062, + "step": 4796 + }, + { + "epoch": 19.188, + "grad_norm": 0.16328303515911102, + "learning_rate": 2.0541082164328657e-06, + "loss": 0.0067, + "step": 4797 + }, + { + "epoch": 19.192, + "grad_norm": 0.17468783259391785, + "learning_rate": 2.0440881763527057e-06, + "loss": 0.0071, + "step": 4798 + }, + { + "epoch": 19.196, + "grad_norm": 0.1770590990781784, + "learning_rate": 2.034068136272545e-06, + "loss": 0.0073, + "step": 4799 + }, + { + "epoch": 19.2, + "grad_norm": 0.20840556919574738, + "learning_rate": 2.024048096192385e-06, + "loss": 0.0077, + "step": 4800 + }, + { + "epoch": 19.204, + "grad_norm": 0.16028790175914764, + "learning_rate": 2.0140280561122245e-06, + "loss": 0.0075, + "step": 4801 + }, + { + "epoch": 19.208, + "grad_norm": 0.10381535440683365, + "learning_rate": 2.004008016032064e-06, + "loss": 0.0042, + "step": 4802 + }, + { + "epoch": 19.212, + "grad_norm": 0.1724594235420227, + "learning_rate": 1.993987975951904e-06, + "loss": 0.0079, + "step": 4803 + }, + { + "epoch": 19.216, + "grad_norm": 0.17494012415409088, + "learning_rate": 1.9839679358717433e-06, + "loss": 0.0069, + "step": 4804 + }, + { + "epoch": 19.22, + "grad_norm": 0.16234153509140015, + "learning_rate": 1.9739478957915833e-06, + "loss": 0.0069, + "step": 4805 + }, + { + "epoch": 19.224, + "grad_norm": 0.15438240766525269, + "learning_rate": 1.963927855711423e-06, + "loss": 0.0073, + "step": 4806 + }, + { + "epoch": 19.228, + "grad_norm": 0.16121751070022583, + "learning_rate": 1.9539078156312624e-06, + "loss": 0.0074, + "step": 4807 + }, + { + "epoch": 19.232, + "grad_norm": 0.15929311513900757, + "learning_rate": 1.9438877755511025e-06, + "loss": 0.0073, + "step": 4808 + }, + { + "epoch": 19.236, + "grad_norm": 0.1370072364807129, + "learning_rate": 1.933867735470942e-06, + "loss": 0.007, + "step": 4809 + }, + { + "epoch": 19.24, + "grad_norm": 0.15893737971782684, + "learning_rate": 1.9238476953907816e-06, + "loss": 0.0063, + "step": 4810 + }, + { + "epoch": 19.244, + "grad_norm": 0.19493524730205536, + "learning_rate": 1.9138276553106212e-06, + "loss": 0.0072, + "step": 4811 + }, + { + "epoch": 19.248, + "grad_norm": 0.1659090220928192, + "learning_rate": 1.9038076152304612e-06, + "loss": 0.0069, + "step": 4812 + }, + { + "epoch": 19.252, + "grad_norm": 0.14420440793037415, + "learning_rate": 1.8937875751503006e-06, + "loss": 0.0065, + "step": 4813 + }, + { + "epoch": 19.256, + "grad_norm": 0.16298946738243103, + "learning_rate": 1.8837675350701404e-06, + "loss": 0.0065, + "step": 4814 + }, + { + "epoch": 19.26, + "grad_norm": 0.13460707664489746, + "learning_rate": 1.87374749498998e-06, + "loss": 0.0069, + "step": 4815 + }, + { + "epoch": 19.264, + "grad_norm": 0.17782555520534515, + "learning_rate": 1.8637274549098198e-06, + "loss": 0.0078, + "step": 4816 + }, + { + "epoch": 19.268, + "grad_norm": 0.21626275777816772, + "learning_rate": 1.8537074148296596e-06, + "loss": 0.0083, + "step": 4817 + }, + { + "epoch": 19.272, + "grad_norm": 0.07763174921274185, + "learning_rate": 1.843687374749499e-06, + "loss": 0.0036, + "step": 4818 + }, + { + "epoch": 19.276, + "grad_norm": 0.19351311028003693, + "learning_rate": 1.8336673346693388e-06, + "loss": 0.0074, + "step": 4819 + }, + { + "epoch": 19.28, + "grad_norm": 0.18536561727523804, + "learning_rate": 1.8236472945891786e-06, + "loss": 0.0074, + "step": 4820 + }, + { + "epoch": 19.284, + "grad_norm": 0.15970304608345032, + "learning_rate": 1.8136272545090182e-06, + "loss": 0.0066, + "step": 4821 + }, + { + "epoch": 19.288, + "grad_norm": 0.14813347160816193, + "learning_rate": 1.803607214428858e-06, + "loss": 0.0061, + "step": 4822 + }, + { + "epoch": 19.292, + "grad_norm": 0.1950247585773468, + "learning_rate": 1.7935871743486973e-06, + "loss": 0.0076, + "step": 4823 + }, + { + "epoch": 19.296, + "grad_norm": 0.19264726340770721, + "learning_rate": 1.7835671342685371e-06, + "loss": 0.0069, + "step": 4824 + }, + { + "epoch": 19.3, + "grad_norm": 0.19691282510757446, + "learning_rate": 1.773547094188377e-06, + "loss": 0.0067, + "step": 4825 + }, + { + "epoch": 19.304, + "grad_norm": 0.1652364283800125, + "learning_rate": 1.7635270541082163e-06, + "loss": 0.0065, + "step": 4826 + }, + { + "epoch": 19.308, + "grad_norm": 0.18058018386363983, + "learning_rate": 1.7535070140280561e-06, + "loss": 0.008, + "step": 4827 + }, + { + "epoch": 19.312, + "grad_norm": 0.1553662121295929, + "learning_rate": 1.743486973947896e-06, + "loss": 0.0067, + "step": 4828 + }, + { + "epoch": 19.316, + "grad_norm": 0.12730355560779572, + "learning_rate": 1.7334669338677355e-06, + "loss": 0.0068, + "step": 4829 + }, + { + "epoch": 19.32, + "grad_norm": 0.1950482428073883, + "learning_rate": 1.7234468937875753e-06, + "loss": 0.008, + "step": 4830 + }, + { + "epoch": 19.324, + "grad_norm": 0.17624934017658234, + "learning_rate": 1.7134268537074147e-06, + "loss": 0.0076, + "step": 4831 + }, + { + "epoch": 19.328, + "grad_norm": 0.1288178265094757, + "learning_rate": 1.7034068136272545e-06, + "loss": 0.0068, + "step": 4832 + }, + { + "epoch": 19.332, + "grad_norm": 0.1886533796787262, + "learning_rate": 1.6933867735470943e-06, + "loss": 0.0063, + "step": 4833 + }, + { + "epoch": 19.336, + "grad_norm": 0.16468536853790283, + "learning_rate": 1.6833667334669339e-06, + "loss": 0.0069, + "step": 4834 + }, + { + "epoch": 19.34, + "grad_norm": 0.17551806569099426, + "learning_rate": 1.6733466933867737e-06, + "loss": 0.0077, + "step": 4835 + }, + { + "epoch": 19.344, + "grad_norm": 0.1710132509469986, + "learning_rate": 1.6633266533066135e-06, + "loss": 0.0076, + "step": 4836 + }, + { + "epoch": 19.348, + "grad_norm": 0.17541347444057465, + "learning_rate": 1.6533066132264529e-06, + "loss": 0.0068, + "step": 4837 + }, + { + "epoch": 19.352, + "grad_norm": 0.1492847353219986, + "learning_rate": 1.6432865731462927e-06, + "loss": 0.0067, + "step": 4838 + }, + { + "epoch": 19.356, + "grad_norm": 0.17386400699615479, + "learning_rate": 1.6332665330661322e-06, + "loss": 0.0074, + "step": 4839 + }, + { + "epoch": 19.36, + "grad_norm": 0.15245819091796875, + "learning_rate": 1.623246492985972e-06, + "loss": 0.0061, + "step": 4840 + }, + { + "epoch": 19.364, + "grad_norm": 0.18433190882205963, + "learning_rate": 1.6132264529058118e-06, + "loss": 0.007, + "step": 4841 + }, + { + "epoch": 19.368, + "grad_norm": 0.16951580345630646, + "learning_rate": 1.6032064128256512e-06, + "loss": 0.0069, + "step": 4842 + }, + { + "epoch": 19.372, + "grad_norm": 0.1776171624660492, + "learning_rate": 1.593186372745491e-06, + "loss": 0.0068, + "step": 4843 + }, + { + "epoch": 19.376, + "grad_norm": 0.2017744779586792, + "learning_rate": 1.5831663326653308e-06, + "loss": 0.0069, + "step": 4844 + }, + { + "epoch": 19.38, + "grad_norm": 0.22619450092315674, + "learning_rate": 1.5731462925851704e-06, + "loss": 0.0071, + "step": 4845 + }, + { + "epoch": 19.384, + "grad_norm": 0.1772204488515854, + "learning_rate": 1.5631262525050102e-06, + "loss": 0.0069, + "step": 4846 + }, + { + "epoch": 19.388, + "grad_norm": 0.2213824987411499, + "learning_rate": 1.5531062124248498e-06, + "loss": 0.0075, + "step": 4847 + }, + { + "epoch": 19.392, + "grad_norm": 0.21730723977088928, + "learning_rate": 1.5430861723446894e-06, + "loss": 0.0078, + "step": 4848 + }, + { + "epoch": 19.396, + "grad_norm": 0.13313718140125275, + "learning_rate": 1.5330661322645292e-06, + "loss": 0.0063, + "step": 4849 + }, + { + "epoch": 19.4, + "grad_norm": 0.23904912173748016, + "learning_rate": 1.5230460921843688e-06, + "loss": 0.0074, + "step": 4850 + }, + { + "epoch": 19.404, + "grad_norm": 0.1711878627538681, + "learning_rate": 1.5130260521042086e-06, + "loss": 0.0072, + "step": 4851 + }, + { + "epoch": 19.408, + "grad_norm": 0.19314146041870117, + "learning_rate": 1.5030060120240482e-06, + "loss": 0.0066, + "step": 4852 + }, + { + "epoch": 19.412, + "grad_norm": 0.16302086412906647, + "learning_rate": 1.492985971943888e-06, + "loss": 0.0078, + "step": 4853 + }, + { + "epoch": 19.416, + "grad_norm": 0.15653206408023834, + "learning_rate": 1.4829659318637276e-06, + "loss": 0.0067, + "step": 4854 + }, + { + "epoch": 19.42, + "grad_norm": 0.12465237826108932, + "learning_rate": 1.4729458917835671e-06, + "loss": 0.0064, + "step": 4855 + }, + { + "epoch": 19.424, + "grad_norm": 0.19219818711280823, + "learning_rate": 1.4629258517034067e-06, + "loss": 0.0075, + "step": 4856 + }, + { + "epoch": 19.428, + "grad_norm": 0.16947104036808014, + "learning_rate": 1.4529058116232465e-06, + "loss": 0.0056, + "step": 4857 + }, + { + "epoch": 19.432, + "grad_norm": 0.16427640616893768, + "learning_rate": 1.4428857715430863e-06, + "loss": 0.0068, + "step": 4858 + }, + { + "epoch": 19.436, + "grad_norm": 0.24073843657970428, + "learning_rate": 1.432865731462926e-06, + "loss": 0.0071, + "step": 4859 + }, + { + "epoch": 19.44, + "grad_norm": 0.22962218523025513, + "learning_rate": 1.4228456913827655e-06, + "loss": 0.008, + "step": 4860 + }, + { + "epoch": 19.444, + "grad_norm": 0.1950879544019699, + "learning_rate": 1.4128256513026053e-06, + "loss": 0.0067, + "step": 4861 + }, + { + "epoch": 19.448, + "grad_norm": 0.16316533088684082, + "learning_rate": 1.402805611222445e-06, + "loss": 0.0074, + "step": 4862 + }, + { + "epoch": 19.452, + "grad_norm": 0.18425200879573822, + "learning_rate": 1.3927855711422847e-06, + "loss": 0.0075, + "step": 4863 + }, + { + "epoch": 19.456, + "grad_norm": 0.1857520490884781, + "learning_rate": 1.3827655310621243e-06, + "loss": 0.0071, + "step": 4864 + }, + { + "epoch": 19.46, + "grad_norm": 0.20833329856395721, + "learning_rate": 1.372745490981964e-06, + "loss": 0.007, + "step": 4865 + }, + { + "epoch": 19.464, + "grad_norm": 0.21447627246379852, + "learning_rate": 1.3627254509018037e-06, + "loss": 0.0074, + "step": 4866 + }, + { + "epoch": 19.468, + "grad_norm": 0.14376424252986908, + "learning_rate": 1.3527054108216433e-06, + "loss": 0.0063, + "step": 4867 + }, + { + "epoch": 19.472, + "grad_norm": 0.17494480311870575, + "learning_rate": 1.3426853707414828e-06, + "loss": 0.0075, + "step": 4868 + }, + { + "epoch": 19.476, + "grad_norm": 0.19328361749649048, + "learning_rate": 1.3326653306613229e-06, + "loss": 0.008, + "step": 4869 + }, + { + "epoch": 19.48, + "grad_norm": 0.15253637731075287, + "learning_rate": 1.3226452905811624e-06, + "loss": 0.0076, + "step": 4870 + }, + { + "epoch": 19.484, + "grad_norm": 0.15423879027366638, + "learning_rate": 1.312625250501002e-06, + "loss": 0.0066, + "step": 4871 + }, + { + "epoch": 19.488, + "grad_norm": 0.16223280131816864, + "learning_rate": 1.3026052104208416e-06, + "loss": 0.0071, + "step": 4872 + }, + { + "epoch": 19.492, + "grad_norm": 0.14479829370975494, + "learning_rate": 1.2925851703406814e-06, + "loss": 0.0065, + "step": 4873 + }, + { + "epoch": 19.496, + "grad_norm": 0.1785711795091629, + "learning_rate": 1.282565130260521e-06, + "loss": 0.0078, + "step": 4874 + }, + { + "epoch": 19.5, + "grad_norm": 0.14600437879562378, + "learning_rate": 1.2725450901803608e-06, + "loss": 0.0064, + "step": 4875 + }, + { + "epoch": 19.504, + "grad_norm": 0.19797396659851074, + "learning_rate": 1.2625250501002006e-06, + "loss": 0.0069, + "step": 4876 + }, + { + "epoch": 19.508, + "grad_norm": 0.19664421677589417, + "learning_rate": 1.2525050100200402e-06, + "loss": 0.0082, + "step": 4877 + }, + { + "epoch": 19.512, + "grad_norm": 0.20880340039730072, + "learning_rate": 1.2424849699398798e-06, + "loss": 0.0082, + "step": 4878 + }, + { + "epoch": 19.516, + "grad_norm": 0.14585073292255402, + "learning_rate": 1.2324649298597194e-06, + "loss": 0.0066, + "step": 4879 + }, + { + "epoch": 19.52, + "grad_norm": 0.2517703175544739, + "learning_rate": 1.2224448897795592e-06, + "loss": 0.0073, + "step": 4880 + }, + { + "epoch": 19.524, + "grad_norm": 0.18812379240989685, + "learning_rate": 1.212424849699399e-06, + "loss": 0.008, + "step": 4881 + }, + { + "epoch": 19.528, + "grad_norm": 0.15317322313785553, + "learning_rate": 1.2024048096192386e-06, + "loss": 0.0064, + "step": 4882 + }, + { + "epoch": 19.532, + "grad_norm": 0.22038224339485168, + "learning_rate": 1.1923847695390782e-06, + "loss": 0.0072, + "step": 4883 + }, + { + "epoch": 19.536, + "grad_norm": 0.16358880698680878, + "learning_rate": 1.182364729458918e-06, + "loss": 0.0067, + "step": 4884 + }, + { + "epoch": 19.54, + "grad_norm": 0.17958605289459229, + "learning_rate": 1.1723446893787575e-06, + "loss": 0.0065, + "step": 4885 + }, + { + "epoch": 19.544, + "grad_norm": 0.1586514711380005, + "learning_rate": 1.1623246492985971e-06, + "loss": 0.0061, + "step": 4886 + }, + { + "epoch": 19.548000000000002, + "grad_norm": 0.08892928808927536, + "learning_rate": 1.152304609218437e-06, + "loss": 0.0023, + "step": 4887 + }, + { + "epoch": 19.552, + "grad_norm": 0.13370400667190552, + "learning_rate": 1.1422845691382767e-06, + "loss": 0.0057, + "step": 4888 + }, + { + "epoch": 19.556, + "grad_norm": 0.17751452326774597, + "learning_rate": 1.1322645290581163e-06, + "loss": 0.0066, + "step": 4889 + }, + { + "epoch": 19.56, + "grad_norm": 0.20876137912273407, + "learning_rate": 1.122244488977956e-06, + "loss": 0.0066, + "step": 4890 + }, + { + "epoch": 19.564, + "grad_norm": 0.19675932824611664, + "learning_rate": 1.1122244488977955e-06, + "loss": 0.0083, + "step": 4891 + }, + { + "epoch": 19.568, + "grad_norm": 0.157494455575943, + "learning_rate": 1.1022044088176353e-06, + "loss": 0.0071, + "step": 4892 + }, + { + "epoch": 19.572, + "grad_norm": 0.1072336882352829, + "learning_rate": 1.092184368737475e-06, + "loss": 0.0032, + "step": 4893 + }, + { + "epoch": 19.576, + "grad_norm": 0.19063591957092285, + "learning_rate": 1.0821643286573147e-06, + "loss": 0.0073, + "step": 4894 + }, + { + "epoch": 19.58, + "grad_norm": 0.1833014339208603, + "learning_rate": 1.0721442885771543e-06, + "loss": 0.0076, + "step": 4895 + }, + { + "epoch": 19.584, + "grad_norm": 0.19544434547424316, + "learning_rate": 1.062124248496994e-06, + "loss": 0.0083, + "step": 4896 + }, + { + "epoch": 19.588, + "grad_norm": 0.19219093024730682, + "learning_rate": 1.0521042084168337e-06, + "loss": 0.0072, + "step": 4897 + }, + { + "epoch": 19.592, + "grad_norm": 0.18003998696804047, + "learning_rate": 1.0420841683366733e-06, + "loss": 0.0075, + "step": 4898 + }, + { + "epoch": 19.596, + "grad_norm": 0.14643754065036774, + "learning_rate": 1.032064128256513e-06, + "loss": 0.0074, + "step": 4899 + }, + { + "epoch": 19.6, + "grad_norm": 0.14837734401226044, + "learning_rate": 1.0220440881763529e-06, + "loss": 0.0069, + "step": 4900 + }, + { + "epoch": 19.604, + "grad_norm": 0.21540893614292145, + "learning_rate": 1.0120240480961924e-06, + "loss": 0.0069, + "step": 4901 + }, + { + "epoch": 19.608, + "grad_norm": 0.15125292539596558, + "learning_rate": 1.002004008016032e-06, + "loss": 0.0072, + "step": 4902 + }, + { + "epoch": 19.612, + "grad_norm": 0.2173326164484024, + "learning_rate": 9.919839679358716e-07, + "loss": 0.0074, + "step": 4903 + }, + { + "epoch": 19.616, + "grad_norm": 0.22422145307064056, + "learning_rate": 9.819639278557114e-07, + "loss": 0.0076, + "step": 4904 + }, + { + "epoch": 19.62, + "grad_norm": 0.21010397374629974, + "learning_rate": 9.719438877755512e-07, + "loss": 0.0074, + "step": 4905 + }, + { + "epoch": 19.624, + "grad_norm": 0.16973888874053955, + "learning_rate": 9.619238476953908e-07, + "loss": 0.0065, + "step": 4906 + }, + { + "epoch": 19.628, + "grad_norm": 0.19270877540111542, + "learning_rate": 9.519038076152306e-07, + "loss": 0.0068, + "step": 4907 + }, + { + "epoch": 19.632, + "grad_norm": 0.14899659156799316, + "learning_rate": 9.418837675350702e-07, + "loss": 0.0074, + "step": 4908 + }, + { + "epoch": 19.636, + "grad_norm": 0.18225519359111786, + "learning_rate": 9.318637274549099e-07, + "loss": 0.0065, + "step": 4909 + }, + { + "epoch": 19.64, + "grad_norm": 0.175700843334198, + "learning_rate": 9.218436873747495e-07, + "loss": 0.0075, + "step": 4910 + }, + { + "epoch": 19.644, + "grad_norm": 0.1648159772157669, + "learning_rate": 9.118236472945893e-07, + "loss": 0.0073, + "step": 4911 + }, + { + "epoch": 19.648, + "grad_norm": 0.22431142628192902, + "learning_rate": 9.01803607214429e-07, + "loss": 0.0073, + "step": 4912 + }, + { + "epoch": 19.652, + "grad_norm": 0.19765180349349976, + "learning_rate": 8.917835671342686e-07, + "loss": 0.0072, + "step": 4913 + }, + { + "epoch": 19.656, + "grad_norm": 0.16144563257694244, + "learning_rate": 8.817635270541082e-07, + "loss": 0.007, + "step": 4914 + }, + { + "epoch": 19.66, + "grad_norm": 0.16720423102378845, + "learning_rate": 8.71743486973948e-07, + "loss": 0.0068, + "step": 4915 + }, + { + "epoch": 19.664, + "grad_norm": 0.201465904712677, + "learning_rate": 8.617234468937877e-07, + "loss": 0.0077, + "step": 4916 + }, + { + "epoch": 19.668, + "grad_norm": 0.16798268258571625, + "learning_rate": 8.517034068136272e-07, + "loss": 0.0063, + "step": 4917 + }, + { + "epoch": 19.672, + "grad_norm": 0.1629098355770111, + "learning_rate": 8.416833667334669e-07, + "loss": 0.0075, + "step": 4918 + }, + { + "epoch": 19.676, + "grad_norm": 0.165368914604187, + "learning_rate": 8.316633266533067e-07, + "loss": 0.0068, + "step": 4919 + }, + { + "epoch": 19.68, + "grad_norm": 0.21303412318229675, + "learning_rate": 8.216432865731463e-07, + "loss": 0.0078, + "step": 4920 + }, + { + "epoch": 19.684, + "grad_norm": 0.16514278948307037, + "learning_rate": 8.11623246492986e-07, + "loss": 0.0066, + "step": 4921 + }, + { + "epoch": 19.688, + "grad_norm": 0.14475776255130768, + "learning_rate": 8.016032064128256e-07, + "loss": 0.007, + "step": 4922 + }, + { + "epoch": 19.692, + "grad_norm": 0.2859732508659363, + "learning_rate": 7.915831663326654e-07, + "loss": 0.008, + "step": 4923 + }, + { + "epoch": 19.696, + "grad_norm": 0.2173309475183487, + "learning_rate": 7.815631262525051e-07, + "loss": 0.0079, + "step": 4924 + }, + { + "epoch": 19.7, + "grad_norm": 0.14563588798046112, + "learning_rate": 7.715430861723447e-07, + "loss": 0.0072, + "step": 4925 + }, + { + "epoch": 19.704, + "grad_norm": 0.1795637160539627, + "learning_rate": 7.615230460921844e-07, + "loss": 0.007, + "step": 4926 + }, + { + "epoch": 19.708, + "grad_norm": 0.1522742211818695, + "learning_rate": 7.515030060120241e-07, + "loss": 0.0062, + "step": 4927 + }, + { + "epoch": 19.712, + "grad_norm": 0.23236194252967834, + "learning_rate": 7.414829659318638e-07, + "loss": 0.0078, + "step": 4928 + }, + { + "epoch": 19.716, + "grad_norm": 0.15131531655788422, + "learning_rate": 7.314629258517034e-07, + "loss": 0.0063, + "step": 4929 + }, + { + "epoch": 19.72, + "grad_norm": 0.15103590488433838, + "learning_rate": 7.214428857715432e-07, + "loss": 0.0069, + "step": 4930 + }, + { + "epoch": 19.724, + "grad_norm": 0.15178599953651428, + "learning_rate": 7.114228456913828e-07, + "loss": 0.0066, + "step": 4931 + }, + { + "epoch": 19.728, + "grad_norm": 0.22268012166023254, + "learning_rate": 7.014028056112224e-07, + "loss": 0.0079, + "step": 4932 + }, + { + "epoch": 19.732, + "grad_norm": 0.1779235601425171, + "learning_rate": 6.913827655310621e-07, + "loss": 0.0064, + "step": 4933 + }, + { + "epoch": 19.736, + "grad_norm": 0.17940625548362732, + "learning_rate": 6.813627254509018e-07, + "loss": 0.007, + "step": 4934 + }, + { + "epoch": 19.74, + "grad_norm": 0.17642860114574432, + "learning_rate": 6.713426853707414e-07, + "loss": 0.0062, + "step": 4935 + }, + { + "epoch": 19.744, + "grad_norm": 0.18925072252750397, + "learning_rate": 6.613226452905812e-07, + "loss": 0.0079, + "step": 4936 + }, + { + "epoch": 19.748, + "grad_norm": 0.1245235726237297, + "learning_rate": 6.513026052104208e-07, + "loss": 0.0063, + "step": 4937 + }, + { + "epoch": 19.752, + "grad_norm": 0.21628788113594055, + "learning_rate": 6.412825651302605e-07, + "loss": 0.0081, + "step": 4938 + }, + { + "epoch": 19.756, + "grad_norm": 0.13739117980003357, + "learning_rate": 6.312625250501003e-07, + "loss": 0.0041, + "step": 4939 + }, + { + "epoch": 19.76, + "grad_norm": 0.18183039128780365, + "learning_rate": 6.212424849699399e-07, + "loss": 0.0064, + "step": 4940 + }, + { + "epoch": 19.764, + "grad_norm": 0.18417534232139587, + "learning_rate": 6.112224448897796e-07, + "loss": 0.0076, + "step": 4941 + }, + { + "epoch": 19.768, + "grad_norm": 0.18779101967811584, + "learning_rate": 6.012024048096193e-07, + "loss": 0.0076, + "step": 4942 + }, + { + "epoch": 19.772, + "grad_norm": 0.1617640256881714, + "learning_rate": 5.91182364729459e-07, + "loss": 0.0065, + "step": 4943 + }, + { + "epoch": 19.776, + "grad_norm": 0.15822680294513702, + "learning_rate": 5.811623246492986e-07, + "loss": 0.006, + "step": 4944 + }, + { + "epoch": 19.78, + "grad_norm": 0.17007651925086975, + "learning_rate": 5.711422845691384e-07, + "loss": 0.0069, + "step": 4945 + }, + { + "epoch": 19.784, + "grad_norm": 0.1721976101398468, + "learning_rate": 5.61122244488978e-07, + "loss": 0.0071, + "step": 4946 + }, + { + "epoch": 19.788, + "grad_norm": 0.1397552639245987, + "learning_rate": 5.511022044088177e-07, + "loss": 0.0069, + "step": 4947 + }, + { + "epoch": 19.792, + "grad_norm": 0.18421714007854462, + "learning_rate": 5.410821643286573e-07, + "loss": 0.0048, + "step": 4948 + }, + { + "epoch": 19.796, + "grad_norm": 0.16504515707492828, + "learning_rate": 5.31062124248497e-07, + "loss": 0.0066, + "step": 4949 + }, + { + "epoch": 19.8, + "grad_norm": 0.18747548758983612, + "learning_rate": 5.210420841683366e-07, + "loss": 0.0081, + "step": 4950 + }, + { + "epoch": 19.804, + "grad_norm": 0.17722636461257935, + "learning_rate": 5.110220440881764e-07, + "loss": 0.0071, + "step": 4951 + }, + { + "epoch": 19.808, + "grad_norm": 0.14402155578136444, + "learning_rate": 5.01002004008016e-07, + "loss": 0.0061, + "step": 4952 + }, + { + "epoch": 19.812, + "grad_norm": 0.2421988993883133, + "learning_rate": 4.909819639278557e-07, + "loss": 0.0075, + "step": 4953 + }, + { + "epoch": 19.816, + "grad_norm": 0.1611681580543518, + "learning_rate": 4.809619238476954e-07, + "loss": 0.0063, + "step": 4954 + }, + { + "epoch": 19.82, + "grad_norm": 0.17111510038375854, + "learning_rate": 4.709418837675351e-07, + "loss": 0.0064, + "step": 4955 + }, + { + "epoch": 19.824, + "grad_norm": 0.2002701759338379, + "learning_rate": 4.6092184368737474e-07, + "loss": 0.0066, + "step": 4956 + }, + { + "epoch": 19.828, + "grad_norm": 0.23351456224918365, + "learning_rate": 4.509018036072145e-07, + "loss": 0.0073, + "step": 4957 + }, + { + "epoch": 19.832, + "grad_norm": 0.12465520948171616, + "learning_rate": 4.408817635270541e-07, + "loss": 0.0057, + "step": 4958 + }, + { + "epoch": 19.836, + "grad_norm": 0.2019532173871994, + "learning_rate": 4.3086172344689383e-07, + "loss": 0.0068, + "step": 4959 + }, + { + "epoch": 19.84, + "grad_norm": 0.1522103101015091, + "learning_rate": 4.2084168336673347e-07, + "loss": 0.0062, + "step": 4960 + }, + { + "epoch": 19.844, + "grad_norm": 0.22643589973449707, + "learning_rate": 4.1082164328657316e-07, + "loss": 0.0079, + "step": 4961 + }, + { + "epoch": 19.848, + "grad_norm": 0.1775081604719162, + "learning_rate": 4.008016032064128e-07, + "loss": 0.0065, + "step": 4962 + }, + { + "epoch": 19.852, + "grad_norm": 0.17318783700466156, + "learning_rate": 3.9078156312625255e-07, + "loss": 0.007, + "step": 4963 + }, + { + "epoch": 19.856, + "grad_norm": 0.20053575932979584, + "learning_rate": 3.807615230460922e-07, + "loss": 0.0074, + "step": 4964 + }, + { + "epoch": 19.86, + "grad_norm": 0.2197975367307663, + "learning_rate": 3.707414829659319e-07, + "loss": 0.0077, + "step": 4965 + }, + { + "epoch": 19.864, + "grad_norm": 0.11314887553453445, + "learning_rate": 3.607214428857716e-07, + "loss": 0.0039, + "step": 4966 + }, + { + "epoch": 19.868, + "grad_norm": 0.23044568300247192, + "learning_rate": 3.507014028056112e-07, + "loss": 0.0081, + "step": 4967 + }, + { + "epoch": 19.872, + "grad_norm": 0.15248127281665802, + "learning_rate": 3.406813627254509e-07, + "loss": 0.0068, + "step": 4968 + }, + { + "epoch": 19.876, + "grad_norm": 0.16394023597240448, + "learning_rate": 3.306613226452906e-07, + "loss": 0.0066, + "step": 4969 + }, + { + "epoch": 19.88, + "grad_norm": 0.1749032884836197, + "learning_rate": 3.2064128256513025e-07, + "loss": 0.0073, + "step": 4970 + }, + { + "epoch": 19.884, + "grad_norm": 0.17276360094547272, + "learning_rate": 3.1062124248496995e-07, + "loss": 0.007, + "step": 4971 + }, + { + "epoch": 19.888, + "grad_norm": 0.13797195255756378, + "learning_rate": 3.0060120240480964e-07, + "loss": 0.007, + "step": 4972 + }, + { + "epoch": 19.892, + "grad_norm": 0.25307130813598633, + "learning_rate": 2.905811623246493e-07, + "loss": 0.0085, + "step": 4973 + }, + { + "epoch": 19.896, + "grad_norm": 0.17503662407398224, + "learning_rate": 2.80561122244489e-07, + "loss": 0.0076, + "step": 4974 + }, + { + "epoch": 19.9, + "grad_norm": 0.14293161034584045, + "learning_rate": 2.7054108216432867e-07, + "loss": 0.007, + "step": 4975 + }, + { + "epoch": 19.904, + "grad_norm": 0.1660107970237732, + "learning_rate": 2.605210420841683e-07, + "loss": 0.0065, + "step": 4976 + }, + { + "epoch": 19.908, + "grad_norm": 0.245314821600914, + "learning_rate": 2.50501002004008e-07, + "loss": 0.0092, + "step": 4977 + }, + { + "epoch": 19.912, + "grad_norm": 0.21174436807632446, + "learning_rate": 2.404809619238477e-07, + "loss": 0.007, + "step": 4978 + }, + { + "epoch": 19.916, + "grad_norm": 0.170380100607872, + "learning_rate": 2.3046092184368737e-07, + "loss": 0.0061, + "step": 4979 + }, + { + "epoch": 19.92, + "grad_norm": 0.2893669307231903, + "learning_rate": 2.2044088176352704e-07, + "loss": 0.0086, + "step": 4980 + }, + { + "epoch": 19.924, + "grad_norm": 0.13519108295440674, + "learning_rate": 2.1042084168336673e-07, + "loss": 0.006, + "step": 4981 + }, + { + "epoch": 19.928, + "grad_norm": 0.17455951869487762, + "learning_rate": 2.004008016032064e-07, + "loss": 0.0076, + "step": 4982 + }, + { + "epoch": 19.932, + "grad_norm": 0.2173195630311966, + "learning_rate": 1.903807615230461e-07, + "loss": 0.0086, + "step": 4983 + }, + { + "epoch": 19.936, + "grad_norm": 0.16455821692943573, + "learning_rate": 1.803607214428858e-07, + "loss": 0.0066, + "step": 4984 + }, + { + "epoch": 19.94, + "grad_norm": 0.16139689087867737, + "learning_rate": 1.7034068136272546e-07, + "loss": 0.0075, + "step": 4985 + }, + { + "epoch": 19.944, + "grad_norm": 0.18227294087409973, + "learning_rate": 1.6032064128256513e-07, + "loss": 0.007, + "step": 4986 + }, + { + "epoch": 19.948, + "grad_norm": 0.13132084906101227, + "learning_rate": 1.5030060120240482e-07, + "loss": 0.0043, + "step": 4987 + }, + { + "epoch": 19.951999999999998, + "grad_norm": 0.1532142460346222, + "learning_rate": 1.402805611222445e-07, + "loss": 0.0063, + "step": 4988 + }, + { + "epoch": 19.956, + "grad_norm": 0.18583548069000244, + "learning_rate": 1.3026052104208416e-07, + "loss": 0.0074, + "step": 4989 + }, + { + "epoch": 19.96, + "grad_norm": 0.16315986216068268, + "learning_rate": 1.2024048096192385e-07, + "loss": 0.0066, + "step": 4990 + }, + { + "epoch": 19.964, + "grad_norm": 0.19338533282279968, + "learning_rate": 1.1022044088176352e-07, + "loss": 0.0062, + "step": 4991 + }, + { + "epoch": 19.968, + "grad_norm": 0.1874208003282547, + "learning_rate": 1.002004008016032e-07, + "loss": 0.0072, + "step": 4992 + }, + { + "epoch": 19.972, + "grad_norm": 0.19708944857120514, + "learning_rate": 9.01803607214429e-08, + "loss": 0.0075, + "step": 4993 + }, + { + "epoch": 19.976, + "grad_norm": 0.13537871837615967, + "learning_rate": 8.016032064128256e-08, + "loss": 0.0072, + "step": 4994 + }, + { + "epoch": 19.98, + "grad_norm": 0.200593501329422, + "learning_rate": 7.014028056112224e-08, + "loss": 0.0077, + "step": 4995 + }, + { + "epoch": 19.984, + "grad_norm": 0.146713986992836, + "learning_rate": 6.012024048096193e-08, + "loss": 0.0074, + "step": 4996 + }, + { + "epoch": 19.988, + "grad_norm": 0.20530632138252258, + "learning_rate": 5.01002004008016e-08, + "loss": 0.0066, + "step": 4997 + }, + { + "epoch": 19.992, + "grad_norm": 0.1384311020374298, + "learning_rate": 4.008016032064128e-08, + "loss": 0.0072, + "step": 4998 + }, + { + "epoch": 19.996, + "grad_norm": 0.1999179571866989, + "learning_rate": 3.006012024048096e-08, + "loss": 0.0075, + "step": 4999 + }, + { + "epoch": 20.0, + "grad_norm": 0.22525227069854736, + "learning_rate": 2.004008016032064e-08, + "loss": 0.0047, + "step": 5000 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.07226802884608e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}