{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 3640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00021978021978021978, "grad_norm": 1.2186660196817165, "learning_rate": 6.999999249142552e-07, "loss": 1.5998, "step": 1 }, { "epoch": 0.00043956043956043956, "grad_norm": 1.0439899552298968, "learning_rate": 6.999996996570566e-07, "loss": 1.6101, "step": 2 }, { "epoch": 0.0006593406593406593, "grad_norm": 1.0721695105594953, "learning_rate": 6.999993242285117e-07, "loss": 1.5086, "step": 3 }, { "epoch": 0.0008791208791208791, "grad_norm": 1.1098815548570793, "learning_rate": 6.999987986287993e-07, "loss": 1.5492, "step": 4 }, { "epoch": 0.001098901098901099, "grad_norm": 1.1307841967232446, "learning_rate": 6.999981228581701e-07, "loss": 1.5344, "step": 5 }, { "epoch": 0.0013186813186813187, "grad_norm": 0.9920620937163445, "learning_rate": 6.999972969169463e-07, "loss": 1.558, "step": 6 }, { "epoch": 0.0015384615384615385, "grad_norm": 1.024132361415856, "learning_rate": 6.999963208055215e-07, "loss": 1.5217, "step": 7 }, { "epoch": 0.0017582417582417582, "grad_norm": 0.9309822643703438, "learning_rate": 6.999951945243612e-07, "loss": 1.541, "step": 8 }, { "epoch": 0.001978021978021978, "grad_norm": 0.8249543682534687, "learning_rate": 6.999939180740022e-07, "loss": 1.5733, "step": 9 }, { "epoch": 0.002197802197802198, "grad_norm": 0.9676935755823561, "learning_rate": 6.999924914550531e-07, "loss": 1.5692, "step": 10 }, { "epoch": 0.0024175824175824176, "grad_norm": 0.8980208205863951, "learning_rate": 6.999909146681941e-07, "loss": 1.565, "step": 11 }, { "epoch": 0.0026373626373626374, "grad_norm": 0.9306418951850254, "learning_rate": 6.999891877141767e-07, "loss": 1.5342, "step": 12 }, { "epoch": 0.002857142857142857, "grad_norm": 0.7374753300540053, "learning_rate": 6.999873105938243e-07, "loss": 1.5431, "step": 13 }, { "epoch": 0.003076923076923077, "grad_norm": 0.6068473940709174, "learning_rate": 6.99985283308032e-07, "loss": 1.5225, "step": 14 }, { "epoch": 0.0032967032967032967, "grad_norm": 0.6019875923751878, "learning_rate": 6.999831058577659e-07, "loss": 1.4669, "step": 15 }, { "epoch": 0.0035164835164835165, "grad_norm": 0.6249268931056989, "learning_rate": 6.999807782440644e-07, "loss": 1.5616, "step": 16 }, { "epoch": 0.0037362637362637363, "grad_norm": 0.5835249297903639, "learning_rate": 6.99978300468037e-07, "loss": 1.5251, "step": 17 }, { "epoch": 0.003956043956043956, "grad_norm": 0.6063786110343348, "learning_rate": 6.999756725308648e-07, "loss": 1.531, "step": 18 }, { "epoch": 0.004175824175824176, "grad_norm": 0.5755540396780645, "learning_rate": 6.999728944338009e-07, "loss": 1.5712, "step": 19 }, { "epoch": 0.004395604395604396, "grad_norm": 0.5687868093148271, "learning_rate": 6.999699661781696e-07, "loss": 1.5716, "step": 20 }, { "epoch": 0.004615384615384616, "grad_norm": 0.5622248804783495, "learning_rate": 6.999668877653669e-07, "loss": 1.4995, "step": 21 }, { "epoch": 0.004835164835164835, "grad_norm": 0.5122047997706676, "learning_rate": 6.999636591968604e-07, "loss": 1.5336, "step": 22 }, { "epoch": 0.005054945054945055, "grad_norm": 0.5376748142392922, "learning_rate": 6.999602804741893e-07, "loss": 1.5249, "step": 23 }, { "epoch": 0.005274725274725275, "grad_norm": 0.4960039885042067, "learning_rate": 6.999567515989641e-07, "loss": 1.5907, "step": 24 }, { "epoch": 0.005494505494505495, "grad_norm": 0.4868968983538848, "learning_rate": 6.999530725728675e-07, "loss": 1.5094, "step": 25 }, { "epoch": 0.005714285714285714, "grad_norm": 0.4509570161471638, "learning_rate": 6.999492433976532e-07, "loss": 1.5193, "step": 26 }, { "epoch": 0.0059340659340659345, "grad_norm": 0.43012066136835875, "learning_rate": 6.999452640751469e-07, "loss": 1.4869, "step": 27 }, { "epoch": 0.006153846153846154, "grad_norm": 0.42798833956354815, "learning_rate": 6.999411346072455e-07, "loss": 1.5537, "step": 28 }, { "epoch": 0.006373626373626374, "grad_norm": 0.42110987188751625, "learning_rate": 6.999368549959178e-07, "loss": 1.4695, "step": 29 }, { "epoch": 0.006593406593406593, "grad_norm": 0.42241293601772917, "learning_rate": 6.999324252432038e-07, "loss": 1.5394, "step": 30 }, { "epoch": 0.006813186813186814, "grad_norm": 0.40830105748996753, "learning_rate": 6.999278453512156e-07, "loss": 1.5436, "step": 31 }, { "epoch": 0.007032967032967033, "grad_norm": 0.42088049458463134, "learning_rate": 6.999231153221364e-07, "loss": 1.517, "step": 32 }, { "epoch": 0.007252747252747253, "grad_norm": 0.4254524125760086, "learning_rate": 6.999182351582212e-07, "loss": 1.4963, "step": 33 }, { "epoch": 0.0074725274725274725, "grad_norm": 0.45211208077565246, "learning_rate": 6.999132048617966e-07, "loss": 1.5334, "step": 34 }, { "epoch": 0.007692307692307693, "grad_norm": 0.38954848298052885, "learning_rate": 6.999080244352607e-07, "loss": 1.4666, "step": 35 }, { "epoch": 0.007912087912087912, "grad_norm": 0.37662572294061986, "learning_rate": 6.999026938810832e-07, "loss": 1.5887, "step": 36 }, { "epoch": 0.008131868131868131, "grad_norm": 0.3800537277157674, "learning_rate": 6.998972132018054e-07, "loss": 1.5428, "step": 37 }, { "epoch": 0.008351648351648353, "grad_norm": 0.39327960840253473, "learning_rate": 6.998915824000401e-07, "loss": 1.5812, "step": 38 }, { "epoch": 0.008571428571428572, "grad_norm": 14.112700324145928, "learning_rate": 6.998858014784716e-07, "loss": 1.5788, "step": 39 }, { "epoch": 0.008791208791208791, "grad_norm": 0.35127700103550985, "learning_rate": 6.99879870439856e-07, "loss": 1.5079, "step": 40 }, { "epoch": 0.00901098901098901, "grad_norm": 0.33547200423608775, "learning_rate": 6.998737892870208e-07, "loss": 1.4971, "step": 41 }, { "epoch": 0.009230769230769232, "grad_norm": 0.3463399443779019, "learning_rate": 6.99867558022865e-07, "loss": 1.5603, "step": 42 }, { "epoch": 0.009450549450549451, "grad_norm": 0.5100467180860531, "learning_rate": 6.998611766503595e-07, "loss": 1.5597, "step": 43 }, { "epoch": 0.00967032967032967, "grad_norm": 0.34297872378392785, "learning_rate": 6.998546451725461e-07, "loss": 1.4844, "step": 44 }, { "epoch": 0.00989010989010989, "grad_norm": 0.36743100193234246, "learning_rate": 6.99847963592539e-07, "loss": 1.5067, "step": 45 }, { "epoch": 0.01010989010989011, "grad_norm": 0.34786527855699156, "learning_rate": 6.998411319135234e-07, "loss": 1.5487, "step": 46 }, { "epoch": 0.01032967032967033, "grad_norm": 0.3520019386286944, "learning_rate": 6.998341501387561e-07, "loss": 1.5092, "step": 47 }, { "epoch": 0.01054945054945055, "grad_norm": 0.34687698726669514, "learning_rate": 6.998270182715658e-07, "loss": 1.5078, "step": 48 }, { "epoch": 0.010769230769230769, "grad_norm": 0.33420958814829355, "learning_rate": 6.998197363153522e-07, "loss": 1.4704, "step": 49 }, { "epoch": 0.01098901098901099, "grad_norm": 0.33024025304703036, "learning_rate": 6.998123042735871e-07, "loss": 1.5104, "step": 50 }, { "epoch": 0.01120879120879121, "grad_norm": 13.292323804745312, "learning_rate": 6.998047221498137e-07, "loss": 1.5458, "step": 51 }, { "epoch": 0.011428571428571429, "grad_norm": 0.3231695044826081, "learning_rate": 6.997969899476462e-07, "loss": 1.5122, "step": 52 }, { "epoch": 0.011648351648351648, "grad_norm": 0.331099451474555, "learning_rate": 6.997891076707711e-07, "loss": 1.5195, "step": 53 }, { "epoch": 0.011868131868131869, "grad_norm": 0.34515503143462656, "learning_rate": 6.997810753229464e-07, "loss": 1.5428, "step": 54 }, { "epoch": 0.012087912087912088, "grad_norm": 0.34544004796040934, "learning_rate": 6.997728929080011e-07, "loss": 1.5598, "step": 55 }, { "epoch": 0.012307692307692308, "grad_norm": 0.32630043594715297, "learning_rate": 6.99764560429836e-07, "loss": 1.5739, "step": 56 }, { "epoch": 0.012527472527472527, "grad_norm": 0.33162041156072575, "learning_rate": 6.997560778924236e-07, "loss": 1.5091, "step": 57 }, { "epoch": 0.012747252747252748, "grad_norm": 0.32637573914797435, "learning_rate": 6.997474452998078e-07, "loss": 1.4946, "step": 58 }, { "epoch": 0.012967032967032967, "grad_norm": 0.3241571015698132, "learning_rate": 6.997386626561041e-07, "loss": 1.4998, "step": 59 }, { "epoch": 0.013186813186813187, "grad_norm": 0.3105444297848675, "learning_rate": 6.997297299654995e-07, "loss": 1.5221, "step": 60 }, { "epoch": 0.013406593406593406, "grad_norm": 0.32279738495572, "learning_rate": 6.997206472322524e-07, "loss": 1.531, "step": 61 }, { "epoch": 0.013626373626373627, "grad_norm": 0.321559866825534, "learning_rate": 6.99711414460693e-07, "loss": 1.5037, "step": 62 }, { "epoch": 0.013846153846153847, "grad_norm": 0.31003333500380825, "learning_rate": 6.997020316552228e-07, "loss": 1.5349, "step": 63 }, { "epoch": 0.014065934065934066, "grad_norm": 0.3226878420193051, "learning_rate": 6.996924988203149e-07, "loss": 1.5072, "step": 64 }, { "epoch": 0.014285714285714285, "grad_norm": 0.30960484322916615, "learning_rate": 6.99682815960514e-07, "loss": 1.4557, "step": 65 }, { "epoch": 0.014505494505494506, "grad_norm": 0.31173035549535516, "learning_rate": 6.996729830804362e-07, "loss": 1.55, "step": 66 }, { "epoch": 0.014725274725274726, "grad_norm": 0.33346114160057794, "learning_rate": 6.996630001847694e-07, "loss": 1.5098, "step": 67 }, { "epoch": 0.014945054945054945, "grad_norm": 0.3262332746170292, "learning_rate": 6.996528672782724e-07, "loss": 1.4954, "step": 68 }, { "epoch": 0.015164835164835164, "grad_norm": 0.3066424118028698, "learning_rate": 6.996425843657762e-07, "loss": 1.5247, "step": 69 }, { "epoch": 0.015384615384615385, "grad_norm": 0.29511754230899856, "learning_rate": 6.996321514521829e-07, "loss": 1.4867, "step": 70 }, { "epoch": 0.015604395604395605, "grad_norm": 0.3029790265153383, "learning_rate": 6.996215685424661e-07, "loss": 1.4409, "step": 71 }, { "epoch": 0.015824175824175824, "grad_norm": 0.3215987443977887, "learning_rate": 6.996108356416715e-07, "loss": 1.482, "step": 72 }, { "epoch": 0.016043956043956045, "grad_norm": 0.30330633610502405, "learning_rate": 6.995999527549153e-07, "loss": 1.5087, "step": 73 }, { "epoch": 0.016263736263736263, "grad_norm": 0.3003227941293912, "learning_rate": 6.995889198873862e-07, "loss": 1.4914, "step": 74 }, { "epoch": 0.016483516483516484, "grad_norm": 0.34686838487569577, "learning_rate": 6.995777370443436e-07, "loss": 1.4711, "step": 75 }, { "epoch": 0.016703296703296705, "grad_norm": 0.28588606950024403, "learning_rate": 6.995664042311191e-07, "loss": 1.5046, "step": 76 }, { "epoch": 0.016923076923076923, "grad_norm": 0.3090307833343753, "learning_rate": 6.995549214531152e-07, "loss": 1.5053, "step": 77 }, { "epoch": 0.017142857142857144, "grad_norm": 0.279920066376002, "learning_rate": 6.995432887158061e-07, "loss": 1.4293, "step": 78 }, { "epoch": 0.01736263736263736, "grad_norm": 0.3205459040940839, "learning_rate": 6.995315060247377e-07, "loss": 1.5683, "step": 79 }, { "epoch": 0.017582417582417582, "grad_norm": 0.2933165344735273, "learning_rate": 6.995195733855271e-07, "loss": 1.496, "step": 80 }, { "epoch": 0.017802197802197803, "grad_norm": 0.28611460012927237, "learning_rate": 6.995074908038631e-07, "loss": 1.5161, "step": 81 }, { "epoch": 0.01802197802197802, "grad_norm": 0.281541569510838, "learning_rate": 6.994952582855058e-07, "loss": 1.5087, "step": 82 }, { "epoch": 0.018241758241758242, "grad_norm": 2.608706443624609, "learning_rate": 6.99482875836287e-07, "loss": 1.5499, "step": 83 }, { "epoch": 0.018461538461538463, "grad_norm": 0.30022043627485206, "learning_rate": 6.994703434621097e-07, "loss": 1.5573, "step": 84 }, { "epoch": 0.01868131868131868, "grad_norm": 0.3925486655499724, "learning_rate": 6.994576611689486e-07, "loss": 1.494, "step": 85 }, { "epoch": 0.018901098901098902, "grad_norm": 0.2843534465338799, "learning_rate": 6.994448289628498e-07, "loss": 1.4763, "step": 86 }, { "epoch": 0.01912087912087912, "grad_norm": 0.2869825542786904, "learning_rate": 6.994318468499308e-07, "loss": 1.51, "step": 87 }, { "epoch": 0.01934065934065934, "grad_norm": 0.3116759886396289, "learning_rate": 6.994187148363806e-07, "loss": 1.4883, "step": 88 }, { "epoch": 0.01956043956043956, "grad_norm": 0.297394744452125, "learning_rate": 6.994054329284599e-07, "loss": 1.5885, "step": 89 }, { "epoch": 0.01978021978021978, "grad_norm": 0.2918854080020701, "learning_rate": 6.993920011325003e-07, "loss": 1.4858, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.2940304534340659, "learning_rate": 6.993784194549055e-07, "loss": 1.4796, "step": 91 }, { "epoch": 0.02021978021978022, "grad_norm": 0.32308529480303205, "learning_rate": 6.993646879021503e-07, "loss": 1.4847, "step": 92 }, { "epoch": 0.02043956043956044, "grad_norm": 0.29725992838234777, "learning_rate": 6.993508064807809e-07, "loss": 1.5481, "step": 93 }, { "epoch": 0.02065934065934066, "grad_norm": 0.2959729110272764, "learning_rate": 6.993367751974151e-07, "loss": 1.4977, "step": 94 }, { "epoch": 0.020879120879120878, "grad_norm": 0.29995140162354467, "learning_rate": 6.993225940587423e-07, "loss": 1.5682, "step": 95 }, { "epoch": 0.0210989010989011, "grad_norm": 0.27708359576077696, "learning_rate": 6.993082630715229e-07, "loss": 1.4819, "step": 96 }, { "epoch": 0.02131868131868132, "grad_norm": 0.2858375043856483, "learning_rate": 6.99293782242589e-07, "loss": 1.4997, "step": 97 }, { "epoch": 0.021538461538461538, "grad_norm": 0.3242764772196802, "learning_rate": 6.992791515788442e-07, "loss": 1.4909, "step": 98 }, { "epoch": 0.02175824175824176, "grad_norm": 0.2855918680306477, "learning_rate": 6.992643710872633e-07, "loss": 1.507, "step": 99 }, { "epoch": 0.02197802197802198, "grad_norm": 0.29129408787298827, "learning_rate": 6.992494407748931e-07, "loss": 1.5033, "step": 100 }, { "epoch": 0.022197802197802197, "grad_norm": 0.2831537449986603, "learning_rate": 6.992343606488509e-07, "loss": 1.4584, "step": 101 }, { "epoch": 0.02241758241758242, "grad_norm": 0.30566588359048896, "learning_rate": 6.99219130716326e-07, "loss": 1.4876, "step": 102 }, { "epoch": 0.022637362637362636, "grad_norm": 0.28003441401871154, "learning_rate": 6.992037509845793e-07, "loss": 1.5287, "step": 103 }, { "epoch": 0.022857142857142857, "grad_norm": 0.2776071350189179, "learning_rate": 6.991882214609427e-07, "loss": 1.5196, "step": 104 }, { "epoch": 0.023076923076923078, "grad_norm": 0.28320343050708896, "learning_rate": 6.991725421528197e-07, "loss": 1.4868, "step": 105 }, { "epoch": 0.023296703296703296, "grad_norm": 0.28999603887230935, "learning_rate": 6.991567130676851e-07, "loss": 1.4722, "step": 106 }, { "epoch": 0.023516483516483517, "grad_norm": 0.3264889773497954, "learning_rate": 6.991407342130853e-07, "loss": 1.5275, "step": 107 }, { "epoch": 0.023736263736263738, "grad_norm": 0.2789118306208069, "learning_rate": 6.991246055966378e-07, "loss": 1.5081, "step": 108 }, { "epoch": 0.023956043956043956, "grad_norm": 0.27960937704917793, "learning_rate": 6.991083272260319e-07, "loss": 1.4807, "step": 109 }, { "epoch": 0.024175824175824177, "grad_norm": 0.28191339204154914, "learning_rate": 6.990918991090279e-07, "loss": 1.5313, "step": 110 }, { "epoch": 0.024395604395604394, "grad_norm": 0.2945719571612475, "learning_rate": 6.990753212534577e-07, "loss": 1.5475, "step": 111 }, { "epoch": 0.024615384615384615, "grad_norm": 0.2751109973724785, "learning_rate": 6.990585936672245e-07, "loss": 1.5284, "step": 112 }, { "epoch": 0.024835164835164836, "grad_norm": 0.2824020087608803, "learning_rate": 6.990417163583031e-07, "loss": 1.5038, "step": 113 }, { "epoch": 0.025054945054945054, "grad_norm": 0.3012447042602539, "learning_rate": 6.990246893347393e-07, "loss": 1.5257, "step": 114 }, { "epoch": 0.025274725274725275, "grad_norm": 0.3373183520802013, "learning_rate": 6.990075126046506e-07, "loss": 1.5136, "step": 115 }, { "epoch": 0.025494505494505496, "grad_norm": 0.26947512348674646, "learning_rate": 6.989901861762256e-07, "loss": 1.4885, "step": 116 }, { "epoch": 0.025714285714285714, "grad_norm": 0.30543440043549336, "learning_rate": 6.989727100577246e-07, "loss": 1.4932, "step": 117 }, { "epoch": 0.025934065934065935, "grad_norm": 0.27272901688360696, "learning_rate": 6.98955084257479e-07, "loss": 1.4918, "step": 118 }, { "epoch": 0.026153846153846153, "grad_norm": 0.2864790696227162, "learning_rate": 6.989373087838915e-07, "loss": 1.5619, "step": 119 }, { "epoch": 0.026373626373626374, "grad_norm": 0.2756479522819835, "learning_rate": 6.989193836454364e-07, "loss": 1.537, "step": 120 }, { "epoch": 0.026593406593406595, "grad_norm": 0.27232865699953096, "learning_rate": 6.989013088506593e-07, "loss": 1.5046, "step": 121 }, { "epoch": 0.026813186813186812, "grad_norm": 0.268028106647397, "learning_rate": 6.988830844081771e-07, "loss": 1.477, "step": 122 }, { "epoch": 0.027032967032967033, "grad_norm": 0.27821815601753497, "learning_rate": 6.988647103266779e-07, "loss": 1.5441, "step": 123 }, { "epoch": 0.027252747252747254, "grad_norm": 0.27697543965729127, "learning_rate": 6.988461866149214e-07, "loss": 1.6002, "step": 124 }, { "epoch": 0.027472527472527472, "grad_norm": 0.29538619865773286, "learning_rate": 6.988275132817383e-07, "loss": 1.5378, "step": 125 }, { "epoch": 0.027692307692307693, "grad_norm": 0.2775367321371363, "learning_rate": 6.988086903360311e-07, "loss": 1.4762, "step": 126 }, { "epoch": 0.02791208791208791, "grad_norm": 0.2702662287092996, "learning_rate": 6.987897177867731e-07, "loss": 1.4599, "step": 127 }, { "epoch": 0.028131868131868132, "grad_norm": 0.28656388789553466, "learning_rate": 6.987705956430092e-07, "loss": 1.5213, "step": 128 }, { "epoch": 0.028351648351648353, "grad_norm": 0.2712634686760554, "learning_rate": 6.987513239138558e-07, "loss": 1.5791, "step": 129 }, { "epoch": 0.02857142857142857, "grad_norm": 0.2905484073509738, "learning_rate": 6.987319026085003e-07, "loss": 1.4899, "step": 130 }, { "epoch": 0.02879120879120879, "grad_norm": 0.2846977667808167, "learning_rate": 6.987123317362014e-07, "loss": 1.4881, "step": 131 }, { "epoch": 0.029010989010989013, "grad_norm": 0.2691320851817523, "learning_rate": 6.986926113062894e-07, "loss": 1.4857, "step": 132 }, { "epoch": 0.02923076923076923, "grad_norm": 0.2911894232325037, "learning_rate": 6.986727413281656e-07, "loss": 1.4855, "step": 133 }, { "epoch": 0.02945054945054945, "grad_norm": 0.305082177212358, "learning_rate": 6.986527218113028e-07, "loss": 1.5369, "step": 134 }, { "epoch": 0.02967032967032967, "grad_norm": 0.27036465289461803, "learning_rate": 6.986325527652449e-07, "loss": 1.5031, "step": 135 }, { "epoch": 0.02989010989010989, "grad_norm": 0.2631354750255763, "learning_rate": 6.986122341996072e-07, "loss": 1.4465, "step": 136 }, { "epoch": 0.03010989010989011, "grad_norm": 0.2752652309390088, "learning_rate": 6.985917661240763e-07, "loss": 1.4614, "step": 137 }, { "epoch": 0.03032967032967033, "grad_norm": 0.32996418437558866, "learning_rate": 6.985711485484101e-07, "loss": 1.5232, "step": 138 }, { "epoch": 0.03054945054945055, "grad_norm": 0.2812242833122952, "learning_rate": 6.985503814824375e-07, "loss": 1.5128, "step": 139 }, { "epoch": 0.03076923076923077, "grad_norm": 0.2728686694891554, "learning_rate": 6.985294649360593e-07, "loss": 1.472, "step": 140 }, { "epoch": 0.03098901098901099, "grad_norm": 0.2710365904840655, "learning_rate": 6.985083989192469e-07, "loss": 1.5455, "step": 141 }, { "epoch": 0.03120879120879121, "grad_norm": 0.29439651848838755, "learning_rate": 6.984871834420429e-07, "loss": 1.5177, "step": 142 }, { "epoch": 0.03142857142857143, "grad_norm": 0.26500957178841184, "learning_rate": 6.98465818514562e-07, "loss": 1.4676, "step": 143 }, { "epoch": 0.03164835164835165, "grad_norm": 0.27673963910531124, "learning_rate": 6.984443041469894e-07, "loss": 1.5072, "step": 144 }, { "epoch": 0.031868131868131866, "grad_norm": 0.2750426966896585, "learning_rate": 6.984226403495816e-07, "loss": 1.4873, "step": 145 }, { "epoch": 0.03208791208791209, "grad_norm": 0.28280202577217106, "learning_rate": 6.984008271326666e-07, "loss": 1.4351, "step": 146 }, { "epoch": 0.03230769230769231, "grad_norm": 0.28194677251705746, "learning_rate": 6.983788645066436e-07, "loss": 1.4858, "step": 147 }, { "epoch": 0.032527472527472526, "grad_norm": 0.2818612120674954, "learning_rate": 6.983567524819828e-07, "loss": 1.4792, "step": 148 }, { "epoch": 0.03274725274725275, "grad_norm": 0.25947122206030515, "learning_rate": 6.98334491069226e-07, "loss": 1.4632, "step": 149 }, { "epoch": 0.03296703296703297, "grad_norm": 0.2743375323362592, "learning_rate": 6.983120802789857e-07, "loss": 1.4429, "step": 150 }, { "epoch": 0.033186813186813185, "grad_norm": 0.34639656463234103, "learning_rate": 6.98289520121946e-07, "loss": 1.4937, "step": 151 }, { "epoch": 0.03340659340659341, "grad_norm": 0.305354077564706, "learning_rate": 6.982668106088623e-07, "loss": 1.4401, "step": 152 }, { "epoch": 0.03362637362637363, "grad_norm": 0.26958390231177176, "learning_rate": 6.982439517505609e-07, "loss": 1.4865, "step": 153 }, { "epoch": 0.033846153846153845, "grad_norm": 0.26179721533422456, "learning_rate": 6.982209435579393e-07, "loss": 1.4573, "step": 154 }, { "epoch": 0.03406593406593406, "grad_norm": 0.26746664471911324, "learning_rate": 6.981977860419664e-07, "loss": 1.47, "step": 155 }, { "epoch": 0.03428571428571429, "grad_norm": 0.26825361946569676, "learning_rate": 6.981744792136822e-07, "loss": 1.5045, "step": 156 }, { "epoch": 0.034505494505494505, "grad_norm": 0.4030036122615661, "learning_rate": 6.981510230841979e-07, "loss": 1.4877, "step": 157 }, { "epoch": 0.03472527472527472, "grad_norm": 0.27170584168506157, "learning_rate": 6.981274176646958e-07, "loss": 1.5494, "step": 158 }, { "epoch": 0.03494505494505495, "grad_norm": 0.2672096935421432, "learning_rate": 6.981036629664294e-07, "loss": 1.4938, "step": 159 }, { "epoch": 0.035164835164835165, "grad_norm": 0.2773005433971945, "learning_rate": 6.980797590007237e-07, "loss": 1.4964, "step": 160 }, { "epoch": 0.03538461538461538, "grad_norm": 0.2694855506049262, "learning_rate": 6.98055705778974e-07, "loss": 1.5027, "step": 161 }, { "epoch": 0.03560439560439561, "grad_norm": 0.27266383003600164, "learning_rate": 6.980315033126479e-07, "loss": 1.5, "step": 162 }, { "epoch": 0.035824175824175825, "grad_norm": 0.9420262239251024, "learning_rate": 6.980071516132831e-07, "loss": 1.4917, "step": 163 }, { "epoch": 0.03604395604395604, "grad_norm": 0.26818601052376817, "learning_rate": 6.979826506924891e-07, "loss": 1.4569, "step": 164 }, { "epoch": 0.03626373626373627, "grad_norm": 0.2620913338518106, "learning_rate": 6.979580005619464e-07, "loss": 1.4557, "step": 165 }, { "epoch": 0.036483516483516484, "grad_norm": 0.266182106227132, "learning_rate": 6.979332012334064e-07, "loss": 1.5161, "step": 166 }, { "epoch": 0.0367032967032967, "grad_norm": 0.2662800539649438, "learning_rate": 6.97908252718692e-07, "loss": 1.4594, "step": 167 }, { "epoch": 0.036923076923076927, "grad_norm": 0.26646844653296586, "learning_rate": 6.978831550296969e-07, "loss": 1.5261, "step": 168 }, { "epoch": 0.037142857142857144, "grad_norm": 0.29971802724846996, "learning_rate": 6.978579081783861e-07, "loss": 1.5052, "step": 169 }, { "epoch": 0.03736263736263736, "grad_norm": 0.27368819727195265, "learning_rate": 6.978325121767956e-07, "loss": 1.5034, "step": 170 }, { "epoch": 0.03758241758241758, "grad_norm": 0.258761161700573, "learning_rate": 6.978069670370325e-07, "loss": 1.5086, "step": 171 }, { "epoch": 0.037802197802197804, "grad_norm": 0.2600022072426382, "learning_rate": 6.977812727712753e-07, "loss": 1.5493, "step": 172 }, { "epoch": 0.03802197802197802, "grad_norm": 0.27417700518264077, "learning_rate": 6.977554293917731e-07, "loss": 1.5198, "step": 173 }, { "epoch": 0.03824175824175824, "grad_norm": 0.27524991954024314, "learning_rate": 6.977294369108466e-07, "loss": 1.4635, "step": 174 }, { "epoch": 0.038461538461538464, "grad_norm": 0.28804055695078534, "learning_rate": 6.977032953408869e-07, "loss": 1.5334, "step": 175 }, { "epoch": 0.03868131868131868, "grad_norm": 0.27813313998794814, "learning_rate": 6.976770046943571e-07, "loss": 1.4716, "step": 176 }, { "epoch": 0.0389010989010989, "grad_norm": 0.25675207046610016, "learning_rate": 6.976505649837905e-07, "loss": 1.4714, "step": 177 }, { "epoch": 0.03912087912087912, "grad_norm": 0.2591895815458457, "learning_rate": 6.976239762217921e-07, "loss": 1.4909, "step": 178 }, { "epoch": 0.03934065934065934, "grad_norm": 0.2674694439581417, "learning_rate": 6.975972384210375e-07, "loss": 1.5039, "step": 179 }, { "epoch": 0.03956043956043956, "grad_norm": 0.26704045084258143, "learning_rate": 6.975703515942736e-07, "loss": 1.4712, "step": 180 }, { "epoch": 0.03978021978021978, "grad_norm": 0.2706980099916859, "learning_rate": 6.975433157543184e-07, "loss": 1.4654, "step": 181 }, { "epoch": 0.04, "grad_norm": 0.2613703776781457, "learning_rate": 6.975161309140605e-07, "loss": 1.5339, "step": 182 }, { "epoch": 0.04021978021978022, "grad_norm": 0.26132683662049644, "learning_rate": 6.974887970864602e-07, "loss": 1.4638, "step": 183 }, { "epoch": 0.04043956043956044, "grad_norm": 0.31062354073651693, "learning_rate": 6.974613142845483e-07, "loss": 1.4769, "step": 184 }, { "epoch": 0.04065934065934066, "grad_norm": 0.2677559086305453, "learning_rate": 6.974336825214269e-07, "loss": 1.527, "step": 185 }, { "epoch": 0.04087912087912088, "grad_norm": 0.2614383595103056, "learning_rate": 6.974059018102691e-07, "loss": 1.5394, "step": 186 }, { "epoch": 0.041098901098901096, "grad_norm": 0.2803770763369622, "learning_rate": 6.973779721643187e-07, "loss": 1.4823, "step": 187 }, { "epoch": 0.04131868131868132, "grad_norm": 0.27070270266026036, "learning_rate": 6.97349893596891e-07, "loss": 1.4939, "step": 188 }, { "epoch": 0.04153846153846154, "grad_norm": 0.25333287693602247, "learning_rate": 6.973216661213718e-07, "loss": 1.4866, "step": 189 }, { "epoch": 0.041758241758241756, "grad_norm": 0.4573189538131726, "learning_rate": 6.972932897512184e-07, "loss": 1.4048, "step": 190 }, { "epoch": 0.04197802197802198, "grad_norm": 0.3000858279410423, "learning_rate": 6.972647644999584e-07, "loss": 1.4757, "step": 191 }, { "epoch": 0.0421978021978022, "grad_norm": 0.2678269758785627, "learning_rate": 6.972360903811911e-07, "loss": 1.4553, "step": 192 }, { "epoch": 0.042417582417582415, "grad_norm": 0.25922908064411393, "learning_rate": 6.972072674085864e-07, "loss": 1.463, "step": 193 }, { "epoch": 0.04263736263736264, "grad_norm": 0.26416460454952456, "learning_rate": 6.971782955958853e-07, "loss": 1.498, "step": 194 }, { "epoch": 0.04285714285714286, "grad_norm": 0.2636566769664566, "learning_rate": 6.971491749568994e-07, "loss": 1.4274, "step": 195 }, { "epoch": 0.043076923076923075, "grad_norm": 0.2613977552257632, "learning_rate": 6.971199055055118e-07, "loss": 1.4527, "step": 196 }, { "epoch": 0.0432967032967033, "grad_norm": 0.7037865083710388, "learning_rate": 6.970904872556761e-07, "loss": 1.4665, "step": 197 }, { "epoch": 0.04351648351648352, "grad_norm": 0.2939214889537684, "learning_rate": 6.970609202214169e-07, "loss": 1.5244, "step": 198 }, { "epoch": 0.043736263736263735, "grad_norm": 0.26993117023093754, "learning_rate": 6.970312044168303e-07, "loss": 1.4594, "step": 199 }, { "epoch": 0.04395604395604396, "grad_norm": 0.29335910638815493, "learning_rate": 6.970013398560824e-07, "loss": 1.5345, "step": 200 }, { "epoch": 0.04417582417582418, "grad_norm": 0.2673832762745383, "learning_rate": 6.969713265534109e-07, "loss": 1.5341, "step": 201 }, { "epoch": 0.044395604395604395, "grad_norm": 0.33706349568368016, "learning_rate": 6.96941164523124e-07, "loss": 1.5072, "step": 202 }, { "epoch": 0.04461538461538461, "grad_norm": 1.7527104868913561, "learning_rate": 6.969108537796012e-07, "loss": 1.4623, "step": 203 }, { "epoch": 0.04483516483516484, "grad_norm": 0.2758247656438932, "learning_rate": 6.968803943372925e-07, "loss": 1.5642, "step": 204 }, { "epoch": 0.045054945054945054, "grad_norm": 0.2811418878265, "learning_rate": 6.968497862107193e-07, "loss": 1.4691, "step": 205 }, { "epoch": 0.04527472527472527, "grad_norm": 0.2718143932122558, "learning_rate": 6.968190294144732e-07, "loss": 1.5535, "step": 206 }, { "epoch": 0.0454945054945055, "grad_norm": 0.2728985339929895, "learning_rate": 6.967881239632172e-07, "loss": 1.5283, "step": 207 }, { "epoch": 0.045714285714285714, "grad_norm": 0.26533504546058206, "learning_rate": 6.96757069871685e-07, "loss": 1.4883, "step": 208 }, { "epoch": 0.04593406593406593, "grad_norm": 0.2638565050651618, "learning_rate": 6.967258671546811e-07, "loss": 1.5245, "step": 209 }, { "epoch": 0.046153846153846156, "grad_norm": 0.2646371262308997, "learning_rate": 6.96694515827081e-07, "loss": 1.472, "step": 210 }, { "epoch": 0.046373626373626374, "grad_norm": 0.2674406666090845, "learning_rate": 6.966630159038311e-07, "loss": 1.4214, "step": 211 }, { "epoch": 0.04659340659340659, "grad_norm": 0.2739895654283086, "learning_rate": 6.966313673999482e-07, "loss": 1.5281, "step": 212 }, { "epoch": 0.046813186813186816, "grad_norm": 0.2578629141398095, "learning_rate": 6.965995703305205e-07, "loss": 1.4542, "step": 213 }, { "epoch": 0.047032967032967034, "grad_norm": 0.265806832621302, "learning_rate": 6.965676247107067e-07, "loss": 1.497, "step": 214 }, { "epoch": 0.04725274725274725, "grad_norm": 0.2747953041522572, "learning_rate": 6.965355305557364e-07, "loss": 1.4623, "step": 215 }, { "epoch": 0.047472527472527476, "grad_norm": 0.2654747254930102, "learning_rate": 6.965032878809099e-07, "loss": 1.4393, "step": 216 }, { "epoch": 0.047692307692307694, "grad_norm": 0.26140208597905573, "learning_rate": 6.964708967015985e-07, "loss": 1.5312, "step": 217 }, { "epoch": 0.04791208791208791, "grad_norm": 0.30084804983668306, "learning_rate": 6.964383570332442e-07, "loss": 1.475, "step": 218 }, { "epoch": 0.04813186813186813, "grad_norm": 0.3027050045757752, "learning_rate": 6.964056688913597e-07, "loss": 1.5316, "step": 219 }, { "epoch": 0.04835164835164835, "grad_norm": 0.2609356819024029, "learning_rate": 6.963728322915288e-07, "loss": 1.5129, "step": 220 }, { "epoch": 0.04857142857142857, "grad_norm": 0.28897418836738115, "learning_rate": 6.963398472494057e-07, "loss": 1.5621, "step": 221 }, { "epoch": 0.04879120879120879, "grad_norm": 0.25256520476991495, "learning_rate": 6.963067137807155e-07, "loss": 1.4543, "step": 222 }, { "epoch": 0.04901098901098901, "grad_norm": 0.2635781648831903, "learning_rate": 6.96273431901254e-07, "loss": 1.5291, "step": 223 }, { "epoch": 0.04923076923076923, "grad_norm": 0.2823487055962416, "learning_rate": 6.962400016268882e-07, "loss": 1.4998, "step": 224 }, { "epoch": 0.04945054945054945, "grad_norm": 0.26684083629955946, "learning_rate": 6.962064229735551e-07, "loss": 1.4757, "step": 225 }, { "epoch": 0.04967032967032967, "grad_norm": 0.32736400051695624, "learning_rate": 6.96172695957263e-07, "loss": 1.4646, "step": 226 }, { "epoch": 0.04989010989010989, "grad_norm": 0.2673703631683696, "learning_rate": 6.961388205940907e-07, "loss": 1.445, "step": 227 }, { "epoch": 0.05010989010989011, "grad_norm": 2.2958669167730785, "learning_rate": 6.961047969001877e-07, "loss": 1.5246, "step": 228 }, { "epoch": 0.05032967032967033, "grad_norm": 0.2591744019844466, "learning_rate": 6.960706248917743e-07, "loss": 1.4993, "step": 229 }, { "epoch": 0.05054945054945055, "grad_norm": 0.272023775827462, "learning_rate": 6.960363045851415e-07, "loss": 1.4788, "step": 230 }, { "epoch": 0.05076923076923077, "grad_norm": 0.2881378705174016, "learning_rate": 6.960018359966512e-07, "loss": 1.4825, "step": 231 }, { "epoch": 0.05098901098901099, "grad_norm": 0.2871013170100303, "learning_rate": 6.959672191427355e-07, "loss": 1.4743, "step": 232 }, { "epoch": 0.05120879120879121, "grad_norm": 0.29645734733942863, "learning_rate": 6.959324540398975e-07, "loss": 1.4721, "step": 233 }, { "epoch": 0.05142857142857143, "grad_norm": 0.25854948360200714, "learning_rate": 6.958975407047111e-07, "loss": 1.563, "step": 234 }, { "epoch": 0.051648351648351645, "grad_norm": 0.28516775227487257, "learning_rate": 6.958624791538206e-07, "loss": 1.4549, "step": 235 }, { "epoch": 0.05186813186813187, "grad_norm": 0.2715940213280979, "learning_rate": 6.95827269403941e-07, "loss": 1.4767, "step": 236 }, { "epoch": 0.05208791208791209, "grad_norm": 0.25990619084628314, "learning_rate": 6.957919114718581e-07, "loss": 1.4342, "step": 237 }, { "epoch": 0.052307692307692305, "grad_norm": 0.258840154005006, "learning_rate": 6.957564053744282e-07, "loss": 1.486, "step": 238 }, { "epoch": 0.05252747252747253, "grad_norm": 0.2535593709769906, "learning_rate": 6.957207511285784e-07, "loss": 1.4045, "step": 239 }, { "epoch": 0.05274725274725275, "grad_norm": 0.29933148068654547, "learning_rate": 6.956849487513061e-07, "loss": 1.4696, "step": 240 }, { "epoch": 0.052967032967032965, "grad_norm": 0.26986020751770984, "learning_rate": 6.9564899825968e-07, "loss": 1.4899, "step": 241 }, { "epoch": 0.05318681318681319, "grad_norm": 0.2516728317277496, "learning_rate": 6.956128996708385e-07, "loss": 1.4936, "step": 242 }, { "epoch": 0.05340659340659341, "grad_norm": 0.2556011054356573, "learning_rate": 6.955766530019911e-07, "loss": 1.4764, "step": 243 }, { "epoch": 0.053626373626373625, "grad_norm": 0.2660489284065188, "learning_rate": 6.955402582704181e-07, "loss": 1.5266, "step": 244 }, { "epoch": 0.05384615384615385, "grad_norm": 0.2598598971211793, "learning_rate": 6.955037154934699e-07, "loss": 1.5083, "step": 245 }, { "epoch": 0.05406593406593407, "grad_norm": 0.2576200896121239, "learning_rate": 6.95467024688568e-07, "loss": 1.501, "step": 246 }, { "epoch": 0.054285714285714284, "grad_norm": 0.2715207303985658, "learning_rate": 6.954301858732039e-07, "loss": 1.4701, "step": 247 }, { "epoch": 0.05450549450549451, "grad_norm": 0.2547077363611182, "learning_rate": 6.9539319906494e-07, "loss": 1.4653, "step": 248 }, { "epoch": 0.054725274725274727, "grad_norm": 0.26056877427332575, "learning_rate": 6.953560642814092e-07, "loss": 1.5032, "step": 249 }, { "epoch": 0.054945054945054944, "grad_norm": 0.2679576435566933, "learning_rate": 6.953187815403152e-07, "loss": 1.478, "step": 250 }, { "epoch": 0.05516483516483516, "grad_norm": 0.2578705664542964, "learning_rate": 6.952813508594318e-07, "loss": 1.5082, "step": 251 }, { "epoch": 0.055384615384615386, "grad_norm": 0.26029743030106334, "learning_rate": 6.952437722566032e-07, "loss": 1.5029, "step": 252 }, { "epoch": 0.055604395604395604, "grad_norm": 0.26016894646276006, "learning_rate": 6.95206045749745e-07, "loss": 1.4943, "step": 253 }, { "epoch": 0.05582417582417582, "grad_norm": 0.2536811282304101, "learning_rate": 6.951681713568424e-07, "loss": 1.4562, "step": 254 }, { "epoch": 0.056043956043956046, "grad_norm": 0.25971527603024036, "learning_rate": 6.951301490959514e-07, "loss": 1.4685, "step": 255 }, { "epoch": 0.056263736263736264, "grad_norm": 0.26670319176313756, "learning_rate": 6.950919789851988e-07, "loss": 1.5116, "step": 256 }, { "epoch": 0.05648351648351648, "grad_norm": 0.25735075761994464, "learning_rate": 6.950536610427815e-07, "loss": 1.4561, "step": 257 }, { "epoch": 0.056703296703296706, "grad_norm": 0.2736555478779235, "learning_rate": 6.950151952869669e-07, "loss": 1.5029, "step": 258 }, { "epoch": 0.05692307692307692, "grad_norm": 0.254819786458596, "learning_rate": 6.94976581736093e-07, "loss": 1.5071, "step": 259 }, { "epoch": 0.05714285714285714, "grad_norm": 0.2603206938823192, "learning_rate": 6.949378204085683e-07, "loss": 1.4552, "step": 260 }, { "epoch": 0.057362637362637366, "grad_norm": 0.26917748367705185, "learning_rate": 6.948989113228717e-07, "loss": 1.513, "step": 261 }, { "epoch": 0.05758241758241758, "grad_norm": 0.26502851157044544, "learning_rate": 6.948598544975524e-07, "loss": 1.5196, "step": 262 }, { "epoch": 0.0578021978021978, "grad_norm": 0.27055374354698203, "learning_rate": 6.948206499512302e-07, "loss": 1.5044, "step": 263 }, { "epoch": 0.058021978021978025, "grad_norm": 0.3208603270233443, "learning_rate": 6.947812977025953e-07, "loss": 1.5076, "step": 264 }, { "epoch": 0.05824175824175824, "grad_norm": 0.2596677957877133, "learning_rate": 6.947417977704084e-07, "loss": 1.4997, "step": 265 }, { "epoch": 0.05846153846153846, "grad_norm": 0.26397969261767407, "learning_rate": 6.947021501735003e-07, "loss": 1.5694, "step": 266 }, { "epoch": 0.05868131868131868, "grad_norm": 0.2855200095017623, "learning_rate": 6.946623549307725e-07, "loss": 1.4998, "step": 267 }, { "epoch": 0.0589010989010989, "grad_norm": 0.28246859273897623, "learning_rate": 6.946224120611967e-07, "loss": 1.4881, "step": 268 }, { "epoch": 0.05912087912087912, "grad_norm": 0.24907848933117777, "learning_rate": 6.945823215838151e-07, "loss": 1.4923, "step": 269 }, { "epoch": 0.05934065934065934, "grad_norm": 0.27291804363093825, "learning_rate": 6.945420835177403e-07, "loss": 1.4702, "step": 270 }, { "epoch": 0.05956043956043956, "grad_norm": 0.2673109560476995, "learning_rate": 6.945016978821552e-07, "loss": 1.523, "step": 271 }, { "epoch": 0.05978021978021978, "grad_norm": 0.26555807629974576, "learning_rate": 6.944611646963128e-07, "loss": 1.4903, "step": 272 }, { "epoch": 0.06, "grad_norm": 0.26523653033189487, "learning_rate": 6.944204839795369e-07, "loss": 1.431, "step": 273 }, { "epoch": 0.06021978021978022, "grad_norm": 0.26418183818079705, "learning_rate": 6.943796557512214e-07, "loss": 1.4915, "step": 274 }, { "epoch": 0.06043956043956044, "grad_norm": 0.26251672112394603, "learning_rate": 6.943386800308305e-07, "loss": 1.5133, "step": 275 }, { "epoch": 0.06065934065934066, "grad_norm": 0.2601657148483874, "learning_rate": 6.942975568378989e-07, "loss": 1.4393, "step": 276 }, { "epoch": 0.06087912087912088, "grad_norm": 0.25383464049284915, "learning_rate": 6.94256286192031e-07, "loss": 1.5193, "step": 277 }, { "epoch": 0.0610989010989011, "grad_norm": 0.2527707719641187, "learning_rate": 6.942148681129026e-07, "loss": 1.502, "step": 278 }, { "epoch": 0.06131868131868132, "grad_norm": 0.2550257292963019, "learning_rate": 6.941733026202587e-07, "loss": 1.5577, "step": 279 }, { "epoch": 0.06153846153846154, "grad_norm": 0.26800959630930515, "learning_rate": 6.941315897339153e-07, "loss": 1.4685, "step": 280 }, { "epoch": 0.06175824175824176, "grad_norm": 0.2639661087333859, "learning_rate": 6.94089729473758e-07, "loss": 1.4654, "step": 281 }, { "epoch": 0.06197802197802198, "grad_norm": 0.3044655793163802, "learning_rate": 6.940477218597434e-07, "loss": 1.5145, "step": 282 }, { "epoch": 0.062197802197802195, "grad_norm": 0.26644488846843883, "learning_rate": 6.940055669118979e-07, "loss": 1.4279, "step": 283 }, { "epoch": 0.06241758241758242, "grad_norm": 0.2531821057834337, "learning_rate": 6.939632646503181e-07, "loss": 1.4871, "step": 284 }, { "epoch": 0.06263736263736264, "grad_norm": 0.26515730237558094, "learning_rate": 6.939208150951713e-07, "loss": 1.4509, "step": 285 }, { "epoch": 0.06285714285714286, "grad_norm": 0.27598146747073726, "learning_rate": 6.938782182666943e-07, "loss": 1.4871, "step": 286 }, { "epoch": 0.06307692307692307, "grad_norm": 0.2661969469148146, "learning_rate": 6.938354741851947e-07, "loss": 1.396, "step": 287 }, { "epoch": 0.0632967032967033, "grad_norm": 0.29842965989187764, "learning_rate": 6.9379258287105e-07, "loss": 1.4267, "step": 288 }, { "epoch": 0.06351648351648352, "grad_norm": 0.25858035630649373, "learning_rate": 6.93749544344708e-07, "loss": 1.5154, "step": 289 }, { "epoch": 0.06373626373626373, "grad_norm": 0.2593531590218418, "learning_rate": 6.937063586266869e-07, "loss": 1.4901, "step": 290 }, { "epoch": 0.06395604395604396, "grad_norm": 0.25858425487633924, "learning_rate": 6.936630257375745e-07, "loss": 1.457, "step": 291 }, { "epoch": 0.06417582417582418, "grad_norm": 0.40235054361683753, "learning_rate": 6.936195456980291e-07, "loss": 1.4533, "step": 292 }, { "epoch": 0.06439560439560439, "grad_norm": 0.25704692703086185, "learning_rate": 6.935759185287796e-07, "loss": 1.4939, "step": 293 }, { "epoch": 0.06461538461538462, "grad_norm": 0.26084450550432003, "learning_rate": 6.935321442506241e-07, "loss": 1.4745, "step": 294 }, { "epoch": 0.06483516483516484, "grad_norm": 0.3204320723154116, "learning_rate": 6.934882228844315e-07, "loss": 1.4833, "step": 295 }, { "epoch": 0.06505494505494505, "grad_norm": 0.2925379455584677, "learning_rate": 6.934441544511407e-07, "loss": 1.5301, "step": 296 }, { "epoch": 0.06527472527472528, "grad_norm": 0.2830144001441581, "learning_rate": 6.933999389717608e-07, "loss": 1.4863, "step": 297 }, { "epoch": 0.0654945054945055, "grad_norm": 0.2893794244145113, "learning_rate": 6.933555764673704e-07, "loss": 1.5546, "step": 298 }, { "epoch": 0.06571428571428571, "grad_norm": 0.26853198296167635, "learning_rate": 6.933110669591191e-07, "loss": 1.4265, "step": 299 }, { "epoch": 0.06593406593406594, "grad_norm": 0.2579360349174298, "learning_rate": 6.932664104682259e-07, "loss": 1.4675, "step": 300 }, { "epoch": 0.06615384615384616, "grad_norm": 0.2634299933254918, "learning_rate": 6.932216070159802e-07, "loss": 1.4939, "step": 301 }, { "epoch": 0.06637362637362637, "grad_norm": 0.255533756023376, "learning_rate": 6.931766566237413e-07, "loss": 1.4855, "step": 302 }, { "epoch": 0.0665934065934066, "grad_norm": 0.2525279781357927, "learning_rate": 6.931315593129387e-07, "loss": 1.4807, "step": 303 }, { "epoch": 0.06681318681318682, "grad_norm": 0.2606954135874035, "learning_rate": 6.930863151050719e-07, "loss": 1.4852, "step": 304 }, { "epoch": 0.06703296703296703, "grad_norm": 0.6770030951043836, "learning_rate": 6.930409240217102e-07, "loss": 1.4383, "step": 305 }, { "epoch": 0.06725274725274726, "grad_norm": 0.31236654031247846, "learning_rate": 6.929953860844933e-07, "loss": 1.4198, "step": 306 }, { "epoch": 0.06747252747252747, "grad_norm": 0.27722631806332093, "learning_rate": 6.929497013151307e-07, "loss": 1.5056, "step": 307 }, { "epoch": 0.06769230769230769, "grad_norm": 0.2537696257468583, "learning_rate": 6.929038697354018e-07, "loss": 1.5424, "step": 308 }, { "epoch": 0.06791208791208792, "grad_norm": 0.25301268518741477, "learning_rate": 6.928578913671562e-07, "loss": 1.5102, "step": 309 }, { "epoch": 0.06813186813186813, "grad_norm": 0.35098513331568687, "learning_rate": 6.928117662323135e-07, "loss": 1.4889, "step": 310 }, { "epoch": 0.06835164835164835, "grad_norm": 0.6935830624249227, "learning_rate": 6.927654943528629e-07, "loss": 1.462, "step": 311 }, { "epoch": 0.06857142857142857, "grad_norm": 0.2617719858115404, "learning_rate": 6.92719075750864e-07, "loss": 1.5075, "step": 312 }, { "epoch": 0.06879120879120879, "grad_norm": 0.2502886113171237, "learning_rate": 6.926725104484462e-07, "loss": 1.4976, "step": 313 }, { "epoch": 0.06901098901098901, "grad_norm": 0.26401853218329807, "learning_rate": 6.926257984678086e-07, "loss": 1.5236, "step": 314 }, { "epoch": 0.06923076923076923, "grad_norm": 0.26577736446286343, "learning_rate": 6.925789398312206e-07, "loss": 1.4676, "step": 315 }, { "epoch": 0.06945054945054945, "grad_norm": 0.27638635956465635, "learning_rate": 6.925319345610214e-07, "loss": 1.4941, "step": 316 }, { "epoch": 0.06967032967032967, "grad_norm": 0.29321419443219365, "learning_rate": 6.9248478267962e-07, "loss": 1.5329, "step": 317 }, { "epoch": 0.0698901098901099, "grad_norm": 0.26616921944946587, "learning_rate": 6.924374842094952e-07, "loss": 1.511, "step": 318 }, { "epoch": 0.0701098901098901, "grad_norm": 0.3436957907926617, "learning_rate": 6.92390039173196e-07, "loss": 1.4564, "step": 319 }, { "epoch": 0.07032967032967033, "grad_norm": 0.2827992534635701, "learning_rate": 6.923424475933411e-07, "loss": 1.4817, "step": 320 }, { "epoch": 0.07054945054945055, "grad_norm": 0.2553823593207318, "learning_rate": 6.922947094926189e-07, "loss": 1.5147, "step": 321 }, { "epoch": 0.07076923076923076, "grad_norm": 8.134751377702411, "learning_rate": 6.922468248937879e-07, "loss": 1.4417, "step": 322 }, { "epoch": 0.07098901098901099, "grad_norm": 0.26967755651153225, "learning_rate": 6.921987938196766e-07, "loss": 1.4807, "step": 323 }, { "epoch": 0.07120879120879121, "grad_norm": 3.5383722866014304, "learning_rate": 6.921506162931828e-07, "loss": 1.4948, "step": 324 }, { "epoch": 0.07142857142857142, "grad_norm": 0.2775193997301435, "learning_rate": 6.921022923372744e-07, "loss": 1.5318, "step": 325 }, { "epoch": 0.07164835164835165, "grad_norm": 0.2713185922837225, "learning_rate": 6.920538219749892e-07, "loss": 1.4545, "step": 326 }, { "epoch": 0.07186813186813187, "grad_norm": 0.25034860762400624, "learning_rate": 6.920052052294348e-07, "loss": 1.4889, "step": 327 }, { "epoch": 0.07208791208791208, "grad_norm": 0.26622578973377264, "learning_rate": 6.919564421237884e-07, "loss": 1.5255, "step": 328 }, { "epoch": 0.07230769230769231, "grad_norm": 0.2514601292842668, "learning_rate": 6.919075326812971e-07, "loss": 1.4863, "step": 329 }, { "epoch": 0.07252747252747253, "grad_norm": 0.24965703280073318, "learning_rate": 6.918584769252778e-07, "loss": 1.4945, "step": 330 }, { "epoch": 0.07274725274725274, "grad_norm": 0.26038456628413265, "learning_rate": 6.918092748791168e-07, "loss": 1.5358, "step": 331 }, { "epoch": 0.07296703296703297, "grad_norm": 0.30474496747461144, "learning_rate": 6.917599265662707e-07, "loss": 1.5037, "step": 332 }, { "epoch": 0.0731868131868132, "grad_norm": 0.2704338207014198, "learning_rate": 6.917104320102656e-07, "loss": 1.5216, "step": 333 }, { "epoch": 0.0734065934065934, "grad_norm": 0.2724832631325722, "learning_rate": 6.916607912346972e-07, "loss": 1.5369, "step": 334 }, { "epoch": 0.07362637362637363, "grad_norm": 0.2592734429969151, "learning_rate": 6.916110042632309e-07, "loss": 1.5068, "step": 335 }, { "epoch": 0.07384615384615385, "grad_norm": 0.26663545112069037, "learning_rate": 6.915610711196021e-07, "loss": 1.5046, "step": 336 }, { "epoch": 0.07406593406593406, "grad_norm": 0.2577845369442996, "learning_rate": 6.915109918276155e-07, "loss": 1.4669, "step": 337 }, { "epoch": 0.07428571428571429, "grad_norm": 0.26062999277192267, "learning_rate": 6.914607664111456e-07, "loss": 1.4814, "step": 338 }, { "epoch": 0.0745054945054945, "grad_norm": 0.4468615031479458, "learning_rate": 6.914103948941367e-07, "loss": 1.499, "step": 339 }, { "epoch": 0.07472527472527472, "grad_norm": 0.2556666659553541, "learning_rate": 6.913598773006028e-07, "loss": 1.4797, "step": 340 }, { "epoch": 0.07494505494505495, "grad_norm": 0.26674340467264024, "learning_rate": 6.913092136546271e-07, "loss": 1.4582, "step": 341 }, { "epoch": 0.07516483516483516, "grad_norm": 0.2584509725069121, "learning_rate": 6.912584039803629e-07, "loss": 1.4678, "step": 342 }, { "epoch": 0.07538461538461538, "grad_norm": 0.256005850312379, "learning_rate": 6.91207448302033e-07, "loss": 1.4405, "step": 343 }, { "epoch": 0.07560439560439561, "grad_norm": 0.37548276718717116, "learning_rate": 6.911563466439296e-07, "loss": 1.4787, "step": 344 }, { "epoch": 0.07582417582417582, "grad_norm": 0.25131728724393987, "learning_rate": 6.911050990304147e-07, "loss": 1.4951, "step": 345 }, { "epoch": 0.07604395604395604, "grad_norm": 0.28063430218461827, "learning_rate": 6.9105370548592e-07, "loss": 1.4831, "step": 346 }, { "epoch": 0.07626373626373627, "grad_norm": 0.2671512942188966, "learning_rate": 6.910021660349463e-07, "loss": 1.4849, "step": 347 }, { "epoch": 0.07648351648351648, "grad_norm": 0.25460261140041673, "learning_rate": 6.909504807020644e-07, "loss": 1.4724, "step": 348 }, { "epoch": 0.0767032967032967, "grad_norm": 0.2700059778290506, "learning_rate": 6.908986495119145e-07, "loss": 1.5273, "step": 349 }, { "epoch": 0.07692307692307693, "grad_norm": 0.25645752419926465, "learning_rate": 6.908466724892063e-07, "loss": 1.4982, "step": 350 }, { "epoch": 0.07714285714285714, "grad_norm": 0.25959474579877234, "learning_rate": 6.907945496587191e-07, "loss": 1.4803, "step": 351 }, { "epoch": 0.07736263736263736, "grad_norm": 0.2620479009009138, "learning_rate": 6.907422810453017e-07, "loss": 1.4585, "step": 352 }, { "epoch": 0.07758241758241759, "grad_norm": 0.2527696861757479, "learning_rate": 6.906898666738724e-07, "loss": 1.5057, "step": 353 }, { "epoch": 0.0778021978021978, "grad_norm": 0.2657712564207219, "learning_rate": 6.906373065694189e-07, "loss": 1.5272, "step": 354 }, { "epoch": 0.07802197802197802, "grad_norm": 0.2811559522852078, "learning_rate": 6.905846007569983e-07, "loss": 1.4933, "step": 355 }, { "epoch": 0.07824175824175825, "grad_norm": 0.2508390205169112, "learning_rate": 6.905317492617375e-07, "loss": 1.4854, "step": 356 }, { "epoch": 0.07846153846153846, "grad_norm": 0.26744348357494585, "learning_rate": 6.904787521088325e-07, "loss": 1.5196, "step": 357 }, { "epoch": 0.07868131868131868, "grad_norm": 0.24717717010181148, "learning_rate": 6.90425609323549e-07, "loss": 1.4946, "step": 358 }, { "epoch": 0.0789010989010989, "grad_norm": 0.2624486339764091, "learning_rate": 6.903723209312222e-07, "loss": 1.4666, "step": 359 }, { "epoch": 0.07912087912087912, "grad_norm": 0.2643903815888448, "learning_rate": 6.903188869572561e-07, "loss": 1.459, "step": 360 }, { "epoch": 0.07934065934065934, "grad_norm": 0.25849130265367753, "learning_rate": 6.902653074271248e-07, "loss": 1.4532, "step": 361 }, { "epoch": 0.07956043956043957, "grad_norm": 0.2620958518337077, "learning_rate": 6.902115823663716e-07, "loss": 1.5331, "step": 362 }, { "epoch": 0.07978021978021978, "grad_norm": 0.2655735580121207, "learning_rate": 6.901577118006088e-07, "loss": 1.4818, "step": 363 }, { "epoch": 0.08, "grad_norm": 0.48161085497749717, "learning_rate": 6.901036957555188e-07, "loss": 1.4941, "step": 364 }, { "epoch": 0.08021978021978023, "grad_norm": 0.25350645445740005, "learning_rate": 6.900495342568526e-07, "loss": 1.5258, "step": 365 }, { "epoch": 0.08043956043956044, "grad_norm": 0.275420280044783, "learning_rate": 6.89995227330431e-07, "loss": 1.4297, "step": 366 }, { "epoch": 0.08065934065934066, "grad_norm": 0.262688338922607, "learning_rate": 6.899407750021441e-07, "loss": 1.5446, "step": 367 }, { "epoch": 0.08087912087912089, "grad_norm": 0.25907908219518205, "learning_rate": 6.898861772979509e-07, "loss": 1.5029, "step": 368 }, { "epoch": 0.0810989010989011, "grad_norm": 0.29498380574337696, "learning_rate": 6.898314342438806e-07, "loss": 1.5394, "step": 369 }, { "epoch": 0.08131868131868132, "grad_norm": 0.26017168929918727, "learning_rate": 6.897765458660305e-07, "loss": 1.5135, "step": 370 }, { "epoch": 0.08153846153846153, "grad_norm": 0.27229468992527034, "learning_rate": 6.897215121905683e-07, "loss": 1.4793, "step": 371 }, { "epoch": 0.08175824175824176, "grad_norm": 0.2560454187610276, "learning_rate": 6.896663332437302e-07, "loss": 1.4526, "step": 372 }, { "epoch": 0.08197802197802198, "grad_norm": 0.24895478307108312, "learning_rate": 6.896110090518217e-07, "loss": 1.426, "step": 373 }, { "epoch": 0.08219780219780219, "grad_norm": 0.2577451016138694, "learning_rate": 6.895555396412185e-07, "loss": 1.5201, "step": 374 }, { "epoch": 0.08241758241758242, "grad_norm": 0.24907178992928014, "learning_rate": 6.894999250383641e-07, "loss": 1.4061, "step": 375 }, { "epoch": 0.08263736263736264, "grad_norm": 0.28771569194362406, "learning_rate": 6.894441652697723e-07, "loss": 1.5247, "step": 376 }, { "epoch": 0.08285714285714285, "grad_norm": 0.24045256690137964, "learning_rate": 6.893882603620256e-07, "loss": 1.4698, "step": 377 }, { "epoch": 0.08307692307692308, "grad_norm": 0.2551056428383154, "learning_rate": 6.893322103417756e-07, "loss": 1.4939, "step": 378 }, { "epoch": 0.0832967032967033, "grad_norm": 0.2570930834107385, "learning_rate": 6.892760152357438e-07, "loss": 1.4728, "step": 379 }, { "epoch": 0.08351648351648351, "grad_norm": 0.2566016081577767, "learning_rate": 6.892196750707201e-07, "loss": 1.4977, "step": 380 }, { "epoch": 0.08373626373626374, "grad_norm": 0.25658848478050506, "learning_rate": 6.891631898735637e-07, "loss": 1.4985, "step": 381 }, { "epoch": 0.08395604395604396, "grad_norm": 0.2706460825991662, "learning_rate": 6.891065596712032e-07, "loss": 1.4121, "step": 382 }, { "epoch": 0.08417582417582417, "grad_norm": 0.3365445903118446, "learning_rate": 6.890497844906363e-07, "loss": 1.4926, "step": 383 }, { "epoch": 0.0843956043956044, "grad_norm": 0.2652784557456132, "learning_rate": 6.889928643589294e-07, "loss": 1.5019, "step": 384 }, { "epoch": 0.08461538461538462, "grad_norm": 0.268707146755482, "learning_rate": 6.889357993032188e-07, "loss": 1.5447, "step": 385 }, { "epoch": 0.08483516483516483, "grad_norm": 0.28103291812562514, "learning_rate": 6.888785893507087e-07, "loss": 1.5017, "step": 386 }, { "epoch": 0.08505494505494506, "grad_norm": 0.25304645911943247, "learning_rate": 6.888212345286738e-07, "loss": 1.4316, "step": 387 }, { "epoch": 0.08527472527472528, "grad_norm": 0.25568766747949867, "learning_rate": 6.887637348644568e-07, "loss": 1.4708, "step": 388 }, { "epoch": 0.08549450549450549, "grad_norm": 0.27101945950458134, "learning_rate": 6.887060903854696e-07, "loss": 1.5363, "step": 389 }, { "epoch": 0.08571428571428572, "grad_norm": 0.24996110145343797, "learning_rate": 6.886483011191937e-07, "loss": 1.4691, "step": 390 }, { "epoch": 0.08593406593406594, "grad_norm": 0.2559733439879277, "learning_rate": 6.885903670931792e-07, "loss": 1.4421, "step": 391 }, { "epoch": 0.08615384615384615, "grad_norm": 0.24899329109658566, "learning_rate": 6.885322883350451e-07, "loss": 1.4893, "step": 392 }, { "epoch": 0.08637362637362637, "grad_norm": 0.2549324828735496, "learning_rate": 6.884740648724797e-07, "loss": 1.4932, "step": 393 }, { "epoch": 0.0865934065934066, "grad_norm": 0.25896256441327564, "learning_rate": 6.8841569673324e-07, "loss": 1.5253, "step": 394 }, { "epoch": 0.08681318681318681, "grad_norm": 0.25886906458445796, "learning_rate": 6.883571839451523e-07, "loss": 1.4753, "step": 395 }, { "epoch": 0.08703296703296703, "grad_norm": 0.25318443906922317, "learning_rate": 6.882985265361118e-07, "loss": 1.448, "step": 396 }, { "epoch": 0.08725274725274726, "grad_norm": 0.2512113574590466, "learning_rate": 6.882397245340823e-07, "loss": 1.4394, "step": 397 }, { "epoch": 0.08747252747252747, "grad_norm": 0.26137115546154455, "learning_rate": 6.881807779670967e-07, "loss": 1.4473, "step": 398 }, { "epoch": 0.0876923076923077, "grad_norm": 0.2593229842385263, "learning_rate": 6.88121686863257e-07, "loss": 1.4174, "step": 399 }, { "epoch": 0.08791208791208792, "grad_norm": 0.25189941519194164, "learning_rate": 6.880624512507341e-07, "loss": 1.5331, "step": 400 }, { "epoch": 0.08813186813186813, "grad_norm": 0.26325410031477786, "learning_rate": 6.880030711577677e-07, "loss": 1.5015, "step": 401 }, { "epoch": 0.08835164835164835, "grad_norm": 0.2649848528360352, "learning_rate": 6.87943546612666e-07, "loss": 1.4982, "step": 402 }, { "epoch": 0.08857142857142856, "grad_norm": 0.2504726168721806, "learning_rate": 6.87883877643807e-07, "loss": 1.4844, "step": 403 }, { "epoch": 0.08879120879120879, "grad_norm": 0.28735708709213764, "learning_rate": 6.878240642796363e-07, "loss": 1.4894, "step": 404 }, { "epoch": 0.08901098901098901, "grad_norm": 0.2747049804656409, "learning_rate": 6.877641065486695e-07, "loss": 1.431, "step": 405 }, { "epoch": 0.08923076923076922, "grad_norm": 0.2478607752612214, "learning_rate": 6.877040044794905e-07, "loss": 1.4954, "step": 406 }, { "epoch": 0.08945054945054945, "grad_norm": 0.2624854251600358, "learning_rate": 6.87643758100752e-07, "loss": 1.47, "step": 407 }, { "epoch": 0.08967032967032967, "grad_norm": 0.24467138857512993, "learning_rate": 6.875833674411754e-07, "loss": 1.4333, "step": 408 }, { "epoch": 0.08989010989010988, "grad_norm": 0.3695908168101729, "learning_rate": 6.875228325295512e-07, "loss": 1.5326, "step": 409 }, { "epoch": 0.09010989010989011, "grad_norm": 0.3967591539462583, "learning_rate": 6.874621533947384e-07, "loss": 1.4744, "step": 410 }, { "epoch": 0.09032967032967033, "grad_norm": 0.29037431096431854, "learning_rate": 6.874013300656651e-07, "loss": 1.4998, "step": 411 }, { "epoch": 0.09054945054945054, "grad_norm": 0.26813923373713594, "learning_rate": 6.873403625713275e-07, "loss": 1.4229, "step": 412 }, { "epoch": 0.09076923076923077, "grad_norm": 0.25477108101131696, "learning_rate": 6.872792509407913e-07, "loss": 1.4572, "step": 413 }, { "epoch": 0.090989010989011, "grad_norm": 0.2533629057160468, "learning_rate": 6.872179952031905e-07, "loss": 1.5205, "step": 414 }, { "epoch": 0.0912087912087912, "grad_norm": 0.25823996012280015, "learning_rate": 6.871565953877276e-07, "loss": 1.5479, "step": 415 }, { "epoch": 0.09142857142857143, "grad_norm": 0.2502767404385213, "learning_rate": 6.870950515236742e-07, "loss": 1.4669, "step": 416 }, { "epoch": 0.09164835164835165, "grad_norm": 0.25614080286852986, "learning_rate": 6.870333636403703e-07, "loss": 1.4709, "step": 417 }, { "epoch": 0.09186813186813186, "grad_norm": 0.25423154440920465, "learning_rate": 6.869715317672249e-07, "loss": 1.4614, "step": 418 }, { "epoch": 0.09208791208791209, "grad_norm": 0.2820748505144496, "learning_rate": 6.869095559337152e-07, "loss": 1.4711, "step": 419 }, { "epoch": 0.09230769230769231, "grad_norm": 0.38836643198774956, "learning_rate": 6.868474361693873e-07, "loss": 1.4723, "step": 420 }, { "epoch": 0.09252747252747252, "grad_norm": 0.2622776612555053, "learning_rate": 6.867851725038559e-07, "loss": 1.4931, "step": 421 }, { "epoch": 0.09274725274725275, "grad_norm": 0.2638414257200676, "learning_rate": 6.867227649668043e-07, "loss": 1.4804, "step": 422 }, { "epoch": 0.09296703296703297, "grad_norm": 0.2538323772231728, "learning_rate": 6.866602135879841e-07, "loss": 1.4465, "step": 423 }, { "epoch": 0.09318681318681318, "grad_norm": 0.25220892076182044, "learning_rate": 6.86597518397216e-07, "loss": 1.5155, "step": 424 }, { "epoch": 0.09340659340659341, "grad_norm": 0.25877234719644127, "learning_rate": 6.865346794243889e-07, "loss": 1.4924, "step": 425 }, { "epoch": 0.09362637362637363, "grad_norm": 0.254282371919666, "learning_rate": 6.864716966994603e-07, "loss": 1.5142, "step": 426 }, { "epoch": 0.09384615384615384, "grad_norm": 0.2699039585810529, "learning_rate": 6.864085702524563e-07, "loss": 1.5293, "step": 427 }, { "epoch": 0.09406593406593407, "grad_norm": 0.26421931723195685, "learning_rate": 6.863453001134715e-07, "loss": 1.4517, "step": 428 }, { "epoch": 0.09428571428571429, "grad_norm": 0.2642785692878364, "learning_rate": 6.862818863126689e-07, "loss": 1.4724, "step": 429 }, { "epoch": 0.0945054945054945, "grad_norm": 0.2503119645036744, "learning_rate": 6.862183288802801e-07, "loss": 1.4846, "step": 430 }, { "epoch": 0.09472527472527473, "grad_norm": 0.2540401369329786, "learning_rate": 6.861546278466052e-07, "loss": 1.4668, "step": 431 }, { "epoch": 0.09494505494505495, "grad_norm": 0.2524060265141518, "learning_rate": 6.860907832420127e-07, "loss": 1.4974, "step": 432 }, { "epoch": 0.09516483516483516, "grad_norm": 0.2533493015247761, "learning_rate": 6.860267950969394e-07, "loss": 1.4708, "step": 433 }, { "epoch": 0.09538461538461539, "grad_norm": 0.3374431842558201, "learning_rate": 6.859626634418909e-07, "loss": 1.4815, "step": 434 }, { "epoch": 0.0956043956043956, "grad_norm": 0.2581770852497789, "learning_rate": 6.858983883074408e-07, "loss": 1.4708, "step": 435 }, { "epoch": 0.09582417582417582, "grad_norm": 0.2522574673385085, "learning_rate": 6.858339697242315e-07, "loss": 1.566, "step": 436 }, { "epoch": 0.09604395604395605, "grad_norm": 0.42697293913143436, "learning_rate": 6.857694077229734e-07, "loss": 1.4585, "step": 437 }, { "epoch": 0.09626373626373626, "grad_norm": 0.26177314911984273, "learning_rate": 6.857047023344456e-07, "loss": 1.4889, "step": 438 }, { "epoch": 0.09648351648351648, "grad_norm": 0.2610139100609117, "learning_rate": 6.856398535894953e-07, "loss": 1.4891, "step": 439 }, { "epoch": 0.0967032967032967, "grad_norm": 0.2495838633528667, "learning_rate": 6.855748615190382e-07, "loss": 1.466, "step": 440 }, { "epoch": 0.09692307692307692, "grad_norm": 0.24574730034749598, "learning_rate": 6.855097261540584e-07, "loss": 1.4826, "step": 441 }, { "epoch": 0.09714285714285714, "grad_norm": 0.2763109377977699, "learning_rate": 6.85444447525608e-07, "loss": 1.5334, "step": 442 }, { "epoch": 0.09736263736263737, "grad_norm": 0.28745607094795184, "learning_rate": 6.853790256648076e-07, "loss": 1.4575, "step": 443 }, { "epoch": 0.09758241758241758, "grad_norm": 0.24475023289100628, "learning_rate": 6.853134606028465e-07, "loss": 1.4609, "step": 444 }, { "epoch": 0.0978021978021978, "grad_norm": 0.2504850925462208, "learning_rate": 6.852477523709813e-07, "loss": 1.4629, "step": 445 }, { "epoch": 0.09802197802197803, "grad_norm": 0.24049767078730194, "learning_rate": 6.851819010005377e-07, "loss": 1.4344, "step": 446 }, { "epoch": 0.09824175824175824, "grad_norm": 0.25202798311689356, "learning_rate": 6.851159065229092e-07, "loss": 1.4478, "step": 447 }, { "epoch": 0.09846153846153846, "grad_norm": 0.2735054530824979, "learning_rate": 6.850497689695579e-07, "loss": 1.5428, "step": 448 }, { "epoch": 0.09868131868131869, "grad_norm": 0.26669562560985255, "learning_rate": 6.849834883720137e-07, "loss": 1.5291, "step": 449 }, { "epoch": 0.0989010989010989, "grad_norm": 0.28016580215239933, "learning_rate": 6.849170647618749e-07, "loss": 1.4678, "step": 450 }, { "epoch": 0.09912087912087912, "grad_norm": 0.2477907212755894, "learning_rate": 6.848504981708079e-07, "loss": 1.4914, "step": 451 }, { "epoch": 0.09934065934065935, "grad_norm": 0.29689288202333614, "learning_rate": 6.847837886305475e-07, "loss": 1.4912, "step": 452 }, { "epoch": 0.09956043956043956, "grad_norm": 0.25483214112335445, "learning_rate": 6.847169361728964e-07, "loss": 1.445, "step": 453 }, { "epoch": 0.09978021978021978, "grad_norm": 0.2593492089307687, "learning_rate": 6.846499408297253e-07, "loss": 1.5105, "step": 454 }, { "epoch": 0.1, "grad_norm": 0.2511983653558936, "learning_rate": 6.845828026329734e-07, "loss": 1.4672, "step": 455 }, { "epoch": 0.10021978021978022, "grad_norm": 0.25016435696472505, "learning_rate": 6.845155216146477e-07, "loss": 1.5156, "step": 456 }, { "epoch": 0.10043956043956044, "grad_norm": 0.2491019652028722, "learning_rate": 6.844480978068236e-07, "loss": 1.5027, "step": 457 }, { "epoch": 0.10065934065934067, "grad_norm": 0.2532570455267634, "learning_rate": 6.843805312416443e-07, "loss": 1.4765, "step": 458 }, { "epoch": 0.10087912087912088, "grad_norm": 0.2513622077483454, "learning_rate": 6.84312821951321e-07, "loss": 1.4851, "step": 459 }, { "epoch": 0.1010989010989011, "grad_norm": 0.299920254513179, "learning_rate": 6.842449699681332e-07, "loss": 1.4649, "step": 460 }, { "epoch": 0.10131868131868133, "grad_norm": 0.25464179536962583, "learning_rate": 6.841769753244283e-07, "loss": 1.5068, "step": 461 }, { "epoch": 0.10153846153846154, "grad_norm": 0.27790285861164804, "learning_rate": 6.841088380526217e-07, "loss": 1.4857, "step": 462 }, { "epoch": 0.10175824175824176, "grad_norm": 0.27302067653462836, "learning_rate": 6.840405581851971e-07, "loss": 1.5504, "step": 463 }, { "epoch": 0.10197802197802198, "grad_norm": 0.27219458588952417, "learning_rate": 6.839721357547054e-07, "loss": 1.5409, "step": 464 }, { "epoch": 0.1021978021978022, "grad_norm": 0.24724735358444033, "learning_rate": 6.839035707937662e-07, "loss": 1.4748, "step": 465 }, { "epoch": 0.10241758241758242, "grad_norm": 0.25664320111975036, "learning_rate": 6.838348633350668e-07, "loss": 1.4888, "step": 466 }, { "epoch": 0.10263736263736263, "grad_norm": 0.27292257984434426, "learning_rate": 6.837660134113625e-07, "loss": 1.507, "step": 467 }, { "epoch": 0.10285714285714286, "grad_norm": 0.25023308630494506, "learning_rate": 6.836970210554762e-07, "loss": 1.4296, "step": 468 }, { "epoch": 0.10307692307692308, "grad_norm": 0.25185551572310205, "learning_rate": 6.836278863002994e-07, "loss": 1.4775, "step": 469 }, { "epoch": 0.10329670329670329, "grad_norm": 0.2536090712788748, "learning_rate": 6.835586091787907e-07, "loss": 1.4453, "step": 470 }, { "epoch": 0.10351648351648352, "grad_norm": 0.2598915530603082, "learning_rate": 6.83489189723977e-07, "loss": 1.4433, "step": 471 }, { "epoch": 0.10373626373626374, "grad_norm": 0.3671535169290183, "learning_rate": 6.834196279689531e-07, "loss": 1.4908, "step": 472 }, { "epoch": 0.10395604395604395, "grad_norm": 0.25483324688785886, "learning_rate": 6.833499239468814e-07, "loss": 1.5017, "step": 473 }, { "epoch": 0.10417582417582417, "grad_norm": 0.29012032430794304, "learning_rate": 6.832800776909922e-07, "loss": 1.4441, "step": 474 }, { "epoch": 0.1043956043956044, "grad_norm": 0.2689171943652517, "learning_rate": 6.832100892345839e-07, "loss": 1.504, "step": 475 }, { "epoch": 0.10461538461538461, "grad_norm": 0.25222277720179376, "learning_rate": 6.83139958611022e-07, "loss": 1.4755, "step": 476 }, { "epoch": 0.10483516483516483, "grad_norm": 0.2669393257393482, "learning_rate": 6.830696858537406e-07, "loss": 1.4486, "step": 477 }, { "epoch": 0.10505494505494506, "grad_norm": 0.28494549510793676, "learning_rate": 6.82999270996241e-07, "loss": 1.5624, "step": 478 }, { "epoch": 0.10527472527472527, "grad_norm": 0.253055576786351, "learning_rate": 6.829287140720925e-07, "loss": 1.465, "step": 479 }, { "epoch": 0.1054945054945055, "grad_norm": 0.2727270214833426, "learning_rate": 6.82858015114932e-07, "loss": 1.4986, "step": 480 }, { "epoch": 0.10571428571428572, "grad_norm": 0.2616853189251014, "learning_rate": 6.827871741584641e-07, "loss": 1.4361, "step": 481 }, { "epoch": 0.10593406593406593, "grad_norm": 0.26911600692056026, "learning_rate": 6.827161912364613e-07, "loss": 1.4822, "step": 482 }, { "epoch": 0.10615384615384615, "grad_norm": 0.2506542803052069, "learning_rate": 6.826450663827636e-07, "loss": 1.4666, "step": 483 }, { "epoch": 0.10637362637362638, "grad_norm": 0.2792852427027902, "learning_rate": 6.825737996312786e-07, "loss": 1.4945, "step": 484 }, { "epoch": 0.10659340659340659, "grad_norm": 0.2485019231447162, "learning_rate": 6.825023910159816e-07, "loss": 1.4846, "step": 485 }, { "epoch": 0.10681318681318681, "grad_norm": 0.2844425014032852, "learning_rate": 6.824308405709159e-07, "loss": 1.4581, "step": 486 }, { "epoch": 0.10703296703296704, "grad_norm": 0.24777904804958625, "learning_rate": 6.823591483301917e-07, "loss": 1.4519, "step": 487 }, { "epoch": 0.10725274725274725, "grad_norm": 0.2449120015806373, "learning_rate": 6.822873143279874e-07, "loss": 1.434, "step": 488 }, { "epoch": 0.10747252747252747, "grad_norm": 0.2651587756968944, "learning_rate": 6.822153385985487e-07, "loss": 1.4872, "step": 489 }, { "epoch": 0.1076923076923077, "grad_norm": 0.2647908048951012, "learning_rate": 6.82143221176189e-07, "loss": 1.4457, "step": 490 }, { "epoch": 0.10791208791208791, "grad_norm": 0.25353637829195075, "learning_rate": 6.820709620952893e-07, "loss": 1.4286, "step": 491 }, { "epoch": 0.10813186813186813, "grad_norm": 0.2700291227118415, "learning_rate": 6.819985613902977e-07, "loss": 1.5072, "step": 492 }, { "epoch": 0.10835164835164836, "grad_norm": 0.2505039082672865, "learning_rate": 6.819260190957305e-07, "loss": 1.4755, "step": 493 }, { "epoch": 0.10857142857142857, "grad_norm": 0.2564507325461924, "learning_rate": 6.81853335246171e-07, "loss": 1.464, "step": 494 }, { "epoch": 0.1087912087912088, "grad_norm": 0.2714965940036086, "learning_rate": 6.8178050987627e-07, "loss": 1.4287, "step": 495 }, { "epoch": 0.10901098901098902, "grad_norm": 0.38581712340759083, "learning_rate": 6.817075430207461e-07, "loss": 1.5313, "step": 496 }, { "epoch": 0.10923076923076923, "grad_norm": 0.26480519030228705, "learning_rate": 6.816344347143849e-07, "loss": 1.436, "step": 497 }, { "epoch": 0.10945054945054945, "grad_norm": 0.25247005640015824, "learning_rate": 6.815611849920399e-07, "loss": 1.4192, "step": 498 }, { "epoch": 0.10967032967032966, "grad_norm": 0.24479858855327907, "learning_rate": 6.814877938886317e-07, "loss": 1.5108, "step": 499 }, { "epoch": 0.10989010989010989, "grad_norm": 0.25245686600030026, "learning_rate": 6.814142614391486e-07, "loss": 1.4044, "step": 500 }, { "epoch": 0.11010989010989011, "grad_norm": 0.2779562368830177, "learning_rate": 6.813405876786458e-07, "loss": 1.4583, "step": 501 }, { "epoch": 0.11032967032967032, "grad_norm": 0.2517484372257996, "learning_rate": 6.812667726422462e-07, "loss": 1.511, "step": 502 }, { "epoch": 0.11054945054945055, "grad_norm": 0.24830708842372914, "learning_rate": 6.811928163651402e-07, "loss": 1.5059, "step": 503 }, { "epoch": 0.11076923076923077, "grad_norm": 0.2596011658819083, "learning_rate": 6.811187188825851e-07, "loss": 1.4828, "step": 504 }, { "epoch": 0.11098901098901098, "grad_norm": 0.25591987109679937, "learning_rate": 6.810444802299058e-07, "loss": 1.4684, "step": 505 }, { "epoch": 0.11120879120879121, "grad_norm": 0.24595512205028225, "learning_rate": 6.809701004424947e-07, "loss": 1.4691, "step": 506 }, { "epoch": 0.11142857142857143, "grad_norm": 0.26389582638107206, "learning_rate": 6.808955795558109e-07, "loss": 1.4806, "step": 507 }, { "epoch": 0.11164835164835164, "grad_norm": 0.24042051551057392, "learning_rate": 6.808209176053813e-07, "loss": 1.3643, "step": 508 }, { "epoch": 0.11186813186813187, "grad_norm": 0.23954361062400112, "learning_rate": 6.807461146267999e-07, "loss": 1.409, "step": 509 }, { "epoch": 0.11208791208791209, "grad_norm": 0.3501248630351167, "learning_rate": 6.806711706557276e-07, "loss": 1.5127, "step": 510 }, { "epoch": 0.1123076923076923, "grad_norm": 0.255586536969081, "learning_rate": 6.805960857278933e-07, "loss": 1.5027, "step": 511 }, { "epoch": 0.11252747252747253, "grad_norm": 0.2604156168407272, "learning_rate": 6.805208598790921e-07, "loss": 1.4921, "step": 512 }, { "epoch": 0.11274725274725275, "grad_norm": 0.25777522687505616, "learning_rate": 6.80445493145187e-07, "loss": 1.4881, "step": 513 }, { "epoch": 0.11296703296703296, "grad_norm": 0.2545150835243321, "learning_rate": 6.803699855621081e-07, "loss": 1.511, "step": 514 }, { "epoch": 0.11318681318681319, "grad_norm": 0.2787260228637967, "learning_rate": 6.802943371658523e-07, "loss": 1.4887, "step": 515 }, { "epoch": 0.11340659340659341, "grad_norm": 0.2783140483999986, "learning_rate": 6.802185479924839e-07, "loss": 1.5523, "step": 516 }, { "epoch": 0.11362637362637362, "grad_norm": 0.2536161073592539, "learning_rate": 6.801426180781342e-07, "loss": 1.4825, "step": 517 }, { "epoch": 0.11384615384615385, "grad_norm": 0.2605049130206773, "learning_rate": 6.800665474590016e-07, "loss": 1.4176, "step": 518 }, { "epoch": 0.11406593406593407, "grad_norm": 0.24106833753743503, "learning_rate": 6.799903361713517e-07, "loss": 1.4833, "step": 519 }, { "epoch": 0.11428571428571428, "grad_norm": 0.25180718354253395, "learning_rate": 6.799139842515172e-07, "loss": 1.5119, "step": 520 }, { "epoch": 0.1145054945054945, "grad_norm": 0.24719275326999504, "learning_rate": 6.798374917358974e-07, "loss": 1.4587, "step": 521 }, { "epoch": 0.11472527472527473, "grad_norm": 0.374620782444536, "learning_rate": 6.797608586609593e-07, "loss": 1.4872, "step": 522 }, { "epoch": 0.11494505494505494, "grad_norm": 0.24043316281503388, "learning_rate": 6.796840850632363e-07, "loss": 1.3724, "step": 523 }, { "epoch": 0.11516483516483517, "grad_norm": 0.24894792679030192, "learning_rate": 6.796071709793292e-07, "loss": 1.4977, "step": 524 }, { "epoch": 0.11538461538461539, "grad_norm": 0.25757890500804637, "learning_rate": 6.795301164459057e-07, "loss": 1.585, "step": 525 }, { "epoch": 0.1156043956043956, "grad_norm": 0.2680353943018703, "learning_rate": 6.794529214997e-07, "loss": 1.4679, "step": 526 }, { "epoch": 0.11582417582417583, "grad_norm": 0.28430578711166316, "learning_rate": 6.79375586177514e-07, "loss": 1.4856, "step": 527 }, { "epoch": 0.11604395604395605, "grad_norm": 0.24729281847640566, "learning_rate": 6.792981105162161e-07, "loss": 1.4634, "step": 528 }, { "epoch": 0.11626373626373626, "grad_norm": 0.26446540265063584, "learning_rate": 6.792204945527416e-07, "loss": 1.5078, "step": 529 }, { "epoch": 0.11648351648351649, "grad_norm": 0.2562857706514639, "learning_rate": 6.791427383240926e-07, "loss": 1.4887, "step": 530 }, { "epoch": 0.1167032967032967, "grad_norm": 0.2699076739620769, "learning_rate": 6.790648418673385e-07, "loss": 1.5422, "step": 531 }, { "epoch": 0.11692307692307692, "grad_norm": 0.24833698988720773, "learning_rate": 6.78986805219615e-07, "loss": 1.4167, "step": 532 }, { "epoch": 0.11714285714285715, "grad_norm": 0.2599219400232677, "learning_rate": 6.789086284181249e-07, "loss": 1.473, "step": 533 }, { "epoch": 0.11736263736263736, "grad_norm": 0.25646038902365914, "learning_rate": 6.788303115001379e-07, "loss": 1.4754, "step": 534 }, { "epoch": 0.11758241758241758, "grad_norm": 0.2570833200962951, "learning_rate": 6.787518545029905e-07, "loss": 1.5218, "step": 535 }, { "epoch": 0.1178021978021978, "grad_norm": 0.24369640759526123, "learning_rate": 6.786732574640857e-07, "loss": 1.4993, "step": 536 }, { "epoch": 0.11802197802197802, "grad_norm": 0.2996102711505013, "learning_rate": 6.785945204208935e-07, "loss": 1.4707, "step": 537 }, { "epoch": 0.11824175824175824, "grad_norm": 4.836968981601045, "learning_rate": 6.785156434109508e-07, "loss": 1.4633, "step": 538 }, { "epoch": 0.11846153846153847, "grad_norm": 0.2521721845885586, "learning_rate": 6.784366264718607e-07, "loss": 1.495, "step": 539 }, { "epoch": 0.11868131868131868, "grad_norm": 0.24759360173294137, "learning_rate": 6.783574696412935e-07, "loss": 1.4902, "step": 540 }, { "epoch": 0.1189010989010989, "grad_norm": 0.2506082057658365, "learning_rate": 6.782781729569861e-07, "loss": 1.4489, "step": 541 }, { "epoch": 0.11912087912087913, "grad_norm": 0.247922029109496, "learning_rate": 6.781987364567418e-07, "loss": 1.5067, "step": 542 }, { "epoch": 0.11934065934065934, "grad_norm": 0.2596877891802961, "learning_rate": 6.781191601784308e-07, "loss": 1.5452, "step": 543 }, { "epoch": 0.11956043956043956, "grad_norm": 0.26471453771120845, "learning_rate": 6.780394441599901e-07, "loss": 1.4403, "step": 544 }, { "epoch": 0.11978021978021978, "grad_norm": 0.26119129531553703, "learning_rate": 6.779595884394228e-07, "loss": 1.4563, "step": 545 }, { "epoch": 0.12, "grad_norm": 0.24934977562527547, "learning_rate": 6.778795930547992e-07, "loss": 1.4777, "step": 546 }, { "epoch": 0.12021978021978022, "grad_norm": 0.32445647857339116, "learning_rate": 6.777994580442556e-07, "loss": 1.4674, "step": 547 }, { "epoch": 0.12043956043956044, "grad_norm": 0.2735917636144104, "learning_rate": 6.777191834459955e-07, "loss": 1.4359, "step": 548 }, { "epoch": 0.12065934065934066, "grad_norm": 0.25710108499917056, "learning_rate": 6.776387692982883e-07, "loss": 1.4893, "step": 549 }, { "epoch": 0.12087912087912088, "grad_norm": 0.2568323094996253, "learning_rate": 6.775582156394702e-07, "loss": 1.4989, "step": 550 }, { "epoch": 0.1210989010989011, "grad_norm": 0.2503252992520465, "learning_rate": 6.774775225079441e-07, "loss": 1.5192, "step": 551 }, { "epoch": 0.12131868131868132, "grad_norm": 0.25233039820591163, "learning_rate": 6.773966899421793e-07, "loss": 1.4429, "step": 552 }, { "epoch": 0.12153846153846154, "grad_norm": 0.26672210036338045, "learning_rate": 6.773157179807115e-07, "loss": 1.4522, "step": 553 }, { "epoch": 0.12175824175824176, "grad_norm": 0.27208276025497646, "learning_rate": 6.772346066621426e-07, "loss": 1.4366, "step": 554 }, { "epoch": 0.12197802197802197, "grad_norm": 0.2494064492885206, "learning_rate": 6.771533560251415e-07, "loss": 1.5145, "step": 555 }, { "epoch": 0.1221978021978022, "grad_norm": 0.26432423181770315, "learning_rate": 6.77071966108443e-07, "loss": 1.4245, "step": 556 }, { "epoch": 0.12241758241758242, "grad_norm": 0.26234707648890465, "learning_rate": 6.769904369508486e-07, "loss": 1.4391, "step": 557 }, { "epoch": 0.12263736263736263, "grad_norm": 0.26429009678438037, "learning_rate": 6.76908768591226e-07, "loss": 1.4905, "step": 558 }, { "epoch": 0.12285714285714286, "grad_norm": 0.32733980284574415, "learning_rate": 6.768269610685095e-07, "loss": 1.5061, "step": 559 }, { "epoch": 0.12307692307692308, "grad_norm": 0.2604183503592994, "learning_rate": 6.767450144216995e-07, "loss": 1.4344, "step": 560 }, { "epoch": 0.1232967032967033, "grad_norm": 0.2545024083799796, "learning_rate": 6.76662928689863e-07, "loss": 1.4102, "step": 561 }, { "epoch": 0.12351648351648352, "grad_norm": 0.2529655914581244, "learning_rate": 6.765807039121328e-07, "loss": 1.4566, "step": 562 }, { "epoch": 0.12373626373626373, "grad_norm": 0.25955218773939964, "learning_rate": 6.764983401277088e-07, "loss": 1.4717, "step": 563 }, { "epoch": 0.12395604395604395, "grad_norm": 0.2560272633314233, "learning_rate": 6.764158373758561e-07, "loss": 1.4886, "step": 564 }, { "epoch": 0.12417582417582418, "grad_norm": 0.24216328593221215, "learning_rate": 6.76333195695907e-07, "loss": 1.4722, "step": 565 }, { "epoch": 0.12439560439560439, "grad_norm": 0.27914603169370267, "learning_rate": 6.762504151272598e-07, "loss": 1.5085, "step": 566 }, { "epoch": 0.12461538461538461, "grad_norm": 0.2672713341139715, "learning_rate": 6.761674957093786e-07, "loss": 1.4944, "step": 567 }, { "epoch": 0.12483516483516484, "grad_norm": 0.2601506203345243, "learning_rate": 6.760844374817941e-07, "loss": 1.5202, "step": 568 }, { "epoch": 0.12505494505494505, "grad_norm": 0.2515362004640614, "learning_rate": 6.76001240484103e-07, "loss": 1.4805, "step": 569 }, { "epoch": 0.12527472527472527, "grad_norm": 0.2608813528356316, "learning_rate": 6.759179047559682e-07, "loss": 1.486, "step": 570 }, { "epoch": 0.1254945054945055, "grad_norm": 0.2589014573963635, "learning_rate": 6.758344303371189e-07, "loss": 1.4856, "step": 571 }, { "epoch": 0.12571428571428572, "grad_norm": 0.24920654811289786, "learning_rate": 6.757508172673502e-07, "loss": 1.4637, "step": 572 }, { "epoch": 0.12593406593406595, "grad_norm": 0.2549398474073354, "learning_rate": 6.756670655865232e-07, "loss": 1.5162, "step": 573 }, { "epoch": 0.12615384615384614, "grad_norm": 0.24662331267290583, "learning_rate": 6.755831753345654e-07, "loss": 1.5235, "step": 574 }, { "epoch": 0.12637362637362637, "grad_norm": 0.26311319396900035, "learning_rate": 6.754991465514703e-07, "loss": 1.4975, "step": 575 }, { "epoch": 0.1265934065934066, "grad_norm": 0.27166576107579626, "learning_rate": 6.754149792772971e-07, "loss": 1.447, "step": 576 }, { "epoch": 0.12681318681318682, "grad_norm": 0.2818718536050693, "learning_rate": 6.753306735521715e-07, "loss": 1.4835, "step": 577 }, { "epoch": 0.12703296703296704, "grad_norm": 0.2538419957229311, "learning_rate": 6.752462294162848e-07, "loss": 1.4437, "step": 578 }, { "epoch": 0.12725274725274724, "grad_norm": 0.25876807777467264, "learning_rate": 6.751616469098947e-07, "loss": 1.4824, "step": 579 }, { "epoch": 0.12747252747252746, "grad_norm": 0.2555996009280035, "learning_rate": 6.750769260733244e-07, "loss": 1.5044, "step": 580 }, { "epoch": 0.1276923076923077, "grad_norm": 0.2573739323380264, "learning_rate": 6.749920669469635e-07, "loss": 1.4234, "step": 581 }, { "epoch": 0.1279120879120879, "grad_norm": 0.2852857286268162, "learning_rate": 6.749070695712671e-07, "loss": 1.4695, "step": 582 }, { "epoch": 0.12813186813186814, "grad_norm": 0.254237091514892, "learning_rate": 6.748219339867565e-07, "loss": 1.5095, "step": 583 }, { "epoch": 0.12835164835164836, "grad_norm": 0.25060909727802133, "learning_rate": 6.747366602340187e-07, "loss": 1.4565, "step": 584 }, { "epoch": 0.12857142857142856, "grad_norm": 0.26823526977337603, "learning_rate": 6.746512483537069e-07, "loss": 1.4477, "step": 585 }, { "epoch": 0.12879120879120878, "grad_norm": 0.2575369033243557, "learning_rate": 6.745656983865397e-07, "loss": 1.5267, "step": 586 }, { "epoch": 0.129010989010989, "grad_norm": 0.25692016695936587, "learning_rate": 6.74480010373302e-07, "loss": 1.6099, "step": 587 }, { "epoch": 0.12923076923076923, "grad_norm": 0.24798901677537402, "learning_rate": 6.74394184354844e-07, "loss": 1.4586, "step": 588 }, { "epoch": 0.12945054945054946, "grad_norm": 0.24918359117104827, "learning_rate": 6.743082203720821e-07, "loss": 1.4707, "step": 589 }, { "epoch": 0.12967032967032968, "grad_norm": 0.3580205505412506, "learning_rate": 6.742221184659983e-07, "loss": 1.4628, "step": 590 }, { "epoch": 0.12989010989010988, "grad_norm": 0.2482170366026315, "learning_rate": 6.741358786776405e-07, "loss": 1.4678, "step": 591 }, { "epoch": 0.1301098901098901, "grad_norm": 0.2621541503303289, "learning_rate": 6.740495010481218e-07, "loss": 1.4722, "step": 592 }, { "epoch": 0.13032967032967033, "grad_norm": 0.2609206727854163, "learning_rate": 6.739629856186219e-07, "loss": 1.4373, "step": 593 }, { "epoch": 0.13054945054945055, "grad_norm": 0.24730244786886074, "learning_rate": 6.738763324303856e-07, "loss": 1.5107, "step": 594 }, { "epoch": 0.13076923076923078, "grad_norm": 0.25360527024214685, "learning_rate": 6.737895415247234e-07, "loss": 1.4751, "step": 595 }, { "epoch": 0.130989010989011, "grad_norm": 0.25135232062801177, "learning_rate": 6.737026129430117e-07, "loss": 1.4944, "step": 596 }, { "epoch": 0.1312087912087912, "grad_norm": 0.24739770278957182, "learning_rate": 6.736155467266922e-07, "loss": 1.4992, "step": 597 }, { "epoch": 0.13142857142857142, "grad_norm": 0.248993357785481, "learning_rate": 6.735283429172724e-07, "loss": 1.4297, "step": 598 }, { "epoch": 0.13164835164835165, "grad_norm": 0.2577381150715164, "learning_rate": 6.734410015563257e-07, "loss": 1.4672, "step": 599 }, { "epoch": 0.13186813186813187, "grad_norm": 0.25352728890385245, "learning_rate": 6.733535226854904e-07, "loss": 1.4939, "step": 600 }, { "epoch": 0.1320879120879121, "grad_norm": 0.28282549033235815, "learning_rate": 6.732659063464709e-07, "loss": 1.4625, "step": 601 }, { "epoch": 0.13230769230769232, "grad_norm": 0.25118396488694805, "learning_rate": 6.73178152581037e-07, "loss": 1.4724, "step": 602 }, { "epoch": 0.13252747252747252, "grad_norm": 0.2643496786353885, "learning_rate": 6.730902614310239e-07, "loss": 1.4778, "step": 603 }, { "epoch": 0.13274725274725274, "grad_norm": 0.2750885070990114, "learning_rate": 6.730022329383325e-07, "loss": 1.4611, "step": 604 }, { "epoch": 0.13296703296703297, "grad_norm": 0.2458917515347148, "learning_rate": 6.72914067144929e-07, "loss": 1.4808, "step": 605 }, { "epoch": 0.1331868131868132, "grad_norm": 0.25874215122652794, "learning_rate": 6.72825764092845e-07, "loss": 1.4708, "step": 606 }, { "epoch": 0.13340659340659342, "grad_norm": 0.3584027575706209, "learning_rate": 6.727373238241778e-07, "loss": 1.5036, "step": 607 }, { "epoch": 0.13362637362637364, "grad_norm": 0.24828060838826704, "learning_rate": 6.726487463810898e-07, "loss": 1.4932, "step": 608 }, { "epoch": 0.13384615384615384, "grad_norm": 0.2464631893675043, "learning_rate": 6.725600318058091e-07, "loss": 1.4668, "step": 609 }, { "epoch": 0.13406593406593406, "grad_norm": 0.24528219819876454, "learning_rate": 6.724711801406289e-07, "loss": 1.4714, "step": 610 }, { "epoch": 0.13428571428571429, "grad_norm": 0.2562755203983173, "learning_rate": 6.723821914279081e-07, "loss": 1.4575, "step": 611 }, { "epoch": 0.1345054945054945, "grad_norm": 0.25387672406745615, "learning_rate": 6.722930657100705e-07, "loss": 1.4835, "step": 612 }, { "epoch": 0.13472527472527474, "grad_norm": 0.2345831827522563, "learning_rate": 6.722038030296056e-07, "loss": 1.4116, "step": 613 }, { "epoch": 0.13494505494505493, "grad_norm": 0.2495181884989296, "learning_rate": 6.721144034290679e-07, "loss": 1.4706, "step": 614 }, { "epoch": 0.13516483516483516, "grad_norm": 0.25425273610397525, "learning_rate": 6.720248669510775e-07, "loss": 1.4549, "step": 615 }, { "epoch": 0.13538461538461538, "grad_norm": 0.25143139259916875, "learning_rate": 6.719351936383192e-07, "loss": 1.4558, "step": 616 }, { "epoch": 0.1356043956043956, "grad_norm": 0.25472415540476356, "learning_rate": 6.718453835335438e-07, "loss": 1.5004, "step": 617 }, { "epoch": 0.13582417582417583, "grad_norm": 0.2622890240277487, "learning_rate": 6.717554366795667e-07, "loss": 1.4762, "step": 618 }, { "epoch": 0.13604395604395605, "grad_norm": 0.24338816027078966, "learning_rate": 6.716653531192689e-07, "loss": 1.4259, "step": 619 }, { "epoch": 0.13626373626373625, "grad_norm": 1.9265731543962334, "learning_rate": 6.71575132895596e-07, "loss": 1.4509, "step": 620 }, { "epoch": 0.13648351648351648, "grad_norm": 0.2442154939987973, "learning_rate": 6.714847760515595e-07, "loss": 1.5357, "step": 621 }, { "epoch": 0.1367032967032967, "grad_norm": 0.25491896578455076, "learning_rate": 6.713942826302354e-07, "loss": 1.4408, "step": 622 }, { "epoch": 0.13692307692307693, "grad_norm": 0.25440094853557665, "learning_rate": 6.713036526747652e-07, "loss": 1.4512, "step": 623 }, { "epoch": 0.13714285714285715, "grad_norm": 0.25297524476493, "learning_rate": 6.712128862283555e-07, "loss": 1.4955, "step": 624 }, { "epoch": 0.13736263736263737, "grad_norm": 0.25034624748662054, "learning_rate": 6.711219833342775e-07, "loss": 1.5209, "step": 625 }, { "epoch": 0.13758241758241757, "grad_norm": 0.30715221340878235, "learning_rate": 6.71030944035868e-07, "loss": 1.4604, "step": 626 }, { "epoch": 0.1378021978021978, "grad_norm": 0.24297220322870838, "learning_rate": 6.709397683765286e-07, "loss": 1.4935, "step": 627 }, { "epoch": 0.13802197802197802, "grad_norm": 0.25807837266666944, "learning_rate": 6.708484563997259e-07, "loss": 1.4928, "step": 628 }, { "epoch": 0.13824175824175824, "grad_norm": 0.25352319005316637, "learning_rate": 6.707570081489913e-07, "loss": 1.4607, "step": 629 }, { "epoch": 0.13846153846153847, "grad_norm": 0.27527236640863634, "learning_rate": 6.706654236679217e-07, "loss": 1.4641, "step": 630 }, { "epoch": 0.1386813186813187, "grad_norm": 0.30420842682976595, "learning_rate": 6.705737030001786e-07, "loss": 1.509, "step": 631 }, { "epoch": 0.1389010989010989, "grad_norm": 0.2635889063342168, "learning_rate": 6.704818461894882e-07, "loss": 1.5144, "step": 632 }, { "epoch": 0.13912087912087912, "grad_norm": 0.27409945062160546, "learning_rate": 6.703898532796419e-07, "loss": 1.5491, "step": 633 }, { "epoch": 0.13934065934065934, "grad_norm": 0.2567530630574621, "learning_rate": 6.702977243144962e-07, "loss": 1.5287, "step": 634 }, { "epoch": 0.13956043956043956, "grad_norm": 0.2605688055972312, "learning_rate": 6.702054593379717e-07, "loss": 1.4475, "step": 635 }, { "epoch": 0.1397802197802198, "grad_norm": 0.3781097534457521, "learning_rate": 6.701130583940549e-07, "loss": 1.4951, "step": 636 }, { "epoch": 0.14, "grad_norm": 0.2885118240134864, "learning_rate": 6.700205215267962e-07, "loss": 1.4084, "step": 637 }, { "epoch": 0.1402197802197802, "grad_norm": 0.27675262439117465, "learning_rate": 6.699278487803111e-07, "loss": 1.4321, "step": 638 }, { "epoch": 0.14043956043956043, "grad_norm": 0.30865092076123496, "learning_rate": 6.698350401987802e-07, "loss": 1.4863, "step": 639 }, { "epoch": 0.14065934065934066, "grad_norm": 0.2591974374403393, "learning_rate": 6.697420958264484e-07, "loss": 1.462, "step": 640 }, { "epoch": 0.14087912087912088, "grad_norm": 0.26120262286985, "learning_rate": 6.696490157076255e-07, "loss": 1.5168, "step": 641 }, { "epoch": 0.1410989010989011, "grad_norm": 0.25326512305755383, "learning_rate": 6.695557998866861e-07, "loss": 1.4377, "step": 642 }, { "epoch": 0.1413186813186813, "grad_norm": 0.2521425114224541, "learning_rate": 6.694624484080696e-07, "loss": 1.4993, "step": 643 }, { "epoch": 0.14153846153846153, "grad_norm": 0.2535659939459957, "learning_rate": 6.693689613162797e-07, "loss": 1.4512, "step": 644 }, { "epoch": 0.14175824175824175, "grad_norm": 0.2561199850385403, "learning_rate": 6.692753386558849e-07, "loss": 1.6039, "step": 645 }, { "epoch": 0.14197802197802198, "grad_norm": 0.255464856674977, "learning_rate": 6.691815804715187e-07, "loss": 1.4365, "step": 646 }, { "epoch": 0.1421978021978022, "grad_norm": 0.25958850730486166, "learning_rate": 6.690876868078785e-07, "loss": 1.5047, "step": 647 }, { "epoch": 0.14241758241758243, "grad_norm": 0.2500783111746471, "learning_rate": 6.689936577097269e-07, "loss": 1.4696, "step": 648 }, { "epoch": 0.14263736263736262, "grad_norm": 0.25760812371648933, "learning_rate": 6.68899493221891e-07, "loss": 1.4919, "step": 649 }, { "epoch": 0.14285714285714285, "grad_norm": 0.26538763002665106, "learning_rate": 6.68805193389262e-07, "loss": 1.4507, "step": 650 }, { "epoch": 0.14307692307692307, "grad_norm": 0.26528655371984755, "learning_rate": 6.68710758256796e-07, "loss": 1.4743, "step": 651 }, { "epoch": 0.1432967032967033, "grad_norm": 0.25406466482981116, "learning_rate": 6.686161878695137e-07, "loss": 1.5057, "step": 652 }, { "epoch": 0.14351648351648352, "grad_norm": 0.247850890598129, "learning_rate": 6.685214822724999e-07, "loss": 1.5243, "step": 653 }, { "epoch": 0.14373626373626375, "grad_norm": 0.2750472401396167, "learning_rate": 6.684266415109042e-07, "loss": 1.476, "step": 654 }, { "epoch": 0.14395604395604394, "grad_norm": 0.2578699382533185, "learning_rate": 6.683316656299404e-07, "loss": 1.5319, "step": 655 }, { "epoch": 0.14417582417582417, "grad_norm": 0.2527601291042012, "learning_rate": 6.682365546748869e-07, "loss": 1.4707, "step": 656 }, { "epoch": 0.1443956043956044, "grad_norm": 0.254402693589395, "learning_rate": 6.681413086910863e-07, "loss": 1.4655, "step": 657 }, { "epoch": 0.14461538461538462, "grad_norm": 0.2578331305103979, "learning_rate": 6.680459277239458e-07, "loss": 1.5281, "step": 658 }, { "epoch": 0.14483516483516484, "grad_norm": 0.2459575646762834, "learning_rate": 6.679504118189368e-07, "loss": 1.4185, "step": 659 }, { "epoch": 0.14505494505494507, "grad_norm": 0.24860288790531948, "learning_rate": 6.678547610215949e-07, "loss": 1.4984, "step": 660 }, { "epoch": 0.14527472527472526, "grad_norm": 0.3380853140654683, "learning_rate": 6.677589753775204e-07, "loss": 1.4717, "step": 661 }, { "epoch": 0.1454945054945055, "grad_norm": 0.27199503422431626, "learning_rate": 6.676630549323777e-07, "loss": 1.4495, "step": 662 }, { "epoch": 0.1457142857142857, "grad_norm": 0.25037691212991126, "learning_rate": 6.67566999731895e-07, "loss": 1.4454, "step": 663 }, { "epoch": 0.14593406593406594, "grad_norm": 0.25256060774128214, "learning_rate": 6.674708098218657e-07, "loss": 1.5024, "step": 664 }, { "epoch": 0.14615384615384616, "grad_norm": 0.2538059396923407, "learning_rate": 6.673744852481466e-07, "loss": 1.4831, "step": 665 }, { "epoch": 0.1463736263736264, "grad_norm": 2.7562989218274083, "learning_rate": 6.672780260566591e-07, "loss": 1.4373, "step": 666 }, { "epoch": 0.14659340659340658, "grad_norm": 0.2552285623766303, "learning_rate": 6.671814322933885e-07, "loss": 1.4442, "step": 667 }, { "epoch": 0.1468131868131868, "grad_norm": 0.26199679489610755, "learning_rate": 6.670847040043846e-07, "loss": 1.4983, "step": 668 }, { "epoch": 0.14703296703296703, "grad_norm": 0.25533200148769086, "learning_rate": 6.66987841235761e-07, "loss": 1.4984, "step": 669 }, { "epoch": 0.14725274725274726, "grad_norm": 0.2481720448897788, "learning_rate": 6.668908440336957e-07, "loss": 1.4718, "step": 670 }, { "epoch": 0.14747252747252748, "grad_norm": 0.25046968974867, "learning_rate": 6.667937124444304e-07, "loss": 1.5561, "step": 671 }, { "epoch": 0.1476923076923077, "grad_norm": 0.25262852955013854, "learning_rate": 6.666964465142714e-07, "loss": 1.4741, "step": 672 }, { "epoch": 0.1479120879120879, "grad_norm": 0.2567999837458799, "learning_rate": 6.665990462895887e-07, "loss": 1.4582, "step": 673 }, { "epoch": 0.14813186813186813, "grad_norm": 0.2782472953324953, "learning_rate": 6.665015118168163e-07, "loss": 1.4744, "step": 674 }, { "epoch": 0.14835164835164835, "grad_norm": 0.25883556005102304, "learning_rate": 6.664038431424524e-07, "loss": 1.4983, "step": 675 }, { "epoch": 0.14857142857142858, "grad_norm": 0.4116410723136531, "learning_rate": 6.663060403130589e-07, "loss": 1.4946, "step": 676 }, { "epoch": 0.1487912087912088, "grad_norm": 0.2521008943237067, "learning_rate": 6.662081033752619e-07, "loss": 1.4725, "step": 677 }, { "epoch": 0.149010989010989, "grad_norm": 0.3065605400222554, "learning_rate": 6.661100323757514e-07, "loss": 1.4687, "step": 678 }, { "epoch": 0.14923076923076922, "grad_norm": 0.3131134849596052, "learning_rate": 6.660118273612812e-07, "loss": 1.5064, "step": 679 }, { "epoch": 0.14945054945054945, "grad_norm": 0.24729196101336837, "learning_rate": 6.659134883786692e-07, "loss": 1.3626, "step": 680 }, { "epoch": 0.14967032967032967, "grad_norm": 0.26349791006173723, "learning_rate": 6.658150154747968e-07, "loss": 1.4969, "step": 681 }, { "epoch": 0.1498901098901099, "grad_norm": 0.2494494282378353, "learning_rate": 6.657164086966096e-07, "loss": 1.4553, "step": 682 }, { "epoch": 0.15010989010989012, "grad_norm": 0.25362773053729387, "learning_rate": 6.656176680911169e-07, "loss": 1.5267, "step": 683 }, { "epoch": 0.15032967032967032, "grad_norm": 0.24941857861908245, "learning_rate": 6.655187937053918e-07, "loss": 1.516, "step": 684 }, { "epoch": 0.15054945054945054, "grad_norm": 0.2523332345790184, "learning_rate": 6.65419785586571e-07, "loss": 1.451, "step": 685 }, { "epoch": 0.15076923076923077, "grad_norm": 0.44281741281359693, "learning_rate": 6.653206437818554e-07, "loss": 1.4983, "step": 686 }, { "epoch": 0.150989010989011, "grad_norm": 0.2580148489364453, "learning_rate": 6.652213683385091e-07, "loss": 1.4659, "step": 687 }, { "epoch": 0.15120879120879122, "grad_norm": 0.27388267317813153, "learning_rate": 6.651219593038603e-07, "loss": 1.4542, "step": 688 }, { "epoch": 0.15142857142857144, "grad_norm": 0.27601761546622605, "learning_rate": 6.650224167253007e-07, "loss": 1.5113, "step": 689 }, { "epoch": 0.15164835164835164, "grad_norm": 0.2540050759966249, "learning_rate": 6.64922740650286e-07, "loss": 1.4595, "step": 690 }, { "epoch": 0.15186813186813186, "grad_norm": 0.2536128366598746, "learning_rate": 6.648229311263348e-07, "loss": 1.461, "step": 691 }, { "epoch": 0.15208791208791209, "grad_norm": 1.173833552368354, "learning_rate": 6.647229882010302e-07, "loss": 1.5149, "step": 692 }, { "epoch": 0.1523076923076923, "grad_norm": 0.2524183503518466, "learning_rate": 6.646229119220181e-07, "loss": 1.463, "step": 693 }, { "epoch": 0.15252747252747254, "grad_norm": 0.25023878320877235, "learning_rate": 6.645227023370085e-07, "loss": 1.4703, "step": 694 }, { "epoch": 0.15274725274725276, "grad_norm": 0.3083160506620809, "learning_rate": 6.644223594937749e-07, "loss": 1.4707, "step": 695 }, { "epoch": 0.15296703296703296, "grad_norm": 0.24693806141253166, "learning_rate": 6.643218834401542e-07, "loss": 1.4681, "step": 696 }, { "epoch": 0.15318681318681318, "grad_norm": 0.24889949450072432, "learning_rate": 6.642212742240469e-07, "loss": 1.4359, "step": 697 }, { "epoch": 0.1534065934065934, "grad_norm": 0.252501631871603, "learning_rate": 6.641205318934168e-07, "loss": 1.463, "step": 698 }, { "epoch": 0.15362637362637363, "grad_norm": 0.2599127968116426, "learning_rate": 6.640196564962912e-07, "loss": 1.4921, "step": 699 }, { "epoch": 0.15384615384615385, "grad_norm": 0.26192494393437693, "learning_rate": 6.639186480807611e-07, "loss": 1.5068, "step": 700 }, { "epoch": 0.15406593406593408, "grad_norm": 0.25497868036668103, "learning_rate": 6.638175066949808e-07, "loss": 1.4696, "step": 701 }, { "epoch": 0.15428571428571428, "grad_norm": 0.2460989820027386, "learning_rate": 6.637162323871675e-07, "loss": 1.4699, "step": 702 }, { "epoch": 0.1545054945054945, "grad_norm": 0.24982022505936596, "learning_rate": 6.636148252056028e-07, "loss": 1.4955, "step": 703 }, { "epoch": 0.15472527472527473, "grad_norm": 0.27317865675245767, "learning_rate": 6.635132851986306e-07, "loss": 1.53, "step": 704 }, { "epoch": 0.15494505494505495, "grad_norm": 0.25048302165674363, "learning_rate": 6.634116124146587e-07, "loss": 1.5136, "step": 705 }, { "epoch": 0.15516483516483517, "grad_norm": 0.26192135430394875, "learning_rate": 6.633098069021581e-07, "loss": 1.4924, "step": 706 }, { "epoch": 0.15538461538461537, "grad_norm": 0.281509310709272, "learning_rate": 6.632078687096632e-07, "loss": 1.4446, "step": 707 }, { "epoch": 0.1556043956043956, "grad_norm": 0.2794053992288377, "learning_rate": 6.631057978857711e-07, "loss": 1.4437, "step": 708 }, { "epoch": 0.15582417582417582, "grad_norm": 0.2520586104467259, "learning_rate": 6.630035944791427e-07, "loss": 1.4722, "step": 709 }, { "epoch": 0.15604395604395604, "grad_norm": 0.24110474613088786, "learning_rate": 6.62901258538502e-07, "loss": 1.472, "step": 710 }, { "epoch": 0.15626373626373627, "grad_norm": 0.24988753659951027, "learning_rate": 6.627987901126361e-07, "loss": 1.4646, "step": 711 }, { "epoch": 0.1564835164835165, "grad_norm": 0.2446989179836303, "learning_rate": 6.626961892503952e-07, "loss": 1.5106, "step": 712 }, { "epoch": 0.1567032967032967, "grad_norm": 0.2916677429885161, "learning_rate": 6.625934560006927e-07, "loss": 1.5192, "step": 713 }, { "epoch": 0.15692307692307692, "grad_norm": 0.32650457597663507, "learning_rate": 6.624905904125053e-07, "loss": 1.3976, "step": 714 }, { "epoch": 0.15714285714285714, "grad_norm": 0.2601942019447628, "learning_rate": 6.623875925348725e-07, "loss": 1.5088, "step": 715 }, { "epoch": 0.15736263736263736, "grad_norm": 0.25807366853009023, "learning_rate": 6.622844624168969e-07, "loss": 1.4225, "step": 716 }, { "epoch": 0.1575824175824176, "grad_norm": 0.30106958162934105, "learning_rate": 6.621812001077443e-07, "loss": 1.467, "step": 717 }, { "epoch": 0.1578021978021978, "grad_norm": 0.26586260319881483, "learning_rate": 6.620778056566433e-07, "loss": 1.5472, "step": 718 }, { "epoch": 0.158021978021978, "grad_norm": 0.2964729009625348, "learning_rate": 6.61974279112886e-07, "loss": 1.5158, "step": 719 }, { "epoch": 0.15824175824175823, "grad_norm": 0.45552473289423245, "learning_rate": 6.618706205258267e-07, "loss": 1.4683, "step": 720 }, { "epoch": 0.15846153846153846, "grad_norm": 0.29935869809134935, "learning_rate": 6.617668299448836e-07, "loss": 1.415, "step": 721 }, { "epoch": 0.15868131868131868, "grad_norm": 0.282983557354629, "learning_rate": 6.616629074195368e-07, "loss": 1.5263, "step": 722 }, { "epoch": 0.1589010989010989, "grad_norm": 0.24751519284333773, "learning_rate": 6.615588529993298e-07, "loss": 1.4847, "step": 723 }, { "epoch": 0.15912087912087913, "grad_norm": 0.2567702507643263, "learning_rate": 6.614546667338693e-07, "loss": 1.4965, "step": 724 }, { "epoch": 0.15934065934065933, "grad_norm": 0.2801887945105601, "learning_rate": 6.613503486728242e-07, "loss": 1.4991, "step": 725 }, { "epoch": 0.15956043956043955, "grad_norm": 0.2607227955687198, "learning_rate": 6.612458988659267e-07, "loss": 1.5175, "step": 726 }, { "epoch": 0.15978021978021978, "grad_norm": 0.25962241272880177, "learning_rate": 6.611413173629719e-07, "loss": 1.4961, "step": 727 }, { "epoch": 0.16, "grad_norm": 0.26264978162906727, "learning_rate": 6.61036604213817e-07, "loss": 1.5129, "step": 728 }, { "epoch": 0.16021978021978023, "grad_norm": 0.2547698297401557, "learning_rate": 6.609317594683826e-07, "loss": 1.5095, "step": 729 }, { "epoch": 0.16043956043956045, "grad_norm": 0.2627553310032375, "learning_rate": 6.608267831766522e-07, "loss": 1.4882, "step": 730 }, { "epoch": 0.16065934065934065, "grad_norm": 0.24763594330752617, "learning_rate": 6.607216753886711e-07, "loss": 1.4772, "step": 731 }, { "epoch": 0.16087912087912087, "grad_norm": 0.2661925378097158, "learning_rate": 6.606164361545481e-07, "loss": 1.4555, "step": 732 }, { "epoch": 0.1610989010989011, "grad_norm": 0.2768756160373414, "learning_rate": 6.605110655244544e-07, "loss": 1.5111, "step": 733 }, { "epoch": 0.16131868131868132, "grad_norm": 0.27331559761307034, "learning_rate": 6.60405563548624e-07, "loss": 1.5633, "step": 734 }, { "epoch": 0.16153846153846155, "grad_norm": 0.27237573228019885, "learning_rate": 6.602999302773531e-07, "loss": 1.4361, "step": 735 }, { "epoch": 0.16175824175824177, "grad_norm": 0.28206585637636306, "learning_rate": 6.601941657610009e-07, "loss": 1.4718, "step": 736 }, { "epoch": 0.16197802197802197, "grad_norm": 0.26533934720128444, "learning_rate": 6.600882700499892e-07, "loss": 1.4645, "step": 737 }, { "epoch": 0.1621978021978022, "grad_norm": 0.26097854908852147, "learning_rate": 6.599822431948019e-07, "loss": 1.5212, "step": 738 }, { "epoch": 0.16241758241758242, "grad_norm": 0.25046327644171784, "learning_rate": 6.598760852459858e-07, "loss": 1.4502, "step": 739 }, { "epoch": 0.16263736263736264, "grad_norm": 0.2576758519604668, "learning_rate": 6.597697962541503e-07, "loss": 1.4647, "step": 740 }, { "epoch": 0.16285714285714287, "grad_norm": 0.2683827822946962, "learning_rate": 6.596633762699666e-07, "loss": 1.4973, "step": 741 }, { "epoch": 0.16307692307692306, "grad_norm": 0.2742508338439112, "learning_rate": 6.595568253441693e-07, "loss": 1.4685, "step": 742 }, { "epoch": 0.1632967032967033, "grad_norm": 0.2531576711174202, "learning_rate": 6.594501435275547e-07, "loss": 1.4534, "step": 743 }, { "epoch": 0.1635164835164835, "grad_norm": 0.3885874890573479, "learning_rate": 6.593433308709817e-07, "loss": 1.5096, "step": 744 }, { "epoch": 0.16373626373626374, "grad_norm": 0.2560369856654694, "learning_rate": 6.592363874253718e-07, "loss": 1.5082, "step": 745 }, { "epoch": 0.16395604395604396, "grad_norm": 0.25046593167757836, "learning_rate": 6.591293132417087e-07, "loss": 1.4847, "step": 746 }, { "epoch": 0.1641758241758242, "grad_norm": 0.251520227691982, "learning_rate": 6.590221083710381e-07, "loss": 1.4886, "step": 747 }, { "epoch": 0.16439560439560438, "grad_norm": 0.2557773067570504, "learning_rate": 6.589147728644686e-07, "loss": 1.4163, "step": 748 }, { "epoch": 0.1646153846153846, "grad_norm": 0.25572096390591553, "learning_rate": 6.588073067731707e-07, "loss": 1.4783, "step": 749 }, { "epoch": 0.16483516483516483, "grad_norm": 0.25899297494207973, "learning_rate": 6.58699710148377e-07, "loss": 1.4259, "step": 750 }, { "epoch": 0.16505494505494506, "grad_norm": 0.2526263606274654, "learning_rate": 6.585919830413829e-07, "loss": 1.494, "step": 751 }, { "epoch": 0.16527472527472528, "grad_norm": 0.2565425985518416, "learning_rate": 6.584841255035456e-07, "loss": 1.4129, "step": 752 }, { "epoch": 0.1654945054945055, "grad_norm": 0.24088172078974396, "learning_rate": 6.583761375862843e-07, "loss": 1.4047, "step": 753 }, { "epoch": 0.1657142857142857, "grad_norm": 0.24566766722942013, "learning_rate": 6.58268019341081e-07, "loss": 1.4837, "step": 754 }, { "epoch": 0.16593406593406593, "grad_norm": 0.2843984476327013, "learning_rate": 6.581597708194791e-07, "loss": 1.4328, "step": 755 }, { "epoch": 0.16615384615384615, "grad_norm": 0.24878333751936998, "learning_rate": 6.580513920730846e-07, "loss": 1.5184, "step": 756 }, { "epoch": 0.16637362637362638, "grad_norm": 0.43823398418556275, "learning_rate": 6.579428831535656e-07, "loss": 1.4559, "step": 757 }, { "epoch": 0.1665934065934066, "grad_norm": 0.2709095406799129, "learning_rate": 6.578342441126517e-07, "loss": 1.4877, "step": 758 }, { "epoch": 0.16681318681318683, "grad_norm": 0.27615866398918093, "learning_rate": 6.577254750021351e-07, "loss": 1.5182, "step": 759 }, { "epoch": 0.16703296703296702, "grad_norm": 0.24791442073327075, "learning_rate": 6.5761657587387e-07, "loss": 1.4094, "step": 760 }, { "epoch": 0.16725274725274725, "grad_norm": 0.2733200136570676, "learning_rate": 6.575075467797722e-07, "loss": 1.4744, "step": 761 }, { "epoch": 0.16747252747252747, "grad_norm": 0.4033652973072014, "learning_rate": 6.573983877718196e-07, "loss": 1.5362, "step": 762 }, { "epoch": 0.1676923076923077, "grad_norm": 0.26085757654455294, "learning_rate": 6.572890989020523e-07, "loss": 1.494, "step": 763 }, { "epoch": 0.16791208791208792, "grad_norm": 0.2522065149512857, "learning_rate": 6.571796802225721e-07, "loss": 1.5137, "step": 764 }, { "epoch": 0.16813186813186815, "grad_norm": 0.2514119880043795, "learning_rate": 6.570701317855428e-07, "loss": 1.4943, "step": 765 }, { "epoch": 0.16835164835164834, "grad_norm": 0.36309687601939433, "learning_rate": 6.569604536431896e-07, "loss": 1.4506, "step": 766 }, { "epoch": 0.16857142857142857, "grad_norm": 0.2357495570393573, "learning_rate": 6.568506458478003e-07, "loss": 1.4307, "step": 767 }, { "epoch": 0.1687912087912088, "grad_norm": 0.2528823695798724, "learning_rate": 6.567407084517239e-07, "loss": 1.4654, "step": 768 }, { "epoch": 0.16901098901098902, "grad_norm": 0.2551977277049756, "learning_rate": 6.566306415073713e-07, "loss": 1.4555, "step": 769 }, { "epoch": 0.16923076923076924, "grad_norm": 0.26677009195582646, "learning_rate": 6.565204450672155e-07, "loss": 1.4539, "step": 770 }, { "epoch": 0.16945054945054944, "grad_norm": 0.24832375944081736, "learning_rate": 6.564101191837908e-07, "loss": 1.4998, "step": 771 }, { "epoch": 0.16967032967032966, "grad_norm": 0.2513859529703173, "learning_rate": 6.562996639096935e-07, "loss": 1.4962, "step": 772 }, { "epoch": 0.16989010989010989, "grad_norm": 0.24532097751680812, "learning_rate": 6.561890792975814e-07, "loss": 1.4674, "step": 773 }, { "epoch": 0.1701098901098901, "grad_norm": 0.26123806133019406, "learning_rate": 6.560783654001741e-07, "loss": 1.5093, "step": 774 }, { "epoch": 0.17032967032967034, "grad_norm": 0.24447838413836245, "learning_rate": 6.559675222702527e-07, "loss": 1.437, "step": 775 }, { "epoch": 0.17054945054945056, "grad_norm": 0.25644416310526846, "learning_rate": 6.558565499606601e-07, "loss": 1.4511, "step": 776 }, { "epoch": 0.17076923076923076, "grad_norm": 0.2548628302557498, "learning_rate": 6.557454485243006e-07, "loss": 1.4297, "step": 777 }, { "epoch": 0.17098901098901098, "grad_norm": 0.26264082635266367, "learning_rate": 6.556342180141401e-07, "loss": 1.4437, "step": 778 }, { "epoch": 0.1712087912087912, "grad_norm": 0.2495974706112277, "learning_rate": 6.555228584832063e-07, "loss": 1.5243, "step": 779 }, { "epoch": 0.17142857142857143, "grad_norm": 0.2742468814162477, "learning_rate": 6.554113699845878e-07, "loss": 1.4201, "step": 780 }, { "epoch": 0.17164835164835165, "grad_norm": 0.25386008822047945, "learning_rate": 6.552997525714353e-07, "loss": 1.5163, "step": 781 }, { "epoch": 0.17186813186813188, "grad_norm": 0.23684642564064617, "learning_rate": 6.551880062969606e-07, "loss": 1.4626, "step": 782 }, { "epoch": 0.17208791208791208, "grad_norm": 0.26592975780850897, "learning_rate": 6.550761312144372e-07, "loss": 1.5183, "step": 783 }, { "epoch": 0.1723076923076923, "grad_norm": 0.25814240172040015, "learning_rate": 6.549641273771998e-07, "loss": 1.5035, "step": 784 }, { "epoch": 0.17252747252747253, "grad_norm": 0.24480519686156274, "learning_rate": 6.548519948386444e-07, "loss": 1.5086, "step": 785 }, { "epoch": 0.17274725274725275, "grad_norm": 0.25949934694857035, "learning_rate": 6.547397336522287e-07, "loss": 1.4474, "step": 786 }, { "epoch": 0.17296703296703297, "grad_norm": 0.2777324804802751, "learning_rate": 6.546273438714714e-07, "loss": 1.51, "step": 787 }, { "epoch": 0.1731868131868132, "grad_norm": 0.259494420828199, "learning_rate": 6.545148255499525e-07, "loss": 1.5005, "step": 788 }, { "epoch": 0.1734065934065934, "grad_norm": 0.256246190619224, "learning_rate": 6.544021787413136e-07, "loss": 1.4771, "step": 789 }, { "epoch": 0.17362637362637362, "grad_norm": 0.2586622385799712, "learning_rate": 6.542894034992573e-07, "loss": 1.551, "step": 790 }, { "epoch": 0.17384615384615384, "grad_norm": 0.2581020576686451, "learning_rate": 6.541764998775477e-07, "loss": 1.4716, "step": 791 }, { "epoch": 0.17406593406593407, "grad_norm": 0.2521819900830967, "learning_rate": 6.540634679300096e-07, "loss": 1.4476, "step": 792 }, { "epoch": 0.1742857142857143, "grad_norm": 0.25954292847728805, "learning_rate": 6.539503077105293e-07, "loss": 1.4297, "step": 793 }, { "epoch": 0.17450549450549452, "grad_norm": 0.5217522634747214, "learning_rate": 6.538370192730544e-07, "loss": 1.4547, "step": 794 }, { "epoch": 0.17472527472527472, "grad_norm": 0.24777676470476245, "learning_rate": 6.537236026715933e-07, "loss": 1.4339, "step": 795 }, { "epoch": 0.17494505494505494, "grad_norm": 0.2522563128550129, "learning_rate": 6.536100579602157e-07, "loss": 1.4643, "step": 796 }, { "epoch": 0.17516483516483516, "grad_norm": 0.24515684833735804, "learning_rate": 6.534963851930525e-07, "loss": 1.4544, "step": 797 }, { "epoch": 0.1753846153846154, "grad_norm": 0.2581061962491689, "learning_rate": 6.533825844242951e-07, "loss": 1.5236, "step": 798 }, { "epoch": 0.1756043956043956, "grad_norm": 0.2527242478069631, "learning_rate": 6.532686557081965e-07, "loss": 1.4979, "step": 799 }, { "epoch": 0.17582417582417584, "grad_norm": 0.2524452532048287, "learning_rate": 6.531545990990706e-07, "loss": 1.4929, "step": 800 }, { "epoch": 0.17604395604395603, "grad_norm": 0.2735738920989129, "learning_rate": 6.53040414651292e-07, "loss": 1.5277, "step": 801 }, { "epoch": 0.17626373626373626, "grad_norm": 0.24114702665372068, "learning_rate": 6.529261024192965e-07, "loss": 1.4867, "step": 802 }, { "epoch": 0.17648351648351648, "grad_norm": 0.25020352180657546, "learning_rate": 6.528116624575806e-07, "loss": 1.4446, "step": 803 }, { "epoch": 0.1767032967032967, "grad_norm": 0.2449838740049229, "learning_rate": 6.52697094820702e-07, "loss": 1.4695, "step": 804 }, { "epoch": 0.17692307692307693, "grad_norm": 0.2654891491654527, "learning_rate": 6.525823995632791e-07, "loss": 1.4923, "step": 805 }, { "epoch": 0.17714285714285713, "grad_norm": 0.2765560877843684, "learning_rate": 6.52467576739991e-07, "loss": 1.5173, "step": 806 }, { "epoch": 0.17736263736263735, "grad_norm": 0.24012827113899465, "learning_rate": 6.523526264055777e-07, "loss": 1.4966, "step": 807 }, { "epoch": 0.17758241758241758, "grad_norm": 0.2526242470250657, "learning_rate": 6.522375486148402e-07, "loss": 1.4905, "step": 808 }, { "epoch": 0.1778021978021978, "grad_norm": 0.2860121568632104, "learning_rate": 6.521223434226399e-07, "loss": 1.4563, "step": 809 }, { "epoch": 0.17802197802197803, "grad_norm": 0.259604810086271, "learning_rate": 6.520070108838993e-07, "loss": 1.4251, "step": 810 }, { "epoch": 0.17824175824175825, "grad_norm": 0.24765284014411357, "learning_rate": 6.518915510536015e-07, "loss": 1.451, "step": 811 }, { "epoch": 0.17846153846153845, "grad_norm": 0.4919211505847288, "learning_rate": 6.5177596398679e-07, "loss": 1.4414, "step": 812 }, { "epoch": 0.17868131868131867, "grad_norm": 0.2477839437882422, "learning_rate": 6.516602497385695e-07, "loss": 1.472, "step": 813 }, { "epoch": 0.1789010989010989, "grad_norm": 0.25991095796059227, "learning_rate": 6.515444083641046e-07, "loss": 1.4993, "step": 814 }, { "epoch": 0.17912087912087912, "grad_norm": 0.26451668595525285, "learning_rate": 6.514284399186213e-07, "loss": 1.4656, "step": 815 }, { "epoch": 0.17934065934065935, "grad_norm": 0.25624340531234613, "learning_rate": 6.513123444574055e-07, "loss": 1.4278, "step": 816 }, { "epoch": 0.17956043956043957, "grad_norm": 0.24061256258328917, "learning_rate": 6.511961220358043e-07, "loss": 1.4867, "step": 817 }, { "epoch": 0.17978021978021977, "grad_norm": 0.2448896146864455, "learning_rate": 6.510797727092248e-07, "loss": 1.4961, "step": 818 }, { "epoch": 0.18, "grad_norm": 0.248983494471306, "learning_rate": 6.509632965331348e-07, "loss": 1.4646, "step": 819 }, { "epoch": 0.18021978021978022, "grad_norm": 0.2524894221390734, "learning_rate": 6.508466935630625e-07, "loss": 1.5152, "step": 820 }, { "epoch": 0.18043956043956044, "grad_norm": 0.27034739087854265, "learning_rate": 6.507299638545966e-07, "loss": 1.5272, "step": 821 }, { "epoch": 0.18065934065934067, "grad_norm": 0.24935185571053273, "learning_rate": 6.506131074633862e-07, "loss": 1.4247, "step": 822 }, { "epoch": 0.1808791208791209, "grad_norm": 0.25373264603118834, "learning_rate": 6.504961244451409e-07, "loss": 1.5418, "step": 823 }, { "epoch": 0.1810989010989011, "grad_norm": 0.23834875950733034, "learning_rate": 6.503790148556308e-07, "loss": 1.4224, "step": 824 }, { "epoch": 0.1813186813186813, "grad_norm": 0.25102572617026075, "learning_rate": 6.502617787506857e-07, "loss": 1.5222, "step": 825 }, { "epoch": 0.18153846153846154, "grad_norm": 0.27305593767207875, "learning_rate": 6.501444161861964e-07, "loss": 1.4816, "step": 826 }, { "epoch": 0.18175824175824176, "grad_norm": 0.24391140874490652, "learning_rate": 6.500269272181136e-07, "loss": 1.4516, "step": 827 }, { "epoch": 0.181978021978022, "grad_norm": 0.25132073954371353, "learning_rate": 6.499093119024486e-07, "loss": 1.4231, "step": 828 }, { "epoch": 0.1821978021978022, "grad_norm": 0.26264618591492184, "learning_rate": 6.497915702952724e-07, "loss": 1.4947, "step": 829 }, { "epoch": 0.1824175824175824, "grad_norm": 0.2532582107048698, "learning_rate": 6.496737024527169e-07, "loss": 1.4384, "step": 830 }, { "epoch": 0.18263736263736263, "grad_norm": 0.2555219574762248, "learning_rate": 6.495557084309736e-07, "loss": 1.4393, "step": 831 }, { "epoch": 0.18285714285714286, "grad_norm": 0.24369500766902816, "learning_rate": 6.494375882862943e-07, "loss": 1.4629, "step": 832 }, { "epoch": 0.18307692307692308, "grad_norm": 0.24796294917899964, "learning_rate": 6.493193420749912e-07, "loss": 1.4816, "step": 833 }, { "epoch": 0.1832967032967033, "grad_norm": 0.24614824664988436, "learning_rate": 6.492009698534362e-07, "loss": 1.5261, "step": 834 }, { "epoch": 0.1835164835164835, "grad_norm": 0.25264208108926695, "learning_rate": 6.490824716780615e-07, "loss": 1.4578, "step": 835 }, { "epoch": 0.18373626373626373, "grad_norm": 0.23119773648091366, "learning_rate": 6.489638476053595e-07, "loss": 1.4633, "step": 836 }, { "epoch": 0.18395604395604395, "grad_norm": 0.6424182090535926, "learning_rate": 6.488450976918823e-07, "loss": 1.4467, "step": 837 }, { "epoch": 0.18417582417582418, "grad_norm": 0.24645636620710157, "learning_rate": 6.487262219942419e-07, "loss": 1.5229, "step": 838 }, { "epoch": 0.1843956043956044, "grad_norm": 0.2502673940161327, "learning_rate": 6.48607220569111e-07, "loss": 1.4655, "step": 839 }, { "epoch": 0.18461538461538463, "grad_norm": 0.26559029081889357, "learning_rate": 6.484880934732213e-07, "loss": 1.4673, "step": 840 }, { "epoch": 0.18483516483516482, "grad_norm": 0.24400080120013196, "learning_rate": 6.483688407633651e-07, "loss": 1.4661, "step": 841 }, { "epoch": 0.18505494505494505, "grad_norm": 0.252351566985222, "learning_rate": 6.482494624963943e-07, "loss": 1.4289, "step": 842 }, { "epoch": 0.18527472527472527, "grad_norm": 0.24945902207623144, "learning_rate": 6.481299587292204e-07, "loss": 1.5083, "step": 843 }, { "epoch": 0.1854945054945055, "grad_norm": 0.2547686116647054, "learning_rate": 6.480103295188153e-07, "loss": 1.4025, "step": 844 }, { "epoch": 0.18571428571428572, "grad_norm": 0.24437959120424912, "learning_rate": 6.478905749222103e-07, "loss": 1.5313, "step": 845 }, { "epoch": 0.18593406593406595, "grad_norm": 0.2668805234876379, "learning_rate": 6.477706949964966e-07, "loss": 1.4854, "step": 846 }, { "epoch": 0.18615384615384614, "grad_norm": 0.24669978610275448, "learning_rate": 6.476506897988254e-07, "loss": 1.4374, "step": 847 }, { "epoch": 0.18637362637362637, "grad_norm": 0.2691489026603144, "learning_rate": 6.475305593864069e-07, "loss": 1.4749, "step": 848 }, { "epoch": 0.1865934065934066, "grad_norm": 0.24556468560377964, "learning_rate": 6.474103038165117e-07, "loss": 1.4873, "step": 849 }, { "epoch": 0.18681318681318682, "grad_norm": 0.27248587243588246, "learning_rate": 6.472899231464699e-07, "loss": 1.5296, "step": 850 }, { "epoch": 0.18703296703296704, "grad_norm": 0.24175210505081607, "learning_rate": 6.471694174336711e-07, "loss": 1.4805, "step": 851 }, { "epoch": 0.18725274725274726, "grad_norm": 0.2533379443925673, "learning_rate": 6.470487867355646e-07, "loss": 1.4816, "step": 852 }, { "epoch": 0.18747252747252746, "grad_norm": 0.2638665301079791, "learning_rate": 6.469280311096593e-07, "loss": 1.4679, "step": 853 }, { "epoch": 0.18769230769230769, "grad_norm": 0.2474694831768457, "learning_rate": 6.468071506135234e-07, "loss": 1.4558, "step": 854 }, { "epoch": 0.1879120879120879, "grad_norm": 0.26113569153035554, "learning_rate": 6.46686145304785e-07, "loss": 1.5425, "step": 855 }, { "epoch": 0.18813186813186814, "grad_norm": 0.2528536810109955, "learning_rate": 6.465650152411316e-07, "loss": 1.5692, "step": 856 }, { "epoch": 0.18835164835164836, "grad_norm": 0.24133013640021078, "learning_rate": 6.4644376048031e-07, "loss": 1.4281, "step": 857 }, { "epoch": 0.18857142857142858, "grad_norm": 0.6294298242866595, "learning_rate": 6.463223810801268e-07, "loss": 1.4473, "step": 858 }, { "epoch": 0.18879120879120878, "grad_norm": 0.25023604464808363, "learning_rate": 6.462008770984475e-07, "loss": 1.4098, "step": 859 }, { "epoch": 0.189010989010989, "grad_norm": 0.24797013473975096, "learning_rate": 6.460792485931976e-07, "loss": 1.4359, "step": 860 }, { "epoch": 0.18923076923076923, "grad_norm": 0.25206812258465444, "learning_rate": 6.459574956223613e-07, "loss": 1.4624, "step": 861 }, { "epoch": 0.18945054945054945, "grad_norm": 0.2489549376961613, "learning_rate": 6.458356182439828e-07, "loss": 1.4618, "step": 862 }, { "epoch": 0.18967032967032968, "grad_norm": 0.250557255187581, "learning_rate": 6.457136165161652e-07, "loss": 1.5019, "step": 863 }, { "epoch": 0.1898901098901099, "grad_norm": 0.2510639666261246, "learning_rate": 6.455914904970709e-07, "loss": 1.4827, "step": 864 }, { "epoch": 0.1901098901098901, "grad_norm": 0.41430231161919934, "learning_rate": 6.454692402449216e-07, "loss": 1.4659, "step": 865 }, { "epoch": 0.19032967032967033, "grad_norm": 0.2465675465651625, "learning_rate": 6.453468658179984e-07, "loss": 1.4819, "step": 866 }, { "epoch": 0.19054945054945055, "grad_norm": 0.24963496466847715, "learning_rate": 6.452243672746415e-07, "loss": 1.5047, "step": 867 }, { "epoch": 0.19076923076923077, "grad_norm": 0.2716313031010093, "learning_rate": 6.4510174467325e-07, "loss": 1.4016, "step": 868 }, { "epoch": 0.190989010989011, "grad_norm": 0.24718273880514013, "learning_rate": 6.449789980722827e-07, "loss": 1.3956, "step": 869 }, { "epoch": 0.1912087912087912, "grad_norm": 0.2482414244879754, "learning_rate": 6.448561275302569e-07, "loss": 1.5006, "step": 870 }, { "epoch": 0.19142857142857142, "grad_norm": 0.29032338053231516, "learning_rate": 6.447331331057494e-07, "loss": 1.4202, "step": 871 }, { "epoch": 0.19164835164835164, "grad_norm": 0.263484713757411, "learning_rate": 6.446100148573957e-07, "loss": 1.4702, "step": 872 }, { "epoch": 0.19186813186813187, "grad_norm": 0.318710014023105, "learning_rate": 6.44486772843891e-07, "loss": 1.4786, "step": 873 }, { "epoch": 0.1920879120879121, "grad_norm": 0.26178198978589434, "learning_rate": 6.443634071239887e-07, "loss": 1.4381, "step": 874 }, { "epoch": 0.19230769230769232, "grad_norm": 0.2616784646721311, "learning_rate": 6.442399177565017e-07, "loss": 1.4749, "step": 875 }, { "epoch": 0.19252747252747252, "grad_norm": 0.24694702837666085, "learning_rate": 6.441163048003017e-07, "loss": 1.4417, "step": 876 }, { "epoch": 0.19274725274725274, "grad_norm": 0.24286260970664117, "learning_rate": 6.439925683143192e-07, "loss": 1.5366, "step": 877 }, { "epoch": 0.19296703296703296, "grad_norm": 0.26143931461528386, "learning_rate": 6.43868708357544e-07, "loss": 1.4638, "step": 878 }, { "epoch": 0.1931868131868132, "grad_norm": 0.2549936055391091, "learning_rate": 6.43744724989024e-07, "loss": 1.488, "step": 879 }, { "epoch": 0.1934065934065934, "grad_norm": 0.2457925274541861, "learning_rate": 6.436206182678667e-07, "loss": 1.4757, "step": 880 }, { "epoch": 0.19362637362637364, "grad_norm": 0.25332057771158173, "learning_rate": 6.434963882532381e-07, "loss": 1.541, "step": 881 }, { "epoch": 0.19384615384615383, "grad_norm": 0.2527740783450117, "learning_rate": 6.433720350043628e-07, "loss": 1.5169, "step": 882 }, { "epoch": 0.19406593406593406, "grad_norm": 0.25737766445240007, "learning_rate": 6.432475585805246e-07, "loss": 1.5362, "step": 883 }, { "epoch": 0.19428571428571428, "grad_norm": 0.24722408218766065, "learning_rate": 6.431229590410655e-07, "loss": 1.4255, "step": 884 }, { "epoch": 0.1945054945054945, "grad_norm": 0.24874655782608956, "learning_rate": 6.429982364453866e-07, "loss": 1.445, "step": 885 }, { "epoch": 0.19472527472527473, "grad_norm": 0.9157681858416035, "learning_rate": 6.428733908529474e-07, "loss": 1.5339, "step": 886 }, { "epoch": 0.19494505494505496, "grad_norm": 0.24661783363809, "learning_rate": 6.427484223232663e-07, "loss": 1.4337, "step": 887 }, { "epoch": 0.19516483516483515, "grad_norm": 0.39788485611443236, "learning_rate": 6.4262333091592e-07, "loss": 1.4669, "step": 888 }, { "epoch": 0.19538461538461538, "grad_norm": 0.2547946401026291, "learning_rate": 6.424981166905441e-07, "loss": 1.4872, "step": 889 }, { "epoch": 0.1956043956043956, "grad_norm": 0.23213374586324453, "learning_rate": 6.423727797068325e-07, "loss": 1.4222, "step": 890 }, { "epoch": 0.19582417582417583, "grad_norm": 0.24623234969287922, "learning_rate": 6.422473200245377e-07, "loss": 1.5105, "step": 891 }, { "epoch": 0.19604395604395605, "grad_norm": 0.2353778095539658, "learning_rate": 6.421217377034706e-07, "loss": 1.452, "step": 892 }, { "epoch": 0.19626373626373628, "grad_norm": 0.2560236768429536, "learning_rate": 6.419960328035009e-07, "loss": 1.4666, "step": 893 }, { "epoch": 0.19648351648351647, "grad_norm": 0.5951869201313986, "learning_rate": 6.418702053845564e-07, "loss": 1.423, "step": 894 }, { "epoch": 0.1967032967032967, "grad_norm": 0.25662714208804754, "learning_rate": 6.417442555066234e-07, "loss": 1.4927, "step": 895 }, { "epoch": 0.19692307692307692, "grad_norm": 0.2530816905689608, "learning_rate": 6.416181832297467e-07, "loss": 1.4384, "step": 896 }, { "epoch": 0.19714285714285715, "grad_norm": 0.23498808230493776, "learning_rate": 6.414919886140292e-07, "loss": 1.4791, "step": 897 }, { "epoch": 0.19736263736263737, "grad_norm": 0.26344742398120496, "learning_rate": 6.413656717196324e-07, "loss": 1.4971, "step": 898 }, { "epoch": 0.19758241758241757, "grad_norm": 0.25143082783374615, "learning_rate": 6.412392326067759e-07, "loss": 1.4597, "step": 899 }, { "epoch": 0.1978021978021978, "grad_norm": 0.23990604547072275, "learning_rate": 6.411126713357377e-07, "loss": 1.4232, "step": 900 }, { "epoch": 0.19802197802197802, "grad_norm": 0.2692939081735677, "learning_rate": 6.409859879668539e-07, "loss": 1.4706, "step": 901 }, { "epoch": 0.19824175824175824, "grad_norm": 0.269185023377902, "learning_rate": 6.40859182560519e-07, "loss": 1.477, "step": 902 }, { "epoch": 0.19846153846153847, "grad_norm": 0.25792597757080377, "learning_rate": 6.407322551771853e-07, "loss": 1.414, "step": 903 }, { "epoch": 0.1986813186813187, "grad_norm": 0.2717337009081469, "learning_rate": 6.406052058773639e-07, "loss": 1.411, "step": 904 }, { "epoch": 0.1989010989010989, "grad_norm": 0.2479518988127083, "learning_rate": 6.404780347216234e-07, "loss": 1.5042, "step": 905 }, { "epoch": 0.1991208791208791, "grad_norm": 0.2459542294208431, "learning_rate": 6.403507417705905e-07, "loss": 1.445, "step": 906 }, { "epoch": 0.19934065934065934, "grad_norm": 0.26575861998920286, "learning_rate": 6.402233270849507e-07, "loss": 1.4916, "step": 907 }, { "epoch": 0.19956043956043956, "grad_norm": 0.27081833099425034, "learning_rate": 6.400957907254468e-07, "loss": 1.4499, "step": 908 }, { "epoch": 0.1997802197802198, "grad_norm": 0.252453291758668, "learning_rate": 6.399681327528796e-07, "loss": 1.4982, "step": 909 }, { "epoch": 0.2, "grad_norm": 0.24537417092917735, "learning_rate": 6.398403532281084e-07, "loss": 1.4778, "step": 910 }, { "epoch": 0.2002197802197802, "grad_norm": 0.2506023943950271, "learning_rate": 6.397124522120501e-07, "loss": 1.5031, "step": 911 }, { "epoch": 0.20043956043956043, "grad_norm": 0.25036021304321665, "learning_rate": 6.395844297656795e-07, "loss": 1.4888, "step": 912 }, { "epoch": 0.20065934065934066, "grad_norm": 0.24254651002406466, "learning_rate": 6.394562859500295e-07, "loss": 1.4459, "step": 913 }, { "epoch": 0.20087912087912088, "grad_norm": 0.25867272996848184, "learning_rate": 6.393280208261904e-07, "loss": 1.545, "step": 914 }, { "epoch": 0.2010989010989011, "grad_norm": 0.24893331006562164, "learning_rate": 6.391996344553111e-07, "loss": 1.5024, "step": 915 }, { "epoch": 0.20131868131868133, "grad_norm": 0.24696705887897696, "learning_rate": 6.390711268985977e-07, "loss": 1.4732, "step": 916 }, { "epoch": 0.20153846153846153, "grad_norm": 0.240259720360503, "learning_rate": 6.389424982173141e-07, "loss": 1.4846, "step": 917 }, { "epoch": 0.20175824175824175, "grad_norm": 0.24233606092879775, "learning_rate": 6.388137484727822e-07, "loss": 1.4527, "step": 918 }, { "epoch": 0.20197802197802198, "grad_norm": 0.24239663106316878, "learning_rate": 6.386848777263814e-07, "loss": 1.4479, "step": 919 }, { "epoch": 0.2021978021978022, "grad_norm": 0.23606123410211843, "learning_rate": 6.38555886039549e-07, "loss": 1.4073, "step": 920 }, { "epoch": 0.20241758241758243, "grad_norm": 0.2501744387518463, "learning_rate": 6.384267734737798e-07, "loss": 1.4662, "step": 921 }, { "epoch": 0.20263736263736265, "grad_norm": 0.25849377503081267, "learning_rate": 6.382975400906261e-07, "loss": 1.4666, "step": 922 }, { "epoch": 0.20285714285714285, "grad_norm": 0.2502574149323045, "learning_rate": 6.381681859516985e-07, "loss": 1.4485, "step": 923 }, { "epoch": 0.20307692307692307, "grad_norm": 0.2508995533000133, "learning_rate": 6.380387111186639e-07, "loss": 1.4335, "step": 924 }, { "epoch": 0.2032967032967033, "grad_norm": 0.2559998901231916, "learning_rate": 6.37909115653248e-07, "loss": 1.5031, "step": 925 }, { "epoch": 0.20351648351648352, "grad_norm": 0.26059348234814095, "learning_rate": 6.377793996172331e-07, "loss": 1.4643, "step": 926 }, { "epoch": 0.20373626373626375, "grad_norm": 0.2595881455088972, "learning_rate": 6.376495630724598e-07, "loss": 1.5158, "step": 927 }, { "epoch": 0.20395604395604397, "grad_norm": 0.2439256330478379, "learning_rate": 6.375196060808253e-07, "loss": 1.4548, "step": 928 }, { "epoch": 0.20417582417582417, "grad_norm": 0.24263873371003117, "learning_rate": 6.373895287042848e-07, "loss": 1.4889, "step": 929 }, { "epoch": 0.2043956043956044, "grad_norm": 0.23460790293609043, "learning_rate": 6.372593310048507e-07, "loss": 1.4443, "step": 930 }, { "epoch": 0.20461538461538462, "grad_norm": 0.2451319771571135, "learning_rate": 6.371290130445929e-07, "loss": 1.5045, "step": 931 }, { "epoch": 0.20483516483516484, "grad_norm": 0.2529639158714934, "learning_rate": 6.369985748856385e-07, "loss": 1.5279, "step": 932 }, { "epoch": 0.20505494505494506, "grad_norm": 0.32201940620813707, "learning_rate": 6.368680165901716e-07, "loss": 1.5645, "step": 933 }, { "epoch": 0.20527472527472526, "grad_norm": 0.2875582546042058, "learning_rate": 6.367373382204343e-07, "loss": 1.4613, "step": 934 }, { "epoch": 0.20549450549450549, "grad_norm": 0.25410679968514366, "learning_rate": 6.366065398387253e-07, "loss": 1.488, "step": 935 }, { "epoch": 0.2057142857142857, "grad_norm": 0.26336986404865037, "learning_rate": 6.364756215074009e-07, "loss": 1.4869, "step": 936 }, { "epoch": 0.20593406593406594, "grad_norm": 0.2508092121036278, "learning_rate": 6.363445832888743e-07, "loss": 1.4721, "step": 937 }, { "epoch": 0.20615384615384616, "grad_norm": 0.25674556894140477, "learning_rate": 6.362134252456162e-07, "loss": 1.5337, "step": 938 }, { "epoch": 0.20637362637362638, "grad_norm": 0.24537133466589953, "learning_rate": 6.36082147440154e-07, "loss": 1.4449, "step": 939 }, { "epoch": 0.20659340659340658, "grad_norm": 0.24937842575749275, "learning_rate": 6.359507499350724e-07, "loss": 1.5097, "step": 940 }, { "epoch": 0.2068131868131868, "grad_norm": 0.2665009293024344, "learning_rate": 6.358192327930133e-07, "loss": 1.4204, "step": 941 }, { "epoch": 0.20703296703296703, "grad_norm": 0.24297154419198846, "learning_rate": 6.356875960766754e-07, "loss": 1.4208, "step": 942 }, { "epoch": 0.20725274725274725, "grad_norm": 0.2438850102539349, "learning_rate": 6.355558398488147e-07, "loss": 1.4701, "step": 943 }, { "epoch": 0.20747252747252748, "grad_norm": 0.2514148533451485, "learning_rate": 6.354239641722438e-07, "loss": 1.5029, "step": 944 }, { "epoch": 0.2076923076923077, "grad_norm": 0.2628877418338969, "learning_rate": 6.352919691098326e-07, "loss": 1.4808, "step": 945 }, { "epoch": 0.2079120879120879, "grad_norm": 0.25494646791834513, "learning_rate": 6.351598547245075e-07, "loss": 1.4681, "step": 946 }, { "epoch": 0.20813186813186813, "grad_norm": 0.25407267277377743, "learning_rate": 6.350276210792524e-07, "loss": 1.4641, "step": 947 }, { "epoch": 0.20835164835164835, "grad_norm": 0.26857491418989893, "learning_rate": 6.348952682371073e-07, "loss": 1.4759, "step": 948 }, { "epoch": 0.20857142857142857, "grad_norm": 0.23972152226069182, "learning_rate": 6.347627962611697e-07, "loss": 1.454, "step": 949 }, { "epoch": 0.2087912087912088, "grad_norm": 0.2565036917274771, "learning_rate": 6.346302052145935e-07, "loss": 1.4978, "step": 950 }, { "epoch": 0.20901098901098902, "grad_norm": 0.258139709502027, "learning_rate": 6.344974951605894e-07, "loss": 1.4461, "step": 951 }, { "epoch": 0.20923076923076922, "grad_norm": 0.25213149359953296, "learning_rate": 6.343646661624251e-07, "loss": 1.433, "step": 952 }, { "epoch": 0.20945054945054944, "grad_norm": 0.2517626189796978, "learning_rate": 6.342317182834248e-07, "loss": 1.4274, "step": 953 }, { "epoch": 0.20967032967032967, "grad_norm": 0.2474446088373573, "learning_rate": 6.340986515869691e-07, "loss": 1.4738, "step": 954 }, { "epoch": 0.2098901098901099, "grad_norm": 0.2583933717951051, "learning_rate": 6.339654661364959e-07, "loss": 1.4963, "step": 955 }, { "epoch": 0.21010989010989012, "grad_norm": 0.2629986391355057, "learning_rate": 6.338321619954993e-07, "loss": 1.4565, "step": 956 }, { "epoch": 0.21032967032967034, "grad_norm": 0.2457716206199386, "learning_rate": 6.336987392275298e-07, "loss": 1.5438, "step": 957 }, { "epoch": 0.21054945054945054, "grad_norm": 0.2772638128277807, "learning_rate": 6.335651978961948e-07, "loss": 1.3944, "step": 958 }, { "epoch": 0.21076923076923076, "grad_norm": 0.25403073870139725, "learning_rate": 6.334315380651581e-07, "loss": 1.5137, "step": 959 }, { "epoch": 0.210989010989011, "grad_norm": 0.24811373405946546, "learning_rate": 6.332977597981402e-07, "loss": 1.463, "step": 960 }, { "epoch": 0.2112087912087912, "grad_norm": 0.2615651322350011, "learning_rate": 6.331638631589177e-07, "loss": 1.4613, "step": 961 }, { "epoch": 0.21142857142857144, "grad_norm": 0.263440486156173, "learning_rate": 6.330298482113238e-07, "loss": 1.4732, "step": 962 }, { "epoch": 0.21164835164835163, "grad_norm": 0.25509452968099045, "learning_rate": 6.328957150192481e-07, "loss": 1.4419, "step": 963 }, { "epoch": 0.21186813186813186, "grad_norm": 0.2558217352425839, "learning_rate": 6.327614636466365e-07, "loss": 1.5063, "step": 964 }, { "epoch": 0.21208791208791208, "grad_norm": 0.24999222039824714, "learning_rate": 6.326270941574915e-07, "loss": 1.4379, "step": 965 }, { "epoch": 0.2123076923076923, "grad_norm": 0.25986839759589353, "learning_rate": 6.324926066158715e-07, "loss": 1.495, "step": 966 }, { "epoch": 0.21252747252747253, "grad_norm": 0.2501101206247595, "learning_rate": 6.323580010858915e-07, "loss": 1.4537, "step": 967 }, { "epoch": 0.21274725274725276, "grad_norm": 0.26059640962348585, "learning_rate": 6.322232776317227e-07, "loss": 1.4483, "step": 968 }, { "epoch": 0.21296703296703295, "grad_norm": 0.3253965606400039, "learning_rate": 6.320884363175926e-07, "loss": 1.5221, "step": 969 }, { "epoch": 0.21318681318681318, "grad_norm": 0.2738405972770376, "learning_rate": 6.319534772077844e-07, "loss": 1.4758, "step": 970 }, { "epoch": 0.2134065934065934, "grad_norm": 0.23998450538869, "learning_rate": 6.318184003666382e-07, "loss": 1.5019, "step": 971 }, { "epoch": 0.21362637362637363, "grad_norm": 0.2750740502569545, "learning_rate": 6.316832058585496e-07, "loss": 1.4747, "step": 972 }, { "epoch": 0.21384615384615385, "grad_norm": 0.2495073621097044, "learning_rate": 6.315478937479705e-07, "loss": 1.4663, "step": 973 }, { "epoch": 0.21406593406593408, "grad_norm": 0.25360133485533454, "learning_rate": 6.314124640994091e-07, "loss": 1.4852, "step": 974 }, { "epoch": 0.21428571428571427, "grad_norm": 0.25701838922538556, "learning_rate": 6.312769169774293e-07, "loss": 1.4635, "step": 975 }, { "epoch": 0.2145054945054945, "grad_norm": 0.245066880216118, "learning_rate": 6.311412524466512e-07, "loss": 1.4443, "step": 976 }, { "epoch": 0.21472527472527472, "grad_norm": 0.27326964827957495, "learning_rate": 6.310054705717509e-07, "loss": 1.4559, "step": 977 }, { "epoch": 0.21494505494505495, "grad_norm": 0.4643132469222394, "learning_rate": 6.308695714174602e-07, "loss": 1.4578, "step": 978 }, { "epoch": 0.21516483516483517, "grad_norm": 0.2553315407913207, "learning_rate": 6.30733555048567e-07, "loss": 1.4741, "step": 979 }, { "epoch": 0.2153846153846154, "grad_norm": 0.2866978785567234, "learning_rate": 6.30597421529915e-07, "loss": 1.4369, "step": 980 }, { "epoch": 0.2156043956043956, "grad_norm": 0.254151772984924, "learning_rate": 6.30461170926404e-07, "loss": 1.4763, "step": 981 }, { "epoch": 0.21582417582417582, "grad_norm": 0.2505294152945484, "learning_rate": 6.303248033029892e-07, "loss": 1.5003, "step": 982 }, { "epoch": 0.21604395604395604, "grad_norm": 0.24788177291163854, "learning_rate": 6.301883187246819e-07, "loss": 1.4565, "step": 983 }, { "epoch": 0.21626373626373627, "grad_norm": 0.25852553005445156, "learning_rate": 6.300517172565491e-07, "loss": 1.4868, "step": 984 }, { "epoch": 0.2164835164835165, "grad_norm": 0.27705295208335157, "learning_rate": 6.299149989637134e-07, "loss": 1.5281, "step": 985 }, { "epoch": 0.21670329670329672, "grad_norm": 0.2486468181676995, "learning_rate": 6.297781639113532e-07, "loss": 1.5065, "step": 986 }, { "epoch": 0.2169230769230769, "grad_norm": 0.25924678229751535, "learning_rate": 6.296412121647026e-07, "loss": 1.4719, "step": 987 }, { "epoch": 0.21714285714285714, "grad_norm": 0.2508896633465167, "learning_rate": 6.295041437890513e-07, "loss": 1.4852, "step": 988 }, { "epoch": 0.21736263736263736, "grad_norm": 0.2564252926625278, "learning_rate": 6.293669588497444e-07, "loss": 1.5041, "step": 989 }, { "epoch": 0.2175824175824176, "grad_norm": 0.24729873883885692, "learning_rate": 6.292296574121828e-07, "loss": 1.462, "step": 990 }, { "epoch": 0.2178021978021978, "grad_norm": 0.24882002931157068, "learning_rate": 6.29092239541823e-07, "loss": 1.4737, "step": 991 }, { "epoch": 0.21802197802197804, "grad_norm": 0.2477338170470924, "learning_rate": 6.289547053041769e-07, "loss": 1.4523, "step": 992 }, { "epoch": 0.21824175824175823, "grad_norm": 0.2524284804610147, "learning_rate": 6.288170547648117e-07, "loss": 1.4448, "step": 993 }, { "epoch": 0.21846153846153846, "grad_norm": 0.2618226168369328, "learning_rate": 6.286792879893504e-07, "loss": 1.4149, "step": 994 }, { "epoch": 0.21868131868131868, "grad_norm": 0.26073755610441146, "learning_rate": 6.285414050434712e-07, "loss": 1.4662, "step": 995 }, { "epoch": 0.2189010989010989, "grad_norm": 0.2509363125644294, "learning_rate": 6.284034059929075e-07, "loss": 1.4694, "step": 996 }, { "epoch": 0.21912087912087913, "grad_norm": 0.2604106550335233, "learning_rate": 6.282652909034486e-07, "loss": 1.465, "step": 997 }, { "epoch": 0.21934065934065933, "grad_norm": 0.2613176452159072, "learning_rate": 6.281270598409384e-07, "loss": 1.5201, "step": 998 }, { "epoch": 0.21956043956043955, "grad_norm": 0.25412235551601126, "learning_rate": 6.279887128712769e-07, "loss": 1.5038, "step": 999 }, { "epoch": 0.21978021978021978, "grad_norm": 0.24241093033928177, "learning_rate": 6.278502500604185e-07, "loss": 1.5119, "step": 1000 }, { "epoch": 0.22, "grad_norm": 0.2564936510368631, "learning_rate": 6.277116714743736e-07, "loss": 1.4621, "step": 1001 }, { "epoch": 0.22021978021978023, "grad_norm": 0.25132189781818387, "learning_rate": 6.275729771792071e-07, "loss": 1.4864, "step": 1002 }, { "epoch": 0.22043956043956045, "grad_norm": 0.262435909568915, "learning_rate": 6.274341672410399e-07, "loss": 1.4876, "step": 1003 }, { "epoch": 0.22065934065934065, "grad_norm": 0.2442607144061378, "learning_rate": 6.27295241726047e-07, "loss": 1.4955, "step": 1004 }, { "epoch": 0.22087912087912087, "grad_norm": 0.24918844109204902, "learning_rate": 6.271562007004594e-07, "loss": 1.4581, "step": 1005 }, { "epoch": 0.2210989010989011, "grad_norm": 0.3007102940008447, "learning_rate": 6.270170442305628e-07, "loss": 1.4649, "step": 1006 }, { "epoch": 0.22131868131868132, "grad_norm": 0.24458518612482621, "learning_rate": 6.268777723826977e-07, "loss": 1.5223, "step": 1007 }, { "epoch": 0.22153846153846155, "grad_norm": 0.24485989164173935, "learning_rate": 6.267383852232599e-07, "loss": 1.4227, "step": 1008 }, { "epoch": 0.22175824175824177, "grad_norm": 0.26994854817969943, "learning_rate": 6.265988828187004e-07, "loss": 1.4892, "step": 1009 }, { "epoch": 0.22197802197802197, "grad_norm": 0.25257673752539994, "learning_rate": 6.264592652355246e-07, "loss": 1.4901, "step": 1010 }, { "epoch": 0.2221978021978022, "grad_norm": 0.44345204761261503, "learning_rate": 6.26319532540293e-07, "loss": 1.4548, "step": 1011 }, { "epoch": 0.22241758241758242, "grad_norm": 0.26091916059194475, "learning_rate": 6.261796847996214e-07, "loss": 1.4834, "step": 1012 }, { "epoch": 0.22263736263736264, "grad_norm": 0.25779038287917105, "learning_rate": 6.260397220801798e-07, "loss": 1.4937, "step": 1013 }, { "epoch": 0.22285714285714286, "grad_norm": 0.24084632378403625, "learning_rate": 6.258996444486935e-07, "loss": 1.438, "step": 1014 }, { "epoch": 0.2230769230769231, "grad_norm": 0.24738965156649764, "learning_rate": 6.257594519719421e-07, "loss": 1.4311, "step": 1015 }, { "epoch": 0.22329670329670329, "grad_norm": 0.2677682094041224, "learning_rate": 6.256191447167607e-07, "loss": 1.4418, "step": 1016 }, { "epoch": 0.2235164835164835, "grad_norm": 0.24732574403308874, "learning_rate": 6.254787227500382e-07, "loss": 1.455, "step": 1017 }, { "epoch": 0.22373626373626374, "grad_norm": 0.2738078752234439, "learning_rate": 6.25338186138719e-07, "loss": 1.4768, "step": 1018 }, { "epoch": 0.22395604395604396, "grad_norm": 0.2530054621084579, "learning_rate": 6.251975349498016e-07, "loss": 1.4965, "step": 1019 }, { "epoch": 0.22417582417582418, "grad_norm": 0.25608936529023674, "learning_rate": 6.250567692503394e-07, "loss": 1.479, "step": 1020 }, { "epoch": 0.2243956043956044, "grad_norm": 0.2530095649283402, "learning_rate": 6.249158891074402e-07, "loss": 1.5014, "step": 1021 }, { "epoch": 0.2246153846153846, "grad_norm": 0.28426157573050004, "learning_rate": 6.247748945882668e-07, "loss": 1.4383, "step": 1022 }, { "epoch": 0.22483516483516483, "grad_norm": 0.2571595455354892, "learning_rate": 6.246337857600359e-07, "loss": 1.4475, "step": 1023 }, { "epoch": 0.22505494505494505, "grad_norm": 0.2431164945851439, "learning_rate": 6.24492562690019e-07, "loss": 1.461, "step": 1024 }, { "epoch": 0.22527472527472528, "grad_norm": 0.2965910957444324, "learning_rate": 6.243512254455423e-07, "loss": 1.4434, "step": 1025 }, { "epoch": 0.2254945054945055, "grad_norm": 0.24318863816189085, "learning_rate": 6.24209774093986e-07, "loss": 1.4125, "step": 1026 }, { "epoch": 0.2257142857142857, "grad_norm": 0.2699868320603675, "learning_rate": 6.24068208702785e-07, "loss": 1.4273, "step": 1027 }, { "epoch": 0.22593406593406593, "grad_norm": 0.2573116416423263, "learning_rate": 6.239265293394283e-07, "loss": 1.4804, "step": 1028 }, { "epoch": 0.22615384615384615, "grad_norm": 0.2537042712591057, "learning_rate": 6.237847360714597e-07, "loss": 1.4497, "step": 1029 }, { "epoch": 0.22637362637362637, "grad_norm": 0.56996514124249, "learning_rate": 6.236428289664767e-07, "loss": 1.453, "step": 1030 }, { "epoch": 0.2265934065934066, "grad_norm": 0.2556228682124189, "learning_rate": 6.235008080921314e-07, "loss": 1.4659, "step": 1031 }, { "epoch": 0.22681318681318682, "grad_norm": 0.2718315386107101, "learning_rate": 6.233586735161303e-07, "loss": 1.4188, "step": 1032 }, { "epoch": 0.22703296703296702, "grad_norm": 0.2462699534901121, "learning_rate": 6.232164253062336e-07, "loss": 1.4673, "step": 1033 }, { "epoch": 0.22725274725274724, "grad_norm": 0.25640174813341954, "learning_rate": 6.230740635302563e-07, "loss": 1.4689, "step": 1034 }, { "epoch": 0.22747252747252747, "grad_norm": 0.2754401724093636, "learning_rate": 6.229315882560671e-07, "loss": 1.4283, "step": 1035 }, { "epoch": 0.2276923076923077, "grad_norm": 0.2898185200896864, "learning_rate": 6.22788999551589e-07, "loss": 1.5236, "step": 1036 }, { "epoch": 0.22791208791208792, "grad_norm": 0.2501607998148379, "learning_rate": 6.226462974847989e-07, "loss": 1.4728, "step": 1037 }, { "epoch": 0.22813186813186814, "grad_norm": 0.26636250011797713, "learning_rate": 6.22503482123728e-07, "loss": 1.5376, "step": 1038 }, { "epoch": 0.22835164835164834, "grad_norm": 0.2502974862297266, "learning_rate": 6.223605535364611e-07, "loss": 1.4333, "step": 1039 }, { "epoch": 0.22857142857142856, "grad_norm": 0.2564062296316927, "learning_rate": 6.222175117911375e-07, "loss": 1.4807, "step": 1040 }, { "epoch": 0.2287912087912088, "grad_norm": 0.25275825535718294, "learning_rate": 6.220743569559499e-07, "loss": 1.4511, "step": 1041 }, { "epoch": 0.229010989010989, "grad_norm": 0.3662792300369227, "learning_rate": 6.219310890991456e-07, "loss": 1.4546, "step": 1042 }, { "epoch": 0.22923076923076924, "grad_norm": 0.24743135427408056, "learning_rate": 6.217877082890251e-07, "loss": 1.4985, "step": 1043 }, { "epoch": 0.22945054945054946, "grad_norm": 0.2444138562542184, "learning_rate": 6.216442145939431e-07, "loss": 1.4754, "step": 1044 }, { "epoch": 0.22967032967032966, "grad_norm": 0.24883832216323296, "learning_rate": 6.215006080823083e-07, "loss": 1.4691, "step": 1045 }, { "epoch": 0.22989010989010988, "grad_norm": 0.2585144888066422, "learning_rate": 6.213568888225825e-07, "loss": 1.5124, "step": 1046 }, { "epoch": 0.2301098901098901, "grad_norm": 0.2519698762792617, "learning_rate": 6.212130568832819e-07, "loss": 1.4571, "step": 1047 }, { "epoch": 0.23032967032967033, "grad_norm": 0.24806043565398622, "learning_rate": 6.210691123329763e-07, "loss": 1.4892, "step": 1048 }, { "epoch": 0.23054945054945056, "grad_norm": 0.2414973297774547, "learning_rate": 6.209250552402888e-07, "loss": 1.4565, "step": 1049 }, { "epoch": 0.23076923076923078, "grad_norm": 0.27596519798931884, "learning_rate": 6.207808856738968e-07, "loss": 1.4836, "step": 1050 }, { "epoch": 0.23098901098901098, "grad_norm": 0.24408299883564805, "learning_rate": 6.206366037025308e-07, "loss": 1.366, "step": 1051 }, { "epoch": 0.2312087912087912, "grad_norm": 0.2568232578421352, "learning_rate": 6.204922093949749e-07, "loss": 1.4863, "step": 1052 }, { "epoch": 0.23142857142857143, "grad_norm": 0.2593019193101645, "learning_rate": 6.203477028200672e-07, "loss": 1.4893, "step": 1053 }, { "epoch": 0.23164835164835165, "grad_norm": 0.27805061915979157, "learning_rate": 6.202030840466987e-07, "loss": 1.4951, "step": 1054 }, { "epoch": 0.23186813186813188, "grad_norm": 0.28336235205970894, "learning_rate": 6.200583531438145e-07, "loss": 1.475, "step": 1055 }, { "epoch": 0.2320879120879121, "grad_norm": 0.2572369675479279, "learning_rate": 6.199135101804126e-07, "loss": 1.5397, "step": 1056 }, { "epoch": 0.2323076923076923, "grad_norm": 0.28325937224786646, "learning_rate": 6.19768555225545e-07, "loss": 1.4759, "step": 1057 }, { "epoch": 0.23252747252747252, "grad_norm": 0.2572233336831093, "learning_rate": 6.196234883483166e-07, "loss": 1.4766, "step": 1058 }, { "epoch": 0.23274725274725275, "grad_norm": 0.24063422185551667, "learning_rate": 6.194783096178858e-07, "loss": 1.4624, "step": 1059 }, { "epoch": 0.23296703296703297, "grad_norm": 0.25066074513711945, "learning_rate": 6.193330191034645e-07, "loss": 1.4517, "step": 1060 }, { "epoch": 0.2331868131868132, "grad_norm": 0.25145707077417156, "learning_rate": 6.191876168743177e-07, "loss": 1.5347, "step": 1061 }, { "epoch": 0.2334065934065934, "grad_norm": 0.25596965013778983, "learning_rate": 6.190421029997638e-07, "loss": 1.4145, "step": 1062 }, { "epoch": 0.23362637362637362, "grad_norm": 0.24480346083152169, "learning_rate": 6.18896477549174e-07, "loss": 1.4719, "step": 1063 }, { "epoch": 0.23384615384615384, "grad_norm": 0.2647756689489498, "learning_rate": 6.187507405919735e-07, "loss": 1.4484, "step": 1064 }, { "epoch": 0.23406593406593407, "grad_norm": 0.24807787727321778, "learning_rate": 6.186048921976398e-07, "loss": 1.4709, "step": 1065 }, { "epoch": 0.2342857142857143, "grad_norm": 0.26266770594278166, "learning_rate": 6.184589324357042e-07, "loss": 1.4805, "step": 1066 }, { "epoch": 0.23450549450549452, "grad_norm": 0.244979882115583, "learning_rate": 6.183128613757507e-07, "loss": 1.4319, "step": 1067 }, { "epoch": 0.2347252747252747, "grad_norm": 0.25745588471583153, "learning_rate": 6.181666790874166e-07, "loss": 1.4383, "step": 1068 }, { "epoch": 0.23494505494505494, "grad_norm": 0.2610718353414187, "learning_rate": 6.180203856403917e-07, "loss": 1.3963, "step": 1069 }, { "epoch": 0.23516483516483516, "grad_norm": 0.2476652541181489, "learning_rate": 6.178739811044196e-07, "loss": 1.5077, "step": 1070 }, { "epoch": 0.2353846153846154, "grad_norm": 0.2507386054259677, "learning_rate": 6.177274655492963e-07, "loss": 1.3914, "step": 1071 }, { "epoch": 0.2356043956043956, "grad_norm": 0.2635488437850268, "learning_rate": 6.175808390448709e-07, "loss": 1.4864, "step": 1072 }, { "epoch": 0.23582417582417584, "grad_norm": 0.3189120592904306, "learning_rate": 6.174341016610453e-07, "loss": 1.4856, "step": 1073 }, { "epoch": 0.23604395604395603, "grad_norm": 0.2514042402305899, "learning_rate": 6.172872534677745e-07, "loss": 1.4953, "step": 1074 }, { "epoch": 0.23626373626373626, "grad_norm": 0.2509635934820503, "learning_rate": 6.171402945350661e-07, "loss": 1.5035, "step": 1075 }, { "epoch": 0.23648351648351648, "grad_norm": 0.2518564268565751, "learning_rate": 6.169932249329803e-07, "loss": 1.4302, "step": 1076 }, { "epoch": 0.2367032967032967, "grad_norm": 0.31047075081107933, "learning_rate": 6.168460447316305e-07, "loss": 1.4227, "step": 1077 }, { "epoch": 0.23692307692307693, "grad_norm": 0.2550929504416796, "learning_rate": 6.166987540011829e-07, "loss": 1.495, "step": 1078 }, { "epoch": 0.23714285714285716, "grad_norm": 0.250243931654738, "learning_rate": 6.165513528118556e-07, "loss": 1.505, "step": 1079 }, { "epoch": 0.23736263736263735, "grad_norm": 0.24949386266821572, "learning_rate": 6.164038412339203e-07, "loss": 1.4828, "step": 1080 }, { "epoch": 0.23758241758241758, "grad_norm": 0.25401637665261445, "learning_rate": 6.162562193377008e-07, "loss": 1.4658, "step": 1081 }, { "epoch": 0.2378021978021978, "grad_norm": 0.2602092323257622, "learning_rate": 6.161084871935735e-07, "loss": 1.4638, "step": 1082 }, { "epoch": 0.23802197802197803, "grad_norm": 0.23971141027923804, "learning_rate": 6.159606448719676e-07, "loss": 1.4253, "step": 1083 }, { "epoch": 0.23824175824175825, "grad_norm": 0.24009592180226755, "learning_rate": 6.158126924433646e-07, "loss": 1.3871, "step": 1084 }, { "epoch": 0.23846153846153847, "grad_norm": 0.26622764617315986, "learning_rate": 6.156646299782986e-07, "loss": 1.5069, "step": 1085 }, { "epoch": 0.23868131868131867, "grad_norm": 0.28647737693092973, "learning_rate": 6.155164575473562e-07, "loss": 1.4419, "step": 1086 }, { "epoch": 0.2389010989010989, "grad_norm": 0.25808251652871966, "learning_rate": 6.153681752211764e-07, "loss": 1.4971, "step": 1087 }, { "epoch": 0.23912087912087912, "grad_norm": 0.25735275747936, "learning_rate": 6.152197830704505e-07, "loss": 1.4938, "step": 1088 }, { "epoch": 0.23934065934065935, "grad_norm": 0.24904505230549445, "learning_rate": 6.150712811659221e-07, "loss": 1.4535, "step": 1089 }, { "epoch": 0.23956043956043957, "grad_norm": 0.253228066051037, "learning_rate": 6.149226695783873e-07, "loss": 1.5039, "step": 1090 }, { "epoch": 0.23978021978021977, "grad_norm": 0.2419367774032695, "learning_rate": 6.147739483786947e-07, "loss": 1.5014, "step": 1091 }, { "epoch": 0.24, "grad_norm": 0.24568896065683954, "learning_rate": 6.146251176377446e-07, "loss": 1.4932, "step": 1092 }, { "epoch": 0.24021978021978022, "grad_norm": 0.24254086343904321, "learning_rate": 6.144761774264899e-07, "loss": 1.4741, "step": 1093 }, { "epoch": 0.24043956043956044, "grad_norm": 0.2503162565465864, "learning_rate": 6.143271278159354e-07, "loss": 1.5192, "step": 1094 }, { "epoch": 0.24065934065934066, "grad_norm": 0.2633170556423953, "learning_rate": 6.141779688771385e-07, "loss": 1.4848, "step": 1095 }, { "epoch": 0.2408791208791209, "grad_norm": 0.2666038663461278, "learning_rate": 6.140287006812085e-07, "loss": 1.4404, "step": 1096 }, { "epoch": 0.24109890109890109, "grad_norm": 0.26634068694512053, "learning_rate": 6.138793232993064e-07, "loss": 1.449, "step": 1097 }, { "epoch": 0.2413186813186813, "grad_norm": 0.24613572287675384, "learning_rate": 6.137298368026461e-07, "loss": 1.4759, "step": 1098 }, { "epoch": 0.24153846153846154, "grad_norm": 0.2522587364888656, "learning_rate": 6.135802412624926e-07, "loss": 1.4832, "step": 1099 }, { "epoch": 0.24175824175824176, "grad_norm": 0.24552154772760235, "learning_rate": 6.134305367501636e-07, "loss": 1.4248, "step": 1100 }, { "epoch": 0.24197802197802198, "grad_norm": 0.2533079304799372, "learning_rate": 6.132807233370282e-07, "loss": 1.4299, "step": 1101 }, { "epoch": 0.2421978021978022, "grad_norm": 0.24640104021676112, "learning_rate": 6.13130801094508e-07, "loss": 1.4303, "step": 1102 }, { "epoch": 0.2424175824175824, "grad_norm": 0.28495899690235366, "learning_rate": 6.129807700940759e-07, "loss": 1.4791, "step": 1103 }, { "epoch": 0.24263736263736263, "grad_norm": 0.24643612019427724, "learning_rate": 6.128306304072571e-07, "loss": 1.4784, "step": 1104 }, { "epoch": 0.24285714285714285, "grad_norm": 0.25865255982401164, "learning_rate": 6.126803821056284e-07, "loss": 1.5042, "step": 1105 }, { "epoch": 0.24307692307692308, "grad_norm": 0.23727486138472725, "learning_rate": 6.125300252608182e-07, "loss": 1.4437, "step": 1106 }, { "epoch": 0.2432967032967033, "grad_norm": 0.30202384726110537, "learning_rate": 6.12379559944507e-07, "loss": 1.4983, "step": 1107 }, { "epoch": 0.24351648351648353, "grad_norm": 0.2805173630023854, "learning_rate": 6.122289862284268e-07, "loss": 1.5131, "step": 1108 }, { "epoch": 0.24373626373626373, "grad_norm": 0.26879845342320885, "learning_rate": 6.120783041843616e-07, "loss": 1.5035, "step": 1109 }, { "epoch": 0.24395604395604395, "grad_norm": 0.25402096552527337, "learning_rate": 6.119275138841464e-07, "loss": 1.4861, "step": 1110 }, { "epoch": 0.24417582417582417, "grad_norm": 0.31932832501073694, "learning_rate": 6.117766153996686e-07, "loss": 1.4874, "step": 1111 }, { "epoch": 0.2443956043956044, "grad_norm": 0.2532955815762932, "learning_rate": 6.116256088028665e-07, "loss": 1.5421, "step": 1112 }, { "epoch": 0.24461538461538462, "grad_norm": 0.2571008324647733, "learning_rate": 6.114744941657302e-07, "loss": 1.5376, "step": 1113 }, { "epoch": 0.24483516483516485, "grad_norm": 0.26672802162222276, "learning_rate": 6.113232715603014e-07, "loss": 1.5261, "step": 1114 }, { "epoch": 0.24505494505494504, "grad_norm": 0.27630820615775087, "learning_rate": 6.111719410586733e-07, "loss": 1.4139, "step": 1115 }, { "epoch": 0.24527472527472527, "grad_norm": 0.25158388736303877, "learning_rate": 6.110205027329901e-07, "loss": 1.4808, "step": 1116 }, { "epoch": 0.2454945054945055, "grad_norm": 0.2536800550823491, "learning_rate": 6.108689566554482e-07, "loss": 1.4362, "step": 1117 }, { "epoch": 0.24571428571428572, "grad_norm": 0.2522730808904053, "learning_rate": 6.107173028982945e-07, "loss": 1.4658, "step": 1118 }, { "epoch": 0.24593406593406594, "grad_norm": 0.25592103393523086, "learning_rate": 6.105655415338279e-07, "loss": 1.5232, "step": 1119 }, { "epoch": 0.24615384615384617, "grad_norm": 0.26320712384616624, "learning_rate": 6.104136726343981e-07, "loss": 1.4594, "step": 1120 }, { "epoch": 0.24637362637362636, "grad_norm": 0.25206434885669243, "learning_rate": 6.102616962724065e-07, "loss": 1.5077, "step": 1121 }, { "epoch": 0.2465934065934066, "grad_norm": 0.2506092463649332, "learning_rate": 6.101096125203055e-07, "loss": 1.4598, "step": 1122 }, { "epoch": 0.2468131868131868, "grad_norm": 0.2535839157735764, "learning_rate": 6.099574214505987e-07, "loss": 1.5111, "step": 1123 }, { "epoch": 0.24703296703296704, "grad_norm": 0.2706976752314015, "learning_rate": 6.098051231358407e-07, "loss": 1.5531, "step": 1124 }, { "epoch": 0.24725274725274726, "grad_norm": 0.261814361613236, "learning_rate": 6.096527176486378e-07, "loss": 1.4746, "step": 1125 }, { "epoch": 0.24747252747252746, "grad_norm": 0.25563828066269206, "learning_rate": 6.095002050616468e-07, "loss": 1.4351, "step": 1126 }, { "epoch": 0.24769230769230768, "grad_norm": 0.24262746175929104, "learning_rate": 6.093475854475757e-07, "loss": 1.4482, "step": 1127 }, { "epoch": 0.2479120879120879, "grad_norm": 0.24905697752627842, "learning_rate": 6.091948588791837e-07, "loss": 1.4243, "step": 1128 }, { "epoch": 0.24813186813186813, "grad_norm": 0.27964225780152474, "learning_rate": 6.09042025429281e-07, "loss": 1.4902, "step": 1129 }, { "epoch": 0.24835164835164836, "grad_norm": 0.25082380315304037, "learning_rate": 6.088890851707283e-07, "loss": 1.4631, "step": 1130 }, { "epoch": 0.24857142857142858, "grad_norm": 0.24560773808871394, "learning_rate": 6.08736038176438e-07, "loss": 1.4844, "step": 1131 }, { "epoch": 0.24879120879120878, "grad_norm": 0.24299561284907775, "learning_rate": 6.085828845193726e-07, "loss": 1.4254, "step": 1132 }, { "epoch": 0.249010989010989, "grad_norm": 0.2549622820375491, "learning_rate": 6.084296242725458e-07, "loss": 1.5371, "step": 1133 }, { "epoch": 0.24923076923076923, "grad_norm": 0.2610460521985358, "learning_rate": 6.082762575090224e-07, "loss": 1.5026, "step": 1134 }, { "epoch": 0.24945054945054945, "grad_norm": 0.25629212629555614, "learning_rate": 6.081227843019173e-07, "loss": 1.4537, "step": 1135 }, { "epoch": 0.24967032967032968, "grad_norm": 0.24685471940099946, "learning_rate": 6.079692047243967e-07, "loss": 1.4827, "step": 1136 }, { "epoch": 0.2498901098901099, "grad_norm": 0.2453785314020694, "learning_rate": 6.078155188496775e-07, "loss": 1.4802, "step": 1137 }, { "epoch": 0.2501098901098901, "grad_norm": 0.24864026929057834, "learning_rate": 6.076617267510269e-07, "loss": 1.4681, "step": 1138 }, { "epoch": 0.25032967032967035, "grad_norm": 0.257752415113131, "learning_rate": 6.075078285017632e-07, "loss": 1.5005, "step": 1139 }, { "epoch": 0.25054945054945055, "grad_norm": 0.3517741283967574, "learning_rate": 6.073538241752547e-07, "loss": 1.4816, "step": 1140 }, { "epoch": 0.25076923076923074, "grad_norm": 0.26452520257594947, "learning_rate": 6.07199713844921e-07, "loss": 1.4731, "step": 1141 }, { "epoch": 0.250989010989011, "grad_norm": 0.25064377542417143, "learning_rate": 6.070454975842316e-07, "loss": 1.4347, "step": 1142 }, { "epoch": 0.2512087912087912, "grad_norm": 0.2542937350562004, "learning_rate": 6.068911754667068e-07, "loss": 1.4986, "step": 1143 }, { "epoch": 0.25142857142857145, "grad_norm": 0.2519101012965945, "learning_rate": 6.067367475659176e-07, "loss": 1.4823, "step": 1144 }, { "epoch": 0.25164835164835164, "grad_norm": 0.25828895174954947, "learning_rate": 6.065822139554849e-07, "loss": 1.4537, "step": 1145 }, { "epoch": 0.2518681318681319, "grad_norm": 0.24608984776859877, "learning_rate": 6.064275747090804e-07, "loss": 1.4811, "step": 1146 }, { "epoch": 0.2520879120879121, "grad_norm": 0.3663766605797682, "learning_rate": 6.06272829900426e-07, "loss": 1.4824, "step": 1147 }, { "epoch": 0.2523076923076923, "grad_norm": 0.24301132196881353, "learning_rate": 6.061179796032939e-07, "loss": 1.5269, "step": 1148 }, { "epoch": 0.25252747252747254, "grad_norm": 0.24987778657281365, "learning_rate": 6.059630238915066e-07, "loss": 1.4716, "step": 1149 }, { "epoch": 0.25274725274725274, "grad_norm": 0.25596706986657414, "learning_rate": 6.058079628389371e-07, "loss": 1.4299, "step": 1150 }, { "epoch": 0.252967032967033, "grad_norm": 0.24944191907968008, "learning_rate": 6.056527965195081e-07, "loss": 1.4441, "step": 1151 }, { "epoch": 0.2531868131868132, "grad_norm": 0.25309000759960226, "learning_rate": 6.05497525007193e-07, "loss": 1.5258, "step": 1152 }, { "epoch": 0.2534065934065934, "grad_norm": 0.24681891625672514, "learning_rate": 6.053421483760152e-07, "loss": 1.4661, "step": 1153 }, { "epoch": 0.25362637362637364, "grad_norm": 0.25797776885383356, "learning_rate": 6.05186666700048e-07, "loss": 1.4855, "step": 1154 }, { "epoch": 0.25384615384615383, "grad_norm": 0.251472266688807, "learning_rate": 6.050310800534152e-07, "loss": 1.4079, "step": 1155 }, { "epoch": 0.2540659340659341, "grad_norm": 0.25504410785351683, "learning_rate": 6.048753885102899e-07, "loss": 1.5241, "step": 1156 }, { "epoch": 0.2542857142857143, "grad_norm": 0.2659572613583344, "learning_rate": 6.047195921448962e-07, "loss": 1.5127, "step": 1157 }, { "epoch": 0.2545054945054945, "grad_norm": 0.24619056708730233, "learning_rate": 6.045636910315074e-07, "loss": 1.5726, "step": 1158 }, { "epoch": 0.25472527472527473, "grad_norm": 0.24478826490368982, "learning_rate": 6.044076852444471e-07, "loss": 1.4612, "step": 1159 }, { "epoch": 0.2549450549450549, "grad_norm": 0.29208441870154067, "learning_rate": 6.042515748580885e-07, "loss": 1.5348, "step": 1160 }, { "epoch": 0.2551648351648352, "grad_norm": 0.24538584809024377, "learning_rate": 6.040953599468552e-07, "loss": 1.5186, "step": 1161 }, { "epoch": 0.2553846153846154, "grad_norm": 0.25643685300585894, "learning_rate": 6.039390405852199e-07, "loss": 1.4723, "step": 1162 }, { "epoch": 0.25560439560439563, "grad_norm": 0.2547927574158795, "learning_rate": 6.03782616847706e-07, "loss": 1.3975, "step": 1163 }, { "epoch": 0.2558241758241758, "grad_norm": 0.25904888762465, "learning_rate": 6.036260888088857e-07, "loss": 1.486, "step": 1164 }, { "epoch": 0.256043956043956, "grad_norm": 0.2590186582179336, "learning_rate": 6.034694565433816e-07, "loss": 1.4909, "step": 1165 }, { "epoch": 0.2562637362637363, "grad_norm": 0.2514821084980522, "learning_rate": 6.033127201258656e-07, "loss": 1.4762, "step": 1166 }, { "epoch": 0.25648351648351647, "grad_norm": 0.25495314239700223, "learning_rate": 6.031558796310597e-07, "loss": 1.4726, "step": 1167 }, { "epoch": 0.2567032967032967, "grad_norm": 0.24749835721393953, "learning_rate": 6.029989351337348e-07, "loss": 1.4428, "step": 1168 }, { "epoch": 0.2569230769230769, "grad_norm": 0.2539325966425873, "learning_rate": 6.028418867087122e-07, "loss": 1.5169, "step": 1169 }, { "epoch": 0.2571428571428571, "grad_norm": 0.24306333874593364, "learning_rate": 6.026847344308623e-07, "loss": 1.5018, "step": 1170 }, { "epoch": 0.25736263736263737, "grad_norm": 0.24297781880095035, "learning_rate": 6.025274783751049e-07, "loss": 1.4832, "step": 1171 }, { "epoch": 0.25758241758241757, "grad_norm": 0.25627900845877255, "learning_rate": 6.023701186164098e-07, "loss": 1.4379, "step": 1172 }, { "epoch": 0.2578021978021978, "grad_norm": 0.24866316311975892, "learning_rate": 6.022126552297955e-07, "loss": 1.4023, "step": 1173 }, { "epoch": 0.258021978021978, "grad_norm": 0.24874008895024652, "learning_rate": 6.020550882903304e-07, "loss": 1.4933, "step": 1174 }, { "epoch": 0.25824175824175827, "grad_norm": 0.25464878977353284, "learning_rate": 6.018974178731322e-07, "loss": 1.4762, "step": 1175 }, { "epoch": 0.25846153846153846, "grad_norm": 0.24715180656412153, "learning_rate": 6.01739644053368e-07, "loss": 1.4166, "step": 1176 }, { "epoch": 0.25868131868131866, "grad_norm": 0.24382487413214948, "learning_rate": 6.015817669062539e-07, "loss": 1.4211, "step": 1177 }, { "epoch": 0.2589010989010989, "grad_norm": 0.26210602072240047, "learning_rate": 6.014237865070555e-07, "loss": 1.5183, "step": 1178 }, { "epoch": 0.2591208791208791, "grad_norm": 0.24780245259611958, "learning_rate": 6.012657029310876e-07, "loss": 1.4403, "step": 1179 }, { "epoch": 0.25934065934065936, "grad_norm": 0.23982738635279338, "learning_rate": 6.01107516253714e-07, "loss": 1.4952, "step": 1180 }, { "epoch": 0.25956043956043956, "grad_norm": 0.25313364511518943, "learning_rate": 6.009492265503482e-07, "loss": 1.472, "step": 1181 }, { "epoch": 0.25978021978021976, "grad_norm": 0.2523675011130071, "learning_rate": 6.00790833896452e-07, "loss": 1.5084, "step": 1182 }, { "epoch": 0.26, "grad_norm": 0.2423015383390281, "learning_rate": 6.006323383675369e-07, "loss": 1.4853, "step": 1183 }, { "epoch": 0.2602197802197802, "grad_norm": 0.26489790333011937, "learning_rate": 6.004737400391632e-07, "loss": 1.5596, "step": 1184 }, { "epoch": 0.26043956043956046, "grad_norm": 0.2418984060776012, "learning_rate": 6.003150389869404e-07, "loss": 1.4923, "step": 1185 }, { "epoch": 0.26065934065934065, "grad_norm": 0.2678745582741978, "learning_rate": 6.001562352865266e-07, "loss": 1.5146, "step": 1186 }, { "epoch": 0.26087912087912085, "grad_norm": 0.2520122665809618, "learning_rate": 5.999973290136292e-07, "loss": 1.4754, "step": 1187 }, { "epoch": 0.2610989010989011, "grad_norm": 0.248200953070769, "learning_rate": 5.998383202440043e-07, "loss": 1.5028, "step": 1188 }, { "epoch": 0.2613186813186813, "grad_norm": 0.24778067156462122, "learning_rate": 5.99679209053457e-07, "loss": 1.4667, "step": 1189 }, { "epoch": 0.26153846153846155, "grad_norm": 0.2722630605507682, "learning_rate": 5.99519995517841e-07, "loss": 1.5367, "step": 1190 }, { "epoch": 0.26175824175824175, "grad_norm": 0.2662416031120894, "learning_rate": 5.993606797130592e-07, "loss": 1.4845, "step": 1191 }, { "epoch": 0.261978021978022, "grad_norm": 0.2671806445602299, "learning_rate": 5.992012617150628e-07, "loss": 1.4437, "step": 1192 }, { "epoch": 0.2621978021978022, "grad_norm": 0.2524153192817131, "learning_rate": 5.99041741599852e-07, "loss": 1.4888, "step": 1193 }, { "epoch": 0.2624175824175824, "grad_norm": 0.24092041229443598, "learning_rate": 5.988821194434758e-07, "loss": 1.5152, "step": 1194 }, { "epoch": 0.26263736263736265, "grad_norm": 0.2568200801688485, "learning_rate": 5.987223953220311e-07, "loss": 1.4709, "step": 1195 }, { "epoch": 0.26285714285714284, "grad_norm": 0.2588652850938266, "learning_rate": 5.985625693116645e-07, "loss": 1.4862, "step": 1196 }, { "epoch": 0.2630769230769231, "grad_norm": 0.24963390531839405, "learning_rate": 5.984026414885704e-07, "loss": 1.4743, "step": 1197 }, { "epoch": 0.2632967032967033, "grad_norm": 0.2448874702757865, "learning_rate": 5.982426119289921e-07, "loss": 1.4444, "step": 1198 }, { "epoch": 0.2635164835164835, "grad_norm": 0.25669812321946955, "learning_rate": 5.980824807092211e-07, "loss": 1.47, "step": 1199 }, { "epoch": 0.26373626373626374, "grad_norm": 0.2421719234121139, "learning_rate": 5.979222479055976e-07, "loss": 1.4505, "step": 1200 }, { "epoch": 0.26395604395604394, "grad_norm": 0.27151723758011004, "learning_rate": 5.977619135945102e-07, "loss": 1.5311, "step": 1201 }, { "epoch": 0.2641758241758242, "grad_norm": 0.2542599355346271, "learning_rate": 5.976014778523959e-07, "loss": 1.5004, "step": 1202 }, { "epoch": 0.2643956043956044, "grad_norm": 0.2511556108189597, "learning_rate": 5.974409407557399e-07, "loss": 1.4541, "step": 1203 }, { "epoch": 0.26461538461538464, "grad_norm": 0.25551017572279605, "learning_rate": 5.97280302381076e-07, "loss": 1.44, "step": 1204 }, { "epoch": 0.26483516483516484, "grad_norm": 0.2553798629811734, "learning_rate": 5.971195628049859e-07, "loss": 1.4905, "step": 1205 }, { "epoch": 0.26505494505494503, "grad_norm": 0.2642029756555833, "learning_rate": 5.969587221041e-07, "loss": 1.4666, "step": 1206 }, { "epoch": 0.2652747252747253, "grad_norm": 0.2853320795960418, "learning_rate": 5.967977803550965e-07, "loss": 1.4327, "step": 1207 }, { "epoch": 0.2654945054945055, "grad_norm": 0.2405682250669, "learning_rate": 5.96636737634702e-07, "loss": 1.499, "step": 1208 }, { "epoch": 0.26571428571428574, "grad_norm": 0.24965478664594806, "learning_rate": 5.964755940196912e-07, "loss": 1.4309, "step": 1209 }, { "epoch": 0.26593406593406593, "grad_norm": 0.2477960693702363, "learning_rate": 5.96314349586887e-07, "loss": 1.4645, "step": 1210 }, { "epoch": 0.26615384615384613, "grad_norm": 0.25929811027231725, "learning_rate": 5.961530044131599e-07, "loss": 1.5014, "step": 1211 }, { "epoch": 0.2663736263736264, "grad_norm": 0.2696274977295813, "learning_rate": 5.95991558575429e-07, "loss": 1.5063, "step": 1212 }, { "epoch": 0.2665934065934066, "grad_norm": 0.24561139748105792, "learning_rate": 5.958300121506612e-07, "loss": 1.4739, "step": 1213 }, { "epoch": 0.26681318681318683, "grad_norm": 0.2549922263907234, "learning_rate": 5.956683652158714e-07, "loss": 1.4736, "step": 1214 }, { "epoch": 0.26703296703296703, "grad_norm": 0.27698993736333094, "learning_rate": 5.955066178481222e-07, "loss": 1.4859, "step": 1215 }, { "epoch": 0.2672527472527473, "grad_norm": 0.24492865588973883, "learning_rate": 5.95344770124524e-07, "loss": 1.4527, "step": 1216 }, { "epoch": 0.2674725274725275, "grad_norm": 0.24360866308103699, "learning_rate": 5.951828221222356e-07, "loss": 1.4194, "step": 1217 }, { "epoch": 0.2676923076923077, "grad_norm": 0.2422559900610527, "learning_rate": 5.950207739184632e-07, "loss": 1.4682, "step": 1218 }, { "epoch": 0.2679120879120879, "grad_norm": 0.2526136818543357, "learning_rate": 5.948586255904606e-07, "loss": 1.3901, "step": 1219 }, { "epoch": 0.2681318681318681, "grad_norm": 0.2453029494258157, "learning_rate": 5.9469637721553e-07, "loss": 1.45, "step": 1220 }, { "epoch": 0.2683516483516484, "grad_norm": 0.2448937162362778, "learning_rate": 5.945340288710202e-07, "loss": 1.4972, "step": 1221 }, { "epoch": 0.26857142857142857, "grad_norm": 0.248643964311628, "learning_rate": 5.943715806343291e-07, "loss": 1.4648, "step": 1222 }, { "epoch": 0.26879120879120877, "grad_norm": 0.2427660684241653, "learning_rate": 5.942090325829009e-07, "loss": 1.5058, "step": 1223 }, { "epoch": 0.269010989010989, "grad_norm": 0.3000472982461334, "learning_rate": 5.94046384794228e-07, "loss": 1.4523, "step": 1224 }, { "epoch": 0.2692307692307692, "grad_norm": 0.2510219939968679, "learning_rate": 5.938836373458505e-07, "loss": 1.5323, "step": 1225 }, { "epoch": 0.26945054945054947, "grad_norm": 0.25057047923701725, "learning_rate": 5.937207903153555e-07, "loss": 1.4894, "step": 1226 }, { "epoch": 0.26967032967032967, "grad_norm": 0.25916602966015984, "learning_rate": 5.935578437803782e-07, "loss": 1.4736, "step": 1227 }, { "epoch": 0.26989010989010986, "grad_norm": 0.2528863643291575, "learning_rate": 5.933947978186005e-07, "loss": 1.4519, "step": 1228 }, { "epoch": 0.2701098901098901, "grad_norm": 0.24951097085657922, "learning_rate": 5.932316525077523e-07, "loss": 1.5032, "step": 1229 }, { "epoch": 0.2703296703296703, "grad_norm": 0.2553958538809859, "learning_rate": 5.930684079256109e-07, "loss": 1.5227, "step": 1230 }, { "epoch": 0.27054945054945057, "grad_norm": 0.24481741189448933, "learning_rate": 5.929050641500002e-07, "loss": 1.4637, "step": 1231 }, { "epoch": 0.27076923076923076, "grad_norm": 0.33419297581145896, "learning_rate": 5.927416212587922e-07, "loss": 1.4128, "step": 1232 }, { "epoch": 0.270989010989011, "grad_norm": 0.2449685437448095, "learning_rate": 5.925780793299058e-07, "loss": 1.4575, "step": 1233 }, { "epoch": 0.2712087912087912, "grad_norm": 0.24577871034173174, "learning_rate": 5.924144384413071e-07, "loss": 1.4361, "step": 1234 }, { "epoch": 0.2714285714285714, "grad_norm": 0.26212461832473327, "learning_rate": 5.922506986710095e-07, "loss": 1.4193, "step": 1235 }, { "epoch": 0.27164835164835166, "grad_norm": 0.24295507349598566, "learning_rate": 5.920868600970731e-07, "loss": 1.4989, "step": 1236 }, { "epoch": 0.27186813186813186, "grad_norm": 0.24841010846308056, "learning_rate": 5.919229227976061e-07, "loss": 1.4311, "step": 1237 }, { "epoch": 0.2720879120879121, "grad_norm": 0.2728685246760535, "learning_rate": 5.917588868507625e-07, "loss": 1.4732, "step": 1238 }, { "epoch": 0.2723076923076923, "grad_norm": 0.241692791019892, "learning_rate": 5.915947523347445e-07, "loss": 1.5371, "step": 1239 }, { "epoch": 0.2725274725274725, "grad_norm": 0.26786092040486226, "learning_rate": 5.914305193278002e-07, "loss": 1.4705, "step": 1240 }, { "epoch": 0.27274725274725276, "grad_norm": 0.25055350348334465, "learning_rate": 5.912661879082256e-07, "loss": 1.556, "step": 1241 }, { "epoch": 0.27296703296703295, "grad_norm": 0.2559995740907689, "learning_rate": 5.91101758154363e-07, "loss": 1.4899, "step": 1242 }, { "epoch": 0.2731868131868132, "grad_norm": 0.247667585677873, "learning_rate": 5.909372301446021e-07, "loss": 1.4979, "step": 1243 }, { "epoch": 0.2734065934065934, "grad_norm": 0.24606259239820927, "learning_rate": 5.907726039573788e-07, "loss": 1.5447, "step": 1244 }, { "epoch": 0.27362637362637365, "grad_norm": 0.2574004117847577, "learning_rate": 5.906078796711764e-07, "loss": 1.4683, "step": 1245 }, { "epoch": 0.27384615384615385, "grad_norm": 0.24736881033961086, "learning_rate": 5.904430573645246e-07, "loss": 1.4195, "step": 1246 }, { "epoch": 0.27406593406593405, "grad_norm": 0.24530580572464508, "learning_rate": 5.902781371159999e-07, "loss": 1.4444, "step": 1247 }, { "epoch": 0.2742857142857143, "grad_norm": 0.34801615245113166, "learning_rate": 5.901131190042257e-07, "loss": 1.4543, "step": 1248 }, { "epoch": 0.2745054945054945, "grad_norm": 0.27425721649670726, "learning_rate": 5.899480031078717e-07, "loss": 1.4905, "step": 1249 }, { "epoch": 0.27472527472527475, "grad_norm": 0.24909979543101832, "learning_rate": 5.897827895056544e-07, "loss": 1.4948, "step": 1250 }, { "epoch": 0.27494505494505495, "grad_norm": 0.2453928589008479, "learning_rate": 5.896174782763371e-07, "loss": 1.4192, "step": 1251 }, { "epoch": 0.27516483516483514, "grad_norm": 0.24784211470781856, "learning_rate": 5.894520694987292e-07, "loss": 1.5016, "step": 1252 }, { "epoch": 0.2753846153846154, "grad_norm": 0.25385193474328976, "learning_rate": 5.89286563251687e-07, "loss": 1.4749, "step": 1253 }, { "epoch": 0.2756043956043956, "grad_norm": 0.2500375390716796, "learning_rate": 5.891209596141132e-07, "loss": 1.4689, "step": 1254 }, { "epoch": 0.27582417582417584, "grad_norm": 0.25183689524162134, "learning_rate": 5.889552586649564e-07, "loss": 1.4391, "step": 1255 }, { "epoch": 0.27604395604395604, "grad_norm": 0.25229187969337635, "learning_rate": 5.887894604832125e-07, "loss": 1.4815, "step": 1256 }, { "epoch": 0.27626373626373624, "grad_norm": 0.24928781380181728, "learning_rate": 5.88623565147923e-07, "loss": 1.4695, "step": 1257 }, { "epoch": 0.2764835164835165, "grad_norm": 0.2562942501214982, "learning_rate": 5.884575727381761e-07, "loss": 1.4697, "step": 1258 }, { "epoch": 0.2767032967032967, "grad_norm": 0.24559651092598303, "learning_rate": 5.882914833331062e-07, "loss": 1.4488, "step": 1259 }, { "epoch": 0.27692307692307694, "grad_norm": 0.35499717285209875, "learning_rate": 5.881252970118937e-07, "loss": 1.4326, "step": 1260 }, { "epoch": 0.27714285714285714, "grad_norm": 0.278990004685268, "learning_rate": 5.879590138537657e-07, "loss": 1.4719, "step": 1261 }, { "epoch": 0.2773626373626374, "grad_norm": 0.40140864415897365, "learning_rate": 5.877926339379949e-07, "loss": 1.3861, "step": 1262 }, { "epoch": 0.2775824175824176, "grad_norm": 0.29577977795576144, "learning_rate": 5.876261573439008e-07, "loss": 1.4429, "step": 1263 }, { "epoch": 0.2778021978021978, "grad_norm": 0.25780139038569383, "learning_rate": 5.874595841508483e-07, "loss": 1.492, "step": 1264 }, { "epoch": 0.27802197802197803, "grad_norm": 0.25949464208174866, "learning_rate": 5.872929144382486e-07, "loss": 1.4194, "step": 1265 }, { "epoch": 0.27824175824175823, "grad_norm": 0.23896647886805938, "learning_rate": 5.871261482855592e-07, "loss": 1.4672, "step": 1266 }, { "epoch": 0.2784615384615385, "grad_norm": 0.29873658112030305, "learning_rate": 5.869592857722832e-07, "loss": 1.5212, "step": 1267 }, { "epoch": 0.2786813186813187, "grad_norm": 0.24995422539987305, "learning_rate": 5.867923269779698e-07, "loss": 1.5121, "step": 1268 }, { "epoch": 0.2789010989010989, "grad_norm": 0.24988830906918102, "learning_rate": 5.866252719822142e-07, "loss": 1.5253, "step": 1269 }, { "epoch": 0.27912087912087913, "grad_norm": 0.2822070269451855, "learning_rate": 5.864581208646571e-07, "loss": 1.4793, "step": 1270 }, { "epoch": 0.2793406593406593, "grad_norm": 0.24384360636722566, "learning_rate": 5.862908737049855e-07, "loss": 1.409, "step": 1271 }, { "epoch": 0.2795604395604396, "grad_norm": 0.3010815653589461, "learning_rate": 5.861235305829318e-07, "loss": 1.4737, "step": 1272 }, { "epoch": 0.2797802197802198, "grad_norm": 0.2426473360780779, "learning_rate": 5.859560915782743e-07, "loss": 1.4597, "step": 1273 }, { "epoch": 0.28, "grad_norm": 0.24031709265797813, "learning_rate": 5.857885567708372e-07, "loss": 1.4473, "step": 1274 }, { "epoch": 0.2802197802197802, "grad_norm": 0.24887282661998375, "learning_rate": 5.856209262404901e-07, "loss": 1.5392, "step": 1275 }, { "epoch": 0.2804395604395604, "grad_norm": 0.33045530720916494, "learning_rate": 5.854532000671482e-07, "loss": 1.4809, "step": 1276 }, { "epoch": 0.2806593406593407, "grad_norm": 0.2598270734406072, "learning_rate": 5.852853783307725e-07, "loss": 1.5037, "step": 1277 }, { "epoch": 0.28087912087912087, "grad_norm": 0.2599588112358863, "learning_rate": 5.851174611113696e-07, "loss": 1.5084, "step": 1278 }, { "epoch": 0.2810989010989011, "grad_norm": 0.2603727300609368, "learning_rate": 5.849494484889913e-07, "loss": 1.4728, "step": 1279 }, { "epoch": 0.2813186813186813, "grad_norm": 0.25999787393649765, "learning_rate": 5.847813405437353e-07, "loss": 1.4216, "step": 1280 }, { "epoch": 0.2815384615384615, "grad_norm": 0.24769042216447004, "learning_rate": 5.846131373557442e-07, "loss": 1.4689, "step": 1281 }, { "epoch": 0.28175824175824177, "grad_norm": 0.2540554170799448, "learning_rate": 5.844448390052066e-07, "loss": 1.4834, "step": 1282 }, { "epoch": 0.28197802197802196, "grad_norm": 0.26568707123092805, "learning_rate": 5.84276445572356e-07, "loss": 1.4566, "step": 1283 }, { "epoch": 0.2821978021978022, "grad_norm": 0.26055271950319026, "learning_rate": 5.841079571374717e-07, "loss": 1.5031, "step": 1284 }, { "epoch": 0.2824175824175824, "grad_norm": 0.2452788727718544, "learning_rate": 5.839393737808778e-07, "loss": 1.474, "step": 1285 }, { "epoch": 0.2826373626373626, "grad_norm": 0.24755361748064456, "learning_rate": 5.83770695582944e-07, "loss": 1.4287, "step": 1286 }, { "epoch": 0.28285714285714286, "grad_norm": 0.408446084926422, "learning_rate": 5.836019226240848e-07, "loss": 1.4857, "step": 1287 }, { "epoch": 0.28307692307692306, "grad_norm": 0.24802297923693442, "learning_rate": 5.834330549847604e-07, "loss": 1.4467, "step": 1288 }, { "epoch": 0.2832967032967033, "grad_norm": 0.2456449266928603, "learning_rate": 5.832640927454759e-07, "loss": 1.4625, "step": 1289 }, { "epoch": 0.2835164835164835, "grad_norm": 0.24884916776155724, "learning_rate": 5.830950359867815e-07, "loss": 1.4963, "step": 1290 }, { "epoch": 0.28373626373626376, "grad_norm": 0.2525680171064067, "learning_rate": 5.829258847892723e-07, "loss": 1.4822, "step": 1291 }, { "epoch": 0.28395604395604396, "grad_norm": 0.24423421079466054, "learning_rate": 5.827566392335887e-07, "loss": 1.447, "step": 1292 }, { "epoch": 0.28417582417582415, "grad_norm": 0.269983032180082, "learning_rate": 5.82587299400416e-07, "loss": 1.5456, "step": 1293 }, { "epoch": 0.2843956043956044, "grad_norm": 0.2559594861722576, "learning_rate": 5.824178653704844e-07, "loss": 1.4456, "step": 1294 }, { "epoch": 0.2846153846153846, "grad_norm": 0.2436519307438797, "learning_rate": 5.822483372245688e-07, "loss": 1.5017, "step": 1295 }, { "epoch": 0.28483516483516486, "grad_norm": 0.2520568394304604, "learning_rate": 5.820787150434893e-07, "loss": 1.4413, "step": 1296 }, { "epoch": 0.28505494505494505, "grad_norm": 0.24142164071110692, "learning_rate": 5.819089989081109e-07, "loss": 1.481, "step": 1297 }, { "epoch": 0.28527472527472525, "grad_norm": 0.25596033369892546, "learning_rate": 5.817391888993431e-07, "loss": 1.4318, "step": 1298 }, { "epoch": 0.2854945054945055, "grad_norm": 0.2544386478012293, "learning_rate": 5.8156928509814e-07, "loss": 1.4276, "step": 1299 }, { "epoch": 0.2857142857142857, "grad_norm": 0.25212373912829383, "learning_rate": 5.813992875855011e-07, "loss": 1.4799, "step": 1300 }, { "epoch": 0.28593406593406595, "grad_norm": 0.25407121739201727, "learning_rate": 5.812291964424698e-07, "loss": 1.4665, "step": 1301 }, { "epoch": 0.28615384615384615, "grad_norm": 0.2511055574424577, "learning_rate": 5.810590117501346e-07, "loss": 1.4749, "step": 1302 }, { "epoch": 0.2863736263736264, "grad_norm": 0.2772291435018925, "learning_rate": 5.808887335896284e-07, "loss": 1.534, "step": 1303 }, { "epoch": 0.2865934065934066, "grad_norm": 0.248031946299549, "learning_rate": 5.80718362042129e-07, "loss": 1.4937, "step": 1304 }, { "epoch": 0.2868131868131868, "grad_norm": 0.25396776910396807, "learning_rate": 5.805478971888583e-07, "loss": 1.4854, "step": 1305 }, { "epoch": 0.28703296703296705, "grad_norm": 0.24410515270199715, "learning_rate": 5.803773391110826e-07, "loss": 1.4456, "step": 1306 }, { "epoch": 0.28725274725274724, "grad_norm": 0.2770756880635025, "learning_rate": 5.802066878901133e-07, "loss": 1.4769, "step": 1307 }, { "epoch": 0.2874725274725275, "grad_norm": 0.25348000723592246, "learning_rate": 5.800359436073057e-07, "loss": 1.4376, "step": 1308 }, { "epoch": 0.2876923076923077, "grad_norm": 0.2484354440295395, "learning_rate": 5.798651063440592e-07, "loss": 1.4513, "step": 1309 }, { "epoch": 0.2879120879120879, "grad_norm": 0.2517249240776544, "learning_rate": 5.796941761818182e-07, "loss": 1.4351, "step": 1310 }, { "epoch": 0.28813186813186814, "grad_norm": 0.2504832272346342, "learning_rate": 5.795231532020711e-07, "loss": 1.4538, "step": 1311 }, { "epoch": 0.28835164835164834, "grad_norm": 0.3019003841272291, "learning_rate": 5.793520374863504e-07, "loss": 1.5249, "step": 1312 }, { "epoch": 0.2885714285714286, "grad_norm": 0.2630909033801328, "learning_rate": 5.791808291162331e-07, "loss": 1.5838, "step": 1313 }, { "epoch": 0.2887912087912088, "grad_norm": 0.24143116292557143, "learning_rate": 5.790095281733399e-07, "loss": 1.4717, "step": 1314 }, { "epoch": 0.289010989010989, "grad_norm": 0.25895879947502787, "learning_rate": 5.788381347393361e-07, "loss": 1.5802, "step": 1315 }, { "epoch": 0.28923076923076924, "grad_norm": 0.24263969650420297, "learning_rate": 5.78666648895931e-07, "loss": 1.5295, "step": 1316 }, { "epoch": 0.28945054945054943, "grad_norm": 0.2418927429342586, "learning_rate": 5.784950707248779e-07, "loss": 1.4659, "step": 1317 }, { "epoch": 0.2896703296703297, "grad_norm": 0.3974036191577161, "learning_rate": 5.783234003079739e-07, "loss": 1.4877, "step": 1318 }, { "epoch": 0.2898901098901099, "grad_norm": 0.24524436489741314, "learning_rate": 5.781516377270604e-07, "loss": 1.4382, "step": 1319 }, { "epoch": 0.29010989010989013, "grad_norm": 0.2851478272657233, "learning_rate": 5.779797830640227e-07, "loss": 1.5369, "step": 1320 }, { "epoch": 0.29032967032967033, "grad_norm": 0.2503039752750346, "learning_rate": 5.778078364007899e-07, "loss": 1.4435, "step": 1321 }, { "epoch": 0.2905494505494505, "grad_norm": 0.24884356273385935, "learning_rate": 5.776357978193347e-07, "loss": 1.4543, "step": 1322 }, { "epoch": 0.2907692307692308, "grad_norm": 0.25888769163782055, "learning_rate": 5.774636674016741e-07, "loss": 1.4053, "step": 1323 }, { "epoch": 0.290989010989011, "grad_norm": 0.2462852035495459, "learning_rate": 5.772914452298688e-07, "loss": 1.4834, "step": 1324 }, { "epoch": 0.29120879120879123, "grad_norm": 0.25411395270956194, "learning_rate": 5.771191313860229e-07, "loss": 1.5166, "step": 1325 }, { "epoch": 0.2914285714285714, "grad_norm": 0.2513998510012807, "learning_rate": 5.769467259522846e-07, "loss": 1.4944, "step": 1326 }, { "epoch": 0.2916483516483516, "grad_norm": 0.43988168100265335, "learning_rate": 5.767742290108453e-07, "loss": 1.4793, "step": 1327 }, { "epoch": 0.2918681318681319, "grad_norm": 0.2534106539853273, "learning_rate": 5.766016406439406e-07, "loss": 1.4667, "step": 1328 }, { "epoch": 0.29208791208791207, "grad_norm": 0.2394330470229594, "learning_rate": 5.764289609338495e-07, "loss": 1.4811, "step": 1329 }, { "epoch": 0.2923076923076923, "grad_norm": 0.2585764308429949, "learning_rate": 5.76256189962894e-07, "loss": 1.4908, "step": 1330 }, { "epoch": 0.2925274725274725, "grad_norm": 0.2416542791979125, "learning_rate": 5.760833278134404e-07, "loss": 1.5639, "step": 1331 }, { "epoch": 0.2927472527472528, "grad_norm": 0.24692480043414214, "learning_rate": 5.759103745678979e-07, "loss": 1.4709, "step": 1332 }, { "epoch": 0.29296703296703297, "grad_norm": 0.3458281045720652, "learning_rate": 5.757373303087195e-07, "loss": 1.4075, "step": 1333 }, { "epoch": 0.29318681318681317, "grad_norm": 0.25017970446322996, "learning_rate": 5.755641951184015e-07, "loss": 1.4471, "step": 1334 }, { "epoch": 0.2934065934065934, "grad_norm": 0.2573881075194735, "learning_rate": 5.753909690794832e-07, "loss": 1.4943, "step": 1335 }, { "epoch": 0.2936263736263736, "grad_norm": 0.24695784811193833, "learning_rate": 5.752176522745478e-07, "loss": 1.4421, "step": 1336 }, { "epoch": 0.29384615384615387, "grad_norm": 0.24586543625747756, "learning_rate": 5.750442447862213e-07, "loss": 1.4503, "step": 1337 }, { "epoch": 0.29406593406593406, "grad_norm": 0.24646862660755073, "learning_rate": 5.748707466971731e-07, "loss": 1.4443, "step": 1338 }, { "epoch": 0.29428571428571426, "grad_norm": 0.26282677786531006, "learning_rate": 5.746971580901159e-07, "loss": 1.4756, "step": 1339 }, { "epoch": 0.2945054945054945, "grad_norm": 0.261353131292857, "learning_rate": 5.745234790478053e-07, "loss": 1.4198, "step": 1340 }, { "epoch": 0.2947252747252747, "grad_norm": 0.24357210900308515, "learning_rate": 5.743497096530402e-07, "loss": 1.5065, "step": 1341 }, { "epoch": 0.29494505494505496, "grad_norm": 0.2517316952398751, "learning_rate": 5.741758499886627e-07, "loss": 1.5051, "step": 1342 }, { "epoch": 0.29516483516483516, "grad_norm": 0.2510808672786306, "learning_rate": 5.740019001375576e-07, "loss": 1.4981, "step": 1343 }, { "epoch": 0.2953846153846154, "grad_norm": 0.25127079169990896, "learning_rate": 5.738278601826528e-07, "loss": 1.5269, "step": 1344 }, { "epoch": 0.2956043956043956, "grad_norm": 0.2584965663841474, "learning_rate": 5.736537302069193e-07, "loss": 1.5491, "step": 1345 }, { "epoch": 0.2958241758241758, "grad_norm": 0.259841902528515, "learning_rate": 5.734795102933711e-07, "loss": 1.4774, "step": 1346 }, { "epoch": 0.29604395604395606, "grad_norm": 0.26010438725480944, "learning_rate": 5.733052005250646e-07, "loss": 1.4402, "step": 1347 }, { "epoch": 0.29626373626373625, "grad_norm": 0.2504879713131675, "learning_rate": 5.731308009850997e-07, "loss": 1.4833, "step": 1348 }, { "epoch": 0.2964835164835165, "grad_norm": 0.2509677515197842, "learning_rate": 5.729563117566183e-07, "loss": 1.5006, "step": 1349 }, { "epoch": 0.2967032967032967, "grad_norm": 0.24838804541143591, "learning_rate": 5.727817329228059e-07, "loss": 1.464, "step": 1350 }, { "epoch": 0.2969230769230769, "grad_norm": 0.24959313046978843, "learning_rate": 5.726070645668903e-07, "loss": 1.4541, "step": 1351 }, { "epoch": 0.29714285714285715, "grad_norm": 0.26537308150910344, "learning_rate": 5.724323067721417e-07, "loss": 1.4479, "step": 1352 }, { "epoch": 0.29736263736263735, "grad_norm": 0.2509782256447158, "learning_rate": 5.722574596218733e-07, "loss": 1.4574, "step": 1353 }, { "epoch": 0.2975824175824176, "grad_norm": 0.2659576892006521, "learning_rate": 5.720825231994411e-07, "loss": 1.4676, "step": 1354 }, { "epoch": 0.2978021978021978, "grad_norm": 0.25523298060847843, "learning_rate": 5.719074975882435e-07, "loss": 1.439, "step": 1355 }, { "epoch": 0.298021978021978, "grad_norm": 0.2775273273573927, "learning_rate": 5.717323828717207e-07, "loss": 1.4567, "step": 1356 }, { "epoch": 0.29824175824175825, "grad_norm": 0.25879672991156294, "learning_rate": 5.715571791333565e-07, "loss": 1.5381, "step": 1357 }, { "epoch": 0.29846153846153844, "grad_norm": 0.24054171393461082, "learning_rate": 5.713818864566764e-07, "loss": 1.4311, "step": 1358 }, { "epoch": 0.2986813186813187, "grad_norm": 0.25140462321818235, "learning_rate": 5.712065049252486e-07, "loss": 1.5125, "step": 1359 }, { "epoch": 0.2989010989010989, "grad_norm": 0.2626765262508115, "learning_rate": 5.710310346226838e-07, "loss": 1.4802, "step": 1360 }, { "epoch": 0.29912087912087915, "grad_norm": 0.25202646435155157, "learning_rate": 5.708554756326345e-07, "loss": 1.4718, "step": 1361 }, { "epoch": 0.29934065934065934, "grad_norm": 0.24388182954928223, "learning_rate": 5.70679828038796e-07, "loss": 1.4558, "step": 1362 }, { "epoch": 0.29956043956043954, "grad_norm": 0.2414756659678707, "learning_rate": 5.705040919249056e-07, "loss": 1.4651, "step": 1363 }, { "epoch": 0.2997802197802198, "grad_norm": 0.24735696929962378, "learning_rate": 5.703282673747426e-07, "loss": 1.5122, "step": 1364 }, { "epoch": 0.3, "grad_norm": 0.2792398861015185, "learning_rate": 5.70152354472129e-07, "loss": 1.4243, "step": 1365 }, { "epoch": 0.30021978021978024, "grad_norm": 0.2581459142568791, "learning_rate": 5.699763533009285e-07, "loss": 1.4987, "step": 1366 }, { "epoch": 0.30043956043956044, "grad_norm": 0.3093871599270535, "learning_rate": 5.698002639450471e-07, "loss": 1.4171, "step": 1367 }, { "epoch": 0.30065934065934063, "grad_norm": 0.2572564961720901, "learning_rate": 5.696240864884327e-07, "loss": 1.4722, "step": 1368 }, { "epoch": 0.3008791208791209, "grad_norm": 0.262863970873206, "learning_rate": 5.69447821015075e-07, "loss": 1.5212, "step": 1369 }, { "epoch": 0.3010989010989011, "grad_norm": 0.26222057362657936, "learning_rate": 5.692714676090062e-07, "loss": 1.5192, "step": 1370 }, { "epoch": 0.30131868131868134, "grad_norm": 0.3071417614804383, "learning_rate": 5.690950263543e-07, "loss": 1.5249, "step": 1371 }, { "epoch": 0.30153846153846153, "grad_norm": 0.25010627254354717, "learning_rate": 5.689184973350722e-07, "loss": 1.4781, "step": 1372 }, { "epoch": 0.3017582417582418, "grad_norm": 0.2553041392298195, "learning_rate": 5.6874188063548e-07, "loss": 1.4998, "step": 1373 }, { "epoch": 0.301978021978022, "grad_norm": 0.2639407112810674, "learning_rate": 5.685651763397232e-07, "loss": 1.4387, "step": 1374 }, { "epoch": 0.3021978021978022, "grad_norm": 0.249452213087093, "learning_rate": 5.683883845320428e-07, "loss": 1.4466, "step": 1375 }, { "epoch": 0.30241758241758243, "grad_norm": 0.2545854400749403, "learning_rate": 5.682115052967213e-07, "loss": 1.5069, "step": 1376 }, { "epoch": 0.30263736263736263, "grad_norm": 0.2721401982511904, "learning_rate": 5.680345387180835e-07, "loss": 1.4467, "step": 1377 }, { "epoch": 0.3028571428571429, "grad_norm": 0.26793985557990907, "learning_rate": 5.678574848804957e-07, "loss": 1.4538, "step": 1378 }, { "epoch": 0.3030769230769231, "grad_norm": 0.2822194318720025, "learning_rate": 5.676803438683652e-07, "loss": 1.5214, "step": 1379 }, { "epoch": 0.3032967032967033, "grad_norm": 0.23572713357279437, "learning_rate": 5.675031157661414e-07, "loss": 1.4715, "step": 1380 }, { "epoch": 0.3035164835164835, "grad_norm": 0.26595924002700233, "learning_rate": 5.673258006583154e-07, "loss": 1.4978, "step": 1381 }, { "epoch": 0.3037362637362637, "grad_norm": 0.23755624844769707, "learning_rate": 5.671483986294193e-07, "loss": 1.4383, "step": 1382 }, { "epoch": 0.303956043956044, "grad_norm": 0.30137774105205695, "learning_rate": 5.669709097640269e-07, "loss": 1.5048, "step": 1383 }, { "epoch": 0.30417582417582417, "grad_norm": 0.3534728549437255, "learning_rate": 5.667933341467533e-07, "loss": 1.4428, "step": 1384 }, { "epoch": 0.30439560439560437, "grad_norm": 0.2560030593834548, "learning_rate": 5.66615671862255e-07, "loss": 1.4715, "step": 1385 }, { "epoch": 0.3046153846153846, "grad_norm": 0.2506556454171206, "learning_rate": 5.664379229952297e-07, "loss": 1.4632, "step": 1386 }, { "epoch": 0.3048351648351648, "grad_norm": 0.2472721096073287, "learning_rate": 5.662600876304167e-07, "loss": 1.4685, "step": 1387 }, { "epoch": 0.30505494505494507, "grad_norm": 0.42497518640623017, "learning_rate": 5.66082165852596e-07, "loss": 1.4645, "step": 1388 }, { "epoch": 0.30527472527472527, "grad_norm": 0.2449598067042146, "learning_rate": 5.659041577465897e-07, "loss": 1.4671, "step": 1389 }, { "epoch": 0.3054945054945055, "grad_norm": 0.2628152734817873, "learning_rate": 5.657260633972598e-07, "loss": 1.4671, "step": 1390 }, { "epoch": 0.3057142857142857, "grad_norm": 0.2502257772986754, "learning_rate": 5.655478828895106e-07, "loss": 1.4535, "step": 1391 }, { "epoch": 0.3059340659340659, "grad_norm": 0.25471252931559857, "learning_rate": 5.653696163082866e-07, "loss": 1.4988, "step": 1392 }, { "epoch": 0.30615384615384617, "grad_norm": 0.24385797600271722, "learning_rate": 5.651912637385738e-07, "loss": 1.4096, "step": 1393 }, { "epoch": 0.30637362637362636, "grad_norm": 0.25676546116826904, "learning_rate": 5.650128252653993e-07, "loss": 1.4715, "step": 1394 }, { "epoch": 0.3065934065934066, "grad_norm": 0.24784394586038405, "learning_rate": 5.648343009738307e-07, "loss": 1.48, "step": 1395 }, { "epoch": 0.3068131868131868, "grad_norm": 0.2457472609882082, "learning_rate": 5.646556909489769e-07, "loss": 1.5201, "step": 1396 }, { "epoch": 0.307032967032967, "grad_norm": 0.2458655155769294, "learning_rate": 5.644769952759875e-07, "loss": 1.4737, "step": 1397 }, { "epoch": 0.30725274725274726, "grad_norm": 0.25223849576167445, "learning_rate": 5.642982140400529e-07, "loss": 1.5003, "step": 1398 }, { "epoch": 0.30747252747252746, "grad_norm": 0.2552847391712624, "learning_rate": 5.641193473264045e-07, "loss": 1.4565, "step": 1399 }, { "epoch": 0.3076923076923077, "grad_norm": 0.2459471016037614, "learning_rate": 5.63940395220314e-07, "loss": 1.4169, "step": 1400 }, { "epoch": 0.3079120879120879, "grad_norm": 0.2483389831046875, "learning_rate": 5.637613578070945e-07, "loss": 1.4946, "step": 1401 }, { "epoch": 0.30813186813186816, "grad_norm": 0.2633400739347006, "learning_rate": 5.635822351720991e-07, "loss": 1.4627, "step": 1402 }, { "epoch": 0.30835164835164836, "grad_norm": 0.258757752980863, "learning_rate": 5.634030274007219e-07, "loss": 1.4718, "step": 1403 }, { "epoch": 0.30857142857142855, "grad_norm": 0.5070997448693416, "learning_rate": 5.632237345783975e-07, "loss": 1.4994, "step": 1404 }, { "epoch": 0.3087912087912088, "grad_norm": 0.30790057285886036, "learning_rate": 5.630443567906011e-07, "loss": 1.4706, "step": 1405 }, { "epoch": 0.309010989010989, "grad_norm": 0.2631032173876283, "learning_rate": 5.628648941228482e-07, "loss": 1.4788, "step": 1406 }, { "epoch": 0.30923076923076925, "grad_norm": 0.5276387218909935, "learning_rate": 5.626853466606951e-07, "loss": 1.458, "step": 1407 }, { "epoch": 0.30945054945054945, "grad_norm": 0.24859726884660674, "learning_rate": 5.625057144897381e-07, "loss": 1.4666, "step": 1408 }, { "epoch": 0.30967032967032965, "grad_norm": 0.25281773514583455, "learning_rate": 5.623259976956144e-07, "loss": 1.4884, "step": 1409 }, { "epoch": 0.3098901098901099, "grad_norm": 0.24749271519094207, "learning_rate": 5.621461963640012e-07, "loss": 1.5311, "step": 1410 }, { "epoch": 0.3101098901098901, "grad_norm": 0.25232121505427474, "learning_rate": 5.619663105806159e-07, "loss": 1.4901, "step": 1411 }, { "epoch": 0.31032967032967035, "grad_norm": 0.2434137284776949, "learning_rate": 5.617863404312165e-07, "loss": 1.5244, "step": 1412 }, { "epoch": 0.31054945054945055, "grad_norm": 0.26079053162019655, "learning_rate": 5.616062860016009e-07, "loss": 1.4994, "step": 1413 }, { "epoch": 0.31076923076923074, "grad_norm": 0.2504660744024973, "learning_rate": 5.614261473776074e-07, "loss": 1.4593, "step": 1414 }, { "epoch": 0.310989010989011, "grad_norm": 0.2449194007940803, "learning_rate": 5.612459246451143e-07, "loss": 1.4826, "step": 1415 }, { "epoch": 0.3112087912087912, "grad_norm": 0.24300282511180893, "learning_rate": 5.610656178900403e-07, "loss": 1.4438, "step": 1416 }, { "epoch": 0.31142857142857144, "grad_norm": 0.24648580582833773, "learning_rate": 5.608852271983437e-07, "loss": 1.4502, "step": 1417 }, { "epoch": 0.31164835164835164, "grad_norm": 0.2523865024641673, "learning_rate": 5.607047526560231e-07, "loss": 1.4736, "step": 1418 }, { "epoch": 0.3118681318681319, "grad_norm": 0.25679243551564807, "learning_rate": 5.605241943491169e-07, "loss": 1.4476, "step": 1419 }, { "epoch": 0.3120879120879121, "grad_norm": 0.263785746954739, "learning_rate": 5.603435523637038e-07, "loss": 1.4684, "step": 1420 }, { "epoch": 0.3123076923076923, "grad_norm": 0.24918462298015576, "learning_rate": 5.601628267859018e-07, "loss": 1.4207, "step": 1421 }, { "epoch": 0.31252747252747254, "grad_norm": 0.23788222017014576, "learning_rate": 5.599820177018695e-07, "loss": 1.4753, "step": 1422 }, { "epoch": 0.31274725274725274, "grad_norm": 0.24615322583325078, "learning_rate": 5.598011251978045e-07, "loss": 1.4569, "step": 1423 }, { "epoch": 0.312967032967033, "grad_norm": 0.28121091251243596, "learning_rate": 5.596201493599449e-07, "loss": 1.4872, "step": 1424 }, { "epoch": 0.3131868131868132, "grad_norm": 0.2470932136083059, "learning_rate": 5.594390902745679e-07, "loss": 1.4697, "step": 1425 }, { "epoch": 0.3134065934065934, "grad_norm": 0.25181414935091184, "learning_rate": 5.592579480279909e-07, "loss": 1.4594, "step": 1426 }, { "epoch": 0.31362637362637363, "grad_norm": 0.24000622688772374, "learning_rate": 5.590767227065706e-07, "loss": 1.4319, "step": 1427 }, { "epoch": 0.31384615384615383, "grad_norm": 0.27023095618555476, "learning_rate": 5.588954143967035e-07, "loss": 1.5004, "step": 1428 }, { "epoch": 0.3140659340659341, "grad_norm": 0.25649542568927564, "learning_rate": 5.587140231848255e-07, "loss": 1.4952, "step": 1429 }, { "epoch": 0.3142857142857143, "grad_norm": 0.25434607909533713, "learning_rate": 5.585325491574123e-07, "loss": 1.4152, "step": 1430 }, { "epoch": 0.31450549450549453, "grad_norm": 0.29623888881495286, "learning_rate": 5.583509924009786e-07, "loss": 1.4615, "step": 1431 }, { "epoch": 0.31472527472527473, "grad_norm": 0.24808496218296028, "learning_rate": 5.581693530020793e-07, "loss": 1.4673, "step": 1432 }, { "epoch": 0.3149450549450549, "grad_norm": 0.2431664108867739, "learning_rate": 5.579876310473079e-07, "loss": 1.4419, "step": 1433 }, { "epoch": 0.3151648351648352, "grad_norm": 0.25625680181131755, "learning_rate": 5.578058266232975e-07, "loss": 1.5086, "step": 1434 }, { "epoch": 0.3153846153846154, "grad_norm": 0.25239128493632057, "learning_rate": 5.576239398167211e-07, "loss": 1.3956, "step": 1435 }, { "epoch": 0.3156043956043956, "grad_norm": 0.3177987113062903, "learning_rate": 5.574419707142899e-07, "loss": 1.5333, "step": 1436 }, { "epoch": 0.3158241758241758, "grad_norm": 0.24921101440713225, "learning_rate": 5.572599194027553e-07, "loss": 1.4625, "step": 1437 }, { "epoch": 0.316043956043956, "grad_norm": 0.2527575768482531, "learning_rate": 5.570777859689074e-07, "loss": 1.4016, "step": 1438 }, { "epoch": 0.3162637362637363, "grad_norm": 0.25944300953619764, "learning_rate": 5.568955704995754e-07, "loss": 1.5011, "step": 1439 }, { "epoch": 0.31648351648351647, "grad_norm": 0.3000210968151791, "learning_rate": 5.567132730816281e-07, "loss": 1.43, "step": 1440 }, { "epoch": 0.3167032967032967, "grad_norm": 0.2436095655406767, "learning_rate": 5.565308938019729e-07, "loss": 1.4973, "step": 1441 }, { "epoch": 0.3169230769230769, "grad_norm": 0.2513959320546392, "learning_rate": 5.563484327475562e-07, "loss": 1.4844, "step": 1442 }, { "epoch": 0.3171428571428571, "grad_norm": 0.24366656363672098, "learning_rate": 5.561658900053638e-07, "loss": 1.5005, "step": 1443 }, { "epoch": 0.31736263736263737, "grad_norm": 0.24275773764369232, "learning_rate": 5.5598326566242e-07, "loss": 1.5112, "step": 1444 }, { "epoch": 0.31758241758241756, "grad_norm": 0.27869479147088283, "learning_rate": 5.558005598057883e-07, "loss": 1.4517, "step": 1445 }, { "epoch": 0.3178021978021978, "grad_norm": 0.44836788997017724, "learning_rate": 5.55617772522571e-07, "loss": 1.4828, "step": 1446 }, { "epoch": 0.318021978021978, "grad_norm": 0.2524172023696438, "learning_rate": 5.55434903899909e-07, "loss": 1.4908, "step": 1447 }, { "epoch": 0.31824175824175827, "grad_norm": 0.25560366190871997, "learning_rate": 5.552519540249823e-07, "loss": 1.4537, "step": 1448 }, { "epoch": 0.31846153846153846, "grad_norm": 0.24590257703234258, "learning_rate": 5.550689229850094e-07, "loss": 1.4444, "step": 1449 }, { "epoch": 0.31868131868131866, "grad_norm": 0.23869349912438817, "learning_rate": 5.548858108672478e-07, "loss": 1.3799, "step": 1450 }, { "epoch": 0.3189010989010989, "grad_norm": 0.25425221516964575, "learning_rate": 5.54702617758993e-07, "loss": 1.4499, "step": 1451 }, { "epoch": 0.3191208791208791, "grad_norm": 0.24652127780233524, "learning_rate": 5.5451934374758e-07, "loss": 1.4834, "step": 1452 }, { "epoch": 0.31934065934065936, "grad_norm": 0.25748748540312894, "learning_rate": 5.543359889203816e-07, "loss": 1.5082, "step": 1453 }, { "epoch": 0.31956043956043956, "grad_norm": 0.24735774192630483, "learning_rate": 5.541525533648098e-07, "loss": 1.4448, "step": 1454 }, { "epoch": 0.31978021978021975, "grad_norm": 0.25548677018599053, "learning_rate": 5.539690371683143e-07, "loss": 1.5432, "step": 1455 }, { "epoch": 0.32, "grad_norm": 0.24170529129716967, "learning_rate": 5.537854404183839e-07, "loss": 1.4369, "step": 1456 }, { "epoch": 0.3202197802197802, "grad_norm": 0.2499469498321262, "learning_rate": 5.536017632025456e-07, "loss": 1.4728, "step": 1457 }, { "epoch": 0.32043956043956046, "grad_norm": 0.25214039364600493, "learning_rate": 5.534180056083646e-07, "loss": 1.4431, "step": 1458 }, { "epoch": 0.32065934065934065, "grad_norm": 0.25028272901560455, "learning_rate": 5.532341677234446e-07, "loss": 1.4518, "step": 1459 }, { "epoch": 0.3208791208791209, "grad_norm": 0.2563486330300104, "learning_rate": 5.530502496354277e-07, "loss": 1.4964, "step": 1460 }, { "epoch": 0.3210989010989011, "grad_norm": 0.24113311014075664, "learning_rate": 5.528662514319939e-07, "loss": 1.4403, "step": 1461 }, { "epoch": 0.3213186813186813, "grad_norm": 0.24337623711648143, "learning_rate": 5.526821732008617e-07, "loss": 1.3784, "step": 1462 }, { "epoch": 0.32153846153846155, "grad_norm": 0.25167443291990105, "learning_rate": 5.524980150297874e-07, "loss": 1.5231, "step": 1463 }, { "epoch": 0.32175824175824175, "grad_norm": 0.24809838716659088, "learning_rate": 5.523137770065658e-07, "loss": 1.4718, "step": 1464 }, { "epoch": 0.321978021978022, "grad_norm": 0.2511115922667999, "learning_rate": 5.521294592190295e-07, "loss": 1.4306, "step": 1465 }, { "epoch": 0.3221978021978022, "grad_norm": 0.23752395664987544, "learning_rate": 5.519450617550492e-07, "loss": 1.3631, "step": 1466 }, { "epoch": 0.3224175824175824, "grad_norm": 0.2384592070302975, "learning_rate": 5.517605847025338e-07, "loss": 1.4627, "step": 1467 }, { "epoch": 0.32263736263736265, "grad_norm": 0.2592006138141254, "learning_rate": 5.515760281494296e-07, "loss": 1.4825, "step": 1468 }, { "epoch": 0.32285714285714284, "grad_norm": 0.25791435149494757, "learning_rate": 5.513913921837216e-07, "loss": 1.5405, "step": 1469 }, { "epoch": 0.3230769230769231, "grad_norm": 0.25262378690212317, "learning_rate": 5.512066768934318e-07, "loss": 1.4445, "step": 1470 }, { "epoch": 0.3232967032967033, "grad_norm": 0.2618464186806256, "learning_rate": 5.510218823666206e-07, "loss": 1.4505, "step": 1471 }, { "epoch": 0.32351648351648354, "grad_norm": 0.2516345721409484, "learning_rate": 5.50837008691386e-07, "loss": 1.4902, "step": 1472 }, { "epoch": 0.32373626373626374, "grad_norm": 0.2462564886830425, "learning_rate": 5.506520559558638e-07, "loss": 1.4555, "step": 1473 }, { "epoch": 0.32395604395604394, "grad_norm": 0.24897616895697583, "learning_rate": 5.504670242482272e-07, "loss": 1.4917, "step": 1474 }, { "epoch": 0.3241758241758242, "grad_norm": 0.26007246117108856, "learning_rate": 5.502819136566875e-07, "loss": 1.499, "step": 1475 }, { "epoch": 0.3243956043956044, "grad_norm": 0.24427868150389598, "learning_rate": 5.500967242694933e-07, "loss": 1.4759, "step": 1476 }, { "epoch": 0.32461538461538464, "grad_norm": 0.26643060912685457, "learning_rate": 5.499114561749307e-07, "loss": 1.5278, "step": 1477 }, { "epoch": 0.32483516483516484, "grad_norm": 0.25911203533432375, "learning_rate": 5.497261094613236e-07, "loss": 1.4835, "step": 1478 }, { "epoch": 0.32505494505494503, "grad_norm": 0.24657749336443374, "learning_rate": 5.495406842170334e-07, "loss": 1.4352, "step": 1479 }, { "epoch": 0.3252747252747253, "grad_norm": 0.29035923784862794, "learning_rate": 5.493551805304585e-07, "loss": 1.5027, "step": 1480 }, { "epoch": 0.3254945054945055, "grad_norm": 0.24385608355519042, "learning_rate": 5.491695984900351e-07, "loss": 1.4728, "step": 1481 }, { "epoch": 0.32571428571428573, "grad_norm": 0.280604665862483, "learning_rate": 5.489839381842367e-07, "loss": 1.4319, "step": 1482 }, { "epoch": 0.32593406593406593, "grad_norm": 0.24708258531599805, "learning_rate": 5.487981997015739e-07, "loss": 1.4393, "step": 1483 }, { "epoch": 0.3261538461538461, "grad_norm": 0.26137202452625863, "learning_rate": 5.486123831305949e-07, "loss": 1.4754, "step": 1484 }, { "epoch": 0.3263736263736264, "grad_norm": 0.2470221269140314, "learning_rate": 5.484264885598847e-07, "loss": 1.4455, "step": 1485 }, { "epoch": 0.3265934065934066, "grad_norm": 0.24638699029361294, "learning_rate": 5.482405160780657e-07, "loss": 1.4493, "step": 1486 }, { "epoch": 0.32681318681318683, "grad_norm": 0.2522195644263799, "learning_rate": 5.480544657737979e-07, "loss": 1.4263, "step": 1487 }, { "epoch": 0.327032967032967, "grad_norm": 0.24994236548654739, "learning_rate": 5.478683377357777e-07, "loss": 1.4574, "step": 1488 }, { "epoch": 0.3272527472527473, "grad_norm": 0.2542060320773594, "learning_rate": 5.476821320527388e-07, "loss": 1.5226, "step": 1489 }, { "epoch": 0.3274725274725275, "grad_norm": 0.2912830493896432, "learning_rate": 5.474958488134518e-07, "loss": 1.4136, "step": 1490 }, { "epoch": 0.32769230769230767, "grad_norm": 0.2586908381805732, "learning_rate": 5.473094881067247e-07, "loss": 1.446, "step": 1491 }, { "epoch": 0.3279120879120879, "grad_norm": 0.36209478940224404, "learning_rate": 5.471230500214021e-07, "loss": 1.5041, "step": 1492 }, { "epoch": 0.3281318681318681, "grad_norm": 0.25839569965035974, "learning_rate": 5.469365346463655e-07, "loss": 1.5342, "step": 1493 }, { "epoch": 0.3283516483516484, "grad_norm": 0.24023299334045561, "learning_rate": 5.467499420705332e-07, "loss": 1.466, "step": 1494 }, { "epoch": 0.32857142857142857, "grad_norm": 0.3501365700384398, "learning_rate": 5.465632723828604e-07, "loss": 1.4431, "step": 1495 }, { "epoch": 0.32879120879120877, "grad_norm": 0.24418008706073047, "learning_rate": 5.463765256723392e-07, "loss": 1.523, "step": 1496 }, { "epoch": 0.329010989010989, "grad_norm": 0.2559257648559177, "learning_rate": 5.461897020279981e-07, "loss": 1.4329, "step": 1497 }, { "epoch": 0.3292307692307692, "grad_norm": 0.2500029574376721, "learning_rate": 5.460028015389027e-07, "loss": 1.4854, "step": 1498 }, { "epoch": 0.32945054945054947, "grad_norm": 0.30864129210705915, "learning_rate": 5.458158242941547e-07, "loss": 1.5429, "step": 1499 }, { "epoch": 0.32967032967032966, "grad_norm": 0.24016461750834422, "learning_rate": 5.456287703828928e-07, "loss": 1.5318, "step": 1500 }, { "epoch": 0.3298901098901099, "grad_norm": 0.2538073310748625, "learning_rate": 5.454416398942922e-07, "loss": 1.4958, "step": 1501 }, { "epoch": 0.3301098901098901, "grad_norm": 0.24784466552667925, "learning_rate": 5.452544329175643e-07, "loss": 1.4459, "step": 1502 }, { "epoch": 0.3303296703296703, "grad_norm": 0.24349725851625248, "learning_rate": 5.450671495419574e-07, "loss": 1.5067, "step": 1503 }, { "epoch": 0.33054945054945056, "grad_norm": 0.24921253640470473, "learning_rate": 5.44879789856756e-07, "loss": 1.4844, "step": 1504 }, { "epoch": 0.33076923076923076, "grad_norm": 0.2565216850423278, "learning_rate": 5.446923539512809e-07, "loss": 1.4879, "step": 1505 }, { "epoch": 0.330989010989011, "grad_norm": 0.2538660241757986, "learning_rate": 5.445048419148894e-07, "loss": 1.4429, "step": 1506 }, { "epoch": 0.3312087912087912, "grad_norm": 0.2501390660018792, "learning_rate": 5.443172538369752e-07, "loss": 1.4767, "step": 1507 }, { "epoch": 0.3314285714285714, "grad_norm": 0.27314855713848507, "learning_rate": 5.441295898069678e-07, "loss": 1.4765, "step": 1508 }, { "epoch": 0.33164835164835166, "grad_norm": 0.24685113051896446, "learning_rate": 5.439418499143333e-07, "loss": 1.4548, "step": 1509 }, { "epoch": 0.33186813186813185, "grad_norm": 0.24626827370484014, "learning_rate": 5.437540342485739e-07, "loss": 1.4816, "step": 1510 }, { "epoch": 0.3320879120879121, "grad_norm": 0.24065483927609677, "learning_rate": 5.435661428992277e-07, "loss": 1.4732, "step": 1511 }, { "epoch": 0.3323076923076923, "grad_norm": 0.25897101893577673, "learning_rate": 5.433781759558694e-07, "loss": 1.463, "step": 1512 }, { "epoch": 0.3325274725274725, "grad_norm": 0.2535357142888248, "learning_rate": 5.431901335081091e-07, "loss": 1.4513, "step": 1513 }, { "epoch": 0.33274725274725275, "grad_norm": 0.23725764627316678, "learning_rate": 5.430020156455932e-07, "loss": 1.4642, "step": 1514 }, { "epoch": 0.33296703296703295, "grad_norm": 0.2528769432262968, "learning_rate": 5.428138224580043e-07, "loss": 1.4734, "step": 1515 }, { "epoch": 0.3331868131868132, "grad_norm": 0.243999952788361, "learning_rate": 5.426255540350604e-07, "loss": 1.4582, "step": 1516 }, { "epoch": 0.3334065934065934, "grad_norm": 0.2492316620333035, "learning_rate": 5.424372104665158e-07, "loss": 1.4326, "step": 1517 }, { "epoch": 0.33362637362637365, "grad_norm": 0.2353644179244723, "learning_rate": 5.422487918421602e-07, "loss": 1.4451, "step": 1518 }, { "epoch": 0.33384615384615385, "grad_norm": 0.2444805165573388, "learning_rate": 5.420602982518198e-07, "loss": 1.4774, "step": 1519 }, { "epoch": 0.33406593406593404, "grad_norm": 0.2534291025257919, "learning_rate": 5.418717297853555e-07, "loss": 1.4196, "step": 1520 }, { "epoch": 0.3342857142857143, "grad_norm": 0.2509051106833752, "learning_rate": 5.416830865326649e-07, "loss": 1.457, "step": 1521 }, { "epoch": 0.3345054945054945, "grad_norm": 0.24734925028850036, "learning_rate": 5.414943685836806e-07, "loss": 1.4824, "step": 1522 }, { "epoch": 0.33472527472527475, "grad_norm": 0.27155469459810494, "learning_rate": 5.413055760283712e-07, "loss": 1.4774, "step": 1523 }, { "epoch": 0.33494505494505494, "grad_norm": 0.24800887928970217, "learning_rate": 5.411167089567405e-07, "loss": 1.4984, "step": 1524 }, { "epoch": 0.33516483516483514, "grad_norm": 0.25307644663373857, "learning_rate": 5.409277674588281e-07, "loss": 1.433, "step": 1525 }, { "epoch": 0.3353846153846154, "grad_norm": 0.24470622096744218, "learning_rate": 5.407387516247091e-07, "loss": 1.4192, "step": 1526 }, { "epoch": 0.3356043956043956, "grad_norm": 0.25432007620145153, "learning_rate": 5.405496615444938e-07, "loss": 1.4579, "step": 1527 }, { "epoch": 0.33582417582417584, "grad_norm": 0.2554028206333398, "learning_rate": 5.40360497308328e-07, "loss": 1.4741, "step": 1528 }, { "epoch": 0.33604395604395604, "grad_norm": 0.2547609382189186, "learning_rate": 5.401712590063932e-07, "loss": 1.4119, "step": 1529 }, { "epoch": 0.3362637362637363, "grad_norm": 0.2532464316349404, "learning_rate": 5.399819467289056e-07, "loss": 1.4755, "step": 1530 }, { "epoch": 0.3364835164835165, "grad_norm": 0.24845126535571804, "learning_rate": 5.397925605661171e-07, "loss": 1.4979, "step": 1531 }, { "epoch": 0.3367032967032967, "grad_norm": 0.25299051344855855, "learning_rate": 5.396031006083145e-07, "loss": 1.4916, "step": 1532 }, { "epoch": 0.33692307692307694, "grad_norm": 0.24362075788128457, "learning_rate": 5.394135669458202e-07, "loss": 1.4479, "step": 1533 }, { "epoch": 0.33714285714285713, "grad_norm": 0.2456546265907831, "learning_rate": 5.392239596689915e-07, "loss": 1.4508, "step": 1534 }, { "epoch": 0.3373626373626374, "grad_norm": 0.25775963168034405, "learning_rate": 5.390342788682207e-07, "loss": 1.4284, "step": 1535 }, { "epoch": 0.3375824175824176, "grad_norm": 0.26861833752807923, "learning_rate": 5.388445246339352e-07, "loss": 1.5127, "step": 1536 }, { "epoch": 0.3378021978021978, "grad_norm": 0.25490000077366065, "learning_rate": 5.386546970565977e-07, "loss": 1.486, "step": 1537 }, { "epoch": 0.33802197802197803, "grad_norm": 0.23792265683948896, "learning_rate": 5.384647962267054e-07, "loss": 1.4162, "step": 1538 }, { "epoch": 0.33824175824175823, "grad_norm": 0.29004751691553654, "learning_rate": 5.382748222347907e-07, "loss": 1.5103, "step": 1539 }, { "epoch": 0.3384615384615385, "grad_norm": 0.2754869834655155, "learning_rate": 5.380847751714209e-07, "loss": 1.4724, "step": 1540 }, { "epoch": 0.3386813186813187, "grad_norm": 0.23558145026194702, "learning_rate": 5.37894655127198e-07, "loss": 1.4317, "step": 1541 }, { "epoch": 0.3389010989010989, "grad_norm": 0.27225269837790794, "learning_rate": 5.37704462192759e-07, "loss": 1.4369, "step": 1542 }, { "epoch": 0.3391208791208791, "grad_norm": 0.2554558886374575, "learning_rate": 5.375141964587753e-07, "loss": 1.4098, "step": 1543 }, { "epoch": 0.3393406593406593, "grad_norm": 0.23347401876322993, "learning_rate": 5.373238580159532e-07, "loss": 1.4225, "step": 1544 }, { "epoch": 0.3395604395604396, "grad_norm": 0.26102492919944953, "learning_rate": 5.371334469550339e-07, "loss": 1.4749, "step": 1545 }, { "epoch": 0.33978021978021977, "grad_norm": 0.3129338614884883, "learning_rate": 5.369429633667929e-07, "loss": 1.4221, "step": 1546 }, { "epoch": 0.34, "grad_norm": 0.2675264151861766, "learning_rate": 5.367524073420402e-07, "loss": 1.452, "step": 1547 }, { "epoch": 0.3402197802197802, "grad_norm": 0.3121086486667206, "learning_rate": 5.365617789716208e-07, "loss": 1.4714, "step": 1548 }, { "epoch": 0.3404395604395604, "grad_norm": 0.2529556901038112, "learning_rate": 5.363710783464138e-07, "loss": 1.3853, "step": 1549 }, { "epoch": 0.34065934065934067, "grad_norm": 0.24490213684593154, "learning_rate": 5.361803055573327e-07, "loss": 1.4663, "step": 1550 }, { "epoch": 0.34087912087912087, "grad_norm": 0.25835859916848847, "learning_rate": 5.359894606953256e-07, "loss": 1.4538, "step": 1551 }, { "epoch": 0.3410989010989011, "grad_norm": 0.2542381749964743, "learning_rate": 5.35798543851375e-07, "loss": 1.4712, "step": 1552 }, { "epoch": 0.3413186813186813, "grad_norm": 0.2547685273733034, "learning_rate": 5.356075551164975e-07, "loss": 1.5018, "step": 1553 }, { "epoch": 0.3415384615384615, "grad_norm": 0.25012508464326705, "learning_rate": 5.354164945817442e-07, "loss": 1.4672, "step": 1554 }, { "epoch": 0.34175824175824177, "grad_norm": 0.23845597850221073, "learning_rate": 5.352253623382002e-07, "loss": 1.5004, "step": 1555 }, { "epoch": 0.34197802197802196, "grad_norm": 0.2830684758224161, "learning_rate": 5.350341584769852e-07, "loss": 1.4355, "step": 1556 }, { "epoch": 0.3421978021978022, "grad_norm": 0.26053788268240063, "learning_rate": 5.348428830892524e-07, "loss": 1.5193, "step": 1557 }, { "epoch": 0.3424175824175824, "grad_norm": 0.2619656313595603, "learning_rate": 5.346515362661894e-07, "loss": 1.4591, "step": 1558 }, { "epoch": 0.34263736263736266, "grad_norm": 0.24384775270031842, "learning_rate": 5.344601180990184e-07, "loss": 1.473, "step": 1559 }, { "epoch": 0.34285714285714286, "grad_norm": 0.24529546851627845, "learning_rate": 5.342686286789946e-07, "loss": 1.4294, "step": 1560 }, { "epoch": 0.34307692307692306, "grad_norm": 0.255705642784143, "learning_rate": 5.34077068097408e-07, "loss": 1.5235, "step": 1561 }, { "epoch": 0.3432967032967033, "grad_norm": 0.2402994273303898, "learning_rate": 5.338854364455819e-07, "loss": 1.436, "step": 1562 }, { "epoch": 0.3435164835164835, "grad_norm": 0.2628094041683089, "learning_rate": 5.336937338148743e-07, "loss": 1.5406, "step": 1563 }, { "epoch": 0.34373626373626376, "grad_norm": 0.25726952374456596, "learning_rate": 5.33501960296676e-07, "loss": 1.4792, "step": 1564 }, { "epoch": 0.34395604395604396, "grad_norm": 0.2489143983098038, "learning_rate": 5.333101159824125e-07, "loss": 1.4696, "step": 1565 }, { "epoch": 0.34417582417582415, "grad_norm": 0.25393076226792394, "learning_rate": 5.331182009635425e-07, "loss": 1.4839, "step": 1566 }, { "epoch": 0.3443956043956044, "grad_norm": 0.24833238790122336, "learning_rate": 5.329262153315586e-07, "loss": 1.4813, "step": 1567 }, { "epoch": 0.3446153846153846, "grad_norm": 0.24877647344306816, "learning_rate": 5.327341591779871e-07, "loss": 1.4242, "step": 1568 }, { "epoch": 0.34483516483516485, "grad_norm": 0.24242360833877535, "learning_rate": 5.325420325943879e-07, "loss": 1.3694, "step": 1569 }, { "epoch": 0.34505494505494505, "grad_norm": 0.2511563121796833, "learning_rate": 5.323498356723542e-07, "loss": 1.3864, "step": 1570 }, { "epoch": 0.34527472527472525, "grad_norm": 0.24910280728725698, "learning_rate": 5.321575685035133e-07, "loss": 1.4293, "step": 1571 }, { "epoch": 0.3454945054945055, "grad_norm": 0.24148226911307036, "learning_rate": 5.319652311795255e-07, "loss": 1.47, "step": 1572 }, { "epoch": 0.3457142857142857, "grad_norm": 0.26042792170550655, "learning_rate": 5.317728237920848e-07, "loss": 1.5616, "step": 1573 }, { "epoch": 0.34593406593406595, "grad_norm": 0.25812273513162637, "learning_rate": 5.315803464329184e-07, "loss": 1.4829, "step": 1574 }, { "epoch": 0.34615384615384615, "grad_norm": 0.24473162545303032, "learning_rate": 5.313877991937871e-07, "loss": 1.4404, "step": 1575 }, { "epoch": 0.3463736263736264, "grad_norm": 0.25852660374979847, "learning_rate": 5.311951821664847e-07, "loss": 1.4336, "step": 1576 }, { "epoch": 0.3465934065934066, "grad_norm": 0.24409302924608858, "learning_rate": 5.310024954428387e-07, "loss": 1.476, "step": 1577 }, { "epoch": 0.3468131868131868, "grad_norm": 0.24349521727617168, "learning_rate": 5.308097391147093e-07, "loss": 1.5268, "step": 1578 }, { "epoch": 0.34703296703296704, "grad_norm": 0.3046686062942096, "learning_rate": 5.306169132739905e-07, "loss": 1.4584, "step": 1579 }, { "epoch": 0.34725274725274724, "grad_norm": 0.2547874869155784, "learning_rate": 5.304240180126089e-07, "loss": 1.4195, "step": 1580 }, { "epoch": 0.3474725274725275, "grad_norm": 0.26855475616874347, "learning_rate": 5.302310534225243e-07, "loss": 1.4895, "step": 1581 }, { "epoch": 0.3476923076923077, "grad_norm": 0.24765425237779218, "learning_rate": 5.300380195957299e-07, "loss": 1.4945, "step": 1582 }, { "epoch": 0.3479120879120879, "grad_norm": 0.2679634206275912, "learning_rate": 5.298449166242515e-07, "loss": 1.4574, "step": 1583 }, { "epoch": 0.34813186813186814, "grad_norm": 0.23905809227530828, "learning_rate": 5.29651744600148e-07, "loss": 1.4459, "step": 1584 }, { "epoch": 0.34835164835164834, "grad_norm": 0.2632656267120424, "learning_rate": 5.294585036155113e-07, "loss": 1.3949, "step": 1585 }, { "epoch": 0.3485714285714286, "grad_norm": 0.25489983481984224, "learning_rate": 5.292651937624662e-07, "loss": 1.4168, "step": 1586 }, { "epoch": 0.3487912087912088, "grad_norm": 0.24517758466897013, "learning_rate": 5.290718151331703e-07, "loss": 1.4415, "step": 1587 }, { "epoch": 0.34901098901098904, "grad_norm": 0.25513692791169457, "learning_rate": 5.288783678198137e-07, "loss": 1.4955, "step": 1588 }, { "epoch": 0.34923076923076923, "grad_norm": 0.2567853058561654, "learning_rate": 5.286848519146195e-07, "loss": 1.4635, "step": 1589 }, { "epoch": 0.34945054945054943, "grad_norm": 0.2536613534859628, "learning_rate": 5.284912675098436e-07, "loss": 1.4606, "step": 1590 }, { "epoch": 0.3496703296703297, "grad_norm": 0.24887742764255438, "learning_rate": 5.282976146977745e-07, "loss": 1.4145, "step": 1591 }, { "epoch": 0.3498901098901099, "grad_norm": 0.2652381331710101, "learning_rate": 5.281038935707332e-07, "loss": 1.529, "step": 1592 }, { "epoch": 0.35010989010989013, "grad_norm": 0.27378876042200684, "learning_rate": 5.279101042210732e-07, "loss": 1.5016, "step": 1593 }, { "epoch": 0.35032967032967033, "grad_norm": 0.2624307840502721, "learning_rate": 5.277162467411808e-07, "loss": 1.466, "step": 1594 }, { "epoch": 0.3505494505494505, "grad_norm": 0.2665690066369142, "learning_rate": 5.275223212234747e-07, "loss": 1.5186, "step": 1595 }, { "epoch": 0.3507692307692308, "grad_norm": 0.24693046472316255, "learning_rate": 5.273283277604057e-07, "loss": 1.5041, "step": 1596 }, { "epoch": 0.350989010989011, "grad_norm": 0.26296307979312056, "learning_rate": 5.271342664444574e-07, "loss": 1.4657, "step": 1597 }, { "epoch": 0.3512087912087912, "grad_norm": 0.24454598187445337, "learning_rate": 5.269401373681455e-07, "loss": 1.489, "step": 1598 }, { "epoch": 0.3514285714285714, "grad_norm": 0.24554422049185623, "learning_rate": 5.267459406240181e-07, "loss": 1.4633, "step": 1599 }, { "epoch": 0.3516483516483517, "grad_norm": 0.25108540014016983, "learning_rate": 5.265516763046558e-07, "loss": 1.4525, "step": 1600 }, { "epoch": 0.3518681318681319, "grad_norm": 0.26604553347994064, "learning_rate": 5.26357344502671e-07, "loss": 1.4961, "step": 1601 }, { "epoch": 0.35208791208791207, "grad_norm": 0.25438004120268287, "learning_rate": 5.261629453107084e-07, "loss": 1.5024, "step": 1602 }, { "epoch": 0.3523076923076923, "grad_norm": 0.2485671186455379, "learning_rate": 5.25968478821445e-07, "loss": 1.4366, "step": 1603 }, { "epoch": 0.3525274725274725, "grad_norm": 0.2576362245656226, "learning_rate": 5.257739451275897e-07, "loss": 1.5253, "step": 1604 }, { "epoch": 0.35274725274725277, "grad_norm": 0.24371850011055635, "learning_rate": 5.255793443218833e-07, "loss": 1.4072, "step": 1605 }, { "epoch": 0.35296703296703297, "grad_norm": 0.247289572853217, "learning_rate": 5.253846764970993e-07, "loss": 1.4588, "step": 1606 }, { "epoch": 0.35318681318681316, "grad_norm": 0.2530801955847355, "learning_rate": 5.251899417460424e-07, "loss": 1.4445, "step": 1607 }, { "epoch": 0.3534065934065934, "grad_norm": 0.2505612898934513, "learning_rate": 5.249951401615491e-07, "loss": 1.4632, "step": 1608 }, { "epoch": 0.3536263736263736, "grad_norm": 0.24772309343233298, "learning_rate": 5.248002718364885e-07, "loss": 1.4974, "step": 1609 }, { "epoch": 0.35384615384615387, "grad_norm": 0.266177888918143, "learning_rate": 5.24605336863761e-07, "loss": 1.545, "step": 1610 }, { "epoch": 0.35406593406593406, "grad_norm": 0.2533717626020377, "learning_rate": 5.244103353362991e-07, "loss": 1.4836, "step": 1611 }, { "epoch": 0.35428571428571426, "grad_norm": 0.2592502180167789, "learning_rate": 5.242152673470665e-07, "loss": 1.4776, "step": 1612 }, { "epoch": 0.3545054945054945, "grad_norm": 0.2969363779571865, "learning_rate": 5.24020132989059e-07, "loss": 1.5055, "step": 1613 }, { "epoch": 0.3547252747252747, "grad_norm": 0.25396152465741845, "learning_rate": 5.238249323553043e-07, "loss": 1.5134, "step": 1614 }, { "epoch": 0.35494505494505496, "grad_norm": 0.2652262284261434, "learning_rate": 5.236296655388607e-07, "loss": 1.4805, "step": 1615 }, { "epoch": 0.35516483516483516, "grad_norm": 0.24838897986825226, "learning_rate": 5.234343326328194e-07, "loss": 1.5331, "step": 1616 }, { "epoch": 0.3553846153846154, "grad_norm": 0.24000930493173722, "learning_rate": 5.232389337303021e-07, "loss": 1.4756, "step": 1617 }, { "epoch": 0.3556043956043956, "grad_norm": 0.25131057253330996, "learning_rate": 5.230434689244622e-07, "loss": 1.461, "step": 1618 }, { "epoch": 0.3558241758241758, "grad_norm": 0.2502103746681586, "learning_rate": 5.228479383084847e-07, "loss": 1.5073, "step": 1619 }, { "epoch": 0.35604395604395606, "grad_norm": 0.30898660360946234, "learning_rate": 5.226523419755858e-07, "loss": 1.4893, "step": 1620 }, { "epoch": 0.35626373626373625, "grad_norm": 0.24204538896547698, "learning_rate": 5.224566800190131e-07, "loss": 1.4025, "step": 1621 }, { "epoch": 0.3564835164835165, "grad_norm": 0.26778749718029793, "learning_rate": 5.222609525320456e-07, "loss": 1.4515, "step": 1622 }, { "epoch": 0.3567032967032967, "grad_norm": 0.2543818679452358, "learning_rate": 5.220651596079932e-07, "loss": 1.448, "step": 1623 }, { "epoch": 0.3569230769230769, "grad_norm": 0.23790757421239175, "learning_rate": 5.218693013401976e-07, "loss": 1.4724, "step": 1624 }, { "epoch": 0.35714285714285715, "grad_norm": 0.2524267519855943, "learning_rate": 5.216733778220308e-07, "loss": 1.4919, "step": 1625 }, { "epoch": 0.35736263736263735, "grad_norm": 0.25258120831921677, "learning_rate": 5.214773891468966e-07, "loss": 1.4999, "step": 1626 }, { "epoch": 0.3575824175824176, "grad_norm": 0.25253733179722276, "learning_rate": 5.212813354082297e-07, "loss": 1.5095, "step": 1627 }, { "epoch": 0.3578021978021978, "grad_norm": 0.25846443644643013, "learning_rate": 5.210852166994957e-07, "loss": 1.4919, "step": 1628 }, { "epoch": 0.35802197802197805, "grad_norm": 0.2547355568107916, "learning_rate": 5.208890331141913e-07, "loss": 1.4914, "step": 1629 }, { "epoch": 0.35824175824175825, "grad_norm": 0.2550941584430843, "learning_rate": 5.206927847458439e-07, "loss": 1.4769, "step": 1630 }, { "epoch": 0.35846153846153844, "grad_norm": 0.2525393773988044, "learning_rate": 5.20496471688012e-07, "loss": 1.4887, "step": 1631 }, { "epoch": 0.3586813186813187, "grad_norm": 0.2512907518837691, "learning_rate": 5.203000940342849e-07, "loss": 1.5132, "step": 1632 }, { "epoch": 0.3589010989010989, "grad_norm": 0.24874642160707364, "learning_rate": 5.201036518782827e-07, "loss": 1.4425, "step": 1633 }, { "epoch": 0.35912087912087914, "grad_norm": 0.24342043499629476, "learning_rate": 5.199071453136563e-07, "loss": 1.4835, "step": 1634 }, { "epoch": 0.35934065934065934, "grad_norm": 0.24306007021206974, "learning_rate": 5.19710574434087e-07, "loss": 1.4775, "step": 1635 }, { "epoch": 0.35956043956043954, "grad_norm": 0.26260245456911935, "learning_rate": 5.195139393332873e-07, "loss": 1.4983, "step": 1636 }, { "epoch": 0.3597802197802198, "grad_norm": 0.250414658674308, "learning_rate": 5.193172401049997e-07, "loss": 1.4913, "step": 1637 }, { "epoch": 0.36, "grad_norm": 0.26081144251159283, "learning_rate": 5.191204768429979e-07, "loss": 1.4465, "step": 1638 }, { "epoch": 0.36021978021978024, "grad_norm": 0.3376410662457479, "learning_rate": 5.189236496410856e-07, "loss": 1.4447, "step": 1639 }, { "epoch": 0.36043956043956044, "grad_norm": 0.24503990004583534, "learning_rate": 5.187267585930972e-07, "loss": 1.5, "step": 1640 }, { "epoch": 0.36065934065934063, "grad_norm": 0.2617886668056767, "learning_rate": 5.185298037928977e-07, "loss": 1.4788, "step": 1641 }, { "epoch": 0.3608791208791209, "grad_norm": 0.2624974611371695, "learning_rate": 5.18332785334382e-07, "loss": 1.4284, "step": 1642 }, { "epoch": 0.3610989010989011, "grad_norm": 0.25429901880516215, "learning_rate": 5.181357033114761e-07, "loss": 1.498, "step": 1643 }, { "epoch": 0.36131868131868133, "grad_norm": 0.24410133896726638, "learning_rate": 5.179385578181356e-07, "loss": 1.469, "step": 1644 }, { "epoch": 0.36153846153846153, "grad_norm": 0.23753990605159025, "learning_rate": 5.177413489483467e-07, "loss": 1.4206, "step": 1645 }, { "epoch": 0.3617582417582418, "grad_norm": 0.24188579178071082, "learning_rate": 5.175440767961257e-07, "loss": 1.4974, "step": 1646 }, { "epoch": 0.361978021978022, "grad_norm": 0.24624653475079045, "learning_rate": 5.173467414555191e-07, "loss": 1.4652, "step": 1647 }, { "epoch": 0.3621978021978022, "grad_norm": 0.289364946335418, "learning_rate": 5.171493430206037e-07, "loss": 1.4478, "step": 1648 }, { "epoch": 0.36241758241758243, "grad_norm": 0.26847709016957916, "learning_rate": 5.169518815854861e-07, "loss": 1.5022, "step": 1649 }, { "epoch": 0.3626373626373626, "grad_norm": 0.2570575009886609, "learning_rate": 5.167543572443029e-07, "loss": 1.4544, "step": 1650 }, { "epoch": 0.3628571428571429, "grad_norm": 0.257286953135937, "learning_rate": 5.165567700912213e-07, "loss": 1.4779, "step": 1651 }, { "epoch": 0.3630769230769231, "grad_norm": 0.25155373502651135, "learning_rate": 5.163591202204376e-07, "loss": 1.4467, "step": 1652 }, { "epoch": 0.36329670329670327, "grad_norm": 0.2435159741570224, "learning_rate": 5.161614077261784e-07, "loss": 1.4807, "step": 1653 }, { "epoch": 0.3635164835164835, "grad_norm": 0.24590872121935767, "learning_rate": 5.159636327027003e-07, "loss": 1.5235, "step": 1654 }, { "epoch": 0.3637362637362637, "grad_norm": 0.2625138461633969, "learning_rate": 5.157657952442896e-07, "loss": 1.4511, "step": 1655 }, { "epoch": 0.363956043956044, "grad_norm": 0.24395697878331868, "learning_rate": 5.155678954452621e-07, "loss": 1.464, "step": 1656 }, { "epoch": 0.36417582417582417, "grad_norm": 0.24486474113797835, "learning_rate": 5.153699333999635e-07, "loss": 1.4713, "step": 1657 }, { "epoch": 0.3643956043956044, "grad_norm": 0.27316487002629064, "learning_rate": 5.151719092027696e-07, "loss": 1.4838, "step": 1658 }, { "epoch": 0.3646153846153846, "grad_norm": 0.2444490217388486, "learning_rate": 5.149738229480853e-07, "loss": 1.4095, "step": 1659 }, { "epoch": 0.3648351648351648, "grad_norm": 0.2607606967463366, "learning_rate": 5.14775674730345e-07, "loss": 1.4843, "step": 1660 }, { "epoch": 0.36505494505494507, "grad_norm": 0.26827223696239494, "learning_rate": 5.145774646440129e-07, "loss": 1.5493, "step": 1661 }, { "epoch": 0.36527472527472526, "grad_norm": 0.2648935836241543, "learning_rate": 5.143791927835829e-07, "loss": 1.4539, "step": 1662 }, { "epoch": 0.3654945054945055, "grad_norm": 0.2367344002413746, "learning_rate": 5.141808592435779e-07, "loss": 1.388, "step": 1663 }, { "epoch": 0.3657142857142857, "grad_norm": 0.27471419900946775, "learning_rate": 5.139824641185505e-07, "loss": 1.4936, "step": 1664 }, { "epoch": 0.3659340659340659, "grad_norm": 0.24806728311125092, "learning_rate": 5.137840075030826e-07, "loss": 1.4664, "step": 1665 }, { "epoch": 0.36615384615384616, "grad_norm": 0.26246681906181907, "learning_rate": 5.135854894917853e-07, "loss": 1.4827, "step": 1666 }, { "epoch": 0.36637362637362636, "grad_norm": 0.2675634521745081, "learning_rate": 5.133869101792993e-07, "loss": 1.5009, "step": 1667 }, { "epoch": 0.3665934065934066, "grad_norm": 0.2433570578602176, "learning_rate": 5.13188269660294e-07, "loss": 1.4872, "step": 1668 }, { "epoch": 0.3668131868131868, "grad_norm": 0.24948785671905935, "learning_rate": 5.129895680294683e-07, "loss": 1.4696, "step": 1669 }, { "epoch": 0.367032967032967, "grad_norm": 0.2554563221150702, "learning_rate": 5.127908053815505e-07, "loss": 1.5169, "step": 1670 }, { "epoch": 0.36725274725274726, "grad_norm": 0.258765505529143, "learning_rate": 5.125919818112974e-07, "loss": 1.4698, "step": 1671 }, { "epoch": 0.36747252747252745, "grad_norm": 0.24230009065253344, "learning_rate": 5.12393097413495e-07, "loss": 1.3884, "step": 1672 }, { "epoch": 0.3676923076923077, "grad_norm": 0.2548695647565594, "learning_rate": 5.121941522829588e-07, "loss": 1.4249, "step": 1673 }, { "epoch": 0.3679120879120879, "grad_norm": 0.25626680131633506, "learning_rate": 5.119951465145327e-07, "loss": 1.4579, "step": 1674 }, { "epoch": 0.36813186813186816, "grad_norm": 0.24812312088905314, "learning_rate": 5.117960802030897e-07, "loss": 1.5091, "step": 1675 }, { "epoch": 0.36835164835164835, "grad_norm": 0.24947351298920709, "learning_rate": 5.115969534435317e-07, "loss": 1.473, "step": 1676 }, { "epoch": 0.36857142857142855, "grad_norm": 0.25146348516505296, "learning_rate": 5.113977663307894e-07, "loss": 1.4645, "step": 1677 }, { "epoch": 0.3687912087912088, "grad_norm": 0.2633496405142336, "learning_rate": 5.11198518959822e-07, "loss": 1.503, "step": 1678 }, { "epoch": 0.369010989010989, "grad_norm": 0.25333672830246573, "learning_rate": 5.109992114256181e-07, "loss": 1.5201, "step": 1679 }, { "epoch": 0.36923076923076925, "grad_norm": 0.257925734816316, "learning_rate": 5.107998438231941e-07, "loss": 1.4488, "step": 1680 }, { "epoch": 0.36945054945054945, "grad_norm": 0.33885595813935115, "learning_rate": 5.106004162475958e-07, "loss": 1.4063, "step": 1681 }, { "epoch": 0.36967032967032964, "grad_norm": 0.25336712226363317, "learning_rate": 5.104009287938972e-07, "loss": 1.525, "step": 1682 }, { "epoch": 0.3698901098901099, "grad_norm": 0.25680866951052767, "learning_rate": 5.102013815572008e-07, "loss": 1.475, "step": 1683 }, { "epoch": 0.3701098901098901, "grad_norm": 0.24826865203303666, "learning_rate": 5.100017746326379e-07, "loss": 1.4953, "step": 1684 }, { "epoch": 0.37032967032967035, "grad_norm": 0.2479780259362285, "learning_rate": 5.09802108115368e-07, "loss": 1.5105, "step": 1685 }, { "epoch": 0.37054945054945054, "grad_norm": 0.2379512224951037, "learning_rate": 5.096023821005793e-07, "loss": 1.5068, "step": 1686 }, { "epoch": 0.3707692307692308, "grad_norm": 0.27283424923387284, "learning_rate": 5.094025966834879e-07, "loss": 1.5117, "step": 1687 }, { "epoch": 0.370989010989011, "grad_norm": 0.2715125554339445, "learning_rate": 5.092027519593385e-07, "loss": 1.4377, "step": 1688 }, { "epoch": 0.3712087912087912, "grad_norm": 0.24841593700618475, "learning_rate": 5.090028480234041e-07, "loss": 1.4514, "step": 1689 }, { "epoch": 0.37142857142857144, "grad_norm": 0.24937969008749136, "learning_rate": 5.088028849709859e-07, "loss": 1.5119, "step": 1690 }, { "epoch": 0.37164835164835164, "grad_norm": 0.2524058970252941, "learning_rate": 5.086028628974133e-07, "loss": 1.4801, "step": 1691 }, { "epoch": 0.3718681318681319, "grad_norm": 0.2475098724614424, "learning_rate": 5.084027818980437e-07, "loss": 1.5341, "step": 1692 }, { "epoch": 0.3720879120879121, "grad_norm": 0.2627371302017303, "learning_rate": 5.082026420682629e-07, "loss": 1.484, "step": 1693 }, { "epoch": 0.3723076923076923, "grad_norm": 0.24337253730307487, "learning_rate": 5.080024435034843e-07, "loss": 1.4469, "step": 1694 }, { "epoch": 0.37252747252747254, "grad_norm": 0.27099817798239256, "learning_rate": 5.078021862991496e-07, "loss": 1.5334, "step": 1695 }, { "epoch": 0.37274725274725273, "grad_norm": 0.2352378169507532, "learning_rate": 5.076018705507285e-07, "loss": 1.3952, "step": 1696 }, { "epoch": 0.372967032967033, "grad_norm": 0.24398355494635385, "learning_rate": 5.074014963537183e-07, "loss": 1.4562, "step": 1697 }, { "epoch": 0.3731868131868132, "grad_norm": 0.24711736541024606, "learning_rate": 5.072010638036447e-07, "loss": 1.4462, "step": 1698 }, { "epoch": 0.3734065934065934, "grad_norm": 0.28791534650705003, "learning_rate": 5.070005729960605e-07, "loss": 1.5435, "step": 1699 }, { "epoch": 0.37362637362637363, "grad_norm": 0.23699801997694145, "learning_rate": 5.068000240265469e-07, "loss": 1.4298, "step": 1700 }, { "epoch": 0.37384615384615383, "grad_norm": 0.2677719563086052, "learning_rate": 5.065994169907127e-07, "loss": 1.4486, "step": 1701 }, { "epoch": 0.3740659340659341, "grad_norm": 0.24943279185346431, "learning_rate": 5.063987519841939e-07, "loss": 1.4759, "step": 1702 }, { "epoch": 0.3742857142857143, "grad_norm": 0.25568632667159863, "learning_rate": 5.061980291026549e-07, "loss": 1.4719, "step": 1703 }, { "epoch": 0.37450549450549453, "grad_norm": 0.24438577886394727, "learning_rate": 5.05997248441787e-07, "loss": 1.4523, "step": 1704 }, { "epoch": 0.3747252747252747, "grad_norm": 0.2586426162558187, "learning_rate": 5.057964100973096e-07, "loss": 1.5342, "step": 1705 }, { "epoch": 0.3749450549450549, "grad_norm": 0.24637124850913075, "learning_rate": 5.055955141649691e-07, "loss": 1.4716, "step": 1706 }, { "epoch": 0.3751648351648352, "grad_norm": 0.2626533839286568, "learning_rate": 5.053945607405396e-07, "loss": 1.5064, "step": 1707 }, { "epoch": 0.37538461538461537, "grad_norm": 0.2681991834944331, "learning_rate": 5.05193549919823e-07, "loss": 1.4551, "step": 1708 }, { "epoch": 0.3756043956043956, "grad_norm": 0.2489845477272852, "learning_rate": 5.049924817986477e-07, "loss": 1.4625, "step": 1709 }, { "epoch": 0.3758241758241758, "grad_norm": 0.24985291717622093, "learning_rate": 5.047913564728703e-07, "loss": 1.4062, "step": 1710 }, { "epoch": 0.376043956043956, "grad_norm": 0.24988919932688364, "learning_rate": 5.045901740383739e-07, "loss": 1.5092, "step": 1711 }, { "epoch": 0.37626373626373627, "grad_norm": 0.25271003886000754, "learning_rate": 5.043889345910693e-07, "loss": 1.4787, "step": 1712 }, { "epoch": 0.37648351648351647, "grad_norm": 0.2505745971018397, "learning_rate": 5.041876382268945e-07, "loss": 1.4293, "step": 1713 }, { "epoch": 0.3767032967032967, "grad_norm": 0.24482954309101237, "learning_rate": 5.039862850418143e-07, "loss": 1.4475, "step": 1714 }, { "epoch": 0.3769230769230769, "grad_norm": 0.24170670857954743, "learning_rate": 5.037848751318209e-07, "loss": 1.4701, "step": 1715 }, { "epoch": 0.37714285714285717, "grad_norm": 0.28057626991866186, "learning_rate": 5.035834085929334e-07, "loss": 1.478, "step": 1716 }, { "epoch": 0.37736263736263737, "grad_norm": 0.242264466938029, "learning_rate": 5.033818855211981e-07, "loss": 1.4919, "step": 1717 }, { "epoch": 0.37758241758241756, "grad_norm": 0.24614319025443693, "learning_rate": 5.031803060126877e-07, "loss": 1.4668, "step": 1718 }, { "epoch": 0.3778021978021978, "grad_norm": 0.24586533904885063, "learning_rate": 5.029786701635025e-07, "loss": 1.4821, "step": 1719 }, { "epoch": 0.378021978021978, "grad_norm": 0.25410869040339945, "learning_rate": 5.027769780697694e-07, "loss": 1.462, "step": 1720 }, { "epoch": 0.37824175824175826, "grad_norm": 0.30108580189647816, "learning_rate": 5.025752298276416e-07, "loss": 1.5574, "step": 1721 }, { "epoch": 0.37846153846153846, "grad_norm": 0.2604357948050287, "learning_rate": 5.023734255333e-07, "loss": 1.51, "step": 1722 }, { "epoch": 0.37868131868131866, "grad_norm": 0.2488240596006886, "learning_rate": 5.021715652829517e-07, "loss": 1.4783, "step": 1723 }, { "epoch": 0.3789010989010989, "grad_norm": 0.24486155407460908, "learning_rate": 5.019696491728302e-07, "loss": 1.4731, "step": 1724 }, { "epoch": 0.3791208791208791, "grad_norm": 0.24814047886420185, "learning_rate": 5.017676772991964e-07, "loss": 1.4932, "step": 1725 }, { "epoch": 0.37934065934065936, "grad_norm": 0.25141982006108415, "learning_rate": 5.015656497583367e-07, "loss": 1.4612, "step": 1726 }, { "epoch": 0.37956043956043956, "grad_norm": 0.24246762504972255, "learning_rate": 5.013635666465653e-07, "loss": 1.4322, "step": 1727 }, { "epoch": 0.3797802197802198, "grad_norm": 0.25178380724344507, "learning_rate": 5.011614280602222e-07, "loss": 1.4421, "step": 1728 }, { "epoch": 0.38, "grad_norm": 0.24372812872379204, "learning_rate": 5.009592340956736e-07, "loss": 1.4522, "step": 1729 }, { "epoch": 0.3802197802197802, "grad_norm": 0.25813976594211274, "learning_rate": 5.007569848493125e-07, "loss": 1.5024, "step": 1730 }, { "epoch": 0.38043956043956045, "grad_norm": 0.24717820104448904, "learning_rate": 5.005546804175584e-07, "loss": 1.5187, "step": 1731 }, { "epoch": 0.38065934065934065, "grad_norm": 0.24230905820266857, "learning_rate": 5.003523208968566e-07, "loss": 1.4505, "step": 1732 }, { "epoch": 0.3808791208791209, "grad_norm": 0.24157743771302173, "learning_rate": 5.001499063836791e-07, "loss": 1.445, "step": 1733 }, { "epoch": 0.3810989010989011, "grad_norm": 0.2463804707054971, "learning_rate": 4.99947436974524e-07, "loss": 1.4571, "step": 1734 }, { "epoch": 0.3813186813186813, "grad_norm": 0.251794984418654, "learning_rate": 4.997449127659155e-07, "loss": 1.4895, "step": 1735 }, { "epoch": 0.38153846153846155, "grad_norm": 0.2528992624714515, "learning_rate": 4.995423338544038e-07, "loss": 1.4259, "step": 1736 }, { "epoch": 0.38175824175824175, "grad_norm": 0.2435411471713826, "learning_rate": 4.993397003365654e-07, "loss": 1.5032, "step": 1737 }, { "epoch": 0.381978021978022, "grad_norm": 0.2517055998896912, "learning_rate": 4.99137012309003e-07, "loss": 1.5022, "step": 1738 }, { "epoch": 0.3821978021978022, "grad_norm": 0.26009088996567187, "learning_rate": 4.989342698683447e-07, "loss": 1.5164, "step": 1739 }, { "epoch": 0.3824175824175824, "grad_norm": 0.254362818031432, "learning_rate": 4.987314731112453e-07, "loss": 1.479, "step": 1740 }, { "epoch": 0.38263736263736264, "grad_norm": 0.2597714051834542, "learning_rate": 4.985286221343846e-07, "loss": 1.4416, "step": 1741 }, { "epoch": 0.38285714285714284, "grad_norm": 0.25528039678124415, "learning_rate": 4.983257170344692e-07, "loss": 1.5145, "step": 1742 }, { "epoch": 0.3830769230769231, "grad_norm": 0.2547350729810254, "learning_rate": 4.981227579082307e-07, "loss": 1.4664, "step": 1743 }, { "epoch": 0.3832967032967033, "grad_norm": 0.23974405079595368, "learning_rate": 4.97919744852427e-07, "loss": 1.3935, "step": 1744 }, { "epoch": 0.38351648351648354, "grad_norm": 0.2601908249110836, "learning_rate": 4.977166779638414e-07, "loss": 1.4609, "step": 1745 }, { "epoch": 0.38373626373626374, "grad_norm": 0.256488928208101, "learning_rate": 4.97513557339283e-07, "loss": 1.4967, "step": 1746 }, { "epoch": 0.38395604395604394, "grad_norm": 0.24641004140866646, "learning_rate": 4.973103830755865e-07, "loss": 1.4455, "step": 1747 }, { "epoch": 0.3841758241758242, "grad_norm": 0.5478959950888969, "learning_rate": 4.971071552696123e-07, "loss": 1.4091, "step": 1748 }, { "epoch": 0.3843956043956044, "grad_norm": 0.2472976803178337, "learning_rate": 4.969038740182459e-07, "loss": 1.4679, "step": 1749 }, { "epoch": 0.38461538461538464, "grad_norm": 0.24927157233560374, "learning_rate": 4.967005394183987e-07, "loss": 1.54, "step": 1750 }, { "epoch": 0.38483516483516483, "grad_norm": 0.3448194022458905, "learning_rate": 4.964971515670075e-07, "loss": 1.4847, "step": 1751 }, { "epoch": 0.38505494505494503, "grad_norm": 0.2542207946374973, "learning_rate": 4.962937105610342e-07, "loss": 1.4905, "step": 1752 }, { "epoch": 0.3852747252747253, "grad_norm": 0.2468556554844839, "learning_rate": 4.960902164974662e-07, "loss": 1.4631, "step": 1753 }, { "epoch": 0.3854945054945055, "grad_norm": 0.2539802160889825, "learning_rate": 4.958866694733165e-07, "loss": 1.4537, "step": 1754 }, { "epoch": 0.38571428571428573, "grad_norm": 0.24328159268134317, "learning_rate": 4.95683069585623e-07, "loss": 1.4392, "step": 1755 }, { "epoch": 0.38593406593406593, "grad_norm": 0.25444687986670267, "learning_rate": 4.954794169314485e-07, "loss": 1.4786, "step": 1756 }, { "epoch": 0.3861538461538462, "grad_norm": 0.2734835499805895, "learning_rate": 4.952757116078817e-07, "loss": 1.482, "step": 1757 }, { "epoch": 0.3863736263736264, "grad_norm": 0.24566099112744974, "learning_rate": 4.950719537120362e-07, "loss": 1.4364, "step": 1758 }, { "epoch": 0.3865934065934066, "grad_norm": 0.2596874713017255, "learning_rate": 4.948681433410498e-07, "loss": 1.5016, "step": 1759 }, { "epoch": 0.3868131868131868, "grad_norm": 0.246167195794601, "learning_rate": 4.946642805920866e-07, "loss": 1.4643, "step": 1760 }, { "epoch": 0.387032967032967, "grad_norm": 0.2574672715077331, "learning_rate": 4.944603655623351e-07, "loss": 1.48, "step": 1761 }, { "epoch": 0.3872527472527473, "grad_norm": 0.24588561989161187, "learning_rate": 4.942563983490084e-07, "loss": 1.4308, "step": 1762 }, { "epoch": 0.3874725274725275, "grad_norm": 0.2546874864594375, "learning_rate": 4.94052379049345e-07, "loss": 1.4329, "step": 1763 }, { "epoch": 0.38769230769230767, "grad_norm": 0.25113981074630753, "learning_rate": 4.938483077606077e-07, "loss": 1.5257, "step": 1764 }, { "epoch": 0.3879120879120879, "grad_norm": 0.24634928100053097, "learning_rate": 4.936441845800848e-07, "loss": 1.5301, "step": 1765 }, { "epoch": 0.3881318681318681, "grad_norm": 0.27798264903802494, "learning_rate": 4.934400096050888e-07, "loss": 1.4742, "step": 1766 }, { "epoch": 0.38835164835164837, "grad_norm": 0.2595476892119092, "learning_rate": 4.93235782932957e-07, "loss": 1.4621, "step": 1767 }, { "epoch": 0.38857142857142857, "grad_norm": 0.25005757368351345, "learning_rate": 4.930315046610513e-07, "loss": 1.5062, "step": 1768 }, { "epoch": 0.38879120879120876, "grad_norm": 0.2475965750531368, "learning_rate": 4.928271748867587e-07, "loss": 1.4672, "step": 1769 }, { "epoch": 0.389010989010989, "grad_norm": 0.25355442753356366, "learning_rate": 4.926227937074897e-07, "loss": 1.4644, "step": 1770 }, { "epoch": 0.3892307692307692, "grad_norm": 0.2448924646560554, "learning_rate": 4.924183612206803e-07, "loss": 1.443, "step": 1771 }, { "epoch": 0.38945054945054947, "grad_norm": 0.27020637123372326, "learning_rate": 4.922138775237905e-07, "loss": 1.5124, "step": 1772 }, { "epoch": 0.38967032967032966, "grad_norm": 0.2429161843850925, "learning_rate": 4.92009342714305e-07, "loss": 1.4743, "step": 1773 }, { "epoch": 0.3898901098901099, "grad_norm": 0.252594183781848, "learning_rate": 4.918047568897325e-07, "loss": 1.4622, "step": 1774 }, { "epoch": 0.3901098901098901, "grad_norm": 0.24048192930828624, "learning_rate": 4.916001201476062e-07, "loss": 1.4698, "step": 1775 }, { "epoch": 0.3903296703296703, "grad_norm": 0.2422641983900522, "learning_rate": 4.913954325854836e-07, "loss": 1.4791, "step": 1776 }, { "epoch": 0.39054945054945056, "grad_norm": 0.25461158110345045, "learning_rate": 4.911906943009464e-07, "loss": 1.4913, "step": 1777 }, { "epoch": 0.39076923076923076, "grad_norm": 0.29321949968392325, "learning_rate": 4.909859053916006e-07, "loss": 1.4653, "step": 1778 }, { "epoch": 0.390989010989011, "grad_norm": 0.25663682866390836, "learning_rate": 4.907810659550759e-07, "loss": 1.5317, "step": 1779 }, { "epoch": 0.3912087912087912, "grad_norm": 0.24724952499445593, "learning_rate": 4.905761760890269e-07, "loss": 1.4515, "step": 1780 }, { "epoch": 0.3914285714285714, "grad_norm": 0.26411372708333347, "learning_rate": 4.903712358911313e-07, "loss": 1.4773, "step": 1781 }, { "epoch": 0.39164835164835166, "grad_norm": 0.24846185378366778, "learning_rate": 4.901662454590915e-07, "loss": 1.4742, "step": 1782 }, { "epoch": 0.39186813186813185, "grad_norm": 0.2668387394027364, "learning_rate": 4.899612048906334e-07, "loss": 1.4363, "step": 1783 }, { "epoch": 0.3920879120879121, "grad_norm": 0.2611779062632974, "learning_rate": 4.89756114283507e-07, "loss": 1.4804, "step": 1784 }, { "epoch": 0.3923076923076923, "grad_norm": 0.2677177467642176, "learning_rate": 4.895509737354864e-07, "loss": 1.454, "step": 1785 }, { "epoch": 0.39252747252747255, "grad_norm": 0.30826115217999084, "learning_rate": 4.893457833443692e-07, "loss": 1.4808, "step": 1786 }, { "epoch": 0.39274725274725275, "grad_norm": 0.2793667782925087, "learning_rate": 4.891405432079765e-07, "loss": 1.4545, "step": 1787 }, { "epoch": 0.39296703296703295, "grad_norm": 0.24327235516279463, "learning_rate": 4.889352534241537e-07, "loss": 1.4653, "step": 1788 }, { "epoch": 0.3931868131868132, "grad_norm": 0.25491835794406864, "learning_rate": 4.887299140907695e-07, "loss": 1.4747, "step": 1789 }, { "epoch": 0.3934065934065934, "grad_norm": 0.2540225940638022, "learning_rate": 4.885245253057165e-07, "loss": 1.5261, "step": 1790 }, { "epoch": 0.39362637362637365, "grad_norm": 0.26796825571352123, "learning_rate": 4.883190871669105e-07, "loss": 1.474, "step": 1791 }, { "epoch": 0.39384615384615385, "grad_norm": 0.2531601757943148, "learning_rate": 4.881135997722911e-07, "loss": 1.4956, "step": 1792 }, { "epoch": 0.39406593406593404, "grad_norm": 0.28754737023725047, "learning_rate": 4.879080632198215e-07, "loss": 1.4302, "step": 1793 }, { "epoch": 0.3942857142857143, "grad_norm": 0.2676420335773287, "learning_rate": 4.877024776074877e-07, "loss": 1.4763, "step": 1794 }, { "epoch": 0.3945054945054945, "grad_norm": 0.24716390305532507, "learning_rate": 4.874968430332998e-07, "loss": 1.4721, "step": 1795 }, { "epoch": 0.39472527472527474, "grad_norm": 0.23344381632056682, "learning_rate": 4.87291159595291e-07, "loss": 1.4322, "step": 1796 }, { "epoch": 0.39494505494505494, "grad_norm": 0.24377427229410936, "learning_rate": 4.870854273915178e-07, "loss": 1.4341, "step": 1797 }, { "epoch": 0.39516483516483514, "grad_norm": 0.24609118241522102, "learning_rate": 4.868796465200599e-07, "loss": 1.463, "step": 1798 }, { "epoch": 0.3953846153846154, "grad_norm": 0.26229641967712986, "learning_rate": 4.866738170790202e-07, "loss": 1.4789, "step": 1799 }, { "epoch": 0.3956043956043956, "grad_norm": 0.25776871029947374, "learning_rate": 4.864679391665245e-07, "loss": 1.5444, "step": 1800 }, { "epoch": 0.39582417582417584, "grad_norm": 0.23892518567766813, "learning_rate": 4.862620128807225e-07, "loss": 1.4507, "step": 1801 }, { "epoch": 0.39604395604395604, "grad_norm": 0.24817053150800028, "learning_rate": 4.860560383197859e-07, "loss": 1.4869, "step": 1802 }, { "epoch": 0.3962637362637363, "grad_norm": 0.25572874720505023, "learning_rate": 4.858500155819105e-07, "loss": 1.4768, "step": 1803 }, { "epoch": 0.3964835164835165, "grad_norm": 0.3168810818857704, "learning_rate": 4.856439447653142e-07, "loss": 1.4569, "step": 1804 }, { "epoch": 0.3967032967032967, "grad_norm": 0.24009418394509754, "learning_rate": 4.854378259682381e-07, "loss": 1.4379, "step": 1805 }, { "epoch": 0.39692307692307693, "grad_norm": 0.25161774876511417, "learning_rate": 4.852316592889464e-07, "loss": 1.4404, "step": 1806 }, { "epoch": 0.39714285714285713, "grad_norm": 0.25938765062006036, "learning_rate": 4.85025444825726e-07, "loss": 1.4366, "step": 1807 }, { "epoch": 0.3973626373626374, "grad_norm": 0.24821673122609372, "learning_rate": 4.848191826768863e-07, "loss": 1.4356, "step": 1808 }, { "epoch": 0.3975824175824176, "grad_norm": 0.2482062446031562, "learning_rate": 4.846128729407597e-07, "loss": 1.4395, "step": 1809 }, { "epoch": 0.3978021978021978, "grad_norm": 0.2669109598969336, "learning_rate": 4.844065157157014e-07, "loss": 1.4541, "step": 1810 }, { "epoch": 0.39802197802197803, "grad_norm": 0.25771016681731657, "learning_rate": 4.842001111000891e-07, "loss": 1.4473, "step": 1811 }, { "epoch": 0.3982417582417582, "grad_norm": 0.3009492967522756, "learning_rate": 4.839936591923229e-07, "loss": 1.4814, "step": 1812 }, { "epoch": 0.3984615384615385, "grad_norm": 0.2464490607634403, "learning_rate": 4.837871600908257e-07, "loss": 1.5183, "step": 1813 }, { "epoch": 0.3986813186813187, "grad_norm": 0.259549663136423, "learning_rate": 4.835806138940428e-07, "loss": 1.4927, "step": 1814 }, { "epoch": 0.3989010989010989, "grad_norm": 0.2566912365495615, "learning_rate": 4.833740207004421e-07, "loss": 1.4423, "step": 1815 }, { "epoch": 0.3991208791208791, "grad_norm": 0.25162225407409183, "learning_rate": 4.831673806085138e-07, "loss": 1.4719, "step": 1816 }, { "epoch": 0.3993406593406593, "grad_norm": 0.2565978320181227, "learning_rate": 4.829606937167702e-07, "loss": 1.4203, "step": 1817 }, { "epoch": 0.3995604395604396, "grad_norm": 0.2447236818504802, "learning_rate": 4.827539601237463e-07, "loss": 1.4745, "step": 1818 }, { "epoch": 0.39978021978021977, "grad_norm": 0.25277170852857506, "learning_rate": 4.825471799279993e-07, "loss": 1.4642, "step": 1819 }, { "epoch": 0.4, "grad_norm": 0.24220794243076613, "learning_rate": 4.823403532281084e-07, "loss": 1.5, "step": 1820 }, { "epoch": 0.4002197802197802, "grad_norm": 0.2427399800900977, "learning_rate": 4.821334801226752e-07, "loss": 1.4454, "step": 1821 }, { "epoch": 0.4004395604395604, "grad_norm": 0.24036850678664107, "learning_rate": 4.819265607103233e-07, "loss": 1.5058, "step": 1822 }, { "epoch": 0.40065934065934067, "grad_norm": 0.26847581950085464, "learning_rate": 4.817195950896983e-07, "loss": 1.393, "step": 1823 }, { "epoch": 0.40087912087912086, "grad_norm": 0.268721336380857, "learning_rate": 4.815125833594679e-07, "loss": 1.4737, "step": 1824 }, { "epoch": 0.4010989010989011, "grad_norm": 0.2910363742169384, "learning_rate": 4.813055256183222e-07, "loss": 1.5296, "step": 1825 }, { "epoch": 0.4013186813186813, "grad_norm": 0.24118564870426512, "learning_rate": 4.810984219649725e-07, "loss": 1.4804, "step": 1826 }, { "epoch": 0.4015384615384615, "grad_norm": 0.2566750028500243, "learning_rate": 4.808912724981524e-07, "loss": 1.4949, "step": 1827 }, { "epoch": 0.40175824175824176, "grad_norm": 0.23867865799133473, "learning_rate": 4.806840773166171e-07, "loss": 1.4066, "step": 1828 }, { "epoch": 0.40197802197802196, "grad_norm": 0.25533959891940117, "learning_rate": 4.804768365191441e-07, "loss": 1.5586, "step": 1829 }, { "epoch": 0.4021978021978022, "grad_norm": 0.2422944624106141, "learning_rate": 4.802695502045322e-07, "loss": 1.4104, "step": 1830 }, { "epoch": 0.4024175824175824, "grad_norm": 0.24466760588270278, "learning_rate": 4.80062218471602e-07, "loss": 1.4668, "step": 1831 }, { "epoch": 0.40263736263736266, "grad_norm": 0.2466937832986439, "learning_rate": 4.798548414191957e-07, "loss": 1.4212, "step": 1832 }, { "epoch": 0.40285714285714286, "grad_norm": 0.24768306549436672, "learning_rate": 4.796474191461773e-07, "loss": 1.4515, "step": 1833 }, { "epoch": 0.40307692307692305, "grad_norm": 0.2504352894051246, "learning_rate": 4.794399517514322e-07, "loss": 1.4858, "step": 1834 }, { "epoch": 0.4032967032967033, "grad_norm": 0.24699744571501392, "learning_rate": 4.792324393338674e-07, "loss": 1.5157, "step": 1835 }, { "epoch": 0.4035164835164835, "grad_norm": 0.24744382663721826, "learning_rate": 4.79024881992411e-07, "loss": 1.4565, "step": 1836 }, { "epoch": 0.40373626373626376, "grad_norm": 0.26316227935316405, "learning_rate": 4.788172798260132e-07, "loss": 1.4838, "step": 1837 }, { "epoch": 0.40395604395604395, "grad_norm": 0.25257085263895485, "learning_rate": 4.786096329336451e-07, "loss": 1.4919, "step": 1838 }, { "epoch": 0.40417582417582415, "grad_norm": 0.2625332035565822, "learning_rate": 4.784019414142992e-07, "loss": 1.4807, "step": 1839 }, { "epoch": 0.4043956043956044, "grad_norm": 0.24705421740064612, "learning_rate": 4.781942053669891e-07, "loss": 1.4699, "step": 1840 }, { "epoch": 0.4046153846153846, "grad_norm": 0.2429821361794914, "learning_rate": 4.7798642489075e-07, "loss": 1.5306, "step": 1841 }, { "epoch": 0.40483516483516485, "grad_norm": 0.26020097251838, "learning_rate": 4.777786000846383e-07, "loss": 1.4924, "step": 1842 }, { "epoch": 0.40505494505494505, "grad_norm": 0.24070971549065806, "learning_rate": 4.77570731047731e-07, "loss": 1.4587, "step": 1843 }, { "epoch": 0.4052747252747253, "grad_norm": 0.23694437718701358, "learning_rate": 4.773628178791265e-07, "loss": 1.4419, "step": 1844 }, { "epoch": 0.4054945054945055, "grad_norm": 0.24633286127674098, "learning_rate": 4.771548606779445e-07, "loss": 1.4264, "step": 1845 }, { "epoch": 0.4057142857142857, "grad_norm": 0.2585750366583406, "learning_rate": 4.769468595433256e-07, "loss": 1.533, "step": 1846 }, { "epoch": 0.40593406593406595, "grad_norm": 0.2622541535395647, "learning_rate": 4.767388145744307e-07, "loss": 1.4448, "step": 1847 }, { "epoch": 0.40615384615384614, "grad_norm": 0.2525586985577036, "learning_rate": 4.7653072587044237e-07, "loss": 1.4515, "step": 1848 }, { "epoch": 0.4063736263736264, "grad_norm": 0.24274686923220623, "learning_rate": 4.763225935305637e-07, "loss": 1.4441, "step": 1849 }, { "epoch": 0.4065934065934066, "grad_norm": 0.28403044996668436, "learning_rate": 4.7611441765401884e-07, "loss": 1.507, "step": 1850 }, { "epoch": 0.4068131868131868, "grad_norm": 0.24752559644983235, "learning_rate": 4.7590619834005223e-07, "loss": 1.4437, "step": 1851 }, { "epoch": 0.40703296703296704, "grad_norm": 0.2554203155855423, "learning_rate": 4.7569793568792933e-07, "loss": 1.4766, "step": 1852 }, { "epoch": 0.40725274725274724, "grad_norm": 0.25857678651541904, "learning_rate": 4.754896297969364e-07, "loss": 1.4844, "step": 1853 }, { "epoch": 0.4074725274725275, "grad_norm": 0.23871671831128147, "learning_rate": 4.7528128076637984e-07, "loss": 1.4997, "step": 1854 }, { "epoch": 0.4076923076923077, "grad_norm": 0.250982771013428, "learning_rate": 4.750728886955871e-07, "loss": 1.4352, "step": 1855 }, { "epoch": 0.40791208791208794, "grad_norm": 0.2492415405681126, "learning_rate": 4.748644536839058e-07, "loss": 1.4324, "step": 1856 }, { "epoch": 0.40813186813186814, "grad_norm": 0.31408680888199064, "learning_rate": 4.746559758307045e-07, "loss": 1.4883, "step": 1857 }, { "epoch": 0.40835164835164833, "grad_norm": 0.252977338230396, "learning_rate": 4.7444745523537166e-07, "loss": 1.4003, "step": 1858 }, { "epoch": 0.4085714285714286, "grad_norm": 0.27105566114926655, "learning_rate": 4.74238891997316e-07, "loss": 1.4994, "step": 1859 }, { "epoch": 0.4087912087912088, "grad_norm": 0.25952788816711503, "learning_rate": 4.7403028621596746e-07, "loss": 1.451, "step": 1860 }, { "epoch": 0.40901098901098903, "grad_norm": 0.2541118783054669, "learning_rate": 4.738216379907753e-07, "loss": 1.5014, "step": 1861 }, { "epoch": 0.40923076923076923, "grad_norm": 0.24630810400822947, "learning_rate": 4.7361294742120965e-07, "loss": 1.4538, "step": 1862 }, { "epoch": 0.40945054945054943, "grad_norm": 0.23836644075251986, "learning_rate": 4.7340421460676044e-07, "loss": 1.499, "step": 1863 }, { "epoch": 0.4096703296703297, "grad_norm": 0.2859251633068821, "learning_rate": 4.731954396469379e-07, "loss": 1.4851, "step": 1864 }, { "epoch": 0.4098901098901099, "grad_norm": 0.251065508090773, "learning_rate": 4.729866226412724e-07, "loss": 1.4631, "step": 1865 }, { "epoch": 0.41010989010989013, "grad_norm": 0.2577852475566407, "learning_rate": 4.727777636893142e-07, "loss": 1.4355, "step": 1866 }, { "epoch": 0.4103296703296703, "grad_norm": 0.2649747543641412, "learning_rate": 4.725688628906338e-07, "loss": 1.4417, "step": 1867 }, { "epoch": 0.4105494505494505, "grad_norm": 0.2972787725085871, "learning_rate": 4.723599203448213e-07, "loss": 1.5141, "step": 1868 }, { "epoch": 0.4107692307692308, "grad_norm": 0.2505888516758264, "learning_rate": 4.7215093615148703e-07, "loss": 1.4643, "step": 1869 }, { "epoch": 0.41098901098901097, "grad_norm": 0.2555008453693315, "learning_rate": 4.7194191041026096e-07, "loss": 1.4753, "step": 1870 }, { "epoch": 0.4112087912087912, "grad_norm": 0.24984656958376497, "learning_rate": 4.7173284322079305e-07, "loss": 1.4604, "step": 1871 }, { "epoch": 0.4114285714285714, "grad_norm": 0.24267064075432038, "learning_rate": 4.715237346827529e-07, "loss": 1.468, "step": 1872 }, { "epoch": 0.4116483516483517, "grad_norm": 0.2474257707883197, "learning_rate": 4.7131458489582984e-07, "loss": 1.4087, "step": 1873 }, { "epoch": 0.41186813186813187, "grad_norm": 0.2595786871517908, "learning_rate": 4.711053939597328e-07, "loss": 1.4757, "step": 1874 }, { "epoch": 0.41208791208791207, "grad_norm": 0.26515303845851107, "learning_rate": 4.7089616197419055e-07, "loss": 1.5202, "step": 1875 }, { "epoch": 0.4123076923076923, "grad_norm": 0.2580574704773794, "learning_rate": 4.706868890389511e-07, "loss": 1.4987, "step": 1876 }, { "epoch": 0.4125274725274725, "grad_norm": 0.24629621536998408, "learning_rate": 4.704775752537823e-07, "loss": 1.4456, "step": 1877 }, { "epoch": 0.41274725274725277, "grad_norm": 0.27158152619748616, "learning_rate": 4.702682207184712e-07, "loss": 1.4959, "step": 1878 }, { "epoch": 0.41296703296703297, "grad_norm": 0.242480743299886, "learning_rate": 4.7005882553282453e-07, "loss": 1.4061, "step": 1879 }, { "epoch": 0.41318681318681316, "grad_norm": 0.2561528857614439, "learning_rate": 4.6984938979666826e-07, "loss": 1.456, "step": 1880 }, { "epoch": 0.4134065934065934, "grad_norm": 0.26569408665037497, "learning_rate": 4.6963991360984765e-07, "loss": 1.5047, "step": 1881 }, { "epoch": 0.4136263736263736, "grad_norm": 0.25976395589489604, "learning_rate": 4.694303970722274e-07, "loss": 1.4173, "step": 1882 }, { "epoch": 0.41384615384615386, "grad_norm": 0.2509515485922121, "learning_rate": 4.692208402836912e-07, "loss": 1.4098, "step": 1883 }, { "epoch": 0.41406593406593406, "grad_norm": 0.25501761312969995, "learning_rate": 4.6901124334414223e-07, "loss": 1.5109, "step": 1884 }, { "epoch": 0.4142857142857143, "grad_norm": 0.29388399646471647, "learning_rate": 4.6880160635350265e-07, "loss": 1.4561, "step": 1885 }, { "epoch": 0.4145054945054945, "grad_norm": 0.25062995423747175, "learning_rate": 4.6859192941171365e-07, "loss": 1.4908, "step": 1886 }, { "epoch": 0.4147252747252747, "grad_norm": 0.24776975594772918, "learning_rate": 4.6838221261873564e-07, "loss": 1.4095, "step": 1887 }, { "epoch": 0.41494505494505496, "grad_norm": 0.24406414113091413, "learning_rate": 4.6817245607454785e-07, "loss": 1.4596, "step": 1888 }, { "epoch": 0.41516483516483516, "grad_norm": 0.24152548526912102, "learning_rate": 4.679626598791485e-07, "loss": 1.4641, "step": 1889 }, { "epoch": 0.4153846153846154, "grad_norm": 0.2591887155483339, "learning_rate": 4.6775282413255485e-07, "loss": 1.4536, "step": 1890 }, { "epoch": 0.4156043956043956, "grad_norm": 0.25372519647690656, "learning_rate": 4.67542948934803e-07, "loss": 1.4672, "step": 1891 }, { "epoch": 0.4158241758241758, "grad_norm": 0.24984707519866892, "learning_rate": 4.6733303438594764e-07, "loss": 1.4889, "step": 1892 }, { "epoch": 0.41604395604395605, "grad_norm": 0.2465455207121106, "learning_rate": 4.6712308058606217e-07, "loss": 1.4577, "step": 1893 }, { "epoch": 0.41626373626373625, "grad_norm": 0.2866705542456264, "learning_rate": 4.669130876352393e-07, "loss": 1.478, "step": 1894 }, { "epoch": 0.4164835164835165, "grad_norm": 0.2449021470786593, "learning_rate": 4.6670305563358984e-07, "loss": 1.4696, "step": 1895 }, { "epoch": 0.4167032967032967, "grad_norm": 0.24194367792760602, "learning_rate": 4.6649298468124336e-07, "loss": 1.4133, "step": 1896 }, { "epoch": 0.4169230769230769, "grad_norm": 0.23805554230224935, "learning_rate": 4.662828748783479e-07, "loss": 1.4735, "step": 1897 }, { "epoch": 0.41714285714285715, "grad_norm": 0.24487829405021447, "learning_rate": 4.6607272632507026e-07, "loss": 1.4194, "step": 1898 }, { "epoch": 0.41736263736263735, "grad_norm": 0.24576913269889125, "learning_rate": 4.658625391215956e-07, "loss": 1.4756, "step": 1899 }, { "epoch": 0.4175824175824176, "grad_norm": 0.24345437401255496, "learning_rate": 4.6565231336812736e-07, "loss": 1.489, "step": 1900 }, { "epoch": 0.4178021978021978, "grad_norm": 0.24951438935633438, "learning_rate": 4.654420491648876e-07, "loss": 1.4564, "step": 1901 }, { "epoch": 0.41802197802197805, "grad_norm": 0.25116043650662734, "learning_rate": 4.652317466121166e-07, "loss": 1.5066, "step": 1902 }, { "epoch": 0.41824175824175824, "grad_norm": 0.26962698285180703, "learning_rate": 4.6502140581007296e-07, "loss": 1.474, "step": 1903 }, { "epoch": 0.41846153846153844, "grad_norm": 0.24739657362303075, "learning_rate": 4.648110268590333e-07, "loss": 1.4814, "step": 1904 }, { "epoch": 0.4186813186813187, "grad_norm": 0.25940239165573875, "learning_rate": 4.646006098592928e-07, "loss": 1.5056, "step": 1905 }, { "epoch": 0.4189010989010989, "grad_norm": 0.24657560022021907, "learning_rate": 4.643901549111645e-07, "loss": 1.4625, "step": 1906 }, { "epoch": 0.41912087912087914, "grad_norm": 0.26164647648356376, "learning_rate": 4.641796621149796e-07, "loss": 1.4912, "step": 1907 }, { "epoch": 0.41934065934065934, "grad_norm": 0.2457366206140155, "learning_rate": 4.639691315710874e-07, "loss": 1.479, "step": 1908 }, { "epoch": 0.41956043956043954, "grad_norm": 0.24829951402302813, "learning_rate": 4.637585633798552e-07, "loss": 1.4507, "step": 1909 }, { "epoch": 0.4197802197802198, "grad_norm": 0.24137696225989796, "learning_rate": 4.6354795764166805e-07, "loss": 1.4788, "step": 1910 }, { "epoch": 0.42, "grad_norm": 0.25059062683734157, "learning_rate": 4.633373144569293e-07, "loss": 1.4602, "step": 1911 }, { "epoch": 0.42021978021978024, "grad_norm": 0.2502811070900914, "learning_rate": 4.6312663392605954e-07, "loss": 1.4299, "step": 1912 }, { "epoch": 0.42043956043956043, "grad_norm": 0.24843538159409193, "learning_rate": 4.629159161494979e-07, "loss": 1.4624, "step": 1913 }, { "epoch": 0.4206593406593407, "grad_norm": 0.25679900310546494, "learning_rate": 4.6270516122770063e-07, "loss": 1.523, "step": 1914 }, { "epoch": 0.4208791208791209, "grad_norm": 0.2551073468372462, "learning_rate": 4.624943692611422e-07, "loss": 1.4654, "step": 1915 }, { "epoch": 0.4210989010989011, "grad_norm": 0.25520050194715943, "learning_rate": 4.622835403503143e-07, "loss": 1.4168, "step": 1916 }, { "epoch": 0.42131868131868133, "grad_norm": 0.2502794084468964, "learning_rate": 4.6207267459572644e-07, "loss": 1.434, "step": 1917 }, { "epoch": 0.42153846153846153, "grad_norm": 0.2711830636029782, "learning_rate": 4.6186177209790574e-07, "loss": 1.511, "step": 1918 }, { "epoch": 0.4217582417582418, "grad_norm": 0.2563225325946048, "learning_rate": 4.616508329573968e-07, "loss": 1.4495, "step": 1919 }, { "epoch": 0.421978021978022, "grad_norm": 0.3447193135994813, "learning_rate": 4.614398572747616e-07, "loss": 1.4517, "step": 1920 }, { "epoch": 0.4221978021978022, "grad_norm": 0.2679357815994255, "learning_rate": 4.612288451505797e-07, "loss": 1.4658, "step": 1921 }, { "epoch": 0.4224175824175824, "grad_norm": 0.2526983489954876, "learning_rate": 4.6101779668544784e-07, "loss": 1.438, "step": 1922 }, { "epoch": 0.4226373626373626, "grad_norm": 0.2425244700743695, "learning_rate": 4.608067119799802e-07, "loss": 1.4182, "step": 1923 }, { "epoch": 0.4228571428571429, "grad_norm": 0.2480925558781557, "learning_rate": 4.6059559113480823e-07, "loss": 1.4514, "step": 1924 }, { "epoch": 0.4230769230769231, "grad_norm": 0.24793658474452754, "learning_rate": 4.603844342505807e-07, "loss": 1.441, "step": 1925 }, { "epoch": 0.42329670329670327, "grad_norm": 0.24733060094121687, "learning_rate": 4.6017324142796337e-07, "loss": 1.4692, "step": 1926 }, { "epoch": 0.4235164835164835, "grad_norm": 0.2509952033182183, "learning_rate": 4.5996201276763916e-07, "loss": 1.434, "step": 1927 }, { "epoch": 0.4237362637362637, "grad_norm": 0.24111925275808668, "learning_rate": 4.5975074837030834e-07, "loss": 1.4882, "step": 1928 }, { "epoch": 0.42395604395604397, "grad_norm": 0.24589477494009745, "learning_rate": 4.595394483366878e-07, "loss": 1.4403, "step": 1929 }, { "epoch": 0.42417582417582417, "grad_norm": 0.23500968708420283, "learning_rate": 4.5932811276751186e-07, "loss": 1.5122, "step": 1930 }, { "epoch": 0.4243956043956044, "grad_norm": 0.2509348334848281, "learning_rate": 4.5911674176353126e-07, "loss": 1.4871, "step": 1931 }, { "epoch": 0.4246153846153846, "grad_norm": 0.2487398588487112, "learning_rate": 4.589053354255142e-07, "loss": 1.4703, "step": 1932 }, { "epoch": 0.4248351648351648, "grad_norm": 0.264975782675525, "learning_rate": 4.586938938542455e-07, "loss": 1.4318, "step": 1933 }, { "epoch": 0.42505494505494507, "grad_norm": 0.24653741023793363, "learning_rate": 4.584824171505265e-07, "loss": 1.4367, "step": 1934 }, { "epoch": 0.42527472527472526, "grad_norm": 0.244534014831125, "learning_rate": 4.582709054151755e-07, "loss": 1.51, "step": 1935 }, { "epoch": 0.4254945054945055, "grad_norm": 0.2434156563546049, "learning_rate": 4.5805935874902776e-07, "loss": 1.4115, "step": 1936 }, { "epoch": 0.4257142857142857, "grad_norm": 0.4409375634032495, "learning_rate": 4.578477772529349e-07, "loss": 1.449, "step": 1937 }, { "epoch": 0.4259340659340659, "grad_norm": 0.28008372588235186, "learning_rate": 4.5763616102776506e-07, "loss": 1.4542, "step": 1938 }, { "epoch": 0.42615384615384616, "grad_norm": 0.24733105124087043, "learning_rate": 4.5742451017440327e-07, "loss": 1.4666, "step": 1939 }, { "epoch": 0.42637362637362636, "grad_norm": 0.26214161289244853, "learning_rate": 4.5721282479375063e-07, "loss": 1.4603, "step": 1940 }, { "epoch": 0.4265934065934066, "grad_norm": 0.2600237795584968, "learning_rate": 4.570011049867252e-07, "loss": 1.4887, "step": 1941 }, { "epoch": 0.4268131868131868, "grad_norm": 0.2635085113676006, "learning_rate": 4.56789350854261e-07, "loss": 1.4473, "step": 1942 }, { "epoch": 0.42703296703296706, "grad_norm": 0.25549916126245564, "learning_rate": 4.565775624973087e-07, "loss": 1.5072, "step": 1943 }, { "epoch": 0.42725274725274726, "grad_norm": 0.2529790262311076, "learning_rate": 4.563657400168352e-07, "loss": 1.4892, "step": 1944 }, { "epoch": 0.42747252747252745, "grad_norm": 0.23877180896457764, "learning_rate": 4.561538835138238e-07, "loss": 1.4678, "step": 1945 }, { "epoch": 0.4276923076923077, "grad_norm": 0.252072813103344, "learning_rate": 4.559419930892736e-07, "loss": 1.4576, "step": 1946 }, { "epoch": 0.4279120879120879, "grad_norm": 0.267557026769763, "learning_rate": 4.557300688442003e-07, "loss": 1.5399, "step": 1947 }, { "epoch": 0.42813186813186815, "grad_norm": 0.2853942500311004, "learning_rate": 4.555181108796356e-07, "loss": 1.4752, "step": 1948 }, { "epoch": 0.42835164835164835, "grad_norm": 0.26206350182419685, "learning_rate": 4.553061192966273e-07, "loss": 1.5009, "step": 1949 }, { "epoch": 0.42857142857142855, "grad_norm": 0.24792865769176314, "learning_rate": 4.5509409419623896e-07, "loss": 1.4927, "step": 1950 }, { "epoch": 0.4287912087912088, "grad_norm": 0.3788666994107071, "learning_rate": 4.548820356795507e-07, "loss": 1.5395, "step": 1951 }, { "epoch": 0.429010989010989, "grad_norm": 0.250827362039741, "learning_rate": 4.5466994384765794e-07, "loss": 1.4867, "step": 1952 }, { "epoch": 0.42923076923076925, "grad_norm": 0.25333586193161073, "learning_rate": 4.5445781880167235e-07, "loss": 1.4877, "step": 1953 }, { "epoch": 0.42945054945054945, "grad_norm": 0.33638463275251274, "learning_rate": 4.5424566064272126e-07, "loss": 1.439, "step": 1954 }, { "epoch": 0.42967032967032964, "grad_norm": 0.24798145046955322, "learning_rate": 4.5403346947194805e-07, "loss": 1.4297, "step": 1955 }, { "epoch": 0.4298901098901099, "grad_norm": 0.266951146502588, "learning_rate": 4.5382124539051156e-07, "loss": 1.4988, "step": 1956 }, { "epoch": 0.4301098901098901, "grad_norm": 0.2487390391782417, "learning_rate": 4.5360898849958626e-07, "loss": 1.4884, "step": 1957 }, { "epoch": 0.43032967032967034, "grad_norm": 0.2622662025890682, "learning_rate": 4.533966989003626e-07, "loss": 1.4731, "step": 1958 }, { "epoch": 0.43054945054945054, "grad_norm": 0.24065722376539453, "learning_rate": 4.5318437669404633e-07, "loss": 1.4385, "step": 1959 }, { "epoch": 0.4307692307692308, "grad_norm": 0.2482545867959584, "learning_rate": 4.529720219818589e-07, "loss": 1.4443, "step": 1960 }, { "epoch": 0.430989010989011, "grad_norm": 0.25570330945984554, "learning_rate": 4.527596348650373e-07, "loss": 1.4119, "step": 1961 }, { "epoch": 0.4312087912087912, "grad_norm": 0.2410226952956021, "learning_rate": 4.525472154448337e-07, "loss": 1.4648, "step": 1962 }, { "epoch": 0.43142857142857144, "grad_norm": 0.25257701450368264, "learning_rate": 4.5233476382251605e-07, "loss": 1.5602, "step": 1963 }, { "epoch": 0.43164835164835164, "grad_norm": 0.2346379087711012, "learning_rate": 4.5212228009936735e-07, "loss": 1.4428, "step": 1964 }, { "epoch": 0.4318681318681319, "grad_norm": 0.3076212419062858, "learning_rate": 4.519097643766859e-07, "loss": 1.4448, "step": 1965 }, { "epoch": 0.4320879120879121, "grad_norm": 0.2558029366853796, "learning_rate": 4.5169721675578544e-07, "loss": 1.5036, "step": 1966 }, { "epoch": 0.4323076923076923, "grad_norm": 0.2523538767794331, "learning_rate": 4.5148463733799495e-07, "loss": 1.4498, "step": 1967 }, { "epoch": 0.43252747252747253, "grad_norm": 0.2730193900422513, "learning_rate": 4.5127202622465826e-07, "loss": 1.4951, "step": 1968 }, { "epoch": 0.43274725274725273, "grad_norm": 0.2706074518453508, "learning_rate": 4.5105938351713463e-07, "loss": 1.4692, "step": 1969 }, { "epoch": 0.432967032967033, "grad_norm": 0.2528399436655525, "learning_rate": 4.508467093167983e-07, "loss": 1.4844, "step": 1970 }, { "epoch": 0.4331868131868132, "grad_norm": 0.25125033559172794, "learning_rate": 4.5063400372503836e-07, "loss": 1.538, "step": 1971 }, { "epoch": 0.43340659340659343, "grad_norm": 0.24891370801079335, "learning_rate": 4.5042126684325906e-07, "loss": 1.3944, "step": 1972 }, { "epoch": 0.43362637362637363, "grad_norm": 0.24812753341113256, "learning_rate": 4.502084987728794e-07, "loss": 1.5183, "step": 1973 }, { "epoch": 0.4338461538461538, "grad_norm": 0.2701235151023572, "learning_rate": 4.4999569961533354e-07, "loss": 1.4777, "step": 1974 }, { "epoch": 0.4340659340659341, "grad_norm": 0.24538484487503745, "learning_rate": 4.4978286947206996e-07, "loss": 1.4778, "step": 1975 }, { "epoch": 0.4342857142857143, "grad_norm": 0.26435441514855984, "learning_rate": 4.495700084445526e-07, "loss": 1.4732, "step": 1976 }, { "epoch": 0.4345054945054945, "grad_norm": 0.2889115556355036, "learning_rate": 4.4935711663425935e-07, "loss": 1.4655, "step": 1977 }, { "epoch": 0.4347252747252747, "grad_norm": 0.24587544643910925, "learning_rate": 4.491441941426834e-07, "loss": 1.4345, "step": 1978 }, { "epoch": 0.4349450549450549, "grad_norm": 0.2634869755023885, "learning_rate": 4.489312410713323e-07, "loss": 1.4412, "step": 1979 }, { "epoch": 0.4351648351648352, "grad_norm": 0.2472304319512106, "learning_rate": 4.4871825752172813e-07, "loss": 1.4419, "step": 1980 }, { "epoch": 0.43538461538461537, "grad_norm": 0.2439296820321806, "learning_rate": 4.485052435954076e-07, "loss": 1.4787, "step": 1981 }, { "epoch": 0.4356043956043956, "grad_norm": 0.2420349645564503, "learning_rate": 4.4829219939392197e-07, "loss": 1.4406, "step": 1982 }, { "epoch": 0.4358241758241758, "grad_norm": 0.24806407871145997, "learning_rate": 4.4807912501883677e-07, "loss": 1.4712, "step": 1983 }, { "epoch": 0.43604395604395607, "grad_norm": 0.25960102110521244, "learning_rate": 4.47866020571732e-07, "loss": 1.4789, "step": 1984 }, { "epoch": 0.43626373626373627, "grad_norm": 0.31118938866936896, "learning_rate": 4.476528861542019e-07, "loss": 1.518, "step": 1985 }, { "epoch": 0.43648351648351646, "grad_norm": 0.24322874340836062, "learning_rate": 4.474397218678554e-07, "loss": 1.4709, "step": 1986 }, { "epoch": 0.4367032967032967, "grad_norm": 0.25468814898355213, "learning_rate": 4.4722652781431483e-07, "loss": 1.4672, "step": 1987 }, { "epoch": 0.4369230769230769, "grad_norm": 0.2557318512527907, "learning_rate": 4.4701330409521767e-07, "loss": 1.4958, "step": 1988 }, { "epoch": 0.43714285714285717, "grad_norm": 0.24240681824502205, "learning_rate": 4.4680005081221494e-07, "loss": 1.4409, "step": 1989 }, { "epoch": 0.43736263736263736, "grad_norm": 0.5309155135537983, "learning_rate": 4.46586768066972e-07, "loss": 1.5051, "step": 1990 }, { "epoch": 0.43758241758241756, "grad_norm": 0.30257411107460197, "learning_rate": 4.463734559611681e-07, "loss": 1.461, "step": 1991 }, { "epoch": 0.4378021978021978, "grad_norm": 0.24386625263667058, "learning_rate": 4.461601145964964e-07, "loss": 1.5288, "step": 1992 }, { "epoch": 0.438021978021978, "grad_norm": 0.29251326962414803, "learning_rate": 4.459467440746645e-07, "loss": 1.4851, "step": 1993 }, { "epoch": 0.43824175824175826, "grad_norm": 0.25691611671310566, "learning_rate": 4.457333444973934e-07, "loss": 1.4672, "step": 1994 }, { "epoch": 0.43846153846153846, "grad_norm": 0.24535319812993825, "learning_rate": 4.4551991596641816e-07, "loss": 1.4394, "step": 1995 }, { "epoch": 0.43868131868131865, "grad_norm": 0.2843181282489842, "learning_rate": 4.4530645858348756e-07, "loss": 1.4789, "step": 1996 }, { "epoch": 0.4389010989010989, "grad_norm": 0.2820459116215806, "learning_rate": 4.450929724503642e-07, "loss": 1.4839, "step": 1997 }, { "epoch": 0.4391208791208791, "grad_norm": 0.25298341155850945, "learning_rate": 4.448794576688244e-07, "loss": 1.4933, "step": 1998 }, { "epoch": 0.43934065934065936, "grad_norm": 0.288869264881916, "learning_rate": 4.446659143406581e-07, "loss": 1.4681, "step": 1999 }, { "epoch": 0.43956043956043955, "grad_norm": 0.2419493068887366, "learning_rate": 4.4445234256766873e-07, "loss": 1.4561, "step": 2000 }, { "epoch": 0.4397802197802198, "grad_norm": 0.24830821012270685, "learning_rate": 4.4423874245167355e-07, "loss": 1.4202, "step": 2001 }, { "epoch": 0.44, "grad_norm": 0.23866381009239548, "learning_rate": 4.440251140945033e-07, "loss": 1.4589, "step": 2002 }, { "epoch": 0.4402197802197802, "grad_norm": 0.2650481763620387, "learning_rate": 4.438114575980018e-07, "loss": 1.4912, "step": 2003 }, { "epoch": 0.44043956043956045, "grad_norm": 0.2672582520895399, "learning_rate": 4.4359777306402663e-07, "loss": 1.4875, "step": 2004 }, { "epoch": 0.44065934065934065, "grad_norm": 0.23896969422896724, "learning_rate": 4.433840605944488e-07, "loss": 1.5128, "step": 2005 }, { "epoch": 0.4408791208791209, "grad_norm": 0.2434239236442947, "learning_rate": 4.4317032029115234e-07, "loss": 1.4578, "step": 2006 }, { "epoch": 0.4410989010989011, "grad_norm": 0.24524504092689825, "learning_rate": 4.429565522560348e-07, "loss": 1.4882, "step": 2007 }, { "epoch": 0.4413186813186813, "grad_norm": 0.26319249674895484, "learning_rate": 4.427427565910069e-07, "loss": 1.5003, "step": 2008 }, { "epoch": 0.44153846153846155, "grad_norm": 0.24421254101152262, "learning_rate": 4.425289333979924e-07, "loss": 1.4428, "step": 2009 }, { "epoch": 0.44175824175824174, "grad_norm": 0.26257616504930686, "learning_rate": 4.4231508277892833e-07, "loss": 1.5429, "step": 2010 }, { "epoch": 0.441978021978022, "grad_norm": 0.2714002271767539, "learning_rate": 4.421012048357646e-07, "loss": 1.5147, "step": 2011 }, { "epoch": 0.4421978021978022, "grad_norm": 0.26141776740071254, "learning_rate": 4.418872996704645e-07, "loss": 1.4218, "step": 2012 }, { "epoch": 0.44241758241758244, "grad_norm": 0.2582404154466726, "learning_rate": 4.4167336738500395e-07, "loss": 1.517, "step": 2013 }, { "epoch": 0.44263736263736264, "grad_norm": 0.24120479739895342, "learning_rate": 4.414594080813721e-07, "loss": 1.4686, "step": 2014 }, { "epoch": 0.44285714285714284, "grad_norm": 0.2495583973180263, "learning_rate": 4.412454218615705e-07, "loss": 1.4777, "step": 2015 }, { "epoch": 0.4430769230769231, "grad_norm": 0.25250100386632984, "learning_rate": 4.4103140882761413e-07, "loss": 1.4964, "step": 2016 }, { "epoch": 0.4432967032967033, "grad_norm": 0.2602533544692144, "learning_rate": 4.4081736908153045e-07, "loss": 1.4982, "step": 2017 }, { "epoch": 0.44351648351648354, "grad_norm": 0.2448456408148941, "learning_rate": 4.406033027253595e-07, "loss": 1.4978, "step": 2018 }, { "epoch": 0.44373626373626374, "grad_norm": 0.26427459374956785, "learning_rate": 4.403892098611543e-07, "loss": 1.4515, "step": 2019 }, { "epoch": 0.44395604395604393, "grad_norm": 0.24692002348678865, "learning_rate": 4.4017509059098033e-07, "loss": 1.5037, "step": 2020 }, { "epoch": 0.4441758241758242, "grad_norm": 0.2429174226027057, "learning_rate": 4.399609450169158e-07, "loss": 1.4219, "step": 2021 }, { "epoch": 0.4443956043956044, "grad_norm": 0.2595607253689343, "learning_rate": 4.397467732410513e-07, "loss": 1.4642, "step": 2022 }, { "epoch": 0.44461538461538463, "grad_norm": 0.24262057566586, "learning_rate": 4.3953257536549e-07, "loss": 1.4686, "step": 2023 }, { "epoch": 0.44483516483516483, "grad_norm": 0.24935581520690986, "learning_rate": 4.393183514923476e-07, "loss": 1.4374, "step": 2024 }, { "epoch": 0.44505494505494503, "grad_norm": 0.25301345209781323, "learning_rate": 4.3910410172375197e-07, "loss": 1.4045, "step": 2025 }, { "epoch": 0.4452747252747253, "grad_norm": 0.3266191260053457, "learning_rate": 4.388898261618434e-07, "loss": 1.4664, "step": 2026 }, { "epoch": 0.4454945054945055, "grad_norm": 0.24422716200018552, "learning_rate": 4.3867552490877474e-07, "loss": 1.3922, "step": 2027 }, { "epoch": 0.44571428571428573, "grad_norm": 0.35083311768377134, "learning_rate": 4.3846119806671065e-07, "loss": 1.4702, "step": 2028 }, { "epoch": 0.4459340659340659, "grad_norm": 0.2561190306795904, "learning_rate": 4.382468457378284e-07, "loss": 1.4395, "step": 2029 }, { "epoch": 0.4461538461538462, "grad_norm": 0.30220352060657174, "learning_rate": 4.38032468024317e-07, "loss": 1.4538, "step": 2030 }, { "epoch": 0.4463736263736264, "grad_norm": 0.3318615787325118, "learning_rate": 4.378180650283781e-07, "loss": 1.4895, "step": 2031 }, { "epoch": 0.44659340659340657, "grad_norm": 0.23559489549850618, "learning_rate": 4.3760363685222495e-07, "loss": 1.4998, "step": 2032 }, { "epoch": 0.4468131868131868, "grad_norm": 0.2555988521994486, "learning_rate": 4.373891835980828e-07, "loss": 1.5352, "step": 2033 }, { "epoch": 0.447032967032967, "grad_norm": 0.24972905143409474, "learning_rate": 4.3717470536818903e-07, "loss": 1.5623, "step": 2034 }, { "epoch": 0.4472527472527473, "grad_norm": 0.2415536218839497, "learning_rate": 4.36960202264793e-07, "loss": 1.3939, "step": 2035 }, { "epoch": 0.44747252747252747, "grad_norm": 0.2620784864455322, "learning_rate": 4.367456743901559e-07, "loss": 1.4449, "step": 2036 }, { "epoch": 0.44769230769230767, "grad_norm": 0.2560134164059784, "learning_rate": 4.3653112184655035e-07, "loss": 1.3946, "step": 2037 }, { "epoch": 0.4479120879120879, "grad_norm": 0.2706183432557722, "learning_rate": 4.363165447362613e-07, "loss": 1.4553, "step": 2038 }, { "epoch": 0.4481318681318681, "grad_norm": 0.24148864334358544, "learning_rate": 4.3610194316158514e-07, "loss": 1.4521, "step": 2039 }, { "epoch": 0.44835164835164837, "grad_norm": 0.2477428186822883, "learning_rate": 4.358873172248298e-07, "loss": 1.4963, "step": 2040 }, { "epoch": 0.44857142857142857, "grad_norm": 0.2444094548120749, "learning_rate": 4.35672667028315e-07, "loss": 1.4787, "step": 2041 }, { "epoch": 0.4487912087912088, "grad_norm": 0.2819388152739819, "learning_rate": 4.3545799267437193e-07, "loss": 1.4535, "step": 2042 }, { "epoch": 0.449010989010989, "grad_norm": 0.25331600255846914, "learning_rate": 4.3524329426534356e-07, "loss": 1.4079, "step": 2043 }, { "epoch": 0.4492307692307692, "grad_norm": 0.2600742009308652, "learning_rate": 4.350285719035839e-07, "loss": 1.3983, "step": 2044 }, { "epoch": 0.44945054945054946, "grad_norm": 0.24806827114783264, "learning_rate": 4.3481382569145853e-07, "loss": 1.4367, "step": 2045 }, { "epoch": 0.44967032967032966, "grad_norm": 0.2527522514484014, "learning_rate": 4.3459905573134455e-07, "loss": 1.4561, "step": 2046 }, { "epoch": 0.4498901098901099, "grad_norm": 0.2416568168819981, "learning_rate": 4.343842621256304e-07, "loss": 1.4361, "step": 2047 }, { "epoch": 0.4501098901098901, "grad_norm": 0.25004779077616274, "learning_rate": 4.341694449767157e-07, "loss": 1.4807, "step": 2048 }, { "epoch": 0.4503296703296703, "grad_norm": 0.256423919404246, "learning_rate": 4.33954604387011e-07, "loss": 1.4729, "step": 2049 }, { "epoch": 0.45054945054945056, "grad_norm": 0.3029008617306128, "learning_rate": 4.337397404589385e-07, "loss": 1.478, "step": 2050 }, { "epoch": 0.45076923076923076, "grad_norm": 0.23925969015310744, "learning_rate": 4.335248532949315e-07, "loss": 1.428, "step": 2051 }, { "epoch": 0.450989010989011, "grad_norm": 0.27330281176762683, "learning_rate": 4.333099429974338e-07, "loss": 1.4692, "step": 2052 }, { "epoch": 0.4512087912087912, "grad_norm": 0.25284322084006117, "learning_rate": 4.3309500966890095e-07, "loss": 1.4964, "step": 2053 }, { "epoch": 0.4514285714285714, "grad_norm": 0.25212111305231066, "learning_rate": 4.3288005341179923e-07, "loss": 1.5439, "step": 2054 }, { "epoch": 0.45164835164835165, "grad_norm": 0.27897296674893207, "learning_rate": 4.3266507432860537e-07, "loss": 1.5189, "step": 2055 }, { "epoch": 0.45186813186813185, "grad_norm": 0.24574501087694758, "learning_rate": 4.3245007252180775e-07, "loss": 1.4427, "step": 2056 }, { "epoch": 0.4520879120879121, "grad_norm": 0.2567189836504206, "learning_rate": 4.322350480939052e-07, "loss": 1.4366, "step": 2057 }, { "epoch": 0.4523076923076923, "grad_norm": 0.2605807299139044, "learning_rate": 4.3202000114740727e-07, "loss": 1.4846, "step": 2058 }, { "epoch": 0.45252747252747255, "grad_norm": 0.2453753849791273, "learning_rate": 4.3180493178483435e-07, "loss": 1.4598, "step": 2059 }, { "epoch": 0.45274725274725275, "grad_norm": 0.24328461589670727, "learning_rate": 4.3158984010871765e-07, "loss": 1.4737, "step": 2060 }, { "epoch": 0.45296703296703295, "grad_norm": 0.25501626902401436, "learning_rate": 4.313747262215987e-07, "loss": 1.4738, "step": 2061 }, { "epoch": 0.4531868131868132, "grad_norm": 0.26784819737481885, "learning_rate": 4.3115959022602984e-07, "loss": 1.4225, "step": 2062 }, { "epoch": 0.4534065934065934, "grad_norm": 0.24525130988311333, "learning_rate": 4.30944432224574e-07, "loss": 1.5193, "step": 2063 }, { "epoch": 0.45362637362637365, "grad_norm": 0.3612247288721671, "learning_rate": 4.307292523198043e-07, "loss": 1.496, "step": 2064 }, { "epoch": 0.45384615384615384, "grad_norm": 0.23592061965966898, "learning_rate": 4.3051405061430465e-07, "loss": 1.4753, "step": 2065 }, { "epoch": 0.45406593406593404, "grad_norm": 0.25423611392034606, "learning_rate": 4.3029882721066934e-07, "loss": 1.5484, "step": 2066 }, { "epoch": 0.4542857142857143, "grad_norm": 0.2935670369186098, "learning_rate": 4.3008358221150267e-07, "loss": 1.4633, "step": 2067 }, { "epoch": 0.4545054945054945, "grad_norm": 0.24337580307309942, "learning_rate": 4.2986831571941944e-07, "loss": 1.4539, "step": 2068 }, { "epoch": 0.45472527472527474, "grad_norm": 0.25121797435260773, "learning_rate": 4.2965302783704474e-07, "loss": 1.4811, "step": 2069 }, { "epoch": 0.45494505494505494, "grad_norm": 0.24855948195337907, "learning_rate": 4.29437718667014e-07, "loss": 1.4963, "step": 2070 }, { "epoch": 0.4551648351648352, "grad_norm": 0.25810073656964916, "learning_rate": 4.2922238831197235e-07, "loss": 1.5138, "step": 2071 }, { "epoch": 0.4553846153846154, "grad_norm": 0.24979152300750987, "learning_rate": 4.290070368745754e-07, "loss": 1.5022, "step": 2072 }, { "epoch": 0.4556043956043956, "grad_norm": 0.28878714910143466, "learning_rate": 4.2879166445748865e-07, "loss": 1.4649, "step": 2073 }, { "epoch": 0.45582417582417584, "grad_norm": 0.24946059870744922, "learning_rate": 4.285762711633877e-07, "loss": 1.493, "step": 2074 }, { "epoch": 0.45604395604395603, "grad_norm": 0.2757308433732612, "learning_rate": 4.28360857094958e-07, "loss": 1.4789, "step": 2075 }, { "epoch": 0.4562637362637363, "grad_norm": 0.2596881745657374, "learning_rate": 4.2814542235489494e-07, "loss": 1.4998, "step": 2076 }, { "epoch": 0.4564835164835165, "grad_norm": 0.2603748068429344, "learning_rate": 4.2792996704590387e-07, "loss": 1.4733, "step": 2077 }, { "epoch": 0.4567032967032967, "grad_norm": 0.2443662480772662, "learning_rate": 4.277144912706998e-07, "loss": 1.4996, "step": 2078 }, { "epoch": 0.45692307692307693, "grad_norm": 0.2614697763154464, "learning_rate": 4.274989951320075e-07, "loss": 1.4367, "step": 2079 }, { "epoch": 0.45714285714285713, "grad_norm": 0.24731871090892066, "learning_rate": 4.272834787325615e-07, "loss": 1.4672, "step": 2080 }, { "epoch": 0.4573626373626374, "grad_norm": 0.2560989472472663, "learning_rate": 4.2706794217510603e-07, "loss": 1.4906, "step": 2081 }, { "epoch": 0.4575824175824176, "grad_norm": 0.24534994109634842, "learning_rate": 4.2685238556239495e-07, "loss": 1.4648, "step": 2082 }, { "epoch": 0.4578021978021978, "grad_norm": 0.28760353790968335, "learning_rate": 4.2663680899719143e-07, "loss": 1.4789, "step": 2083 }, { "epoch": 0.458021978021978, "grad_norm": 0.2396213350672438, "learning_rate": 4.264212125822686e-07, "loss": 1.4612, "step": 2084 }, { "epoch": 0.4582417582417582, "grad_norm": 0.26144684638887217, "learning_rate": 4.262055964204085e-07, "loss": 1.4013, "step": 2085 }, { "epoch": 0.4584615384615385, "grad_norm": 0.2607553202482003, "learning_rate": 4.259899606144031e-07, "loss": 1.5135, "step": 2086 }, { "epoch": 0.4586813186813187, "grad_norm": 0.2594931474385825, "learning_rate": 4.2577430526705343e-07, "loss": 1.416, "step": 2087 }, { "epoch": 0.4589010989010989, "grad_norm": 0.2611756491498896, "learning_rate": 4.255586304811699e-07, "loss": 1.5464, "step": 2088 }, { "epoch": 0.4591208791208791, "grad_norm": 0.25004996383787975, "learning_rate": 4.253429363595723e-07, "loss": 1.4762, "step": 2089 }, { "epoch": 0.4593406593406593, "grad_norm": 0.25427160923801684, "learning_rate": 4.2512722300508943e-07, "loss": 1.5437, "step": 2090 }, { "epoch": 0.45956043956043957, "grad_norm": 0.2610733982322852, "learning_rate": 4.2491149052055936e-07, "loss": 1.5014, "step": 2091 }, { "epoch": 0.45978021978021977, "grad_norm": 0.2561664692941615, "learning_rate": 4.2469573900882946e-07, "loss": 1.4396, "step": 2092 }, { "epoch": 0.46, "grad_norm": 0.26370019260727595, "learning_rate": 4.2447996857275583e-07, "loss": 1.5059, "step": 2093 }, { "epoch": 0.4602197802197802, "grad_norm": 0.3261297040343998, "learning_rate": 4.242641793152038e-07, "loss": 1.5002, "step": 2094 }, { "epoch": 0.4604395604395604, "grad_norm": 0.28908805699260925, "learning_rate": 4.240483713390477e-07, "loss": 1.5241, "step": 2095 }, { "epoch": 0.46065934065934067, "grad_norm": 0.26358486161361233, "learning_rate": 4.2383254474717064e-07, "loss": 1.5232, "step": 2096 }, { "epoch": 0.46087912087912086, "grad_norm": 0.24265924214246137, "learning_rate": 4.2361669964246477e-07, "loss": 1.4652, "step": 2097 }, { "epoch": 0.4610989010989011, "grad_norm": 0.23999489946480565, "learning_rate": 4.2340083612783094e-07, "loss": 1.4139, "step": 2098 }, { "epoch": 0.4613186813186813, "grad_norm": 0.2511447918001245, "learning_rate": 4.231849543061788e-07, "loss": 1.4413, "step": 2099 }, { "epoch": 0.46153846153846156, "grad_norm": 0.2434633609202519, "learning_rate": 4.229690542804267e-07, "loss": 1.448, "step": 2100 }, { "epoch": 0.46175824175824176, "grad_norm": 0.2680332101972384, "learning_rate": 4.2275313615350195e-07, "loss": 1.4894, "step": 2101 }, { "epoch": 0.46197802197802196, "grad_norm": 0.2553793825205718, "learning_rate": 4.225372000283399e-07, "loss": 1.4996, "step": 2102 }, { "epoch": 0.4621978021978022, "grad_norm": 0.25530929239675787, "learning_rate": 4.223212460078851e-07, "loss": 1.4353, "step": 2103 }, { "epoch": 0.4624175824175824, "grad_norm": 0.2467225152119547, "learning_rate": 4.2210527419509034e-07, "loss": 1.4512, "step": 2104 }, { "epoch": 0.46263736263736266, "grad_norm": 0.2542087333372453, "learning_rate": 4.218892846929167e-07, "loss": 1.4923, "step": 2105 }, { "epoch": 0.46285714285714286, "grad_norm": 0.24250588824628316, "learning_rate": 4.2167327760433415e-07, "loss": 1.4893, "step": 2106 }, { "epoch": 0.46307692307692305, "grad_norm": 0.248356494344754, "learning_rate": 4.214572530323209e-07, "loss": 1.4826, "step": 2107 }, { "epoch": 0.4632967032967033, "grad_norm": 0.25129696629053866, "learning_rate": 4.21241211079863e-07, "loss": 1.5164, "step": 2108 }, { "epoch": 0.4635164835164835, "grad_norm": 0.24500177287742092, "learning_rate": 4.210251518499555e-07, "loss": 1.412, "step": 2109 }, { "epoch": 0.46373626373626375, "grad_norm": 0.255636589279567, "learning_rate": 4.208090754456011e-07, "loss": 1.5499, "step": 2110 }, { "epoch": 0.46395604395604395, "grad_norm": 0.3803441252872352, "learning_rate": 4.205929819698114e-07, "loss": 1.4686, "step": 2111 }, { "epoch": 0.4641758241758242, "grad_norm": 0.24126324776431443, "learning_rate": 4.203768715256053e-07, "loss": 1.3918, "step": 2112 }, { "epoch": 0.4643956043956044, "grad_norm": 0.2496857026325197, "learning_rate": 4.2016074421601047e-07, "loss": 1.5016, "step": 2113 }, { "epoch": 0.4646153846153846, "grad_norm": 0.25998269132170193, "learning_rate": 4.199446001440621e-07, "loss": 1.4559, "step": 2114 }, { "epoch": 0.46483516483516485, "grad_norm": 0.25144116621936335, "learning_rate": 4.1972843941280375e-07, "loss": 1.4639, "step": 2115 }, { "epoch": 0.46505494505494505, "grad_norm": 0.24650473826043623, "learning_rate": 4.1951226212528674e-07, "loss": 1.5229, "step": 2116 }, { "epoch": 0.4652747252747253, "grad_norm": 0.2540484387629659, "learning_rate": 4.192960683845703e-07, "loss": 1.4339, "step": 2117 }, { "epoch": 0.4654945054945055, "grad_norm": 0.2752498752239846, "learning_rate": 4.1907985829372146e-07, "loss": 1.4687, "step": 2118 }, { "epoch": 0.4657142857142857, "grad_norm": 0.24540759562663214, "learning_rate": 4.188636319558154e-07, "loss": 1.4865, "step": 2119 }, { "epoch": 0.46593406593406594, "grad_norm": 0.2658690120881403, "learning_rate": 4.1864738947393443e-07, "loss": 1.4461, "step": 2120 }, { "epoch": 0.46615384615384614, "grad_norm": 0.38623091924961017, "learning_rate": 4.184311309511691e-07, "loss": 1.4963, "step": 2121 }, { "epoch": 0.4663736263736264, "grad_norm": 0.24199241715554962, "learning_rate": 4.182148564906173e-07, "loss": 1.4266, "step": 2122 }, { "epoch": 0.4665934065934066, "grad_norm": 0.24330896768729043, "learning_rate": 4.1799856619538456e-07, "loss": 1.3831, "step": 2123 }, { "epoch": 0.4668131868131868, "grad_norm": 0.24032074539738865, "learning_rate": 4.177822601685843e-07, "loss": 1.4185, "step": 2124 }, { "epoch": 0.46703296703296704, "grad_norm": 0.24960556255798652, "learning_rate": 4.1756593851333657e-07, "loss": 1.4808, "step": 2125 }, { "epoch": 0.46725274725274724, "grad_norm": 0.23711489610297176, "learning_rate": 4.1734960133277e-07, "loss": 1.4657, "step": 2126 }, { "epoch": 0.4674725274725275, "grad_norm": 0.32068962980375065, "learning_rate": 4.1713324873001985e-07, "loss": 1.4832, "step": 2127 }, { "epoch": 0.4676923076923077, "grad_norm": 0.2518391727634862, "learning_rate": 4.1691688080822906e-07, "loss": 1.4646, "step": 2128 }, { "epoch": 0.46791208791208794, "grad_norm": 0.24932634371386272, "learning_rate": 4.1670049767054757e-07, "loss": 1.4596, "step": 2129 }, { "epoch": 0.46813186813186813, "grad_norm": 0.251421232967717, "learning_rate": 4.16484099420133e-07, "loss": 1.509, "step": 2130 }, { "epoch": 0.46835164835164833, "grad_norm": 0.25696894067799475, "learning_rate": 4.162676861601498e-07, "loss": 1.4876, "step": 2131 }, { "epoch": 0.4685714285714286, "grad_norm": 0.23856366540520546, "learning_rate": 4.160512579937697e-07, "loss": 1.5087, "step": 2132 }, { "epoch": 0.4687912087912088, "grad_norm": 0.24999786642137214, "learning_rate": 4.1583481502417166e-07, "loss": 1.5268, "step": 2133 }, { "epoch": 0.46901098901098903, "grad_norm": 0.2463367846663434, "learning_rate": 4.156183573545415e-07, "loss": 1.4424, "step": 2134 }, { "epoch": 0.46923076923076923, "grad_norm": 0.28294377177431385, "learning_rate": 4.1540188508807225e-07, "loss": 1.4708, "step": 2135 }, { "epoch": 0.4694505494505494, "grad_norm": 0.25320109960323733, "learning_rate": 4.151853983279637e-07, "loss": 1.5059, "step": 2136 }, { "epoch": 0.4696703296703297, "grad_norm": 0.25529501987391623, "learning_rate": 4.1496889717742263e-07, "loss": 1.4886, "step": 2137 }, { "epoch": 0.4698901098901099, "grad_norm": 0.26722265943532014, "learning_rate": 4.147523817396629e-07, "loss": 1.5239, "step": 2138 }, { "epoch": 0.4701098901098901, "grad_norm": 0.2638999649207356, "learning_rate": 4.1453585211790465e-07, "loss": 1.5156, "step": 2139 }, { "epoch": 0.4703296703296703, "grad_norm": 0.25046035356861307, "learning_rate": 4.143193084153753e-07, "loss": 1.484, "step": 2140 }, { "epoch": 0.4705494505494506, "grad_norm": 0.2503101049980358, "learning_rate": 4.141027507353087e-07, "loss": 1.4616, "step": 2141 }, { "epoch": 0.4707692307692308, "grad_norm": 0.2605648383058907, "learning_rate": 4.1388617918094563e-07, "loss": 1.4535, "step": 2142 }, { "epoch": 0.47098901098901097, "grad_norm": 0.24344463446649003, "learning_rate": 4.1366959385553316e-07, "loss": 1.4759, "step": 2143 }, { "epoch": 0.4712087912087912, "grad_norm": 0.2721819098807181, "learning_rate": 4.134529948623249e-07, "loss": 1.5411, "step": 2144 }, { "epoch": 0.4714285714285714, "grad_norm": 0.2454384678281813, "learning_rate": 4.132363823045816e-07, "loss": 1.3788, "step": 2145 }, { "epoch": 0.47164835164835167, "grad_norm": 0.2521026799104069, "learning_rate": 4.130197562855696e-07, "loss": 1.4809, "step": 2146 }, { "epoch": 0.47186813186813187, "grad_norm": 0.24389100413775666, "learning_rate": 4.1280311690856234e-07, "loss": 1.4973, "step": 2147 }, { "epoch": 0.47208791208791206, "grad_norm": 0.24801335859937837, "learning_rate": 4.125864642768391e-07, "loss": 1.4622, "step": 2148 }, { "epoch": 0.4723076923076923, "grad_norm": 0.24582502995608502, "learning_rate": 4.12369798493686e-07, "loss": 1.441, "step": 2149 }, { "epoch": 0.4725274725274725, "grad_norm": 0.2556859453671942, "learning_rate": 4.121531196623951e-07, "loss": 1.4526, "step": 2150 }, { "epoch": 0.47274725274725277, "grad_norm": 0.6082569291110256, "learning_rate": 4.1193642788626474e-07, "loss": 1.4471, "step": 2151 }, { "epoch": 0.47296703296703296, "grad_norm": 0.24879058608612206, "learning_rate": 4.117197232685995e-07, "loss": 1.4823, "step": 2152 }, { "epoch": 0.47318681318681316, "grad_norm": 0.267231589560563, "learning_rate": 4.1150300591271e-07, "loss": 1.5142, "step": 2153 }, { "epoch": 0.4734065934065934, "grad_norm": 0.27099962653455645, "learning_rate": 4.1128627592191295e-07, "loss": 1.4974, "step": 2154 }, { "epoch": 0.4736263736263736, "grad_norm": 0.27173473889903393, "learning_rate": 4.1106953339953103e-07, "loss": 1.4799, "step": 2155 }, { "epoch": 0.47384615384615386, "grad_norm": 0.24540001875933956, "learning_rate": 4.10852778448893e-07, "loss": 1.453, "step": 2156 }, { "epoch": 0.47406593406593406, "grad_norm": 0.24341896446192007, "learning_rate": 4.106360111733337e-07, "loss": 1.422, "step": 2157 }, { "epoch": 0.4742857142857143, "grad_norm": 0.2605932533495345, "learning_rate": 4.1041923167619345e-07, "loss": 1.5059, "step": 2158 }, { "epoch": 0.4745054945054945, "grad_norm": 0.3739400105072832, "learning_rate": 4.102024400608186e-07, "loss": 1.434, "step": 2159 }, { "epoch": 0.4747252747252747, "grad_norm": 0.24029918941121878, "learning_rate": 4.099856364305614e-07, "loss": 1.4364, "step": 2160 }, { "epoch": 0.47494505494505496, "grad_norm": 0.25843652143322837, "learning_rate": 4.0976882088877955e-07, "loss": 1.4552, "step": 2161 }, { "epoch": 0.47516483516483515, "grad_norm": 0.36890007621991705, "learning_rate": 4.0955199353883666e-07, "loss": 1.4771, "step": 2162 }, { "epoch": 0.4753846153846154, "grad_norm": 0.2559317403094861, "learning_rate": 4.0933515448410197e-07, "loss": 1.471, "step": 2163 }, { "epoch": 0.4756043956043956, "grad_norm": 0.25008407923538156, "learning_rate": 4.0911830382795e-07, "loss": 1.5048, "step": 2164 }, { "epoch": 0.4758241758241758, "grad_norm": 0.2423485199780464, "learning_rate": 4.089014416737613e-07, "loss": 1.5425, "step": 2165 }, { "epoch": 0.47604395604395605, "grad_norm": 0.24572023579073374, "learning_rate": 4.086845681249215e-07, "loss": 1.4656, "step": 2166 }, { "epoch": 0.47626373626373625, "grad_norm": 0.27440521210145385, "learning_rate": 4.084676832848216e-07, "loss": 1.491, "step": 2167 }, { "epoch": 0.4764835164835165, "grad_norm": 0.3107747060908553, "learning_rate": 4.082507872568585e-07, "loss": 1.5429, "step": 2168 }, { "epoch": 0.4767032967032967, "grad_norm": 0.2537307127475986, "learning_rate": 4.08033880144434e-07, "loss": 1.4688, "step": 2169 }, { "epoch": 0.47692307692307695, "grad_norm": 0.2518383654657694, "learning_rate": 4.078169620509552e-07, "loss": 1.4755, "step": 2170 }, { "epoch": 0.47714285714285715, "grad_norm": 0.31769063812828074, "learning_rate": 4.076000330798345e-07, "loss": 1.4752, "step": 2171 }, { "epoch": 0.47736263736263734, "grad_norm": 0.26660323364676236, "learning_rate": 4.0738309333448964e-07, "loss": 1.4554, "step": 2172 }, { "epoch": 0.4775824175824176, "grad_norm": 0.25529946819906274, "learning_rate": 4.0716614291834327e-07, "loss": 1.5257, "step": 2173 }, { "epoch": 0.4778021978021978, "grad_norm": 0.25148741629104937, "learning_rate": 4.0694918193482317e-07, "loss": 1.4256, "step": 2174 }, { "epoch": 0.47802197802197804, "grad_norm": 0.23587639375669317, "learning_rate": 4.0673221048736234e-07, "loss": 1.4262, "step": 2175 }, { "epoch": 0.47824175824175824, "grad_norm": 0.25877167767567394, "learning_rate": 4.0651522867939863e-07, "loss": 1.4829, "step": 2176 }, { "epoch": 0.47846153846153844, "grad_norm": 0.24940377754896279, "learning_rate": 4.0629823661437485e-07, "loss": 1.4375, "step": 2177 }, { "epoch": 0.4786813186813187, "grad_norm": 0.25485570465304636, "learning_rate": 4.060812343957385e-07, "loss": 1.4751, "step": 2178 }, { "epoch": 0.4789010989010989, "grad_norm": 0.24195630123328987, "learning_rate": 4.0586422212694226e-07, "loss": 1.4817, "step": 2179 }, { "epoch": 0.47912087912087914, "grad_norm": 0.2646037576766741, "learning_rate": 4.0564719991144344e-07, "loss": 1.4383, "step": 2180 }, { "epoch": 0.47934065934065934, "grad_norm": 0.24472048765588794, "learning_rate": 4.0543016785270417e-07, "loss": 1.4679, "step": 2181 }, { "epoch": 0.47956043956043953, "grad_norm": 0.28942181978522935, "learning_rate": 4.052131260541911e-07, "loss": 1.4702, "step": 2182 }, { "epoch": 0.4797802197802198, "grad_norm": 0.24832404652400986, "learning_rate": 4.049960746193755e-07, "loss": 1.4315, "step": 2183 }, { "epoch": 0.48, "grad_norm": 0.4369544456809934, "learning_rate": 4.0477901365173375e-07, "loss": 1.4129, "step": 2184 }, { "epoch": 0.48021978021978023, "grad_norm": 0.2541583125518289, "learning_rate": 4.045619432547461e-07, "loss": 1.4702, "step": 2185 }, { "epoch": 0.48043956043956043, "grad_norm": 0.2523588685878539, "learning_rate": 4.043448635318975e-07, "loss": 1.4745, "step": 2186 }, { "epoch": 0.4806593406593407, "grad_norm": 0.2510926996602045, "learning_rate": 4.041277745866776e-07, "loss": 1.4902, "step": 2187 }, { "epoch": 0.4808791208791209, "grad_norm": 0.2592318111699245, "learning_rate": 4.039106765225803e-07, "loss": 1.49, "step": 2188 }, { "epoch": 0.4810989010989011, "grad_norm": 0.2511407064232407, "learning_rate": 4.036935694431038e-07, "loss": 1.4597, "step": 2189 }, { "epoch": 0.48131868131868133, "grad_norm": 0.24553195643338055, "learning_rate": 4.034764534517504e-07, "loss": 1.4164, "step": 2190 }, { "epoch": 0.4815384615384615, "grad_norm": 0.2544932429204096, "learning_rate": 4.032593286520272e-07, "loss": 1.4233, "step": 2191 }, { "epoch": 0.4817582417582418, "grad_norm": 0.246372819528969, "learning_rate": 4.030421951474449e-07, "loss": 1.4254, "step": 2192 }, { "epoch": 0.481978021978022, "grad_norm": 0.2444111471649207, "learning_rate": 4.028250530415188e-07, "loss": 1.502, "step": 2193 }, { "epoch": 0.48219780219780217, "grad_norm": 0.2484790732312108, "learning_rate": 4.0260790243776806e-07, "loss": 1.4628, "step": 2194 }, { "epoch": 0.4824175824175824, "grad_norm": 0.24749135295669722, "learning_rate": 4.0239074343971586e-07, "loss": 1.4266, "step": 2195 }, { "epoch": 0.4826373626373626, "grad_norm": 0.24861043417869974, "learning_rate": 4.021735761508896e-07, "loss": 1.4871, "step": 2196 }, { "epoch": 0.4828571428571429, "grad_norm": 0.2531956580470661, "learning_rate": 4.0195640067482035e-07, "loss": 1.4946, "step": 2197 }, { "epoch": 0.48307692307692307, "grad_norm": 0.25469456597604356, "learning_rate": 4.017392171150434e-07, "loss": 1.5239, "step": 2198 }, { "epoch": 0.4832967032967033, "grad_norm": 0.25846203750265634, "learning_rate": 4.015220255750977e-07, "loss": 1.4277, "step": 2199 }, { "epoch": 0.4835164835164835, "grad_norm": 0.24535480350801261, "learning_rate": 4.0130482615852604e-07, "loss": 1.4681, "step": 2200 }, { "epoch": 0.4837362637362637, "grad_norm": 0.2474661039500049, "learning_rate": 4.010876189688748e-07, "loss": 1.4978, "step": 2201 }, { "epoch": 0.48395604395604397, "grad_norm": 0.24811461610371785, "learning_rate": 4.008704041096943e-07, "loss": 1.4826, "step": 2202 }, { "epoch": 0.48417582417582417, "grad_norm": 0.23865921226506684, "learning_rate": 4.006531816845386e-07, "loss": 1.4128, "step": 2203 }, { "epoch": 0.4843956043956044, "grad_norm": 0.2536119805482763, "learning_rate": 4.004359517969651e-07, "loss": 1.5137, "step": 2204 }, { "epoch": 0.4846153846153846, "grad_norm": 0.2538459095239332, "learning_rate": 4.002187145505347e-07, "loss": 1.4692, "step": 2205 }, { "epoch": 0.4848351648351648, "grad_norm": 0.25037515727381765, "learning_rate": 4.000014700488123e-07, "loss": 1.486, "step": 2206 }, { "epoch": 0.48505494505494506, "grad_norm": 0.2534854405761311, "learning_rate": 3.9978421839536565e-07, "loss": 1.4716, "step": 2207 }, { "epoch": 0.48527472527472526, "grad_norm": 0.2516800776138757, "learning_rate": 3.995669596937664e-07, "loss": 1.4109, "step": 2208 }, { "epoch": 0.4854945054945055, "grad_norm": 0.26224105682457977, "learning_rate": 3.9934969404758916e-07, "loss": 1.502, "step": 2209 }, { "epoch": 0.4857142857142857, "grad_norm": 0.33275220074258516, "learning_rate": 3.9913242156041224e-07, "loss": 1.445, "step": 2210 }, { "epoch": 0.4859340659340659, "grad_norm": 0.2517760073190495, "learning_rate": 3.9891514233581694e-07, "loss": 1.5186, "step": 2211 }, { "epoch": 0.48615384615384616, "grad_norm": 0.24466627661225412, "learning_rate": 3.986978564773877e-07, "loss": 1.4935, "step": 2212 }, { "epoch": 0.48637362637362636, "grad_norm": 0.25131614810160363, "learning_rate": 3.984805640887124e-07, "loss": 1.4478, "step": 2213 }, { "epoch": 0.4865934065934066, "grad_norm": 0.2586676888317208, "learning_rate": 3.982632652733819e-07, "loss": 1.4601, "step": 2214 }, { "epoch": 0.4868131868131868, "grad_norm": 0.2484649524850849, "learning_rate": 3.980459601349902e-07, "loss": 1.4408, "step": 2215 }, { "epoch": 0.48703296703296706, "grad_norm": 0.25411892758832755, "learning_rate": 3.978286487771342e-07, "loss": 1.4455, "step": 2216 }, { "epoch": 0.48725274725274725, "grad_norm": 0.2527913746887048, "learning_rate": 3.9761133130341364e-07, "loss": 1.4259, "step": 2217 }, { "epoch": 0.48747252747252745, "grad_norm": 0.2524472976197228, "learning_rate": 3.9739400781743154e-07, "loss": 1.4366, "step": 2218 }, { "epoch": 0.4876923076923077, "grad_norm": 0.24749176289072802, "learning_rate": 3.9717667842279346e-07, "loss": 1.4817, "step": 2219 }, { "epoch": 0.4879120879120879, "grad_norm": 0.23411649445207888, "learning_rate": 3.96959343223108e-07, "loss": 1.4662, "step": 2220 }, { "epoch": 0.48813186813186815, "grad_norm": 0.259536911380817, "learning_rate": 3.967420023219864e-07, "loss": 1.48, "step": 2221 }, { "epoch": 0.48835164835164835, "grad_norm": 0.2492732422110916, "learning_rate": 3.965246558230426e-07, "loss": 1.4856, "step": 2222 }, { "epoch": 0.48857142857142855, "grad_norm": 0.2572209250914964, "learning_rate": 3.963073038298935e-07, "loss": 1.4741, "step": 2223 }, { "epoch": 0.4887912087912088, "grad_norm": 0.25895725275313164, "learning_rate": 3.9608994644615794e-07, "loss": 1.4347, "step": 2224 }, { "epoch": 0.489010989010989, "grad_norm": 0.245615821628319, "learning_rate": 3.9587258377545804e-07, "loss": 1.4458, "step": 2225 }, { "epoch": 0.48923076923076925, "grad_norm": 0.2563831201035074, "learning_rate": 3.9565521592141815e-07, "loss": 1.5211, "step": 2226 }, { "epoch": 0.48945054945054944, "grad_norm": 0.2541876466901656, "learning_rate": 3.9543784298766525e-07, "loss": 1.4274, "step": 2227 }, { "epoch": 0.4896703296703297, "grad_norm": 0.25398626933791496, "learning_rate": 3.952204650778282e-07, "loss": 1.4797, "step": 2228 }, { "epoch": 0.4898901098901099, "grad_norm": 0.25758327785635093, "learning_rate": 3.950030822955391e-07, "loss": 1.4612, "step": 2229 }, { "epoch": 0.4901098901098901, "grad_norm": 0.2541624023396581, "learning_rate": 3.9478569474443164e-07, "loss": 1.4715, "step": 2230 }, { "epoch": 0.49032967032967034, "grad_norm": 0.2442921249089427, "learning_rate": 3.9456830252814204e-07, "loss": 1.5103, "step": 2231 }, { "epoch": 0.49054945054945054, "grad_norm": 0.255858430011158, "learning_rate": 3.943509057503088e-07, "loss": 1.4637, "step": 2232 }, { "epoch": 0.4907692307692308, "grad_norm": 0.2507519521381344, "learning_rate": 3.9413350451457257e-07, "loss": 1.4694, "step": 2233 }, { "epoch": 0.490989010989011, "grad_norm": 0.2573155043460483, "learning_rate": 3.9391609892457617e-07, "loss": 1.4416, "step": 2234 }, { "epoch": 0.4912087912087912, "grad_norm": 0.28593458043217906, "learning_rate": 3.9369868908396425e-07, "loss": 1.4554, "step": 2235 }, { "epoch": 0.49142857142857144, "grad_norm": 0.24396185472777643, "learning_rate": 3.934812750963838e-07, "loss": 1.4465, "step": 2236 }, { "epoch": 0.49164835164835163, "grad_norm": 0.24446906800643653, "learning_rate": 3.932638570654836e-07, "loss": 1.4438, "step": 2237 }, { "epoch": 0.4918681318681319, "grad_norm": 0.23084238338734298, "learning_rate": 3.930464350949144e-07, "loss": 1.4375, "step": 2238 }, { "epoch": 0.4920879120879121, "grad_norm": 0.2528309575959943, "learning_rate": 3.9282900928832897e-07, "loss": 1.5037, "step": 2239 }, { "epoch": 0.49230769230769234, "grad_norm": 0.24839907705789768, "learning_rate": 3.9261157974938167e-07, "loss": 1.4749, "step": 2240 }, { "epoch": 0.49252747252747253, "grad_norm": 0.2492622242980725, "learning_rate": 3.9239414658172877e-07, "loss": 1.4654, "step": 2241 }, { "epoch": 0.49274725274725273, "grad_norm": 0.25282608627910996, "learning_rate": 3.921767098890282e-07, "loss": 1.4292, "step": 2242 }, { "epoch": 0.492967032967033, "grad_norm": 0.24870397375672293, "learning_rate": 3.919592697749395e-07, "loss": 1.4669, "step": 2243 }, { "epoch": 0.4931868131868132, "grad_norm": 0.2512220677317805, "learning_rate": 3.9174182634312436e-07, "loss": 1.4564, "step": 2244 }, { "epoch": 0.49340659340659343, "grad_norm": 0.24628718210590606, "learning_rate": 3.915243796972452e-07, "loss": 1.4781, "step": 2245 }, { "epoch": 0.4936263736263736, "grad_norm": 0.2611441766796232, "learning_rate": 3.913069299409667e-07, "loss": 1.5186, "step": 2246 }, { "epoch": 0.4938461538461538, "grad_norm": 0.2397190545356055, "learning_rate": 3.9108947717795463e-07, "loss": 1.4307, "step": 2247 }, { "epoch": 0.4940659340659341, "grad_norm": 0.23909845500613314, "learning_rate": 3.908720215118764e-07, "loss": 1.4208, "step": 2248 }, { "epoch": 0.4942857142857143, "grad_norm": 0.28326644231504894, "learning_rate": 3.906545630464007e-07, "loss": 1.4224, "step": 2249 }, { "epoch": 0.4945054945054945, "grad_norm": 0.24633437801473324, "learning_rate": 3.9043710188519735e-07, "loss": 1.4887, "step": 2250 }, { "epoch": 0.4947252747252747, "grad_norm": 0.23904916861557024, "learning_rate": 3.90219638131938e-07, "loss": 1.4118, "step": 2251 }, { "epoch": 0.4949450549450549, "grad_norm": 0.25677412191404325, "learning_rate": 3.90002171890295e-07, "loss": 1.5036, "step": 2252 }, { "epoch": 0.49516483516483517, "grad_norm": 0.25005759102015573, "learning_rate": 3.8978470326394214e-07, "loss": 1.4814, "step": 2253 }, { "epoch": 0.49538461538461537, "grad_norm": 0.2581993772151251, "learning_rate": 3.8956723235655426e-07, "loss": 1.4622, "step": 2254 }, { "epoch": 0.4956043956043956, "grad_norm": 0.2609294834516711, "learning_rate": 3.893497592718074e-07, "loss": 1.4481, "step": 2255 }, { "epoch": 0.4958241758241758, "grad_norm": 0.25723709715413734, "learning_rate": 3.891322841133786e-07, "loss": 1.4517, "step": 2256 }, { "epoch": 0.49604395604395607, "grad_norm": 0.25155775919517426, "learning_rate": 3.889148069849457e-07, "loss": 1.5171, "step": 2257 }, { "epoch": 0.49626373626373627, "grad_norm": 0.2620557653630709, "learning_rate": 3.8869732799018784e-07, "loss": 1.4489, "step": 2258 }, { "epoch": 0.49648351648351646, "grad_norm": 0.25002331863705707, "learning_rate": 3.884798472327848e-07, "loss": 1.5076, "step": 2259 }, { "epoch": 0.4967032967032967, "grad_norm": 0.2656186939595916, "learning_rate": 3.8826236481641695e-07, "loss": 1.5147, "step": 2260 }, { "epoch": 0.4969230769230769, "grad_norm": 0.26114526317827286, "learning_rate": 3.880448808447662e-07, "loss": 1.4475, "step": 2261 }, { "epoch": 0.49714285714285716, "grad_norm": 0.24379211496905664, "learning_rate": 3.878273954215143e-07, "loss": 1.4233, "step": 2262 }, { "epoch": 0.49736263736263736, "grad_norm": 0.242500199968574, "learning_rate": 3.876099086503445e-07, "loss": 1.4413, "step": 2263 }, { "epoch": 0.49758241758241756, "grad_norm": 0.2546738974714412, "learning_rate": 3.873924206349403e-07, "loss": 1.4875, "step": 2264 }, { "epoch": 0.4978021978021978, "grad_norm": 0.2514541612117798, "learning_rate": 3.871749314789855e-07, "loss": 1.4893, "step": 2265 }, { "epoch": 0.498021978021978, "grad_norm": 0.24914408880178057, "learning_rate": 3.8695744128616505e-07, "loss": 1.3969, "step": 2266 }, { "epoch": 0.49824175824175826, "grad_norm": 0.2474244656953845, "learning_rate": 3.867399501601641e-07, "loss": 1.486, "step": 2267 }, { "epoch": 0.49846153846153846, "grad_norm": 0.24438263305872812, "learning_rate": 3.865224582046682e-07, "loss": 1.419, "step": 2268 }, { "epoch": 0.4986813186813187, "grad_norm": 0.2629258652096395, "learning_rate": 3.8630496552336336e-07, "loss": 1.5157, "step": 2269 }, { "epoch": 0.4989010989010989, "grad_norm": 0.2409382566658195, "learning_rate": 3.8608747221993594e-07, "loss": 1.4329, "step": 2270 }, { "epoch": 0.4991208791208791, "grad_norm": 0.24623568377332633, "learning_rate": 3.858699783980726e-07, "loss": 1.4415, "step": 2271 }, { "epoch": 0.49934065934065935, "grad_norm": 0.2539052101987305, "learning_rate": 3.8565248416146007e-07, "loss": 1.4709, "step": 2272 }, { "epoch": 0.49956043956043955, "grad_norm": 0.24133238747541236, "learning_rate": 3.8543498961378567e-07, "loss": 1.4745, "step": 2273 }, { "epoch": 0.4997802197802198, "grad_norm": 0.2557776072848505, "learning_rate": 3.852174948587365e-07, "loss": 1.4667, "step": 2274 }, { "epoch": 0.5, "grad_norm": 0.5003240500311481, "learning_rate": 3.85e-07, "loss": 1.4184, "step": 2275 }, { "epoch": 0.5002197802197802, "grad_norm": 0.24841473830266936, "learning_rate": 3.8478250514126357e-07, "loss": 1.4785, "step": 2276 }, { "epoch": 0.5004395604395604, "grad_norm": 0.24033681185054634, "learning_rate": 3.8456501038621437e-07, "loss": 1.4664, "step": 2277 }, { "epoch": 0.5006593406593407, "grad_norm": 0.25073011844641474, "learning_rate": 3.8434751583853986e-07, "loss": 1.4608, "step": 2278 }, { "epoch": 0.5008791208791209, "grad_norm": 0.26313066571800764, "learning_rate": 3.8413002160192746e-07, "loss": 1.4739, "step": 2279 }, { "epoch": 0.5010989010989011, "grad_norm": 0.24628034670922833, "learning_rate": 3.8391252778006415e-07, "loss": 1.4514, "step": 2280 }, { "epoch": 0.5013186813186813, "grad_norm": 0.25819247247267896, "learning_rate": 3.836950344766366e-07, "loss": 1.4775, "step": 2281 }, { "epoch": 0.5015384615384615, "grad_norm": 0.32968273982251245, "learning_rate": 3.8347754179533187e-07, "loss": 1.4397, "step": 2282 }, { "epoch": 0.5017582417582418, "grad_norm": 0.25337506380009656, "learning_rate": 3.8326004983983584e-07, "loss": 1.4456, "step": 2283 }, { "epoch": 0.501978021978022, "grad_norm": 0.2509531393309275, "learning_rate": 3.83042558713835e-07, "loss": 1.4818, "step": 2284 }, { "epoch": 0.5021978021978022, "grad_norm": 0.2554138807544182, "learning_rate": 3.8282506852101446e-07, "loss": 1.4299, "step": 2285 }, { "epoch": 0.5024175824175824, "grad_norm": 0.24884345900933796, "learning_rate": 3.8260757936505983e-07, "loss": 1.4719, "step": 2286 }, { "epoch": 0.5026373626373626, "grad_norm": 0.2567318242500976, "learning_rate": 3.8239009134965556e-07, "loss": 1.5205, "step": 2287 }, { "epoch": 0.5028571428571429, "grad_norm": 0.28827141519980987, "learning_rate": 3.821726045784856e-07, "loss": 1.4994, "step": 2288 }, { "epoch": 0.5030769230769231, "grad_norm": 0.25402458703562336, "learning_rate": 3.8195511915523386e-07, "loss": 1.4988, "step": 2289 }, { "epoch": 0.5032967032967033, "grad_norm": 0.2568810361409908, "learning_rate": 3.8173763518358304e-07, "loss": 1.5159, "step": 2290 }, { "epoch": 0.5035164835164835, "grad_norm": 0.2509320041521234, "learning_rate": 3.8152015276721535e-07, "loss": 1.4442, "step": 2291 }, { "epoch": 0.5037362637362638, "grad_norm": 0.24210277332773947, "learning_rate": 3.813026720098121e-07, "loss": 1.473, "step": 2292 }, { "epoch": 0.503956043956044, "grad_norm": 0.23952405663008583, "learning_rate": 3.810851930150542e-07, "loss": 1.3874, "step": 2293 }, { "epoch": 0.5041758241758242, "grad_norm": 0.24393557987251238, "learning_rate": 3.8086771588662143e-07, "loss": 1.4558, "step": 2294 }, { "epoch": 0.5043956043956044, "grad_norm": 0.23736793767225717, "learning_rate": 3.806502407281926e-07, "loss": 1.4779, "step": 2295 }, { "epoch": 0.5046153846153846, "grad_norm": 0.2532361984171458, "learning_rate": 3.804327676434458e-07, "loss": 1.4287, "step": 2296 }, { "epoch": 0.5048351648351649, "grad_norm": 0.26654186227997895, "learning_rate": 3.8021529673605785e-07, "loss": 1.5512, "step": 2297 }, { "epoch": 0.5050549450549451, "grad_norm": 0.2427767085812242, "learning_rate": 3.799978281097051e-07, "loss": 1.4286, "step": 2298 }, { "epoch": 0.5052747252747253, "grad_norm": 0.26393950646934833, "learning_rate": 3.7978036186806204e-07, "loss": 1.5118, "step": 2299 }, { "epoch": 0.5054945054945055, "grad_norm": 0.27442419825958025, "learning_rate": 3.7956289811480263e-07, "loss": 1.5133, "step": 2300 }, { "epoch": 0.5057142857142857, "grad_norm": 0.25219356781392155, "learning_rate": 3.7934543695359947e-07, "loss": 1.4664, "step": 2301 }, { "epoch": 0.505934065934066, "grad_norm": 0.2552058890419149, "learning_rate": 3.7912797848812355e-07, "loss": 1.4977, "step": 2302 }, { "epoch": 0.5061538461538462, "grad_norm": 0.24694517782102704, "learning_rate": 3.789105228220453e-07, "loss": 1.4942, "step": 2303 }, { "epoch": 0.5063736263736264, "grad_norm": 0.25489159416406026, "learning_rate": 3.7869307005903325e-07, "loss": 1.4696, "step": 2304 }, { "epoch": 0.5065934065934066, "grad_norm": 0.25201538035174836, "learning_rate": 3.784756203027548e-07, "loss": 1.4254, "step": 2305 }, { "epoch": 0.5068131868131868, "grad_norm": 0.2738964359143157, "learning_rate": 3.7825817365687573e-07, "loss": 1.4718, "step": 2306 }, { "epoch": 0.5070329670329671, "grad_norm": 0.251695545088705, "learning_rate": 3.780407302250605e-07, "loss": 1.4369, "step": 2307 }, { "epoch": 0.5072527472527473, "grad_norm": 0.24599154493338135, "learning_rate": 3.7782329011097185e-07, "loss": 1.4624, "step": 2308 }, { "epoch": 0.5074725274725275, "grad_norm": 0.24667844501736866, "learning_rate": 3.776058534182713e-07, "loss": 1.4633, "step": 2309 }, { "epoch": 0.5076923076923077, "grad_norm": 0.29337737433785255, "learning_rate": 3.773884202506184e-07, "loss": 1.4483, "step": 2310 }, { "epoch": 0.5079120879120879, "grad_norm": 0.25017157663708117, "learning_rate": 3.7717099071167107e-07, "loss": 1.4527, "step": 2311 }, { "epoch": 0.5081318681318682, "grad_norm": 0.2493125023934487, "learning_rate": 3.7695356490508547e-07, "loss": 1.5364, "step": 2312 }, { "epoch": 0.5083516483516484, "grad_norm": 0.2552680950194569, "learning_rate": 3.767361429345164e-07, "loss": 1.3963, "step": 2313 }, { "epoch": 0.5085714285714286, "grad_norm": 0.2527747074759333, "learning_rate": 3.7651872490361625e-07, "loss": 1.4298, "step": 2314 }, { "epoch": 0.5087912087912088, "grad_norm": 0.270320247819433, "learning_rate": 3.763013109160358e-07, "loss": 1.4305, "step": 2315 }, { "epoch": 0.509010989010989, "grad_norm": 0.2501930406554101, "learning_rate": 3.760839010754239e-07, "loss": 1.4205, "step": 2316 }, { "epoch": 0.5092307692307693, "grad_norm": 0.24627255884646854, "learning_rate": 3.7586649548542736e-07, "loss": 1.4634, "step": 2317 }, { "epoch": 0.5094505494505495, "grad_norm": 0.2594413697324097, "learning_rate": 3.756490942496912e-07, "loss": 1.4296, "step": 2318 }, { "epoch": 0.5096703296703297, "grad_norm": 0.241443154074072, "learning_rate": 3.75431697471858e-07, "loss": 1.4214, "step": 2319 }, { "epoch": 0.5098901098901099, "grad_norm": 0.32001564128841054, "learning_rate": 3.7521430525556846e-07, "loss": 1.491, "step": 2320 }, { "epoch": 0.5101098901098902, "grad_norm": 0.24272898905556806, "learning_rate": 3.7499691770446083e-07, "loss": 1.432, "step": 2321 }, { "epoch": 0.5103296703296704, "grad_norm": 0.24399153979616894, "learning_rate": 3.7477953492217173e-07, "loss": 1.4484, "step": 2322 }, { "epoch": 0.5105494505494506, "grad_norm": 0.2424576699450368, "learning_rate": 3.7456215701233473e-07, "loss": 1.5475, "step": 2323 }, { "epoch": 0.5107692307692308, "grad_norm": 0.25864494452289266, "learning_rate": 3.743447840785818e-07, "loss": 1.4577, "step": 2324 }, { "epoch": 0.510989010989011, "grad_norm": 0.24964225806607046, "learning_rate": 3.7412741622454205e-07, "loss": 1.4903, "step": 2325 }, { "epoch": 0.5112087912087913, "grad_norm": 0.24474201190457495, "learning_rate": 3.7391005355384205e-07, "loss": 1.4525, "step": 2326 }, { "epoch": 0.5114285714285715, "grad_norm": 0.24910505365666538, "learning_rate": 3.7369269617010655e-07, "loss": 1.4498, "step": 2327 }, { "epoch": 0.5116483516483517, "grad_norm": 0.2393738418338155, "learning_rate": 3.7347534417695734e-07, "loss": 1.4433, "step": 2328 }, { "epoch": 0.5118681318681318, "grad_norm": 0.24786669020425164, "learning_rate": 3.7325799767801365e-07, "loss": 1.4739, "step": 2329 }, { "epoch": 0.512087912087912, "grad_norm": 0.2495661564832968, "learning_rate": 3.7304065677689206e-07, "loss": 1.4453, "step": 2330 }, { "epoch": 0.5123076923076924, "grad_norm": 0.25039328270074224, "learning_rate": 3.728233215772065e-07, "loss": 1.4564, "step": 2331 }, { "epoch": 0.5125274725274725, "grad_norm": 0.24682075686811833, "learning_rate": 3.726059921825685e-07, "loss": 1.4693, "step": 2332 }, { "epoch": 0.5127472527472527, "grad_norm": 0.2539640526472173, "learning_rate": 3.723886686965864e-07, "loss": 1.51, "step": 2333 }, { "epoch": 0.5129670329670329, "grad_norm": 0.238444959442042, "learning_rate": 3.7217135122286586e-07, "loss": 1.3958, "step": 2334 }, { "epoch": 0.5131868131868131, "grad_norm": 0.24104629604246275, "learning_rate": 3.719540398650099e-07, "loss": 1.4758, "step": 2335 }, { "epoch": 0.5134065934065934, "grad_norm": 0.3270237259906149, "learning_rate": 3.717367347266181e-07, "loss": 1.5671, "step": 2336 }, { "epoch": 0.5136263736263736, "grad_norm": 0.2456132693039993, "learning_rate": 3.7151943591128765e-07, "loss": 1.4872, "step": 2337 }, { "epoch": 0.5138461538461538, "grad_norm": 0.2448230376335242, "learning_rate": 3.7130214352261233e-07, "loss": 1.4548, "step": 2338 }, { "epoch": 0.514065934065934, "grad_norm": 0.24191589322092183, "learning_rate": 3.7108485766418315e-07, "loss": 1.4354, "step": 2339 }, { "epoch": 0.5142857142857142, "grad_norm": 0.26527616434014223, "learning_rate": 3.7086757843958796e-07, "loss": 1.4766, "step": 2340 }, { "epoch": 0.5145054945054945, "grad_norm": 0.2401466273453965, "learning_rate": 3.706503059524108e-07, "loss": 1.4633, "step": 2341 }, { "epoch": 0.5147252747252747, "grad_norm": 0.2476514638882906, "learning_rate": 3.704330403062336e-07, "loss": 1.491, "step": 2342 }, { "epoch": 0.5149450549450549, "grad_norm": 0.25424091238816915, "learning_rate": 3.702157816046344e-07, "loss": 1.4508, "step": 2343 }, { "epoch": 0.5151648351648351, "grad_norm": 0.2404497274348749, "learning_rate": 3.6999852995118784e-07, "loss": 1.4898, "step": 2344 }, { "epoch": 0.5153846153846153, "grad_norm": 0.2575112655941573, "learning_rate": 3.6978128544946524e-07, "loss": 1.4813, "step": 2345 }, { "epoch": 0.5156043956043956, "grad_norm": 0.35965713375661085, "learning_rate": 3.6956404820303496e-07, "loss": 1.4244, "step": 2346 }, { "epoch": 0.5158241758241758, "grad_norm": 0.25568518865603723, "learning_rate": 3.693468183154614e-07, "loss": 1.4547, "step": 2347 }, { "epoch": 0.516043956043956, "grad_norm": 0.254028026851914, "learning_rate": 3.6912959589030573e-07, "loss": 1.484, "step": 2348 }, { "epoch": 0.5162637362637362, "grad_norm": 0.26235289216398056, "learning_rate": 3.689123810311252e-07, "loss": 1.5028, "step": 2349 }, { "epoch": 0.5164835164835165, "grad_norm": 0.24235662101047564, "learning_rate": 3.68695173841474e-07, "loss": 1.4727, "step": 2350 }, { "epoch": 0.5167032967032967, "grad_norm": 0.24435031157677387, "learning_rate": 3.6847797442490226e-07, "loss": 1.4743, "step": 2351 }, { "epoch": 0.5169230769230769, "grad_norm": 0.2460452477834752, "learning_rate": 3.6826078288495654e-07, "loss": 1.4337, "step": 2352 }, { "epoch": 0.5171428571428571, "grad_norm": 0.23683375278787086, "learning_rate": 3.680435993251796e-07, "loss": 1.4824, "step": 2353 }, { "epoch": 0.5173626373626373, "grad_norm": 0.26491693796020277, "learning_rate": 3.678264238491105e-07, "loss": 1.48, "step": 2354 }, { "epoch": 0.5175824175824176, "grad_norm": 0.27656080165220287, "learning_rate": 3.6760925656028407e-07, "loss": 1.4828, "step": 2355 }, { "epoch": 0.5178021978021978, "grad_norm": 0.2538831484175234, "learning_rate": 3.67392097562232e-07, "loss": 1.4679, "step": 2356 }, { "epoch": 0.518021978021978, "grad_norm": 0.24301818447540044, "learning_rate": 3.671749469584811e-07, "loss": 1.5261, "step": 2357 }, { "epoch": 0.5182417582417582, "grad_norm": 0.6295033281908831, "learning_rate": 3.669578048525551e-07, "loss": 1.482, "step": 2358 }, { "epoch": 0.5184615384615384, "grad_norm": 0.24685397208179988, "learning_rate": 3.667406713479729e-07, "loss": 1.4516, "step": 2359 }, { "epoch": 0.5186813186813187, "grad_norm": 0.24234241454696095, "learning_rate": 3.6652354654824957e-07, "loss": 1.467, "step": 2360 }, { "epoch": 0.5189010989010989, "grad_norm": 0.2453436935073063, "learning_rate": 3.6630643055689623e-07, "loss": 1.477, "step": 2361 }, { "epoch": 0.5191208791208791, "grad_norm": 0.25091548919956697, "learning_rate": 3.6608932347741974e-07, "loss": 1.474, "step": 2362 }, { "epoch": 0.5193406593406593, "grad_norm": 0.26462417732227744, "learning_rate": 3.6587222541332244e-07, "loss": 1.4865, "step": 2363 }, { "epoch": 0.5195604395604395, "grad_norm": 0.23547926738031363, "learning_rate": 3.656551364681026e-07, "loss": 1.4548, "step": 2364 }, { "epoch": 0.5197802197802198, "grad_norm": 0.25248058408821283, "learning_rate": 3.65438056745254e-07, "loss": 1.4881, "step": 2365 }, { "epoch": 0.52, "grad_norm": 0.24639762570360496, "learning_rate": 3.652209863482663e-07, "loss": 1.4333, "step": 2366 }, { "epoch": 0.5202197802197802, "grad_norm": 0.2384335820996811, "learning_rate": 3.6500392538062447e-07, "loss": 1.4922, "step": 2367 }, { "epoch": 0.5204395604395604, "grad_norm": 0.23938737681849095, "learning_rate": 3.6478687394580897e-07, "loss": 1.4098, "step": 2368 }, { "epoch": 0.5206593406593406, "grad_norm": 0.25775289904758764, "learning_rate": 3.645698321472959e-07, "loss": 1.5393, "step": 2369 }, { "epoch": 0.5208791208791209, "grad_norm": 0.2534509396457383, "learning_rate": 3.643528000885565e-07, "loss": 1.5131, "step": 2370 }, { "epoch": 0.5210989010989011, "grad_norm": 0.25161087694382234, "learning_rate": 3.641357778730578e-07, "loss": 1.5225, "step": 2371 }, { "epoch": 0.5213186813186813, "grad_norm": 0.2531132919378715, "learning_rate": 3.6391876560426156e-07, "loss": 1.3885, "step": 2372 }, { "epoch": 0.5215384615384615, "grad_norm": 0.27678118391000595, "learning_rate": 3.637017633856253e-07, "loss": 1.4796, "step": 2373 }, { "epoch": 0.5217582417582417, "grad_norm": 0.3372078287193749, "learning_rate": 3.6348477132060146e-07, "loss": 1.4785, "step": 2374 }, { "epoch": 0.521978021978022, "grad_norm": 0.3654889146461077, "learning_rate": 3.6326778951263765e-07, "loss": 1.4112, "step": 2375 }, { "epoch": 0.5221978021978022, "grad_norm": 0.2539579037334113, "learning_rate": 3.6305081806517676e-07, "loss": 1.5093, "step": 2376 }, { "epoch": 0.5224175824175824, "grad_norm": 0.24155137687989628, "learning_rate": 3.628338570816568e-07, "loss": 1.4028, "step": 2377 }, { "epoch": 0.5226373626373626, "grad_norm": 0.29159047625470025, "learning_rate": 3.626169066655105e-07, "loss": 1.5091, "step": 2378 }, { "epoch": 0.5228571428571429, "grad_norm": 0.25993978508890914, "learning_rate": 3.623999669201655e-07, "loss": 1.4699, "step": 2379 }, { "epoch": 0.5230769230769231, "grad_norm": 0.25977530015379996, "learning_rate": 3.621830379490449e-07, "loss": 1.4666, "step": 2380 }, { "epoch": 0.5232967032967033, "grad_norm": 0.2549597264906517, "learning_rate": 3.61966119855566e-07, "loss": 1.4371, "step": 2381 }, { "epoch": 0.5235164835164835, "grad_norm": 0.28112919059338826, "learning_rate": 3.6174921274314146e-07, "loss": 1.427, "step": 2382 }, { "epoch": 0.5237362637362637, "grad_norm": 0.25240146760536947, "learning_rate": 3.615323167151783e-07, "loss": 1.4486, "step": 2383 }, { "epoch": 0.523956043956044, "grad_norm": 0.2664580405551836, "learning_rate": 3.6131543187507853e-07, "loss": 1.4713, "step": 2384 }, { "epoch": 0.5241758241758242, "grad_norm": 0.2500998703829329, "learning_rate": 3.610985583262387e-07, "loss": 1.5272, "step": 2385 }, { "epoch": 0.5243956043956044, "grad_norm": 0.2525705996196096, "learning_rate": 3.6088169617204997e-07, "loss": 1.4412, "step": 2386 }, { "epoch": 0.5246153846153846, "grad_norm": 0.25000485621733065, "learning_rate": 3.6066484551589807e-07, "loss": 1.4597, "step": 2387 }, { "epoch": 0.5248351648351648, "grad_norm": 0.2410643399310685, "learning_rate": 3.604480064611634e-07, "loss": 1.4532, "step": 2388 }, { "epoch": 0.5250549450549451, "grad_norm": 0.24446312178585083, "learning_rate": 3.6023117911122043e-07, "loss": 1.5031, "step": 2389 }, { "epoch": 0.5252747252747253, "grad_norm": 0.3362095219381454, "learning_rate": 3.6001436356943865e-07, "loss": 1.5374, "step": 2390 }, { "epoch": 0.5254945054945055, "grad_norm": 0.30150323793043726, "learning_rate": 3.597975599391814e-07, "loss": 1.4848, "step": 2391 }, { "epoch": 0.5257142857142857, "grad_norm": 0.24788575059663162, "learning_rate": 3.595807683238066e-07, "loss": 1.4805, "step": 2392 }, { "epoch": 0.5259340659340659, "grad_norm": 0.2607277845856362, "learning_rate": 3.5936398882666633e-07, "loss": 1.4144, "step": 2393 }, { "epoch": 0.5261538461538462, "grad_norm": 0.24685913931142686, "learning_rate": 3.591472215511069e-07, "loss": 1.4582, "step": 2394 }, { "epoch": 0.5263736263736264, "grad_norm": 0.5962223978399409, "learning_rate": 3.58930466600469e-07, "loss": 1.4715, "step": 2395 }, { "epoch": 0.5265934065934066, "grad_norm": 0.26155916145065927, "learning_rate": 3.587137240780871e-07, "loss": 1.4348, "step": 2396 }, { "epoch": 0.5268131868131868, "grad_norm": 0.25033706471393447, "learning_rate": 3.5849699408729007e-07, "loss": 1.4629, "step": 2397 }, { "epoch": 0.527032967032967, "grad_norm": 0.24636054493260834, "learning_rate": 3.582802767314005e-07, "loss": 1.4718, "step": 2398 }, { "epoch": 0.5272527472527473, "grad_norm": 0.26061079364117784, "learning_rate": 3.580635721137353e-07, "loss": 1.425, "step": 2399 }, { "epoch": 0.5274725274725275, "grad_norm": 0.27835332660121304, "learning_rate": 3.5784688033760484e-07, "loss": 1.4561, "step": 2400 }, { "epoch": 0.5276923076923077, "grad_norm": 0.24729440919091983, "learning_rate": 3.5763020150631404e-07, "loss": 1.4787, "step": 2401 }, { "epoch": 0.5279120879120879, "grad_norm": 0.24420608479735176, "learning_rate": 3.574135357231609e-07, "loss": 1.4738, "step": 2402 }, { "epoch": 0.5281318681318682, "grad_norm": 0.2557867687587444, "learning_rate": 3.5719688309143775e-07, "loss": 1.509, "step": 2403 }, { "epoch": 0.5283516483516484, "grad_norm": 0.25074565043670655, "learning_rate": 3.569802437144304e-07, "loss": 1.524, "step": 2404 }, { "epoch": 0.5285714285714286, "grad_norm": 0.2461662436249225, "learning_rate": 3.5676361769541846e-07, "loss": 1.3901, "step": 2405 }, { "epoch": 0.5287912087912088, "grad_norm": 0.24870628819652119, "learning_rate": 3.565470051376751e-07, "loss": 1.4639, "step": 2406 }, { "epoch": 0.529010989010989, "grad_norm": 0.2578878580655407, "learning_rate": 3.5633040614446693e-07, "loss": 1.4337, "step": 2407 }, { "epoch": 0.5292307692307693, "grad_norm": 0.24416041120687818, "learning_rate": 3.561138208190543e-07, "loss": 1.4747, "step": 2408 }, { "epoch": 0.5294505494505495, "grad_norm": 0.25531378396760357, "learning_rate": 3.558972492646912e-07, "loss": 1.4904, "step": 2409 }, { "epoch": 0.5296703296703297, "grad_norm": 0.2363396760204816, "learning_rate": 3.556806915846247e-07, "loss": 1.4463, "step": 2410 }, { "epoch": 0.5298901098901099, "grad_norm": 0.24943120497323146, "learning_rate": 3.554641478820954e-07, "loss": 1.4236, "step": 2411 }, { "epoch": 0.5301098901098901, "grad_norm": 0.24732171440482204, "learning_rate": 3.552476182603372e-07, "loss": 1.4489, "step": 2412 }, { "epoch": 0.5303296703296704, "grad_norm": 0.24306256922666883, "learning_rate": 3.5503110282257725e-07, "loss": 1.4377, "step": 2413 }, { "epoch": 0.5305494505494506, "grad_norm": 0.24425677091309583, "learning_rate": 3.5481460167203626e-07, "loss": 1.4757, "step": 2414 }, { "epoch": 0.5307692307692308, "grad_norm": 0.2529037931280104, "learning_rate": 3.545981149119277e-07, "loss": 1.4868, "step": 2415 }, { "epoch": 0.530989010989011, "grad_norm": 0.24632188014095946, "learning_rate": 3.5438164264545854e-07, "loss": 1.4856, "step": 2416 }, { "epoch": 0.5312087912087912, "grad_norm": 0.2513676700213414, "learning_rate": 3.5416518497582844e-07, "loss": 1.4474, "step": 2417 }, { "epoch": 0.5314285714285715, "grad_norm": 0.24316858435167316, "learning_rate": 3.5394874200623025e-07, "loss": 1.4611, "step": 2418 }, { "epoch": 0.5316483516483517, "grad_norm": 0.30859414643629013, "learning_rate": 3.537323138398502e-07, "loss": 1.4665, "step": 2419 }, { "epoch": 0.5318681318681319, "grad_norm": 0.25132634887271454, "learning_rate": 3.5351590057986706e-07, "loss": 1.4642, "step": 2420 }, { "epoch": 0.5320879120879121, "grad_norm": 0.24990117582506366, "learning_rate": 3.5329950232945247e-07, "loss": 1.4652, "step": 2421 }, { "epoch": 0.5323076923076923, "grad_norm": 0.25409742429800175, "learning_rate": 3.5308311919177103e-07, "loss": 1.471, "step": 2422 }, { "epoch": 0.5325274725274726, "grad_norm": 0.32455414234173374, "learning_rate": 3.5286675126998003e-07, "loss": 1.4346, "step": 2423 }, { "epoch": 0.5327472527472528, "grad_norm": 0.3291216222833815, "learning_rate": 3.5265039866722997e-07, "loss": 1.4416, "step": 2424 }, { "epoch": 0.532967032967033, "grad_norm": 0.39783825599437156, "learning_rate": 3.5243406148666347e-07, "loss": 1.475, "step": 2425 }, { "epoch": 0.5331868131868132, "grad_norm": 0.24359859701096273, "learning_rate": 3.522177398314159e-07, "loss": 1.5323, "step": 2426 }, { "epoch": 0.5334065934065934, "grad_norm": 0.2505957093291331, "learning_rate": 3.5200143380461553e-07, "loss": 1.5131, "step": 2427 }, { "epoch": 0.5336263736263737, "grad_norm": 0.2476887249243456, "learning_rate": 3.517851435093827e-07, "loss": 1.4365, "step": 2428 }, { "epoch": 0.5338461538461539, "grad_norm": 0.251634764989001, "learning_rate": 3.5156886904883096e-07, "loss": 1.5045, "step": 2429 }, { "epoch": 0.5340659340659341, "grad_norm": 0.24693035470009495, "learning_rate": 3.5135261052606556e-07, "loss": 1.4867, "step": 2430 }, { "epoch": 0.5342857142857143, "grad_norm": 0.2510114323493616, "learning_rate": 3.511363680441847e-07, "loss": 1.4704, "step": 2431 }, { "epoch": 0.5345054945054946, "grad_norm": 0.24413830266218428, "learning_rate": 3.5092014170627847e-07, "loss": 1.4791, "step": 2432 }, { "epoch": 0.5347252747252748, "grad_norm": 0.2549529871977361, "learning_rate": 3.5070393161542974e-07, "loss": 1.4105, "step": 2433 }, { "epoch": 0.534945054945055, "grad_norm": 0.24085825020125307, "learning_rate": 3.504877378747133e-07, "loss": 1.4169, "step": 2434 }, { "epoch": 0.5351648351648352, "grad_norm": 0.2460619199237114, "learning_rate": 3.5027156058719635e-07, "loss": 1.4424, "step": 2435 }, { "epoch": 0.5353846153846153, "grad_norm": 0.26898264466723765, "learning_rate": 3.50055399855938e-07, "loss": 1.4813, "step": 2436 }, { "epoch": 0.5356043956043957, "grad_norm": 0.23253205429484922, "learning_rate": 3.4983925578398957e-07, "loss": 1.416, "step": 2437 }, { "epoch": 0.5358241758241759, "grad_norm": 0.31045840835273486, "learning_rate": 3.496231284743946e-07, "loss": 1.4473, "step": 2438 }, { "epoch": 0.536043956043956, "grad_norm": 0.24932598271297657, "learning_rate": 3.494070180301886e-07, "loss": 1.4535, "step": 2439 }, { "epoch": 0.5362637362637362, "grad_norm": 0.25065753370385874, "learning_rate": 3.491909245543989e-07, "loss": 1.4426, "step": 2440 }, { "epoch": 0.5364835164835164, "grad_norm": 0.26493054254675447, "learning_rate": 3.4897484815004457e-07, "loss": 1.5287, "step": 2441 }, { "epoch": 0.5367032967032968, "grad_norm": 0.2609973851037939, "learning_rate": 3.4875878892013703e-07, "loss": 1.4704, "step": 2442 }, { "epoch": 0.536923076923077, "grad_norm": 0.25556647371724434, "learning_rate": 3.4854274696767927e-07, "loss": 1.4117, "step": 2443 }, { "epoch": 0.5371428571428571, "grad_norm": 0.26521611235385095, "learning_rate": 3.4832672239566584e-07, "loss": 1.4343, "step": 2444 }, { "epoch": 0.5373626373626373, "grad_norm": 0.24173566804949828, "learning_rate": 3.481107153070833e-07, "loss": 1.435, "step": 2445 }, { "epoch": 0.5375824175824175, "grad_norm": 0.24424325127841187, "learning_rate": 3.4789472580490975e-07, "loss": 1.45, "step": 2446 }, { "epoch": 0.5378021978021978, "grad_norm": 0.244678264719358, "learning_rate": 3.476787539921149e-07, "loss": 1.4613, "step": 2447 }, { "epoch": 0.538021978021978, "grad_norm": 0.25787588823262375, "learning_rate": 3.4746279997166003e-07, "loss": 1.5005, "step": 2448 }, { "epoch": 0.5382417582417582, "grad_norm": 0.23894036858444787, "learning_rate": 3.472468638464981e-07, "loss": 1.4903, "step": 2449 }, { "epoch": 0.5384615384615384, "grad_norm": 0.2637014488813191, "learning_rate": 3.470309457195733e-07, "loss": 1.4404, "step": 2450 }, { "epoch": 0.5386813186813186, "grad_norm": 0.24646079245948196, "learning_rate": 3.468150456938213e-07, "loss": 1.5028, "step": 2451 }, { "epoch": 0.5389010989010989, "grad_norm": 0.2573309172419129, "learning_rate": 3.46599163872169e-07, "loss": 1.4448, "step": 2452 }, { "epoch": 0.5391208791208791, "grad_norm": 0.2589449655414008, "learning_rate": 3.463833003575352e-07, "loss": 1.445, "step": 2453 }, { "epoch": 0.5393406593406593, "grad_norm": 0.5852414658697316, "learning_rate": 3.4616745525282934e-07, "loss": 1.4688, "step": 2454 }, { "epoch": 0.5395604395604395, "grad_norm": 0.2519595272985753, "learning_rate": 3.459516286609524e-07, "loss": 1.4515, "step": 2455 }, { "epoch": 0.5397802197802197, "grad_norm": 0.2900347008805642, "learning_rate": 3.457358206847962e-07, "loss": 1.4487, "step": 2456 }, { "epoch": 0.54, "grad_norm": 0.25661106951172385, "learning_rate": 3.455200314272441e-07, "loss": 1.4514, "step": 2457 }, { "epoch": 0.5402197802197802, "grad_norm": 0.25732154087119125, "learning_rate": 3.453042609911706e-07, "loss": 1.4282, "step": 2458 }, { "epoch": 0.5404395604395604, "grad_norm": 0.24889408132909518, "learning_rate": 3.4508850947944057e-07, "loss": 1.4445, "step": 2459 }, { "epoch": 0.5406593406593406, "grad_norm": 0.2701462310034605, "learning_rate": 3.4487277699491056e-07, "loss": 1.4432, "step": 2460 }, { "epoch": 0.5408791208791209, "grad_norm": 0.2553952628077091, "learning_rate": 3.4465706364042764e-07, "loss": 1.5246, "step": 2461 }, { "epoch": 0.5410989010989011, "grad_norm": 0.27429146561848944, "learning_rate": 3.4444136951883004e-07, "loss": 1.4802, "step": 2462 }, { "epoch": 0.5413186813186813, "grad_norm": 0.3847981198035457, "learning_rate": 3.442256947329466e-07, "loss": 1.426, "step": 2463 }, { "epoch": 0.5415384615384615, "grad_norm": 0.2505304096279959, "learning_rate": 3.4401003938559693e-07, "loss": 1.4251, "step": 2464 }, { "epoch": 0.5417582417582417, "grad_norm": 0.2811376482113325, "learning_rate": 3.437944035795915e-07, "loss": 1.4811, "step": 2465 }, { "epoch": 0.541978021978022, "grad_norm": 0.25685745384779063, "learning_rate": 3.4357878741773145e-07, "loss": 1.5058, "step": 2466 }, { "epoch": 0.5421978021978022, "grad_norm": 0.2589697373282326, "learning_rate": 3.433631910028086e-07, "loss": 1.4322, "step": 2467 }, { "epoch": 0.5424175824175824, "grad_norm": 0.2653156802744679, "learning_rate": 3.431476144376051e-07, "loss": 1.4406, "step": 2468 }, { "epoch": 0.5426373626373626, "grad_norm": 0.2425175948787124, "learning_rate": 3.4293205782489406e-07, "loss": 1.4273, "step": 2469 }, { "epoch": 0.5428571428571428, "grad_norm": 0.28131479717165775, "learning_rate": 3.4271652126743866e-07, "loss": 1.4875, "step": 2470 }, { "epoch": 0.5430769230769231, "grad_norm": 0.2377766036111552, "learning_rate": 3.4250100486799256e-07, "loss": 1.4524, "step": 2471 }, { "epoch": 0.5432967032967033, "grad_norm": 0.2811295000176585, "learning_rate": 3.422855087293002e-07, "loss": 1.4719, "step": 2472 }, { "epoch": 0.5435164835164835, "grad_norm": 0.2533281053392722, "learning_rate": 3.4207003295409617e-07, "loss": 1.4821, "step": 2473 }, { "epoch": 0.5437362637362637, "grad_norm": 0.2523850823471961, "learning_rate": 3.418545776451051e-07, "loss": 1.4996, "step": 2474 }, { "epoch": 0.5439560439560439, "grad_norm": 0.26054012455088077, "learning_rate": 3.416391429050421e-07, "loss": 1.4399, "step": 2475 }, { "epoch": 0.5441758241758242, "grad_norm": 0.2623042118045894, "learning_rate": 3.4142372883661224e-07, "loss": 1.4756, "step": 2476 }, { "epoch": 0.5443956043956044, "grad_norm": 0.2523932975509699, "learning_rate": 3.412083355425113e-07, "loss": 1.5235, "step": 2477 }, { "epoch": 0.5446153846153846, "grad_norm": 0.24750151171386292, "learning_rate": 3.4099296312542465e-07, "loss": 1.4234, "step": 2478 }, { "epoch": 0.5448351648351648, "grad_norm": 0.24840472821833404, "learning_rate": 3.407776116880277e-07, "loss": 1.4051, "step": 2479 }, { "epoch": 0.545054945054945, "grad_norm": 0.2432141910966809, "learning_rate": 3.40562281332986e-07, "loss": 1.4631, "step": 2480 }, { "epoch": 0.5452747252747253, "grad_norm": 0.24425875081241077, "learning_rate": 3.4034697216295514e-07, "loss": 1.4807, "step": 2481 }, { "epoch": 0.5454945054945055, "grad_norm": 0.24913540952082738, "learning_rate": 3.401316842805806e-07, "loss": 1.5285, "step": 2482 }, { "epoch": 0.5457142857142857, "grad_norm": 0.2561140740036112, "learning_rate": 3.399164177884974e-07, "loss": 1.4678, "step": 2483 }, { "epoch": 0.5459340659340659, "grad_norm": 0.24527619813444076, "learning_rate": 3.3970117278933075e-07, "loss": 1.4615, "step": 2484 }, { "epoch": 0.5461538461538461, "grad_norm": 0.24093477258527357, "learning_rate": 3.394859493856954e-07, "loss": 1.4754, "step": 2485 }, { "epoch": 0.5463736263736264, "grad_norm": 0.2483508001877219, "learning_rate": 3.392707476801957e-07, "loss": 1.4846, "step": 2486 }, { "epoch": 0.5465934065934066, "grad_norm": 0.251498999901592, "learning_rate": 3.3905556777542603e-07, "loss": 1.5126, "step": 2487 }, { "epoch": 0.5468131868131868, "grad_norm": 0.292984358149494, "learning_rate": 3.388404097739702e-07, "loss": 1.4384, "step": 2488 }, { "epoch": 0.547032967032967, "grad_norm": 0.2471393152596394, "learning_rate": 3.386252737784014e-07, "loss": 1.4826, "step": 2489 }, { "epoch": 0.5472527472527473, "grad_norm": 0.26047142753984537, "learning_rate": 3.384101598912823e-07, "loss": 1.4981, "step": 2490 }, { "epoch": 0.5474725274725275, "grad_norm": 0.2461878978044061, "learning_rate": 3.3819506821516553e-07, "loss": 1.47, "step": 2491 }, { "epoch": 0.5476923076923077, "grad_norm": 0.23805161395936614, "learning_rate": 3.3797999885259277e-07, "loss": 1.4635, "step": 2492 }, { "epoch": 0.5479120879120879, "grad_norm": 0.35282272725126546, "learning_rate": 3.377649519060948e-07, "loss": 1.5261, "step": 2493 }, { "epoch": 0.5481318681318681, "grad_norm": 0.2466539109687882, "learning_rate": 3.3754992747819224e-07, "loss": 1.45, "step": 2494 }, { "epoch": 0.5483516483516484, "grad_norm": 0.2603889600133571, "learning_rate": 3.3733492567139457e-07, "loss": 1.4689, "step": 2495 }, { "epoch": 0.5485714285714286, "grad_norm": 0.25709555281243185, "learning_rate": 3.371199465882008e-07, "loss": 1.4679, "step": 2496 }, { "epoch": 0.5487912087912088, "grad_norm": 0.2568533673561895, "learning_rate": 3.36904990331099e-07, "loss": 1.4868, "step": 2497 }, { "epoch": 0.549010989010989, "grad_norm": 0.24963793309196405, "learning_rate": 3.366900570025661e-07, "loss": 1.4055, "step": 2498 }, { "epoch": 0.5492307692307692, "grad_norm": 0.2475589390440301, "learning_rate": 3.364751467050686e-07, "loss": 1.4755, "step": 2499 }, { "epoch": 0.5494505494505495, "grad_norm": 0.2747454061930324, "learning_rate": 3.362602595410613e-07, "loss": 1.4102, "step": 2500 }, { "epoch": 0.5496703296703297, "grad_norm": 0.2627576856680716, "learning_rate": 3.3604539561298903e-07, "loss": 1.47, "step": 2501 }, { "epoch": 0.5498901098901099, "grad_norm": 0.26130260962110075, "learning_rate": 3.3583055502328437e-07, "loss": 1.4188, "step": 2502 }, { "epoch": 0.5501098901098901, "grad_norm": 0.2533795146156557, "learning_rate": 3.3561573787436954e-07, "loss": 1.4421, "step": 2503 }, { "epoch": 0.5503296703296703, "grad_norm": 0.2549711093400903, "learning_rate": 3.354009442686555e-07, "loss": 1.4564, "step": 2504 }, { "epoch": 0.5505494505494506, "grad_norm": 0.2596310640387029, "learning_rate": 3.351861743085415e-07, "loss": 1.4048, "step": 2505 }, { "epoch": 0.5507692307692308, "grad_norm": 0.2585193367500666, "learning_rate": 3.3497142809641615e-07, "loss": 1.4362, "step": 2506 }, { "epoch": 0.550989010989011, "grad_norm": 0.25759289383156936, "learning_rate": 3.347567057346565e-07, "loss": 1.44, "step": 2507 }, { "epoch": 0.5512087912087912, "grad_norm": 0.25242840136946804, "learning_rate": 3.345420073256281e-07, "loss": 1.4454, "step": 2508 }, { "epoch": 0.5514285714285714, "grad_norm": 0.2774655042786759, "learning_rate": 3.343273329716851e-07, "loss": 1.4512, "step": 2509 }, { "epoch": 0.5516483516483517, "grad_norm": 0.24697466261756673, "learning_rate": 3.341126827751701e-07, "loss": 1.5146, "step": 2510 }, { "epoch": 0.5518681318681319, "grad_norm": 0.24608028817615893, "learning_rate": 3.338980568384149e-07, "loss": 1.4418, "step": 2511 }, { "epoch": 0.5520879120879121, "grad_norm": 0.24884221696868708, "learning_rate": 3.3368345526373874e-07, "loss": 1.4368, "step": 2512 }, { "epoch": 0.5523076923076923, "grad_norm": 0.2744378881294568, "learning_rate": 3.334688781534497e-07, "loss": 1.4112, "step": 2513 }, { "epoch": 0.5525274725274725, "grad_norm": 0.24812712111009483, "learning_rate": 3.332543256098443e-07, "loss": 1.4973, "step": 2514 }, { "epoch": 0.5527472527472528, "grad_norm": 0.24336979659611635, "learning_rate": 3.3303979773520694e-07, "loss": 1.403, "step": 2515 }, { "epoch": 0.552967032967033, "grad_norm": 0.2492977419816484, "learning_rate": 3.32825294631811e-07, "loss": 1.5108, "step": 2516 }, { "epoch": 0.5531868131868132, "grad_norm": 0.26839021597873497, "learning_rate": 3.3261081640191725e-07, "loss": 1.5203, "step": 2517 }, { "epoch": 0.5534065934065934, "grad_norm": 0.24322649847388625, "learning_rate": 3.3239636314777514e-07, "loss": 1.4795, "step": 2518 }, { "epoch": 0.5536263736263737, "grad_norm": 0.24895096900469302, "learning_rate": 3.321819349716218e-07, "loss": 1.4335, "step": 2519 }, { "epoch": 0.5538461538461539, "grad_norm": 0.2459517510639334, "learning_rate": 3.3196753197568287e-07, "loss": 1.526, "step": 2520 }, { "epoch": 0.5540659340659341, "grad_norm": 0.26015872791263117, "learning_rate": 3.317531542621715e-07, "loss": 1.4665, "step": 2521 }, { "epoch": 0.5542857142857143, "grad_norm": 0.25941471154155876, "learning_rate": 3.315388019332893e-07, "loss": 1.4784, "step": 2522 }, { "epoch": 0.5545054945054945, "grad_norm": 0.24569955453892808, "learning_rate": 3.313244750912253e-07, "loss": 1.4537, "step": 2523 }, { "epoch": 0.5547252747252748, "grad_norm": 0.24186931977538126, "learning_rate": 3.311101738381565e-07, "loss": 1.5036, "step": 2524 }, { "epoch": 0.554945054945055, "grad_norm": 0.2704259669906531, "learning_rate": 3.3089589827624796e-07, "loss": 1.4056, "step": 2525 }, { "epoch": 0.5551648351648352, "grad_norm": 0.2607421854076853, "learning_rate": 3.3068164850765246e-07, "loss": 1.4222, "step": 2526 }, { "epoch": 0.5553846153846154, "grad_norm": 0.2434896276585296, "learning_rate": 3.3046742463451004e-07, "loss": 1.4041, "step": 2527 }, { "epoch": 0.5556043956043956, "grad_norm": 0.26658832252410564, "learning_rate": 3.302532267589487e-07, "loss": 1.4563, "step": 2528 }, { "epoch": 0.5558241758241759, "grad_norm": 0.24760698091118363, "learning_rate": 3.3003905498308414e-07, "loss": 1.4811, "step": 2529 }, { "epoch": 0.5560439560439561, "grad_norm": 0.2448022617942516, "learning_rate": 3.298249094090196e-07, "loss": 1.4732, "step": 2530 }, { "epoch": 0.5562637362637363, "grad_norm": 0.2676645906836815, "learning_rate": 3.2961079013884575e-07, "loss": 1.5212, "step": 2531 }, { "epoch": 0.5564835164835165, "grad_norm": 0.2592275178329446, "learning_rate": 3.2939669727464054e-07, "loss": 1.4354, "step": 2532 }, { "epoch": 0.5567032967032967, "grad_norm": 0.24202438550836206, "learning_rate": 3.2918263091846964e-07, "loss": 1.4716, "step": 2533 }, { "epoch": 0.556923076923077, "grad_norm": 0.25619769090494693, "learning_rate": 3.289685911723858e-07, "loss": 1.3794, "step": 2534 }, { "epoch": 0.5571428571428572, "grad_norm": 0.25163360088271464, "learning_rate": 3.2875457813842944e-07, "loss": 1.5385, "step": 2535 }, { "epoch": 0.5573626373626374, "grad_norm": 0.2474039155565428, "learning_rate": 3.2854059191862796e-07, "loss": 1.4646, "step": 2536 }, { "epoch": 0.5575824175824176, "grad_norm": 0.24790481799111025, "learning_rate": 3.2832663261499604e-07, "loss": 1.4936, "step": 2537 }, { "epoch": 0.5578021978021978, "grad_norm": 0.24333738553491652, "learning_rate": 3.2811270032953557e-07, "loss": 1.4335, "step": 2538 }, { "epoch": 0.5580219780219781, "grad_norm": 0.2605555266761169, "learning_rate": 3.2789879516423534e-07, "loss": 1.4951, "step": 2539 }, { "epoch": 0.5582417582417583, "grad_norm": 0.23841468143353503, "learning_rate": 3.276849172210717e-07, "loss": 1.4632, "step": 2540 }, { "epoch": 0.5584615384615385, "grad_norm": 0.2415403860543191, "learning_rate": 3.2747106660200766e-07, "loss": 1.4545, "step": 2541 }, { "epoch": 0.5586813186813187, "grad_norm": 0.26275919709985424, "learning_rate": 3.272572434089931e-07, "loss": 1.5114, "step": 2542 }, { "epoch": 0.5589010989010988, "grad_norm": 0.23853140946010676, "learning_rate": 3.2704344774396524e-07, "loss": 1.437, "step": 2543 }, { "epoch": 0.5591208791208792, "grad_norm": 0.2441808103072464, "learning_rate": 3.2682967970884754e-07, "loss": 1.4607, "step": 2544 }, { "epoch": 0.5593406593406594, "grad_norm": 0.2523839705339858, "learning_rate": 3.266159394055512e-07, "loss": 1.5381, "step": 2545 }, { "epoch": 0.5595604395604395, "grad_norm": 0.2730961053168548, "learning_rate": 3.264022269359734e-07, "loss": 1.4453, "step": 2546 }, { "epoch": 0.5597802197802197, "grad_norm": 0.24842846462662363, "learning_rate": 3.2618854240199823e-07, "loss": 1.4806, "step": 2547 }, { "epoch": 0.56, "grad_norm": 0.2449526251039824, "learning_rate": 3.259748859054967e-07, "loss": 1.5009, "step": 2548 }, { "epoch": 0.5602197802197803, "grad_norm": 0.25274673887224086, "learning_rate": 3.2576125754832634e-07, "loss": 1.4966, "step": 2549 }, { "epoch": 0.5604395604395604, "grad_norm": 0.23865412819854098, "learning_rate": 3.2554765743233125e-07, "loss": 1.465, "step": 2550 }, { "epoch": 0.5606593406593406, "grad_norm": 0.2541274318400704, "learning_rate": 3.2533408565934194e-07, "loss": 1.5231, "step": 2551 }, { "epoch": 0.5608791208791208, "grad_norm": 0.381478326789079, "learning_rate": 3.2512054233117564e-07, "loss": 1.542, "step": 2552 }, { "epoch": 0.5610989010989011, "grad_norm": 0.2503718315460643, "learning_rate": 3.2490702754963574e-07, "loss": 1.488, "step": 2553 }, { "epoch": 0.5613186813186813, "grad_norm": 0.3229378528011373, "learning_rate": 3.246935414165125e-07, "loss": 1.4384, "step": 2554 }, { "epoch": 0.5615384615384615, "grad_norm": 0.2474829544995195, "learning_rate": 3.244800840335819e-07, "loss": 1.4971, "step": 2555 }, { "epoch": 0.5617582417582417, "grad_norm": 2.3272715985664276, "learning_rate": 3.242666555026066e-07, "loss": 1.4205, "step": 2556 }, { "epoch": 0.5619780219780219, "grad_norm": 0.2532428428215496, "learning_rate": 3.2405325592533553e-07, "loss": 1.4453, "step": 2557 }, { "epoch": 0.5621978021978022, "grad_norm": 0.25886253529821535, "learning_rate": 3.2383988540350355e-07, "loss": 1.5173, "step": 2558 }, { "epoch": 0.5624175824175824, "grad_norm": 0.2518593405781085, "learning_rate": 3.2362654403883196e-07, "loss": 1.4558, "step": 2559 }, { "epoch": 0.5626373626373626, "grad_norm": 0.24827338476745162, "learning_rate": 3.23413231933028e-07, "loss": 1.5319, "step": 2560 }, { "epoch": 0.5628571428571428, "grad_norm": 0.3072532487745244, "learning_rate": 3.231999491877851e-07, "loss": 1.433, "step": 2561 }, { "epoch": 0.563076923076923, "grad_norm": 0.2585982268843787, "learning_rate": 3.2298669590478237e-07, "loss": 1.4268, "step": 2562 }, { "epoch": 0.5632967032967033, "grad_norm": 0.2629033786882321, "learning_rate": 3.2277347218568505e-07, "loss": 1.5091, "step": 2563 }, { "epoch": 0.5635164835164835, "grad_norm": 0.24837050939337624, "learning_rate": 3.2256027813214466e-07, "loss": 1.4253, "step": 2564 }, { "epoch": 0.5637362637362637, "grad_norm": 0.2566859245028406, "learning_rate": 3.2234711384579806e-07, "loss": 1.4301, "step": 2565 }, { "epoch": 0.5639560439560439, "grad_norm": 0.24792427739814885, "learning_rate": 3.2213397942826803e-07, "loss": 1.4601, "step": 2566 }, { "epoch": 0.5641758241758241, "grad_norm": 0.2443964933568627, "learning_rate": 3.2192087498116327e-07, "loss": 1.4502, "step": 2567 }, { "epoch": 0.5643956043956044, "grad_norm": 0.24082959777065138, "learning_rate": 3.2170780060607796e-07, "loss": 1.4533, "step": 2568 }, { "epoch": 0.5646153846153846, "grad_norm": 0.24382212241251466, "learning_rate": 3.2149475640459243e-07, "loss": 1.4929, "step": 2569 }, { "epoch": 0.5648351648351648, "grad_norm": 0.25018130599851895, "learning_rate": 3.212817424782719e-07, "loss": 1.4488, "step": 2570 }, { "epoch": 0.565054945054945, "grad_norm": 0.25574900908295944, "learning_rate": 3.210687589286677e-07, "loss": 1.4997, "step": 2571 }, { "epoch": 0.5652747252747252, "grad_norm": 0.27781853993394434, "learning_rate": 3.208558058573167e-07, "loss": 1.5537, "step": 2572 }, { "epoch": 0.5654945054945055, "grad_norm": 0.2601298765458066, "learning_rate": 3.206428833657407e-07, "loss": 1.4737, "step": 2573 }, { "epoch": 0.5657142857142857, "grad_norm": 0.25377617743154335, "learning_rate": 3.204299915554474e-07, "loss": 1.4547, "step": 2574 }, { "epoch": 0.5659340659340659, "grad_norm": 0.25072831180917604, "learning_rate": 3.2021713052792997e-07, "loss": 1.4487, "step": 2575 }, { "epoch": 0.5661538461538461, "grad_norm": 0.2433127043909169, "learning_rate": 3.2000430038466655e-07, "loss": 1.4171, "step": 2576 }, { "epoch": 0.5663736263736264, "grad_norm": 0.25400901437083023, "learning_rate": 3.1979150122712053e-07, "loss": 1.4724, "step": 2577 }, { "epoch": 0.5665934065934066, "grad_norm": 0.2486947090902992, "learning_rate": 3.1957873315674087e-07, "loss": 1.4275, "step": 2578 }, { "epoch": 0.5668131868131868, "grad_norm": 0.24727789962424468, "learning_rate": 3.1936599627496157e-07, "loss": 1.4607, "step": 2579 }, { "epoch": 0.567032967032967, "grad_norm": 0.24360407672577455, "learning_rate": 3.1915329068320176e-07, "loss": 1.4809, "step": 2580 }, { "epoch": 0.5672527472527472, "grad_norm": 0.26404224889084515, "learning_rate": 3.189406164828653e-07, "loss": 1.4369, "step": 2581 }, { "epoch": 0.5674725274725275, "grad_norm": 0.30193832823623906, "learning_rate": 3.187279737753416e-07, "loss": 1.4478, "step": 2582 }, { "epoch": 0.5676923076923077, "grad_norm": 0.254953736010794, "learning_rate": 3.18515362662005e-07, "loss": 1.4535, "step": 2583 }, { "epoch": 0.5679120879120879, "grad_norm": 0.25779652046949264, "learning_rate": 3.1830278324421455e-07, "loss": 1.446, "step": 2584 }, { "epoch": 0.5681318681318681, "grad_norm": 0.2791095963130846, "learning_rate": 3.180902356233141e-07, "loss": 1.476, "step": 2585 }, { "epoch": 0.5683516483516483, "grad_norm": 0.25209541243231776, "learning_rate": 3.1787771990063274e-07, "loss": 1.4546, "step": 2586 }, { "epoch": 0.5685714285714286, "grad_norm": 0.2463060068160242, "learning_rate": 3.176652361774839e-07, "loss": 1.4493, "step": 2587 }, { "epoch": 0.5687912087912088, "grad_norm": 0.2640472167615685, "learning_rate": 3.174527845551662e-07, "loss": 1.5246, "step": 2588 }, { "epoch": 0.569010989010989, "grad_norm": 0.25538103608343604, "learning_rate": 3.172403651349627e-07, "loss": 1.4742, "step": 2589 }, { "epoch": 0.5692307692307692, "grad_norm": 0.246145099871271, "learning_rate": 3.170279780181411e-07, "loss": 1.4781, "step": 2590 }, { "epoch": 0.5694505494505494, "grad_norm": 0.25269496869080404, "learning_rate": 3.1681562330595376e-07, "loss": 1.4693, "step": 2591 }, { "epoch": 0.5696703296703297, "grad_norm": 0.25688121599076813, "learning_rate": 3.1660330109963746e-07, "loss": 1.4672, "step": 2592 }, { "epoch": 0.5698901098901099, "grad_norm": 0.2407271097018519, "learning_rate": 3.163910115004138e-07, "loss": 1.4054, "step": 2593 }, { "epoch": 0.5701098901098901, "grad_norm": 0.2597704242979883, "learning_rate": 3.161787546094886e-07, "loss": 1.4555, "step": 2594 }, { "epoch": 0.5703296703296703, "grad_norm": 0.23975496843985386, "learning_rate": 3.1596653052805204e-07, "loss": 1.4669, "step": 2595 }, { "epoch": 0.5705494505494505, "grad_norm": 0.24621303519748397, "learning_rate": 3.1575433935727873e-07, "loss": 1.4984, "step": 2596 }, { "epoch": 0.5707692307692308, "grad_norm": 0.24101580541246978, "learning_rate": 3.155421811983277e-07, "loss": 1.4704, "step": 2597 }, { "epoch": 0.570989010989011, "grad_norm": 0.25761613526285804, "learning_rate": 3.1533005615234205e-07, "loss": 1.5003, "step": 2598 }, { "epoch": 0.5712087912087912, "grad_norm": 0.23982293041807357, "learning_rate": 3.1511796432044935e-07, "loss": 1.3696, "step": 2599 }, { "epoch": 0.5714285714285714, "grad_norm": 0.2551912185766934, "learning_rate": 3.1490590580376097e-07, "loss": 1.5132, "step": 2600 }, { "epoch": 0.5716483516483516, "grad_norm": 0.35324427853948154, "learning_rate": 3.1469388070337284e-07, "loss": 1.4856, "step": 2601 }, { "epoch": 0.5718681318681319, "grad_norm": 0.2488082673436781, "learning_rate": 3.144818891203644e-07, "loss": 1.488, "step": 2602 }, { "epoch": 0.5720879120879121, "grad_norm": 0.25294317307726777, "learning_rate": 3.1426993115579973e-07, "loss": 1.4542, "step": 2603 }, { "epoch": 0.5723076923076923, "grad_norm": 0.2572557661095975, "learning_rate": 3.1405800691072645e-07, "loss": 1.4924, "step": 2604 }, { "epoch": 0.5725274725274725, "grad_norm": 0.24894955663998664, "learning_rate": 3.1384611648617633e-07, "loss": 1.5289, "step": 2605 }, { "epoch": 0.5727472527472528, "grad_norm": 0.2478132076775968, "learning_rate": 3.1363425998316467e-07, "loss": 1.483, "step": 2606 }, { "epoch": 0.572967032967033, "grad_norm": 0.24078033897355672, "learning_rate": 3.1342243750269125e-07, "loss": 1.4472, "step": 2607 }, { "epoch": 0.5731868131868132, "grad_norm": 0.2475037589947621, "learning_rate": 3.13210649145739e-07, "loss": 1.4196, "step": 2608 }, { "epoch": 0.5734065934065934, "grad_norm": 0.23372037850745242, "learning_rate": 3.1299889501327483e-07, "loss": 1.4447, "step": 2609 }, { "epoch": 0.5736263736263736, "grad_norm": 0.2585502894539701, "learning_rate": 3.127871752062494e-07, "loss": 1.4731, "step": 2610 }, { "epoch": 0.5738461538461539, "grad_norm": 0.25331621953124167, "learning_rate": 3.125754898255967e-07, "loss": 1.5192, "step": 2611 }, { "epoch": 0.5740659340659341, "grad_norm": 0.2832784069839386, "learning_rate": 3.1236383897223487e-07, "loss": 1.4283, "step": 2612 }, { "epoch": 0.5742857142857143, "grad_norm": 0.2562379215472176, "learning_rate": 3.1215222274706507e-07, "loss": 1.5193, "step": 2613 }, { "epoch": 0.5745054945054945, "grad_norm": 0.2565464755119887, "learning_rate": 3.119406412509722e-07, "loss": 1.4648, "step": 2614 }, { "epoch": 0.5747252747252747, "grad_norm": 0.24239285072109812, "learning_rate": 3.1172909458482447e-07, "loss": 1.3815, "step": 2615 }, { "epoch": 0.574945054945055, "grad_norm": 0.24237000060226999, "learning_rate": 3.1151758284947356e-07, "loss": 1.4599, "step": 2616 }, { "epoch": 0.5751648351648352, "grad_norm": 0.24201060162258123, "learning_rate": 3.1130610614575453e-07, "loss": 1.4926, "step": 2617 }, { "epoch": 0.5753846153846154, "grad_norm": 0.24110541902147878, "learning_rate": 3.1109466457448574e-07, "loss": 1.4269, "step": 2618 }, { "epoch": 0.5756043956043956, "grad_norm": 0.5151905357191532, "learning_rate": 3.108832582364687e-07, "loss": 1.4174, "step": 2619 }, { "epoch": 0.5758241758241758, "grad_norm": 0.264400423492667, "learning_rate": 3.1067188723248823e-07, "loss": 1.5061, "step": 2620 }, { "epoch": 0.5760439560439561, "grad_norm": 0.2533826514388564, "learning_rate": 3.1046055166331217e-07, "loss": 1.4184, "step": 2621 }, { "epoch": 0.5762637362637363, "grad_norm": 0.2762190273399301, "learning_rate": 3.102492516296917e-07, "loss": 1.4942, "step": 2622 }, { "epoch": 0.5764835164835165, "grad_norm": 0.24797308297241358, "learning_rate": 3.100379872323609e-07, "loss": 1.4833, "step": 2623 }, { "epoch": 0.5767032967032967, "grad_norm": 0.2447992141664247, "learning_rate": 3.098267585720367e-07, "loss": 1.4154, "step": 2624 }, { "epoch": 0.5769230769230769, "grad_norm": 0.25456065851251086, "learning_rate": 3.096155657494194e-07, "loss": 1.5042, "step": 2625 }, { "epoch": 0.5771428571428572, "grad_norm": 0.24192886052994556, "learning_rate": 3.094044088651917e-07, "loss": 1.4607, "step": 2626 }, { "epoch": 0.5773626373626374, "grad_norm": 0.2680401406754433, "learning_rate": 3.091932880200199e-07, "loss": 1.4202, "step": 2627 }, { "epoch": 0.5775824175824176, "grad_norm": 0.3005987432819412, "learning_rate": 3.089822033145522e-07, "loss": 1.428, "step": 2628 }, { "epoch": 0.5778021978021978, "grad_norm": 0.26120398677275525, "learning_rate": 3.087711548494204e-07, "loss": 1.4763, "step": 2629 }, { "epoch": 0.578021978021978, "grad_norm": 0.32458714972291736, "learning_rate": 3.085601427252384e-07, "loss": 1.4499, "step": 2630 }, { "epoch": 0.5782417582417583, "grad_norm": 0.27004227251398977, "learning_rate": 3.0834916704260314e-07, "loss": 1.4768, "step": 2631 }, { "epoch": 0.5784615384615385, "grad_norm": 0.243766314754373, "learning_rate": 3.081382279020942e-07, "loss": 1.4529, "step": 2632 }, { "epoch": 0.5786813186813187, "grad_norm": 0.24606129828590848, "learning_rate": 3.079273254042736e-07, "loss": 1.4939, "step": 2633 }, { "epoch": 0.5789010989010989, "grad_norm": 0.2500961939135463, "learning_rate": 3.077164596496857e-07, "loss": 1.3928, "step": 2634 }, { "epoch": 0.5791208791208792, "grad_norm": 0.24275683762503814, "learning_rate": 3.0750563073885774e-07, "loss": 1.4908, "step": 2635 }, { "epoch": 0.5793406593406594, "grad_norm": 0.255648446292445, "learning_rate": 3.0729483877229925e-07, "loss": 1.4695, "step": 2636 }, { "epoch": 0.5795604395604396, "grad_norm": 0.2505917464893049, "learning_rate": 3.070840838505021e-07, "loss": 1.4361, "step": 2637 }, { "epoch": 0.5797802197802198, "grad_norm": 0.4164525347416848, "learning_rate": 3.068733660739404e-07, "loss": 1.4378, "step": 2638 }, { "epoch": 0.58, "grad_norm": 0.25534232957880826, "learning_rate": 3.066626855430708e-07, "loss": 1.4871, "step": 2639 }, { "epoch": 0.5802197802197803, "grad_norm": 0.2625756004252582, "learning_rate": 3.064520423583319e-07, "loss": 1.4771, "step": 2640 }, { "epoch": 0.5804395604395605, "grad_norm": 0.24767947681665137, "learning_rate": 3.062414366201448e-07, "loss": 1.435, "step": 2641 }, { "epoch": 0.5806593406593407, "grad_norm": 0.24937095071964677, "learning_rate": 3.060308684289126e-07, "loss": 1.4655, "step": 2642 }, { "epoch": 0.5808791208791209, "grad_norm": 0.2349785411716748, "learning_rate": 3.0582033788502044e-07, "loss": 1.4373, "step": 2643 }, { "epoch": 0.581098901098901, "grad_norm": 0.24143193424826362, "learning_rate": 3.0560984508883556e-07, "loss": 1.4567, "step": 2644 }, { "epoch": 0.5813186813186814, "grad_norm": 0.2403897129083357, "learning_rate": 3.0539939014070724e-07, "loss": 1.4898, "step": 2645 }, { "epoch": 0.5815384615384616, "grad_norm": 0.2506923828615349, "learning_rate": 3.051889731409667e-07, "loss": 1.438, "step": 2646 }, { "epoch": 0.5817582417582418, "grad_norm": 0.25845481960208516, "learning_rate": 3.049785941899271e-07, "loss": 1.4776, "step": 2647 }, { "epoch": 0.581978021978022, "grad_norm": 0.29332390853994833, "learning_rate": 3.0476825338788345e-07, "loss": 1.4469, "step": 2648 }, { "epoch": 0.5821978021978022, "grad_norm": 0.3250187148849675, "learning_rate": 3.0455795083511254e-07, "loss": 1.4956, "step": 2649 }, { "epoch": 0.5824175824175825, "grad_norm": 0.24149785917052716, "learning_rate": 3.043476866318727e-07, "loss": 1.502, "step": 2650 }, { "epoch": 0.5826373626373627, "grad_norm": 0.26100227997137226, "learning_rate": 3.0413746087840447e-07, "loss": 1.5276, "step": 2651 }, { "epoch": 0.5828571428571429, "grad_norm": 0.24741217343462235, "learning_rate": 3.0392727367492983e-07, "loss": 1.4491, "step": 2652 }, { "epoch": 0.583076923076923, "grad_norm": 0.2365057831730838, "learning_rate": 3.037171251216522e-07, "loss": 1.4954, "step": 2653 }, { "epoch": 0.5832967032967032, "grad_norm": 0.27826573782475317, "learning_rate": 3.0350701531875673e-07, "loss": 1.4737, "step": 2654 }, { "epoch": 0.5835164835164836, "grad_norm": 0.24485011334493212, "learning_rate": 3.032969443664101e-07, "loss": 1.5342, "step": 2655 }, { "epoch": 0.5837362637362637, "grad_norm": 0.2520291467226276, "learning_rate": 3.030869123647606e-07, "loss": 1.5136, "step": 2656 }, { "epoch": 0.583956043956044, "grad_norm": 0.24737609327471743, "learning_rate": 3.028769194139378e-07, "loss": 1.5458, "step": 2657 }, { "epoch": 0.5841758241758241, "grad_norm": 0.27859430322534867, "learning_rate": 3.0266696561405246e-07, "loss": 1.4492, "step": 2658 }, { "epoch": 0.5843956043956045, "grad_norm": 0.24858071059972395, "learning_rate": 3.0245705106519705e-07, "loss": 1.4589, "step": 2659 }, { "epoch": 0.5846153846153846, "grad_norm": 0.251028121597539, "learning_rate": 3.022471758674451e-07, "loss": 1.4367, "step": 2660 }, { "epoch": 0.5848351648351648, "grad_norm": 0.23276113173526022, "learning_rate": 3.0203734012085155e-07, "loss": 1.4627, "step": 2661 }, { "epoch": 0.585054945054945, "grad_norm": 0.2978633736080548, "learning_rate": 3.018275439254522e-07, "loss": 1.4127, "step": 2662 }, { "epoch": 0.5852747252747252, "grad_norm": 0.2410866841392644, "learning_rate": 3.0161778738126445e-07, "loss": 1.3678, "step": 2663 }, { "epoch": 0.5854945054945055, "grad_norm": 0.24165254504816364, "learning_rate": 3.0140807058828634e-07, "loss": 1.476, "step": 2664 }, { "epoch": 0.5857142857142857, "grad_norm": 0.2624777663014652, "learning_rate": 3.011983936464974e-07, "loss": 1.4766, "step": 2665 }, { "epoch": 0.5859340659340659, "grad_norm": 0.2634247230469347, "learning_rate": 3.009887566558577e-07, "loss": 1.4598, "step": 2666 }, { "epoch": 0.5861538461538461, "grad_norm": 0.25875444607367315, "learning_rate": 3.007791597163088e-07, "loss": 1.509, "step": 2667 }, { "epoch": 0.5863736263736263, "grad_norm": 0.26630387524065624, "learning_rate": 3.0056960292777275e-07, "loss": 1.461, "step": 2668 }, { "epoch": 0.5865934065934066, "grad_norm": 0.24078612966483726, "learning_rate": 3.003600863901524e-07, "loss": 1.4968, "step": 2669 }, { "epoch": 0.5868131868131868, "grad_norm": 0.24566965714964653, "learning_rate": 3.0015061020333173e-07, "loss": 1.4463, "step": 2670 }, { "epoch": 0.587032967032967, "grad_norm": 0.26238688810875044, "learning_rate": 2.9994117446717546e-07, "loss": 1.4323, "step": 2671 }, { "epoch": 0.5872527472527472, "grad_norm": 0.26934926974738016, "learning_rate": 2.9973177928152886e-07, "loss": 1.4642, "step": 2672 }, { "epoch": 0.5874725274725274, "grad_norm": 0.252353065643034, "learning_rate": 2.9952242474621774e-07, "loss": 1.433, "step": 2673 }, { "epoch": 0.5876923076923077, "grad_norm": 0.2526591795699064, "learning_rate": 2.993131109610488e-07, "loss": 1.4873, "step": 2674 }, { "epoch": 0.5879120879120879, "grad_norm": 0.24745614937116045, "learning_rate": 2.991038380258095e-07, "loss": 1.4721, "step": 2675 }, { "epoch": 0.5881318681318681, "grad_norm": 0.25320330697301335, "learning_rate": 2.9889460604026724e-07, "loss": 1.503, "step": 2676 }, { "epoch": 0.5883516483516483, "grad_norm": 0.25404437739800473, "learning_rate": 2.9868541510417015e-07, "loss": 1.5045, "step": 2677 }, { "epoch": 0.5885714285714285, "grad_norm": 0.24604682796478008, "learning_rate": 2.9847626531724715e-07, "loss": 1.478, "step": 2678 }, { "epoch": 0.5887912087912088, "grad_norm": 0.24484594462571194, "learning_rate": 2.9826715677920693e-07, "loss": 1.433, "step": 2679 }, { "epoch": 0.589010989010989, "grad_norm": 0.24938898195160425, "learning_rate": 2.980580895897391e-07, "loss": 1.4724, "step": 2680 }, { "epoch": 0.5892307692307692, "grad_norm": 0.24404664218132727, "learning_rate": 2.97849063848513e-07, "loss": 1.4203, "step": 2681 }, { "epoch": 0.5894505494505494, "grad_norm": 0.25727167626009134, "learning_rate": 2.9764007965517875e-07, "loss": 1.5806, "step": 2682 }, { "epoch": 0.5896703296703296, "grad_norm": 0.25741622544742127, "learning_rate": 2.974311371093664e-07, "loss": 1.4805, "step": 2683 }, { "epoch": 0.5898901098901099, "grad_norm": 0.25109130456679774, "learning_rate": 2.972222363106858e-07, "loss": 1.4668, "step": 2684 }, { "epoch": 0.5901098901098901, "grad_norm": 0.25413762077941837, "learning_rate": 2.970133773587276e-07, "loss": 1.3821, "step": 2685 }, { "epoch": 0.5903296703296703, "grad_norm": 0.2481136357766227, "learning_rate": 2.968045603530621e-07, "loss": 1.4982, "step": 2686 }, { "epoch": 0.5905494505494505, "grad_norm": 0.26060840911471206, "learning_rate": 2.9659578539323966e-07, "loss": 1.536, "step": 2687 }, { "epoch": 0.5907692307692308, "grad_norm": 0.24894375397809232, "learning_rate": 2.963870525787903e-07, "loss": 1.4488, "step": 2688 }, { "epoch": 0.590989010989011, "grad_norm": 0.24549929315416463, "learning_rate": 2.961783620092246e-07, "loss": 1.4934, "step": 2689 }, { "epoch": 0.5912087912087912, "grad_norm": 0.24841004174419143, "learning_rate": 2.959697137840326e-07, "loss": 1.4358, "step": 2690 }, { "epoch": 0.5914285714285714, "grad_norm": 0.25265455873382414, "learning_rate": 2.9576110800268403e-07, "loss": 1.4453, "step": 2691 }, { "epoch": 0.5916483516483516, "grad_norm": 0.25304837017926046, "learning_rate": 2.9555254476462854e-07, "loss": 1.4942, "step": 2692 }, { "epoch": 0.5918681318681319, "grad_norm": 0.2564751955741434, "learning_rate": 2.953440241692954e-07, "loss": 1.5028, "step": 2693 }, { "epoch": 0.5920879120879121, "grad_norm": 0.4637593906261774, "learning_rate": 2.9513554631609403e-07, "loss": 1.449, "step": 2694 }, { "epoch": 0.5923076923076923, "grad_norm": 0.26801487565820803, "learning_rate": 2.9492711130441296e-07, "loss": 1.5407, "step": 2695 }, { "epoch": 0.5925274725274725, "grad_norm": 0.25455198449595984, "learning_rate": 2.947187192336202e-07, "loss": 1.4823, "step": 2696 }, { "epoch": 0.5927472527472527, "grad_norm": 0.3107751988612622, "learning_rate": 2.945103702030637e-07, "loss": 1.4964, "step": 2697 }, { "epoch": 0.592967032967033, "grad_norm": 0.2472063244291013, "learning_rate": 2.9430206431207066e-07, "loss": 1.5277, "step": 2698 }, { "epoch": 0.5931868131868132, "grad_norm": 0.24236934943271227, "learning_rate": 2.9409380165994786e-07, "loss": 1.4392, "step": 2699 }, { "epoch": 0.5934065934065934, "grad_norm": 0.25532961404856547, "learning_rate": 2.938855823459812e-07, "loss": 1.4332, "step": 2700 }, { "epoch": 0.5936263736263736, "grad_norm": 0.24241481637153153, "learning_rate": 2.936774064694363e-07, "loss": 1.4496, "step": 2701 }, { "epoch": 0.5938461538461538, "grad_norm": 0.24352597552449362, "learning_rate": 2.934692741295577e-07, "loss": 1.5056, "step": 2702 }, { "epoch": 0.5940659340659341, "grad_norm": 0.3906887755320128, "learning_rate": 2.9326118542556934e-07, "loss": 1.4237, "step": 2703 }, { "epoch": 0.5942857142857143, "grad_norm": 0.2394689170130002, "learning_rate": 2.9305314045667453e-07, "loss": 1.4957, "step": 2704 }, { "epoch": 0.5945054945054945, "grad_norm": 0.2568718639849738, "learning_rate": 2.9284513932205545e-07, "loss": 1.4349, "step": 2705 }, { "epoch": 0.5947252747252747, "grad_norm": 0.27977339720146194, "learning_rate": 2.926371821208735e-07, "loss": 1.449, "step": 2706 }, { "epoch": 0.5949450549450549, "grad_norm": 0.2544818020840947, "learning_rate": 2.924292689522691e-07, "loss": 1.4792, "step": 2707 }, { "epoch": 0.5951648351648352, "grad_norm": 0.24755744130994373, "learning_rate": 2.9222139991536177e-07, "loss": 1.5612, "step": 2708 }, { "epoch": 0.5953846153846154, "grad_norm": 0.2578695246691789, "learning_rate": 2.920135751092499e-07, "loss": 1.5198, "step": 2709 }, { "epoch": 0.5956043956043956, "grad_norm": 0.2872310702737208, "learning_rate": 2.918057946330109e-07, "loss": 1.4424, "step": 2710 }, { "epoch": 0.5958241758241758, "grad_norm": 0.2566019432078165, "learning_rate": 2.9159805858570093e-07, "loss": 1.4996, "step": 2711 }, { "epoch": 0.596043956043956, "grad_norm": 0.3174090061953062, "learning_rate": 2.9139036706635494e-07, "loss": 1.4493, "step": 2712 }, { "epoch": 0.5962637362637363, "grad_norm": 0.4448257107050925, "learning_rate": 2.911827201739867e-07, "loss": 1.4859, "step": 2713 }, { "epoch": 0.5964835164835165, "grad_norm": 0.25015361228476474, "learning_rate": 2.9097511800758894e-07, "loss": 1.4481, "step": 2714 }, { "epoch": 0.5967032967032967, "grad_norm": 0.3752665868215014, "learning_rate": 2.907675606661327e-07, "loss": 1.4822, "step": 2715 }, { "epoch": 0.5969230769230769, "grad_norm": 0.24421542260831594, "learning_rate": 2.9056004824856784e-07, "loss": 1.4789, "step": 2716 }, { "epoch": 0.5971428571428572, "grad_norm": 0.23402351613022046, "learning_rate": 2.903525808538226e-07, "loss": 1.4615, "step": 2717 }, { "epoch": 0.5973626373626374, "grad_norm": 0.2551073910167814, "learning_rate": 2.9014515858080424e-07, "loss": 1.4488, "step": 2718 }, { "epoch": 0.5975824175824176, "grad_norm": 0.2924320056245985, "learning_rate": 2.8993778152839806e-07, "loss": 1.425, "step": 2719 }, { "epoch": 0.5978021978021978, "grad_norm": 0.25662166205007986, "learning_rate": 2.8973044979546783e-07, "loss": 1.501, "step": 2720 }, { "epoch": 0.598021978021978, "grad_norm": 0.2562770092861731, "learning_rate": 2.89523163480856e-07, "loss": 1.4995, "step": 2721 }, { "epoch": 0.5982417582417583, "grad_norm": 0.2426367136724267, "learning_rate": 2.893159226833829e-07, "loss": 1.4782, "step": 2722 }, { "epoch": 0.5984615384615385, "grad_norm": 0.24777685332988814, "learning_rate": 2.8910872750184777e-07, "loss": 1.4702, "step": 2723 }, { "epoch": 0.5986813186813187, "grad_norm": 0.240135100898524, "learning_rate": 2.8890157803502757e-07, "loss": 1.4864, "step": 2724 }, { "epoch": 0.5989010989010989, "grad_norm": 0.29277346818395716, "learning_rate": 2.886944743816779e-07, "loss": 1.4546, "step": 2725 }, { "epoch": 0.5991208791208791, "grad_norm": 0.257649706301125, "learning_rate": 2.88487416640532e-07, "loss": 1.539, "step": 2726 }, { "epoch": 0.5993406593406594, "grad_norm": 0.2619500410076014, "learning_rate": 2.882804049103017e-07, "loss": 1.5068, "step": 2727 }, { "epoch": 0.5995604395604396, "grad_norm": 0.2528374253627584, "learning_rate": 2.880734392896768e-07, "loss": 1.4843, "step": 2728 }, { "epoch": 0.5997802197802198, "grad_norm": 0.5273316811041006, "learning_rate": 2.878665198773248e-07, "loss": 1.4479, "step": 2729 }, { "epoch": 0.6, "grad_norm": 0.2627518876587247, "learning_rate": 2.876596467718916e-07, "loss": 1.5063, "step": 2730 }, { "epoch": 0.6002197802197802, "grad_norm": 0.25536190043986196, "learning_rate": 2.874528200720007e-07, "loss": 1.5356, "step": 2731 }, { "epoch": 0.6004395604395605, "grad_norm": 0.23583426044581032, "learning_rate": 2.872460398762536e-07, "loss": 1.4199, "step": 2732 }, { "epoch": 0.6006593406593407, "grad_norm": 0.25871441334810275, "learning_rate": 2.870393062832298e-07, "loss": 1.5, "step": 2733 }, { "epoch": 0.6008791208791209, "grad_norm": 0.2536571432183577, "learning_rate": 2.868326193914862e-07, "loss": 1.4921, "step": 2734 }, { "epoch": 0.6010989010989011, "grad_norm": 0.25024205420416806, "learning_rate": 2.8662597929955785e-07, "loss": 1.3915, "step": 2735 }, { "epoch": 0.6013186813186813, "grad_norm": 0.24610958354668805, "learning_rate": 2.864193861059572e-07, "loss": 1.4565, "step": 2736 }, { "epoch": 0.6015384615384616, "grad_norm": 0.2503297851221268, "learning_rate": 2.8621283990917424e-07, "loss": 1.4075, "step": 2737 }, { "epoch": 0.6017582417582418, "grad_norm": 0.2538166839525025, "learning_rate": 2.8600634080767706e-07, "loss": 1.4641, "step": 2738 }, { "epoch": 0.601978021978022, "grad_norm": 0.25979470001679406, "learning_rate": 2.85799888899911e-07, "loss": 1.535, "step": 2739 }, { "epoch": 0.6021978021978022, "grad_norm": 0.24576730845308514, "learning_rate": 2.8559348428429866e-07, "loss": 1.4609, "step": 2740 }, { "epoch": 0.6024175824175824, "grad_norm": 0.2534655548238086, "learning_rate": 2.853871270592403e-07, "loss": 1.4879, "step": 2741 }, { "epoch": 0.6026373626373627, "grad_norm": 0.2789829200576066, "learning_rate": 2.8518081732311365e-07, "loss": 1.4383, "step": 2742 }, { "epoch": 0.6028571428571429, "grad_norm": 0.24500911334681083, "learning_rate": 2.8497455517427406e-07, "loss": 1.4767, "step": 2743 }, { "epoch": 0.6030769230769231, "grad_norm": 0.24063586795282615, "learning_rate": 2.847683407110536e-07, "loss": 1.409, "step": 2744 }, { "epoch": 0.6032967032967033, "grad_norm": 0.253914208061666, "learning_rate": 2.8456217403176183e-07, "loss": 1.5053, "step": 2745 }, { "epoch": 0.6035164835164836, "grad_norm": 0.24074409688538426, "learning_rate": 2.8435605523468575e-07, "loss": 1.4828, "step": 2746 }, { "epoch": 0.6037362637362638, "grad_norm": 0.2553825030734294, "learning_rate": 2.8414998441808944e-07, "loss": 1.5263, "step": 2747 }, { "epoch": 0.603956043956044, "grad_norm": 0.2382423781962556, "learning_rate": 2.8394396168021405e-07, "loss": 1.4564, "step": 2748 }, { "epoch": 0.6041758241758242, "grad_norm": 0.24876751520215948, "learning_rate": 2.8373798711927757e-07, "loss": 1.4817, "step": 2749 }, { "epoch": 0.6043956043956044, "grad_norm": 0.2516003827718687, "learning_rate": 2.835320608334755e-07, "loss": 1.4137, "step": 2750 }, { "epoch": 0.6046153846153847, "grad_norm": 0.2749483301934398, "learning_rate": 2.8332618292097993e-07, "loss": 1.4397, "step": 2751 }, { "epoch": 0.6048351648351649, "grad_norm": 0.286111946427972, "learning_rate": 2.831203534799401e-07, "loss": 1.4399, "step": 2752 }, { "epoch": 0.6050549450549451, "grad_norm": 0.29218144959981274, "learning_rate": 2.829145726084821e-07, "loss": 1.4436, "step": 2753 }, { "epoch": 0.6052747252747253, "grad_norm": 0.2525068779790069, "learning_rate": 2.827088404047089e-07, "loss": 1.4637, "step": 2754 }, { "epoch": 0.6054945054945055, "grad_norm": 0.2543220711372504, "learning_rate": 2.825031569667003e-07, "loss": 1.4778, "step": 2755 }, { "epoch": 0.6057142857142858, "grad_norm": 0.24789127883030665, "learning_rate": 2.8229752239251236e-07, "loss": 1.462, "step": 2756 }, { "epoch": 0.605934065934066, "grad_norm": 0.25967686683388536, "learning_rate": 2.820919367801786e-07, "loss": 1.5454, "step": 2757 }, { "epoch": 0.6061538461538462, "grad_norm": 0.2604169620992817, "learning_rate": 2.8188640022770883e-07, "loss": 1.5082, "step": 2758 }, { "epoch": 0.6063736263736264, "grad_norm": 0.26860630268928987, "learning_rate": 2.8168091283308945e-07, "loss": 1.5118, "step": 2759 }, { "epoch": 0.6065934065934065, "grad_norm": 0.39784011679850734, "learning_rate": 2.814754746942835e-07, "loss": 1.4461, "step": 2760 }, { "epoch": 0.6068131868131869, "grad_norm": 0.24390577596970886, "learning_rate": 2.8127008590923033e-07, "loss": 1.4586, "step": 2761 }, { "epoch": 0.607032967032967, "grad_norm": 0.23898812807040618, "learning_rate": 2.810647465758463e-07, "loss": 1.4653, "step": 2762 }, { "epoch": 0.6072527472527472, "grad_norm": 0.2630961492357384, "learning_rate": 2.808594567920236e-07, "loss": 1.483, "step": 2763 }, { "epoch": 0.6074725274725274, "grad_norm": 0.2549766101813333, "learning_rate": 2.806542166556309e-07, "loss": 1.4526, "step": 2764 }, { "epoch": 0.6076923076923076, "grad_norm": 0.2492678697325956, "learning_rate": 2.8044902626451363e-07, "loss": 1.5081, "step": 2765 }, { "epoch": 0.607912087912088, "grad_norm": 0.5191154837257347, "learning_rate": 2.802438857164928e-07, "loss": 1.4791, "step": 2766 }, { "epoch": 0.6081318681318681, "grad_norm": 0.24638747553873072, "learning_rate": 2.8003879510936665e-07, "loss": 1.5079, "step": 2767 }, { "epoch": 0.6083516483516483, "grad_norm": 0.25502158688573806, "learning_rate": 2.7983375454090857e-07, "loss": 1.4368, "step": 2768 }, { "epoch": 0.6085714285714285, "grad_norm": 0.24980133419717593, "learning_rate": 2.796287641088687e-07, "loss": 1.5135, "step": 2769 }, { "epoch": 0.6087912087912087, "grad_norm": 0.242376182426195, "learning_rate": 2.7942382391097324e-07, "loss": 1.4315, "step": 2770 }, { "epoch": 0.609010989010989, "grad_norm": 0.2580739428886733, "learning_rate": 2.7921893404492396e-07, "loss": 1.4545, "step": 2771 }, { "epoch": 0.6092307692307692, "grad_norm": 0.26204011355952916, "learning_rate": 2.790140946083994e-07, "loss": 1.4674, "step": 2772 }, { "epoch": 0.6094505494505494, "grad_norm": 0.25198584266570817, "learning_rate": 2.7880930569905366e-07, "loss": 1.4641, "step": 2773 }, { "epoch": 0.6096703296703296, "grad_norm": 0.28798028115276636, "learning_rate": 2.7860456741451645e-07, "loss": 1.5141, "step": 2774 }, { "epoch": 0.6098901098901099, "grad_norm": 0.24340120570712515, "learning_rate": 2.7839987985239386e-07, "loss": 1.4365, "step": 2775 }, { "epoch": 0.6101098901098901, "grad_norm": 0.2570269674608408, "learning_rate": 2.781952431102675e-07, "loss": 1.4716, "step": 2776 }, { "epoch": 0.6103296703296703, "grad_norm": 0.2387418227608774, "learning_rate": 2.7799065728569497e-07, "loss": 1.4219, "step": 2777 }, { "epoch": 0.6105494505494505, "grad_norm": 0.26350495969877236, "learning_rate": 2.777861224762095e-07, "loss": 1.4539, "step": 2778 }, { "epoch": 0.6107692307692307, "grad_norm": 0.24599371937262848, "learning_rate": 2.7758163877931974e-07, "loss": 1.4554, "step": 2779 }, { "epoch": 0.610989010989011, "grad_norm": 0.2504081196618615, "learning_rate": 2.7737720629251027e-07, "loss": 1.4543, "step": 2780 }, { "epoch": 0.6112087912087912, "grad_norm": 0.2543328826681762, "learning_rate": 2.771728251132414e-07, "loss": 1.4969, "step": 2781 }, { "epoch": 0.6114285714285714, "grad_norm": 0.2466344005097448, "learning_rate": 2.769684953389486e-07, "loss": 1.4792, "step": 2782 }, { "epoch": 0.6116483516483516, "grad_norm": 0.2441171202539346, "learning_rate": 2.76764217067043e-07, "loss": 1.4599, "step": 2783 }, { "epoch": 0.6118681318681318, "grad_norm": 0.25282448539558716, "learning_rate": 2.765599903949112e-07, "loss": 1.4729, "step": 2784 }, { "epoch": 0.6120879120879121, "grad_norm": 0.23823618339636068, "learning_rate": 2.763558154199151e-07, "loss": 1.4864, "step": 2785 }, { "epoch": 0.6123076923076923, "grad_norm": 0.2392802927769334, "learning_rate": 2.761516922393922e-07, "loss": 1.432, "step": 2786 }, { "epoch": 0.6125274725274725, "grad_norm": 0.24138215208784955, "learning_rate": 2.7594762095065507e-07, "loss": 1.4992, "step": 2787 }, { "epoch": 0.6127472527472527, "grad_norm": 0.24723985590687675, "learning_rate": 2.757436016509917e-07, "loss": 1.3818, "step": 2788 }, { "epoch": 0.6129670329670329, "grad_norm": 0.24340468458681574, "learning_rate": 2.75539634437665e-07, "loss": 1.4829, "step": 2789 }, { "epoch": 0.6131868131868132, "grad_norm": 0.3274566947817604, "learning_rate": 2.753357194079133e-07, "loss": 1.4621, "step": 2790 }, { "epoch": 0.6134065934065934, "grad_norm": 0.24919490504124517, "learning_rate": 2.751318566589501e-07, "loss": 1.4903, "step": 2791 }, { "epoch": 0.6136263736263736, "grad_norm": 0.25916066602605464, "learning_rate": 2.74928046287964e-07, "loss": 1.4575, "step": 2792 }, { "epoch": 0.6138461538461538, "grad_norm": 0.24608490593825452, "learning_rate": 2.747242883921182e-07, "loss": 1.5024, "step": 2793 }, { "epoch": 0.614065934065934, "grad_norm": 0.2441503092999619, "learning_rate": 2.7452058306855143e-07, "loss": 1.4518, "step": 2794 }, { "epoch": 0.6142857142857143, "grad_norm": 0.2592931844175535, "learning_rate": 2.7431693041437706e-07, "loss": 1.3783, "step": 2795 }, { "epoch": 0.6145054945054945, "grad_norm": 0.2992583932379197, "learning_rate": 2.741133305266834e-07, "loss": 1.4372, "step": 2796 }, { "epoch": 0.6147252747252747, "grad_norm": 0.24221330064764257, "learning_rate": 2.7390978350253374e-07, "loss": 1.4672, "step": 2797 }, { "epoch": 0.6149450549450549, "grad_norm": 0.28105113913083424, "learning_rate": 2.737062894389659e-07, "loss": 1.465, "step": 2798 }, { "epoch": 0.6151648351648351, "grad_norm": 0.2665490478140696, "learning_rate": 2.735028484329926e-07, "loss": 1.4044, "step": 2799 }, { "epoch": 0.6153846153846154, "grad_norm": 0.24423573775881333, "learning_rate": 2.7329946058160124e-07, "loss": 1.4441, "step": 2800 }, { "epoch": 0.6156043956043956, "grad_norm": 0.24854542416073258, "learning_rate": 2.730961259817542e-07, "loss": 1.5131, "step": 2801 }, { "epoch": 0.6158241758241758, "grad_norm": 0.2568503196130161, "learning_rate": 2.728928447303877e-07, "loss": 1.435, "step": 2802 }, { "epoch": 0.616043956043956, "grad_norm": 0.25751972570130277, "learning_rate": 2.7268961692441346e-07, "loss": 1.4825, "step": 2803 }, { "epoch": 0.6162637362637363, "grad_norm": 0.26758436474037745, "learning_rate": 2.724864426607169e-07, "loss": 1.488, "step": 2804 }, { "epoch": 0.6164835164835165, "grad_norm": 0.25463513002359184, "learning_rate": 2.722833220361586e-07, "loss": 1.4665, "step": 2805 }, { "epoch": 0.6167032967032967, "grad_norm": 0.24033732821235326, "learning_rate": 2.72080255147573e-07, "loss": 1.4403, "step": 2806 }, { "epoch": 0.6169230769230769, "grad_norm": 0.2537429863828503, "learning_rate": 2.718772420917693e-07, "loss": 1.4958, "step": 2807 }, { "epoch": 0.6171428571428571, "grad_norm": 0.3070515269029011, "learning_rate": 2.716742829655309e-07, "loss": 1.533, "step": 2808 }, { "epoch": 0.6173626373626374, "grad_norm": 0.26070131619033404, "learning_rate": 2.714713778656153e-07, "loss": 1.4411, "step": 2809 }, { "epoch": 0.6175824175824176, "grad_norm": 0.2448312377581891, "learning_rate": 2.7126852688875474e-07, "loss": 1.4124, "step": 2810 }, { "epoch": 0.6178021978021978, "grad_norm": 0.24533074910693486, "learning_rate": 2.710657301316552e-07, "loss": 1.4092, "step": 2811 }, { "epoch": 0.618021978021978, "grad_norm": 0.2622020381668056, "learning_rate": 2.70862987690997e-07, "loss": 1.4131, "step": 2812 }, { "epoch": 0.6182417582417582, "grad_norm": 0.24164843532840727, "learning_rate": 2.706602996634345e-07, "loss": 1.4463, "step": 2813 }, { "epoch": 0.6184615384615385, "grad_norm": 0.26821984758842227, "learning_rate": 2.7045766614559613e-07, "loss": 1.4513, "step": 2814 }, { "epoch": 0.6186813186813187, "grad_norm": 0.2527769764049168, "learning_rate": 2.702550872340846e-07, "loss": 1.4803, "step": 2815 }, { "epoch": 0.6189010989010989, "grad_norm": 0.24481534745646374, "learning_rate": 2.7005256302547604e-07, "loss": 1.4737, "step": 2816 }, { "epoch": 0.6191208791208791, "grad_norm": 0.24772386003665042, "learning_rate": 2.698500936163209e-07, "loss": 1.4522, "step": 2817 }, { "epoch": 0.6193406593406593, "grad_norm": 0.2505975677436217, "learning_rate": 2.696476791031434e-07, "loss": 1.5338, "step": 2818 }, { "epoch": 0.6195604395604396, "grad_norm": 0.25392803101198486, "learning_rate": 2.6944531958244165e-07, "loss": 1.5052, "step": 2819 }, { "epoch": 0.6197802197802198, "grad_norm": 0.23309877944596225, "learning_rate": 2.6924301515068745e-07, "loss": 1.4745, "step": 2820 }, { "epoch": 0.62, "grad_norm": 0.26377695945308144, "learning_rate": 2.690407659043264e-07, "loss": 1.5323, "step": 2821 }, { "epoch": 0.6202197802197802, "grad_norm": 0.23633592621511978, "learning_rate": 2.688385719397779e-07, "loss": 1.3892, "step": 2822 }, { "epoch": 0.6204395604395604, "grad_norm": 0.2463089099013308, "learning_rate": 2.686364333534347e-07, "loss": 1.453, "step": 2823 }, { "epoch": 0.6206593406593407, "grad_norm": 0.2534261083630188, "learning_rate": 2.684343502416632e-07, "loss": 1.5113, "step": 2824 }, { "epoch": 0.6208791208791209, "grad_norm": 0.24994646596974002, "learning_rate": 2.682323227008038e-07, "loss": 1.451, "step": 2825 }, { "epoch": 0.6210989010989011, "grad_norm": 0.30067136753213547, "learning_rate": 2.680303508271698e-07, "loss": 1.4584, "step": 2826 }, { "epoch": 0.6213186813186813, "grad_norm": 0.25560280194093127, "learning_rate": 2.678284347170484e-07, "loss": 1.4125, "step": 2827 }, { "epoch": 0.6215384615384615, "grad_norm": 0.2941711091966941, "learning_rate": 2.676265744667e-07, "loss": 1.5021, "step": 2828 }, { "epoch": 0.6217582417582418, "grad_norm": 0.24605546671769116, "learning_rate": 2.6742477017235833e-07, "loss": 1.4621, "step": 2829 }, { "epoch": 0.621978021978022, "grad_norm": 0.2563288605501652, "learning_rate": 2.672230219302307e-07, "loss": 1.4661, "step": 2830 }, { "epoch": 0.6221978021978022, "grad_norm": 0.2483960316808464, "learning_rate": 2.670213298364975e-07, "loss": 1.48, "step": 2831 }, { "epoch": 0.6224175824175824, "grad_norm": 0.24389055588115652, "learning_rate": 2.6681969398731225e-07, "loss": 1.4399, "step": 2832 }, { "epoch": 0.6226373626373627, "grad_norm": 0.2461762059616389, "learning_rate": 2.6661811447880197e-07, "loss": 1.4945, "step": 2833 }, { "epoch": 0.6228571428571429, "grad_norm": 0.2471990459257803, "learning_rate": 2.664165914070665e-07, "loss": 1.522, "step": 2834 }, { "epoch": 0.6230769230769231, "grad_norm": 0.258707408088636, "learning_rate": 2.66215124868179e-07, "loss": 1.5095, "step": 2835 }, { "epoch": 0.6232967032967033, "grad_norm": 0.2507656223835275, "learning_rate": 2.660137149581857e-07, "loss": 1.4768, "step": 2836 }, { "epoch": 0.6235164835164835, "grad_norm": 0.26040518587086814, "learning_rate": 2.658123617731056e-07, "loss": 1.4672, "step": 2837 }, { "epoch": 0.6237362637362638, "grad_norm": 0.36898059415392886, "learning_rate": 2.6561106540893057e-07, "loss": 1.5357, "step": 2838 }, { "epoch": 0.623956043956044, "grad_norm": 0.2652248897625437, "learning_rate": 2.654098259616261e-07, "loss": 1.4572, "step": 2839 }, { "epoch": 0.6241758241758242, "grad_norm": 0.2489821663440469, "learning_rate": 2.6520864352712976e-07, "loss": 1.4843, "step": 2840 }, { "epoch": 0.6243956043956044, "grad_norm": 0.2594474675887204, "learning_rate": 2.650075182013523e-07, "loss": 1.4407, "step": 2841 }, { "epoch": 0.6246153846153846, "grad_norm": 0.25712986434896845, "learning_rate": 2.6480645008017706e-07, "loss": 1.4515, "step": 2842 }, { "epoch": 0.6248351648351649, "grad_norm": 0.2573780357581436, "learning_rate": 2.6460543925946023e-07, "loss": 1.4536, "step": 2843 }, { "epoch": 0.6250549450549451, "grad_norm": 0.30665596993707006, "learning_rate": 2.64404485835031e-07, "loss": 1.4529, "step": 2844 }, { "epoch": 0.6252747252747253, "grad_norm": 0.23967905051755029, "learning_rate": 2.6420358990269047e-07, "loss": 1.5155, "step": 2845 }, { "epoch": 0.6254945054945055, "grad_norm": 0.29993653997854336, "learning_rate": 2.640027515582131e-07, "loss": 1.424, "step": 2846 }, { "epoch": 0.6257142857142857, "grad_norm": 0.24506179692758245, "learning_rate": 2.6380197089734514e-07, "loss": 1.456, "step": 2847 }, { "epoch": 0.625934065934066, "grad_norm": 0.24575536037400184, "learning_rate": 2.6360124801580614e-07, "loss": 1.4429, "step": 2848 }, { "epoch": 0.6261538461538462, "grad_norm": 0.24513992249774819, "learning_rate": 2.634005830092874e-07, "loss": 1.4885, "step": 2849 }, { "epoch": 0.6263736263736264, "grad_norm": 0.2447623203675496, "learning_rate": 2.6319997597345303e-07, "loss": 1.4452, "step": 2850 }, { "epoch": 0.6265934065934066, "grad_norm": 0.24424616248784922, "learning_rate": 2.6299942700393944e-07, "loss": 1.4282, "step": 2851 }, { "epoch": 0.6268131868131868, "grad_norm": 0.2393103125713913, "learning_rate": 2.627989361963554e-07, "loss": 1.4944, "step": 2852 }, { "epoch": 0.6270329670329671, "grad_norm": 0.24372315941926556, "learning_rate": 2.6259850364628164e-07, "loss": 1.4597, "step": 2853 }, { "epoch": 0.6272527472527473, "grad_norm": 0.2576497581670632, "learning_rate": 2.623981294492716e-07, "loss": 1.4798, "step": 2854 }, { "epoch": 0.6274725274725275, "grad_norm": 0.25662975162566215, "learning_rate": 2.621978137008504e-07, "loss": 1.4821, "step": 2855 }, { "epoch": 0.6276923076923077, "grad_norm": 0.24280530676974668, "learning_rate": 2.619975564965158e-07, "loss": 1.4673, "step": 2856 }, { "epoch": 0.6279120879120879, "grad_norm": 0.23964867825847486, "learning_rate": 2.6179735793173716e-07, "loss": 1.499, "step": 2857 }, { "epoch": 0.6281318681318682, "grad_norm": 0.5861739492202969, "learning_rate": 2.615972181019562e-07, "loss": 1.408, "step": 2858 }, { "epoch": 0.6283516483516484, "grad_norm": 0.24986249129666405, "learning_rate": 2.613971371025867e-07, "loss": 1.4493, "step": 2859 }, { "epoch": 0.6285714285714286, "grad_norm": 0.24015338506617812, "learning_rate": 2.611971150290141e-07, "loss": 1.4288, "step": 2860 }, { "epoch": 0.6287912087912088, "grad_norm": 0.2463092478272792, "learning_rate": 2.6099715197659596e-07, "loss": 1.4322, "step": 2861 }, { "epoch": 0.6290109890109891, "grad_norm": 0.2532701143533487, "learning_rate": 2.607972480406615e-07, "loss": 1.4637, "step": 2862 }, { "epoch": 0.6292307692307693, "grad_norm": 0.2552728846307907, "learning_rate": 2.6059740331651217e-07, "loss": 1.4707, "step": 2863 }, { "epoch": 0.6294505494505495, "grad_norm": 0.2553736055829221, "learning_rate": 2.603976178994208e-07, "loss": 1.4731, "step": 2864 }, { "epoch": 0.6296703296703297, "grad_norm": 0.25576625836644434, "learning_rate": 2.6019789188463193e-07, "loss": 1.4295, "step": 2865 }, { "epoch": 0.6298901098901099, "grad_norm": 0.23985643407427887, "learning_rate": 2.599982253673621e-07, "loss": 1.4582, "step": 2866 }, { "epoch": 0.6301098901098902, "grad_norm": 0.25161103928396245, "learning_rate": 2.597986184427991e-07, "loss": 1.4415, "step": 2867 }, { "epoch": 0.6303296703296704, "grad_norm": 0.24200350717316507, "learning_rate": 2.595990712061028e-07, "loss": 1.5023, "step": 2868 }, { "epoch": 0.6305494505494506, "grad_norm": 0.2449181858527255, "learning_rate": 2.5939958375240426e-07, "loss": 1.4137, "step": 2869 }, { "epoch": 0.6307692307692307, "grad_norm": 0.24963028357508124, "learning_rate": 2.59200156176806e-07, "loss": 1.5189, "step": 2870 }, { "epoch": 0.630989010989011, "grad_norm": 0.2544803480741481, "learning_rate": 2.5900078857438207e-07, "loss": 1.4613, "step": 2871 }, { "epoch": 0.6312087912087913, "grad_norm": 0.25710601023148333, "learning_rate": 2.5880148104017794e-07, "loss": 1.4881, "step": 2872 }, { "epoch": 0.6314285714285715, "grad_norm": 0.2660596997956752, "learning_rate": 2.586022336692106e-07, "loss": 1.4239, "step": 2873 }, { "epoch": 0.6316483516483516, "grad_norm": 0.24672583608519805, "learning_rate": 2.584030465564683e-07, "loss": 1.4013, "step": 2874 }, { "epoch": 0.6318681318681318, "grad_norm": 0.25768288683848095, "learning_rate": 2.5820391979691025e-07, "loss": 1.4666, "step": 2875 }, { "epoch": 0.632087912087912, "grad_norm": 0.25865658897524546, "learning_rate": 2.580048534854673e-07, "loss": 1.529, "step": 2876 }, { "epoch": 0.6323076923076923, "grad_norm": 0.24769300741875816, "learning_rate": 2.5780584771704116e-07, "loss": 1.4807, "step": 2877 }, { "epoch": 0.6325274725274725, "grad_norm": 0.2628685955578374, "learning_rate": 2.5760690258650496e-07, "loss": 1.4682, "step": 2878 }, { "epoch": 0.6327472527472527, "grad_norm": 0.2486143115957994, "learning_rate": 2.5740801818870267e-07, "loss": 1.4692, "step": 2879 }, { "epoch": 0.6329670329670329, "grad_norm": 0.30566491823381436, "learning_rate": 2.572091946184496e-07, "loss": 1.4942, "step": 2880 }, { "epoch": 0.6331868131868131, "grad_norm": 0.26502102734665955, "learning_rate": 2.570104319705317e-07, "loss": 1.4832, "step": 2881 }, { "epoch": 0.6334065934065934, "grad_norm": 0.2558474918238458, "learning_rate": 2.56811730339706e-07, "loss": 1.512, "step": 2882 }, { "epoch": 0.6336263736263736, "grad_norm": 0.23618937873946327, "learning_rate": 2.5661308982070065e-07, "loss": 1.4216, "step": 2883 }, { "epoch": 0.6338461538461538, "grad_norm": 0.24607635970302394, "learning_rate": 2.564145105082147e-07, "loss": 1.4578, "step": 2884 }, { "epoch": 0.634065934065934, "grad_norm": 0.37591479413491696, "learning_rate": 2.5621599249691746e-07, "loss": 1.4868, "step": 2885 }, { "epoch": 0.6342857142857142, "grad_norm": 0.25925714314608084, "learning_rate": 2.5601753588144953e-07, "loss": 1.5097, "step": 2886 }, { "epoch": 0.6345054945054945, "grad_norm": 0.25206041921771516, "learning_rate": 2.558191407564221e-07, "loss": 1.3957, "step": 2887 }, { "epoch": 0.6347252747252747, "grad_norm": 0.2388974492827397, "learning_rate": 2.5562080721641715e-07, "loss": 1.5077, "step": 2888 }, { "epoch": 0.6349450549450549, "grad_norm": 0.2352967262259724, "learning_rate": 2.5542253535598717e-07, "loss": 1.4807, "step": 2889 }, { "epoch": 0.6351648351648351, "grad_norm": 0.2446569128586504, "learning_rate": 2.552243252696551e-07, "loss": 1.4399, "step": 2890 }, { "epoch": 0.6353846153846154, "grad_norm": 0.2365234128733907, "learning_rate": 2.550261770519148e-07, "loss": 1.4485, "step": 2891 }, { "epoch": 0.6356043956043956, "grad_norm": 0.24301965126548045, "learning_rate": 2.5482809079723024e-07, "loss": 1.4666, "step": 2892 }, { "epoch": 0.6358241758241758, "grad_norm": 0.24242178670602155, "learning_rate": 2.5463006660003637e-07, "loss": 1.5414, "step": 2893 }, { "epoch": 0.636043956043956, "grad_norm": 0.24207991875232743, "learning_rate": 2.5443210455473786e-07, "loss": 1.4618, "step": 2894 }, { "epoch": 0.6362637362637362, "grad_norm": 0.2567225455233669, "learning_rate": 2.542342047557105e-07, "loss": 1.4766, "step": 2895 }, { "epoch": 0.6364835164835165, "grad_norm": 0.24731626801041443, "learning_rate": 2.5403636729729964e-07, "loss": 1.4465, "step": 2896 }, { "epoch": 0.6367032967032967, "grad_norm": 0.24926976310952406, "learning_rate": 2.538385922738215e-07, "loss": 1.4898, "step": 2897 }, { "epoch": 0.6369230769230769, "grad_norm": 0.23383539155294128, "learning_rate": 2.5364087977956243e-07, "loss": 1.4101, "step": 2898 }, { "epoch": 0.6371428571428571, "grad_norm": 0.251464325919328, "learning_rate": 2.534432299087787e-07, "loss": 1.4793, "step": 2899 }, { "epoch": 0.6373626373626373, "grad_norm": 0.24509098112180402, "learning_rate": 2.5324564275569706e-07, "loss": 1.4745, "step": 2900 }, { "epoch": 0.6375824175824176, "grad_norm": 0.2663807872672918, "learning_rate": 2.5304811841451395e-07, "loss": 1.4217, "step": 2901 }, { "epoch": 0.6378021978021978, "grad_norm": 0.23696124194719534, "learning_rate": 2.5285065697939634e-07, "loss": 1.4688, "step": 2902 }, { "epoch": 0.638021978021978, "grad_norm": 0.2456102335966778, "learning_rate": 2.526532585444808e-07, "loss": 1.5261, "step": 2903 }, { "epoch": 0.6382417582417582, "grad_norm": 0.24972928092226704, "learning_rate": 2.524559232038743e-07, "loss": 1.4612, "step": 2904 }, { "epoch": 0.6384615384615384, "grad_norm": 0.25201685389695233, "learning_rate": 2.522586510516534e-07, "loss": 1.44, "step": 2905 }, { "epoch": 0.6386813186813187, "grad_norm": 0.255514240779577, "learning_rate": 2.5206144218186436e-07, "loss": 1.5007, "step": 2906 }, { "epoch": 0.6389010989010989, "grad_norm": 0.25577694608065626, "learning_rate": 2.5186429668852386e-07, "loss": 1.421, "step": 2907 }, { "epoch": 0.6391208791208791, "grad_norm": 0.25157469910805375, "learning_rate": 2.5166721466561795e-07, "loss": 1.4542, "step": 2908 }, { "epoch": 0.6393406593406593, "grad_norm": 0.25938177309484367, "learning_rate": 2.5147019620710245e-07, "loss": 1.511, "step": 2909 }, { "epoch": 0.6395604395604395, "grad_norm": 0.24075538377136294, "learning_rate": 2.512732414069028e-07, "loss": 1.455, "step": 2910 }, { "epoch": 0.6397802197802198, "grad_norm": 0.259048723673585, "learning_rate": 2.5107635035891437e-07, "loss": 1.4618, "step": 2911 }, { "epoch": 0.64, "grad_norm": 0.25283089613057796, "learning_rate": 2.508795231570021e-07, "loss": 1.4847, "step": 2912 }, { "epoch": 0.6402197802197802, "grad_norm": 0.32406040698220695, "learning_rate": 2.5068275989500023e-07, "loss": 1.5001, "step": 2913 }, { "epoch": 0.6404395604395604, "grad_norm": 0.27582907409618784, "learning_rate": 2.504860606667128e-07, "loss": 1.5091, "step": 2914 }, { "epoch": 0.6406593406593407, "grad_norm": 0.2524924066748553, "learning_rate": 2.5028942556591296e-07, "loss": 1.4731, "step": 2915 }, { "epoch": 0.6408791208791209, "grad_norm": 0.25315946602881495, "learning_rate": 2.5009285468634377e-07, "loss": 1.4732, "step": 2916 }, { "epoch": 0.6410989010989011, "grad_norm": 0.2472126417658668, "learning_rate": 2.498963481217174e-07, "loss": 1.4972, "step": 2917 }, { "epoch": 0.6413186813186813, "grad_norm": 0.26297434701382966, "learning_rate": 2.496999059657151e-07, "loss": 1.3957, "step": 2918 }, { "epoch": 0.6415384615384615, "grad_norm": 0.2540569813398366, "learning_rate": 2.4950352831198806e-07, "loss": 1.4581, "step": 2919 }, { "epoch": 0.6417582417582418, "grad_norm": 0.2592848693101979, "learning_rate": 2.493072152541561e-07, "loss": 1.5138, "step": 2920 }, { "epoch": 0.641978021978022, "grad_norm": 0.3090859085566191, "learning_rate": 2.4911096688580873e-07, "loss": 1.4268, "step": 2921 }, { "epoch": 0.6421978021978022, "grad_norm": 0.23717743116654658, "learning_rate": 2.4891478330050427e-07, "loss": 1.4456, "step": 2922 }, { "epoch": 0.6424175824175824, "grad_norm": 0.25147376891162104, "learning_rate": 2.4871866459177035e-07, "loss": 1.4794, "step": 2923 }, { "epoch": 0.6426373626373626, "grad_norm": 0.2460244860737173, "learning_rate": 2.485226108531034e-07, "loss": 1.3746, "step": 2924 }, { "epoch": 0.6428571428571429, "grad_norm": 0.25053272659868825, "learning_rate": 2.483266221779692e-07, "loss": 1.4533, "step": 2925 }, { "epoch": 0.6430769230769231, "grad_norm": 0.24555094406174524, "learning_rate": 2.4813069865980243e-07, "loss": 1.4907, "step": 2926 }, { "epoch": 0.6432967032967033, "grad_norm": 0.2508731121060695, "learning_rate": 2.479348403920067e-07, "loss": 1.4715, "step": 2927 }, { "epoch": 0.6435164835164835, "grad_norm": 0.2694951088347539, "learning_rate": 2.4773904746795437e-07, "loss": 1.4947, "step": 2928 }, { "epoch": 0.6437362637362637, "grad_norm": 0.2503538129609662, "learning_rate": 2.475433199809869e-07, "loss": 1.4644, "step": 2929 }, { "epoch": 0.643956043956044, "grad_norm": 0.24790099011997746, "learning_rate": 2.4734765802441417e-07, "loss": 1.4962, "step": 2930 }, { "epoch": 0.6441758241758242, "grad_norm": 0.2535367449362057, "learning_rate": 2.4715206169151537e-07, "loss": 1.5242, "step": 2931 }, { "epoch": 0.6443956043956044, "grad_norm": 0.24765899273957198, "learning_rate": 2.4695653107553787e-07, "loss": 1.3793, "step": 2932 }, { "epoch": 0.6446153846153846, "grad_norm": 0.2568970873517436, "learning_rate": 2.467610662696979e-07, "loss": 1.397, "step": 2933 }, { "epoch": 0.6448351648351648, "grad_norm": 0.24359852655681025, "learning_rate": 2.4656566736718055e-07, "loss": 1.4776, "step": 2934 }, { "epoch": 0.6450549450549451, "grad_norm": 0.2521438960939031, "learning_rate": 2.463703344611391e-07, "loss": 1.5206, "step": 2935 }, { "epoch": 0.6452747252747253, "grad_norm": 0.2533804782220817, "learning_rate": 2.4617506764469583e-07, "loss": 1.431, "step": 2936 }, { "epoch": 0.6454945054945055, "grad_norm": 0.24411108105400223, "learning_rate": 2.4597986701094094e-07, "loss": 1.4873, "step": 2937 }, { "epoch": 0.6457142857142857, "grad_norm": 0.2749327049552055, "learning_rate": 2.457847326529337e-07, "loss": 1.4897, "step": 2938 }, { "epoch": 0.6459340659340659, "grad_norm": 0.2556229087342906, "learning_rate": 2.455896646637011e-07, "loss": 1.4577, "step": 2939 }, { "epoch": 0.6461538461538462, "grad_norm": 0.2429621105813027, "learning_rate": 2.45394663136239e-07, "loss": 1.4333, "step": 2940 }, { "epoch": 0.6463736263736264, "grad_norm": 0.28905077326746476, "learning_rate": 2.4519972816351147e-07, "loss": 1.4174, "step": 2941 }, { "epoch": 0.6465934065934066, "grad_norm": 0.2692161782606739, "learning_rate": 2.450048598384509e-07, "loss": 1.464, "step": 2942 }, { "epoch": 0.6468131868131868, "grad_norm": 0.2599451377641455, "learning_rate": 2.4481005825395777e-07, "loss": 1.4712, "step": 2943 }, { "epoch": 0.6470329670329671, "grad_norm": 0.25847217362042313, "learning_rate": 2.446153235029006e-07, "loss": 1.5089, "step": 2944 }, { "epoch": 0.6472527472527473, "grad_norm": 0.25025000456975677, "learning_rate": 2.4442065567811653e-07, "loss": 1.4325, "step": 2945 }, { "epoch": 0.6474725274725275, "grad_norm": 0.3062559101992876, "learning_rate": 2.4422605487241036e-07, "loss": 1.4408, "step": 2946 }, { "epoch": 0.6476923076923077, "grad_norm": 0.2544567029041009, "learning_rate": 2.440315211785551e-07, "loss": 1.5209, "step": 2947 }, { "epoch": 0.6479120879120879, "grad_norm": 0.2517821459407482, "learning_rate": 2.438370546892916e-07, "loss": 1.4349, "step": 2948 }, { "epoch": 0.6481318681318682, "grad_norm": 0.25776996022413107, "learning_rate": 2.4364265549732907e-07, "loss": 1.453, "step": 2949 }, { "epoch": 0.6483516483516484, "grad_norm": 0.2627200763149241, "learning_rate": 2.434483236953441e-07, "loss": 1.4635, "step": 2950 }, { "epoch": 0.6485714285714286, "grad_norm": 0.24327742804716065, "learning_rate": 2.4325405937598185e-07, "loss": 1.4517, "step": 2951 }, { "epoch": 0.6487912087912088, "grad_norm": 0.2504827525899091, "learning_rate": 2.430598626318546e-07, "loss": 1.4397, "step": 2952 }, { "epoch": 0.649010989010989, "grad_norm": 0.2425369524665089, "learning_rate": 2.4286573355554274e-07, "loss": 1.4879, "step": 2953 }, { "epoch": 0.6492307692307693, "grad_norm": 0.25325488583168404, "learning_rate": 2.4267167223959435e-07, "loss": 1.4791, "step": 2954 }, { "epoch": 0.6494505494505495, "grad_norm": 0.25234062890244585, "learning_rate": 2.424776787765254e-07, "loss": 1.5006, "step": 2955 }, { "epoch": 0.6496703296703297, "grad_norm": 0.24206717094628213, "learning_rate": 2.42283753258819e-07, "loss": 1.446, "step": 2956 }, { "epoch": 0.6498901098901099, "grad_norm": 0.24610605739098584, "learning_rate": 2.420898957789267e-07, "loss": 1.4242, "step": 2957 }, { "epoch": 0.6501098901098901, "grad_norm": 0.27691333257297623, "learning_rate": 2.4189610642926674e-07, "loss": 1.4453, "step": 2958 }, { "epoch": 0.6503296703296704, "grad_norm": 0.24930504957336627, "learning_rate": 2.4170238530222535e-07, "loss": 1.4866, "step": 2959 }, { "epoch": 0.6505494505494506, "grad_norm": 0.255335178655049, "learning_rate": 2.4150873249015626e-07, "loss": 1.5051, "step": 2960 }, { "epoch": 0.6507692307692308, "grad_norm": 0.24921237889027203, "learning_rate": 2.413151480853805e-07, "loss": 1.4839, "step": 2961 }, { "epoch": 0.650989010989011, "grad_norm": 0.23625646421617658, "learning_rate": 2.4112163218018643e-07, "loss": 1.4502, "step": 2962 }, { "epoch": 0.6512087912087912, "grad_norm": 0.26607537504914247, "learning_rate": 2.409281848668298e-07, "loss": 1.527, "step": 2963 }, { "epoch": 0.6514285714285715, "grad_norm": 0.37042601653975316, "learning_rate": 2.407348062375338e-07, "loss": 1.4136, "step": 2964 }, { "epoch": 0.6516483516483517, "grad_norm": 0.24518878874398756, "learning_rate": 2.405414963844886e-07, "loss": 1.5283, "step": 2965 }, { "epoch": 0.6518681318681319, "grad_norm": 0.2598143109317973, "learning_rate": 2.40348255399852e-07, "loss": 1.4527, "step": 2966 }, { "epoch": 0.6520879120879121, "grad_norm": 0.25024726976257977, "learning_rate": 2.401550833757486e-07, "loss": 1.4782, "step": 2967 }, { "epoch": 0.6523076923076923, "grad_norm": 0.26792275526618126, "learning_rate": 2.3996198040427027e-07, "loss": 1.4535, "step": 2968 }, { "epoch": 0.6525274725274726, "grad_norm": 0.3656491138960403, "learning_rate": 2.397689465774757e-07, "loss": 1.4651, "step": 2969 }, { "epoch": 0.6527472527472528, "grad_norm": 0.24484044654345521, "learning_rate": 2.3957598198739124e-07, "loss": 1.4874, "step": 2970 }, { "epoch": 0.652967032967033, "grad_norm": 0.25598445315108104, "learning_rate": 2.3938308672600956e-07, "loss": 1.4653, "step": 2971 }, { "epoch": 0.6531868131868132, "grad_norm": 0.25291595516474, "learning_rate": 2.391902608852906e-07, "loss": 1.438, "step": 2972 }, { "epoch": 0.6534065934065935, "grad_norm": 0.2573687623226722, "learning_rate": 2.3899750455716123e-07, "loss": 1.4504, "step": 2973 }, { "epoch": 0.6536263736263737, "grad_norm": 0.2640884074388109, "learning_rate": 2.388048178335152e-07, "loss": 1.4663, "step": 2974 }, { "epoch": 0.6538461538461539, "grad_norm": 0.24004062990968525, "learning_rate": 2.386122008062129e-07, "loss": 1.396, "step": 2975 }, { "epoch": 0.654065934065934, "grad_norm": 0.25327911391656, "learning_rate": 2.3841965356708157e-07, "loss": 1.5331, "step": 2976 }, { "epoch": 0.6542857142857142, "grad_norm": 0.24644399792474514, "learning_rate": 2.3822717620791528e-07, "loss": 1.442, "step": 2977 }, { "epoch": 0.6545054945054946, "grad_norm": 0.24948516403498913, "learning_rate": 2.3803476882047442e-07, "loss": 1.4755, "step": 2978 }, { "epoch": 0.6547252747252748, "grad_norm": 0.24073357890270214, "learning_rate": 2.3784243149648666e-07, "loss": 1.4767, "step": 2979 }, { "epoch": 0.654945054945055, "grad_norm": 0.283618267596697, "learning_rate": 2.376501643276457e-07, "loss": 1.4586, "step": 2980 }, { "epoch": 0.6551648351648351, "grad_norm": 0.2468768745864606, "learning_rate": 2.3745796740561218e-07, "loss": 1.4543, "step": 2981 }, { "epoch": 0.6553846153846153, "grad_norm": 0.2568937663514108, "learning_rate": 2.3726584082201295e-07, "loss": 1.492, "step": 2982 }, { "epoch": 0.6556043956043957, "grad_norm": 0.2618632259781446, "learning_rate": 2.3707378466844135e-07, "loss": 1.4513, "step": 2983 }, { "epoch": 0.6558241758241758, "grad_norm": 0.25599881617870923, "learning_rate": 2.3688179903645752e-07, "loss": 1.5482, "step": 2984 }, { "epoch": 0.656043956043956, "grad_norm": 0.2571918931916502, "learning_rate": 2.366898840175875e-07, "loss": 1.4192, "step": 2985 }, { "epoch": 0.6562637362637362, "grad_norm": 0.2412963502691558, "learning_rate": 2.3649803970332405e-07, "loss": 1.5063, "step": 2986 }, { "epoch": 0.6564835164835164, "grad_norm": 0.23989128783862962, "learning_rate": 2.3630626618512578e-07, "loss": 1.4124, "step": 2987 }, { "epoch": 0.6567032967032967, "grad_norm": 0.24540190318494906, "learning_rate": 2.3611456355441795e-07, "loss": 1.4372, "step": 2988 }, { "epoch": 0.6569230769230769, "grad_norm": 0.2498617140330551, "learning_rate": 2.3592293190259204e-07, "loss": 1.4829, "step": 2989 }, { "epoch": 0.6571428571428571, "grad_norm": 0.2523377042802845, "learning_rate": 2.3573137132100543e-07, "loss": 1.4521, "step": 2990 }, { "epoch": 0.6573626373626373, "grad_norm": 0.24784159630196054, "learning_rate": 2.3553988190098174e-07, "loss": 1.5054, "step": 2991 }, { "epoch": 0.6575824175824175, "grad_norm": 0.2473577182913839, "learning_rate": 2.3534846373381062e-07, "loss": 1.554, "step": 2992 }, { "epoch": 0.6578021978021978, "grad_norm": 0.2465708419356356, "learning_rate": 2.351571169107477e-07, "loss": 1.4286, "step": 2993 }, { "epoch": 0.658021978021978, "grad_norm": 0.2570133814109051, "learning_rate": 2.349658415230149e-07, "loss": 1.469, "step": 2994 }, { "epoch": 0.6582417582417582, "grad_norm": 0.24891790017703455, "learning_rate": 2.3477463766179964e-07, "loss": 1.4992, "step": 2995 }, { "epoch": 0.6584615384615384, "grad_norm": 0.24462414762608164, "learning_rate": 2.3458350541825577e-07, "loss": 1.4019, "step": 2996 }, { "epoch": 0.6586813186813186, "grad_norm": 0.2545435603975285, "learning_rate": 2.3439244488350255e-07, "loss": 1.4369, "step": 2997 }, { "epoch": 0.6589010989010989, "grad_norm": 0.24277545462796876, "learning_rate": 2.3420145614862501e-07, "loss": 1.4799, "step": 2998 }, { "epoch": 0.6591208791208791, "grad_norm": 0.25252291174055863, "learning_rate": 2.3401053930467438e-07, "loss": 1.4394, "step": 2999 }, { "epoch": 0.6593406593406593, "grad_norm": 0.2578725036009162, "learning_rate": 2.3381969444266735e-07, "loss": 1.4569, "step": 3000 }, { "epoch": 0.6595604395604395, "grad_norm": 0.24158085775251778, "learning_rate": 2.3362892165358633e-07, "loss": 1.3909, "step": 3001 }, { "epoch": 0.6597802197802198, "grad_norm": 0.25588298250955477, "learning_rate": 2.3343822102837913e-07, "loss": 1.5275, "step": 3002 }, { "epoch": 0.66, "grad_norm": 0.24381431471343762, "learning_rate": 2.3324759265795963e-07, "loss": 1.4082, "step": 3003 }, { "epoch": 0.6602197802197802, "grad_norm": 0.2536951003775051, "learning_rate": 2.330570366332071e-07, "loss": 1.4535, "step": 3004 }, { "epoch": 0.6604395604395604, "grad_norm": 0.26507366301011936, "learning_rate": 2.328665530449661e-07, "loss": 1.4707, "step": 3005 }, { "epoch": 0.6606593406593406, "grad_norm": 0.24127812506832325, "learning_rate": 2.3267614198404682e-07, "loss": 1.4423, "step": 3006 }, { "epoch": 0.6608791208791209, "grad_norm": 0.2557160869294038, "learning_rate": 2.3248580354122476e-07, "loss": 1.4679, "step": 3007 }, { "epoch": 0.6610989010989011, "grad_norm": 0.24916186623065067, "learning_rate": 2.3229553780724108e-07, "loss": 1.4679, "step": 3008 }, { "epoch": 0.6613186813186813, "grad_norm": 0.25841460090363655, "learning_rate": 2.3210534487280205e-07, "loss": 1.4534, "step": 3009 }, { "epoch": 0.6615384615384615, "grad_norm": 0.2376517574484778, "learning_rate": 2.3191522482857908e-07, "loss": 1.4277, "step": 3010 }, { "epoch": 0.6617582417582417, "grad_norm": 0.29047109814999295, "learning_rate": 2.3172517776520933e-07, "loss": 1.5039, "step": 3011 }, { "epoch": 0.661978021978022, "grad_norm": 0.2854690411185694, "learning_rate": 2.3153520377329458e-07, "loss": 1.4527, "step": 3012 }, { "epoch": 0.6621978021978022, "grad_norm": 0.25168645703946924, "learning_rate": 2.3134530294340232e-07, "loss": 1.4274, "step": 3013 }, { "epoch": 0.6624175824175824, "grad_norm": 0.24783582545489324, "learning_rate": 2.3115547536606478e-07, "loss": 1.5132, "step": 3014 }, { "epoch": 0.6626373626373626, "grad_norm": 0.24510804262015817, "learning_rate": 2.3096572113177937e-07, "loss": 1.5068, "step": 3015 }, { "epoch": 0.6628571428571428, "grad_norm": 0.2555103275469262, "learning_rate": 2.307760403310086e-07, "loss": 1.4884, "step": 3016 }, { "epoch": 0.6630769230769231, "grad_norm": 0.24574180794453893, "learning_rate": 2.3058643305417975e-07, "loss": 1.4446, "step": 3017 }, { "epoch": 0.6632967032967033, "grad_norm": 0.2447077574083961, "learning_rate": 2.3039689939168542e-07, "loss": 1.5006, "step": 3018 }, { "epoch": 0.6635164835164835, "grad_norm": 0.34544343390585736, "learning_rate": 2.3020743943388296e-07, "loss": 1.511, "step": 3019 }, { "epoch": 0.6637362637362637, "grad_norm": 0.250237284650321, "learning_rate": 2.3001805327109445e-07, "loss": 1.4755, "step": 3020 }, { "epoch": 0.6639560439560439, "grad_norm": 0.24431944270074715, "learning_rate": 2.2982874099360686e-07, "loss": 1.5218, "step": 3021 }, { "epoch": 0.6641758241758242, "grad_norm": 0.24619630632580405, "learning_rate": 2.296395026916719e-07, "loss": 1.5287, "step": 3022 }, { "epoch": 0.6643956043956044, "grad_norm": 0.2421460507560475, "learning_rate": 2.2945033845550628e-07, "loss": 1.4595, "step": 3023 }, { "epoch": 0.6646153846153846, "grad_norm": 0.2829390316329465, "learning_rate": 2.29261248375291e-07, "loss": 1.4412, "step": 3024 }, { "epoch": 0.6648351648351648, "grad_norm": 0.25538296977530495, "learning_rate": 2.2907223254117183e-07, "loss": 1.3967, "step": 3025 }, { "epoch": 0.665054945054945, "grad_norm": 0.3169874856464177, "learning_rate": 2.2888329104325956e-07, "loss": 1.5162, "step": 3026 }, { "epoch": 0.6652747252747253, "grad_norm": 0.24591002130587566, "learning_rate": 2.2869442397162882e-07, "loss": 1.436, "step": 3027 }, { "epoch": 0.6654945054945055, "grad_norm": 1.2438378204165965, "learning_rate": 2.2850563141631935e-07, "loss": 1.5281, "step": 3028 }, { "epoch": 0.6657142857142857, "grad_norm": 0.23552149623893676, "learning_rate": 2.283169134673351e-07, "loss": 1.4341, "step": 3029 }, { "epoch": 0.6659340659340659, "grad_norm": 0.43744069766685717, "learning_rate": 2.2812827021464457e-07, "loss": 1.4989, "step": 3030 }, { "epoch": 0.6661538461538462, "grad_norm": 0.27395660410861833, "learning_rate": 2.2793970174818025e-07, "loss": 1.4418, "step": 3031 }, { "epoch": 0.6663736263736264, "grad_norm": 0.2605035534891694, "learning_rate": 2.2775120815783972e-07, "loss": 1.4687, "step": 3032 }, { "epoch": 0.6665934065934066, "grad_norm": 0.29914621715832324, "learning_rate": 2.275627895334842e-07, "loss": 1.4947, "step": 3033 }, { "epoch": 0.6668131868131868, "grad_norm": 0.24979565628141243, "learning_rate": 2.273744459649396e-07, "loss": 1.4918, "step": 3034 }, { "epoch": 0.667032967032967, "grad_norm": 0.24666021686537823, "learning_rate": 2.2718617754199578e-07, "loss": 1.427, "step": 3035 }, { "epoch": 0.6672527472527473, "grad_norm": 0.26473206776587377, "learning_rate": 2.269979843544067e-07, "loss": 1.3772, "step": 3036 }, { "epoch": 0.6674725274725275, "grad_norm": 0.24762613360330796, "learning_rate": 2.268098664918909e-07, "loss": 1.4845, "step": 3037 }, { "epoch": 0.6676923076923077, "grad_norm": 0.2755099801753309, "learning_rate": 2.2662182404413068e-07, "loss": 1.5255, "step": 3038 }, { "epoch": 0.6679120879120879, "grad_norm": 0.24234455204365682, "learning_rate": 2.2643385710077234e-07, "loss": 1.5065, "step": 3039 }, { "epoch": 0.6681318681318681, "grad_norm": 0.24716690624443044, "learning_rate": 2.2624596575142615e-07, "loss": 1.43, "step": 3040 }, { "epoch": 0.6683516483516484, "grad_norm": 0.25925718573281836, "learning_rate": 2.2605815008566662e-07, "loss": 1.5404, "step": 3041 }, { "epoch": 0.6685714285714286, "grad_norm": 0.24592395223821256, "learning_rate": 2.2587041019303217e-07, "loss": 1.5795, "step": 3042 }, { "epoch": 0.6687912087912088, "grad_norm": 0.4111149414258364, "learning_rate": 2.2568274616302478e-07, "loss": 1.4284, "step": 3043 }, { "epoch": 0.669010989010989, "grad_norm": 0.2681271119853714, "learning_rate": 2.254951580851105e-07, "loss": 1.5025, "step": 3044 }, { "epoch": 0.6692307692307692, "grad_norm": 0.2564820006540057, "learning_rate": 2.2530764604871915e-07, "loss": 1.4669, "step": 3045 }, { "epoch": 0.6694505494505495, "grad_norm": 0.23993998136149258, "learning_rate": 2.2512021014324398e-07, "loss": 1.4381, "step": 3046 }, { "epoch": 0.6696703296703297, "grad_norm": 0.24597328410825253, "learning_rate": 2.2493285045804263e-07, "loss": 1.4429, "step": 3047 }, { "epoch": 0.6698901098901099, "grad_norm": 0.2442446102577392, "learning_rate": 2.2474556708243567e-07, "loss": 1.4278, "step": 3048 }, { "epoch": 0.6701098901098901, "grad_norm": 0.2660171934620519, "learning_rate": 2.245583601057079e-07, "loss": 1.4914, "step": 3049 }, { "epoch": 0.6703296703296703, "grad_norm": 0.24590583067691427, "learning_rate": 2.2437122961710723e-07, "loss": 1.4517, "step": 3050 }, { "epoch": 0.6705494505494506, "grad_norm": 0.27129946221444673, "learning_rate": 2.2418417570584525e-07, "loss": 1.524, "step": 3051 }, { "epoch": 0.6707692307692308, "grad_norm": 0.25371149060523107, "learning_rate": 2.2399719846109736e-07, "loss": 1.5097, "step": 3052 }, { "epoch": 0.670989010989011, "grad_norm": 0.25645313108582296, "learning_rate": 2.238102979720019e-07, "loss": 1.4682, "step": 3053 }, { "epoch": 0.6712087912087912, "grad_norm": 0.25008491242513803, "learning_rate": 2.2362347432766087e-07, "loss": 1.4428, "step": 3054 }, { "epoch": 0.6714285714285714, "grad_norm": 0.24918179303638485, "learning_rate": 2.2343672761713958e-07, "loss": 1.4284, "step": 3055 }, { "epoch": 0.6716483516483517, "grad_norm": 0.2641300455781301, "learning_rate": 2.2325005792946676e-07, "loss": 1.4456, "step": 3056 }, { "epoch": 0.6718681318681319, "grad_norm": 0.25355982281227657, "learning_rate": 2.2306346535363454e-07, "loss": 1.4435, "step": 3057 }, { "epoch": 0.6720879120879121, "grad_norm": 0.23953670821996706, "learning_rate": 2.2287694997859788e-07, "loss": 1.4761, "step": 3058 }, { "epoch": 0.6723076923076923, "grad_norm": 0.26255369200191203, "learning_rate": 2.2269051189327533e-07, "loss": 1.4265, "step": 3059 }, { "epoch": 0.6725274725274726, "grad_norm": 0.30347224252128724, "learning_rate": 2.2250415118654814e-07, "loss": 1.4215, "step": 3060 }, { "epoch": 0.6727472527472528, "grad_norm": 0.23973036656762103, "learning_rate": 2.2231786794726128e-07, "loss": 1.4776, "step": 3061 }, { "epoch": 0.672967032967033, "grad_norm": 0.2696345520772091, "learning_rate": 2.2213166226422238e-07, "loss": 1.452, "step": 3062 }, { "epoch": 0.6731868131868132, "grad_norm": 0.2364206023120924, "learning_rate": 2.2194553422620203e-07, "loss": 1.4749, "step": 3063 }, { "epoch": 0.6734065934065934, "grad_norm": 0.24853105640171716, "learning_rate": 2.2175948392193416e-07, "loss": 1.4835, "step": 3064 }, { "epoch": 0.6736263736263737, "grad_norm": 0.2512307002156439, "learning_rate": 2.215735114401153e-07, "loss": 1.4624, "step": 3065 }, { "epoch": 0.6738461538461539, "grad_norm": 0.25526373880125336, "learning_rate": 2.2138761686940516e-07, "loss": 1.5062, "step": 3066 }, { "epoch": 0.6740659340659341, "grad_norm": 0.24535367297223465, "learning_rate": 2.212018002984261e-07, "loss": 1.4196, "step": 3067 }, { "epoch": 0.6742857142857143, "grad_norm": 0.2574168943162774, "learning_rate": 2.2101606181576338e-07, "loss": 1.4473, "step": 3068 }, { "epoch": 0.6745054945054945, "grad_norm": 0.26110707797614324, "learning_rate": 2.2083040150996495e-07, "loss": 1.534, "step": 3069 }, { "epoch": 0.6747252747252748, "grad_norm": 0.2525884240847623, "learning_rate": 2.206448194695415e-07, "loss": 1.4394, "step": 3070 }, { "epoch": 0.674945054945055, "grad_norm": 0.24358300659593868, "learning_rate": 2.2045931578296653e-07, "loss": 1.5019, "step": 3071 }, { "epoch": 0.6751648351648352, "grad_norm": 0.2434315489452012, "learning_rate": 2.2027389053867627e-07, "loss": 1.4753, "step": 3072 }, { "epoch": 0.6753846153846154, "grad_norm": 0.3123842926702313, "learning_rate": 2.2008854382506927e-07, "loss": 1.4943, "step": 3073 }, { "epoch": 0.6756043956043956, "grad_norm": 0.24783645902626378, "learning_rate": 2.199032757305068e-07, "loss": 1.4517, "step": 3074 }, { "epoch": 0.6758241758241759, "grad_norm": 0.2461502951028342, "learning_rate": 2.1971808634331246e-07, "loss": 1.4843, "step": 3075 }, { "epoch": 0.6760439560439561, "grad_norm": 0.24477843238407343, "learning_rate": 2.1953297575177276e-07, "loss": 1.5018, "step": 3076 }, { "epoch": 0.6762637362637363, "grad_norm": 0.24202176622759503, "learning_rate": 2.193479440441363e-07, "loss": 1.3793, "step": 3077 }, { "epoch": 0.6764835164835165, "grad_norm": 0.2553382699920082, "learning_rate": 2.1916299130861392e-07, "loss": 1.5, "step": 3078 }, { "epoch": 0.6767032967032967, "grad_norm": 0.2590305026210981, "learning_rate": 2.1897811763337939e-07, "loss": 1.49, "step": 3079 }, { "epoch": 0.676923076923077, "grad_norm": 0.26940706711649237, "learning_rate": 2.1879332310656815e-07, "loss": 1.5435, "step": 3080 }, { "epoch": 0.6771428571428572, "grad_norm": 0.23610551384248166, "learning_rate": 2.1860860781627842e-07, "loss": 1.4784, "step": 3081 }, { "epoch": 0.6773626373626374, "grad_norm": 0.25544121629166344, "learning_rate": 2.1842397185057037e-07, "loss": 1.4369, "step": 3082 }, { "epoch": 0.6775824175824176, "grad_norm": 0.25572194309840396, "learning_rate": 2.1823941529746632e-07, "loss": 1.4368, "step": 3083 }, { "epoch": 0.6778021978021977, "grad_norm": 0.2508399245102378, "learning_rate": 2.180549382449509e-07, "loss": 1.5001, "step": 3084 }, { "epoch": 0.6780219780219781, "grad_norm": 0.2445312473823318, "learning_rate": 2.1787054078097058e-07, "loss": 1.4513, "step": 3085 }, { "epoch": 0.6782417582417583, "grad_norm": 0.2506420509392925, "learning_rate": 2.1768622299343422e-07, "loss": 1.4933, "step": 3086 }, { "epoch": 0.6784615384615384, "grad_norm": 0.25456041414152586, "learning_rate": 2.1750198497021261e-07, "loss": 1.4729, "step": 3087 }, { "epoch": 0.6786813186813186, "grad_norm": 0.25097074076004683, "learning_rate": 2.1731782679913836e-07, "loss": 1.4494, "step": 3088 }, { "epoch": 0.678901098901099, "grad_norm": 0.24085724506774758, "learning_rate": 2.17133748568006e-07, "loss": 1.4898, "step": 3089 }, { "epoch": 0.6791208791208792, "grad_norm": 0.24075355484106975, "learning_rate": 2.1694975036457223e-07, "loss": 1.4266, "step": 3090 }, { "epoch": 0.6793406593406593, "grad_norm": 0.25466765182508455, "learning_rate": 2.167658322765553e-07, "loss": 1.4302, "step": 3091 }, { "epoch": 0.6795604395604395, "grad_norm": 0.24993986538291837, "learning_rate": 2.1658199439163543e-07, "loss": 1.4778, "step": 3092 }, { "epoch": 0.6797802197802197, "grad_norm": 0.24157887696162691, "learning_rate": 2.1639823679745443e-07, "loss": 1.4915, "step": 3093 }, { "epoch": 0.68, "grad_norm": 0.25949850421442, "learning_rate": 2.16214559581616e-07, "loss": 1.459, "step": 3094 }, { "epoch": 0.6802197802197802, "grad_norm": 0.23773255396933846, "learning_rate": 2.1603096283168567e-07, "loss": 1.4366, "step": 3095 }, { "epoch": 0.6804395604395604, "grad_norm": 0.24714077246063373, "learning_rate": 2.1584744663519024e-07, "loss": 1.475, "step": 3096 }, { "epoch": 0.6806593406593406, "grad_norm": 0.24648925557271095, "learning_rate": 2.156640110796183e-07, "loss": 1.4609, "step": 3097 }, { "epoch": 0.6808791208791208, "grad_norm": 0.2594198583463061, "learning_rate": 2.1548065625242005e-07, "loss": 1.514, "step": 3098 }, { "epoch": 0.6810989010989011, "grad_norm": 0.29286364110074287, "learning_rate": 2.152973822410069e-07, "loss": 1.4118, "step": 3099 }, { "epoch": 0.6813186813186813, "grad_norm": 0.2423648161204909, "learning_rate": 2.1511418913275227e-07, "loss": 1.4327, "step": 3100 }, { "epoch": 0.6815384615384615, "grad_norm": 0.23993786329091552, "learning_rate": 2.1493107701499052e-07, "loss": 1.5077, "step": 3101 }, { "epoch": 0.6817582417582417, "grad_norm": 0.3411989665514392, "learning_rate": 2.1474804597501768e-07, "loss": 1.4289, "step": 3102 }, { "epoch": 0.6819780219780219, "grad_norm": 0.25536913747651535, "learning_rate": 2.1456509610009107e-07, "loss": 1.3766, "step": 3103 }, { "epoch": 0.6821978021978022, "grad_norm": 0.2532263164782981, "learning_rate": 2.14382227477429e-07, "loss": 1.4468, "step": 3104 }, { "epoch": 0.6824175824175824, "grad_norm": 0.24029046044235383, "learning_rate": 2.1419944019421166e-07, "loss": 1.5588, "step": 3105 }, { "epoch": 0.6826373626373626, "grad_norm": 0.23969928997050355, "learning_rate": 2.1401673433758002e-07, "loss": 1.4366, "step": 3106 }, { "epoch": 0.6828571428571428, "grad_norm": 0.2585854322539309, "learning_rate": 2.138341099946363e-07, "loss": 1.4755, "step": 3107 }, { "epoch": 0.683076923076923, "grad_norm": 0.2442193789475446, "learning_rate": 2.136515672524438e-07, "loss": 1.5113, "step": 3108 }, { "epoch": 0.6832967032967033, "grad_norm": 0.27456272290932193, "learning_rate": 2.1346910619802707e-07, "loss": 1.4706, "step": 3109 }, { "epoch": 0.6835164835164835, "grad_norm": 0.24814010870811984, "learning_rate": 2.1328672691837182e-07, "loss": 1.4417, "step": 3110 }, { "epoch": 0.6837362637362637, "grad_norm": 0.23617749416609635, "learning_rate": 2.1310442950042455e-07, "loss": 1.4359, "step": 3111 }, { "epoch": 0.6839560439560439, "grad_norm": 0.24831668058757458, "learning_rate": 2.1292221403109274e-07, "loss": 1.4452, "step": 3112 }, { "epoch": 0.6841758241758241, "grad_norm": 0.25205386687745585, "learning_rate": 2.127400805972448e-07, "loss": 1.457, "step": 3113 }, { "epoch": 0.6843956043956044, "grad_norm": 0.2577114555677272, "learning_rate": 2.1255802928571008e-07, "loss": 1.4333, "step": 3114 }, { "epoch": 0.6846153846153846, "grad_norm": 0.24382561172249276, "learning_rate": 2.1237606018327903e-07, "loss": 1.5009, "step": 3115 }, { "epoch": 0.6848351648351648, "grad_norm": 0.2847541174438912, "learning_rate": 2.1219417337670233e-07, "loss": 1.4895, "step": 3116 }, { "epoch": 0.685054945054945, "grad_norm": 0.27838387006735205, "learning_rate": 2.1201236895269212e-07, "loss": 1.4323, "step": 3117 }, { "epoch": 0.6852747252747253, "grad_norm": 0.2759133069945561, "learning_rate": 2.1183064699792062e-07, "loss": 1.5442, "step": 3118 }, { "epoch": 0.6854945054945055, "grad_norm": 0.2528145717777085, "learning_rate": 2.1164900759902124e-07, "loss": 1.3942, "step": 3119 }, { "epoch": 0.6857142857142857, "grad_norm": 0.25217428292549016, "learning_rate": 2.1146745084258773e-07, "loss": 1.4769, "step": 3120 }, { "epoch": 0.6859340659340659, "grad_norm": 0.24806848979131116, "learning_rate": 2.1128597681517453e-07, "loss": 1.4779, "step": 3121 }, { "epoch": 0.6861538461538461, "grad_norm": 0.25201626753038225, "learning_rate": 2.1110458560329664e-07, "loss": 1.4908, "step": 3122 }, { "epoch": 0.6863736263736264, "grad_norm": 0.34458576673155794, "learning_rate": 2.1092327729342943e-07, "loss": 1.486, "step": 3123 }, { "epoch": 0.6865934065934066, "grad_norm": 0.2519743257027312, "learning_rate": 2.1074205197200905e-07, "loss": 1.5106, "step": 3124 }, { "epoch": 0.6868131868131868, "grad_norm": 0.24061342136295233, "learning_rate": 2.105609097254321e-07, "loss": 1.4173, "step": 3125 }, { "epoch": 0.687032967032967, "grad_norm": 0.2434514127876388, "learning_rate": 2.1037985064005515e-07, "loss": 1.5073, "step": 3126 }, { "epoch": 0.6872527472527472, "grad_norm": 0.24435642631768628, "learning_rate": 2.101988748021955e-07, "loss": 1.4492, "step": 3127 }, { "epoch": 0.6874725274725275, "grad_norm": 0.24286132884219838, "learning_rate": 2.1001798229813049e-07, "loss": 1.4344, "step": 3128 }, { "epoch": 0.6876923076923077, "grad_norm": 0.26170499726834145, "learning_rate": 2.0983717321409808e-07, "loss": 1.4172, "step": 3129 }, { "epoch": 0.6879120879120879, "grad_norm": 0.23227327559356964, "learning_rate": 2.096564476362963e-07, "loss": 1.4432, "step": 3130 }, { "epoch": 0.6881318681318681, "grad_norm": 0.27840101443572146, "learning_rate": 2.09475805650883e-07, "loss": 1.3829, "step": 3131 }, { "epoch": 0.6883516483516483, "grad_norm": 0.24187472135404398, "learning_rate": 2.0929524734397695e-07, "loss": 1.4699, "step": 3132 }, { "epoch": 0.6885714285714286, "grad_norm": 0.2527146797146545, "learning_rate": 2.0911477280165625e-07, "loss": 1.4175, "step": 3133 }, { "epoch": 0.6887912087912088, "grad_norm": 0.2609213836624885, "learning_rate": 2.0893438210995972e-07, "loss": 1.4425, "step": 3134 }, { "epoch": 0.689010989010989, "grad_norm": 0.24496251146873171, "learning_rate": 2.0875407535488563e-07, "loss": 1.403, "step": 3135 }, { "epoch": 0.6892307692307692, "grad_norm": 0.250920865990223, "learning_rate": 2.0857385262239268e-07, "loss": 1.5082, "step": 3136 }, { "epoch": 0.6894505494505494, "grad_norm": 0.24564768858946562, "learning_rate": 2.0839371399839924e-07, "loss": 1.4635, "step": 3137 }, { "epoch": 0.6896703296703297, "grad_norm": 0.24187048441058168, "learning_rate": 2.0821365956878357e-07, "loss": 1.4062, "step": 3138 }, { "epoch": 0.6898901098901099, "grad_norm": 0.24735457834632849, "learning_rate": 2.0803368941938404e-07, "loss": 1.5286, "step": 3139 }, { "epoch": 0.6901098901098901, "grad_norm": 0.25833072189664436, "learning_rate": 2.0785380363599882e-07, "loss": 1.4721, "step": 3140 }, { "epoch": 0.6903296703296703, "grad_norm": 0.24399907985969946, "learning_rate": 2.0767400230438554e-07, "loss": 1.4639, "step": 3141 }, { "epoch": 0.6905494505494505, "grad_norm": 0.2517968074441635, "learning_rate": 2.0749428551026188e-07, "loss": 1.3812, "step": 3142 }, { "epoch": 0.6907692307692308, "grad_norm": 0.24466899540903284, "learning_rate": 2.0731465333930481e-07, "loss": 1.4336, "step": 3143 }, { "epoch": 0.690989010989011, "grad_norm": 0.2750572732307114, "learning_rate": 2.0713510587715174e-07, "loss": 1.4307, "step": 3144 }, { "epoch": 0.6912087912087912, "grad_norm": 0.2413244511686587, "learning_rate": 2.0695564320939895e-07, "loss": 1.5065, "step": 3145 }, { "epoch": 0.6914285714285714, "grad_norm": 0.24815369423796488, "learning_rate": 2.0677626542160242e-07, "loss": 1.4443, "step": 3146 }, { "epoch": 0.6916483516483517, "grad_norm": 0.2646080787831905, "learning_rate": 2.06596972599278e-07, "loss": 1.5057, "step": 3147 }, { "epoch": 0.6918681318681319, "grad_norm": 0.2425595954547546, "learning_rate": 2.0641776482790086e-07, "loss": 1.4369, "step": 3148 }, { "epoch": 0.6920879120879121, "grad_norm": 0.2527111061408441, "learning_rate": 2.0623864219290548e-07, "loss": 1.4684, "step": 3149 }, { "epoch": 0.6923076923076923, "grad_norm": 0.25041609236608553, "learning_rate": 2.0605960477968594e-07, "loss": 1.4906, "step": 3150 }, { "epoch": 0.6925274725274725, "grad_norm": 0.2485469344657923, "learning_rate": 2.0588065267359564e-07, "loss": 1.4239, "step": 3151 }, { "epoch": 0.6927472527472528, "grad_norm": 0.26629831155357364, "learning_rate": 2.0570178595994708e-07, "loss": 1.4996, "step": 3152 }, { "epoch": 0.692967032967033, "grad_norm": 0.23641471142457823, "learning_rate": 2.055230047240125e-07, "loss": 1.482, "step": 3153 }, { "epoch": 0.6931868131868132, "grad_norm": 0.3667053908025628, "learning_rate": 2.05344309051023e-07, "loss": 1.4338, "step": 3154 }, { "epoch": 0.6934065934065934, "grad_norm": 0.24496616092781123, "learning_rate": 2.0516569902616923e-07, "loss": 1.4402, "step": 3155 }, { "epoch": 0.6936263736263736, "grad_norm": 0.25579099163610536, "learning_rate": 2.049871747346007e-07, "loss": 1.5221, "step": 3156 }, { "epoch": 0.6938461538461539, "grad_norm": 0.24817376173997746, "learning_rate": 2.0480873626142614e-07, "loss": 1.428, "step": 3157 }, { "epoch": 0.6940659340659341, "grad_norm": 0.40343921679542905, "learning_rate": 2.0463038369171346e-07, "loss": 1.5188, "step": 3158 }, { "epoch": 0.6942857142857143, "grad_norm": 0.24999018687260025, "learning_rate": 2.044521171104895e-07, "loss": 1.4645, "step": 3159 }, { "epoch": 0.6945054945054945, "grad_norm": 0.24075434555395248, "learning_rate": 2.0427393660274025e-07, "loss": 1.5099, "step": 3160 }, { "epoch": 0.6947252747252747, "grad_norm": 0.2533563131813355, "learning_rate": 2.0409584225341038e-07, "loss": 1.4723, "step": 3161 }, { "epoch": 0.694945054945055, "grad_norm": 0.24333315308734801, "learning_rate": 2.0391783414740392e-07, "loss": 1.4673, "step": 3162 }, { "epoch": 0.6951648351648352, "grad_norm": 0.24180432171033167, "learning_rate": 2.0373991236958331e-07, "loss": 1.4257, "step": 3163 }, { "epoch": 0.6953846153846154, "grad_norm": 0.25739434860090893, "learning_rate": 2.035620770047703e-07, "loss": 1.4347, "step": 3164 }, { "epoch": 0.6956043956043956, "grad_norm": 0.3074683582796377, "learning_rate": 2.0338432813774507e-07, "loss": 1.4102, "step": 3165 }, { "epoch": 0.6958241758241758, "grad_norm": 0.23928778515501578, "learning_rate": 2.0320666585324678e-07, "loss": 1.4264, "step": 3166 }, { "epoch": 0.6960439560439561, "grad_norm": 0.2475993981417254, "learning_rate": 2.0302909023597305e-07, "loss": 1.4806, "step": 3167 }, { "epoch": 0.6962637362637363, "grad_norm": 0.23846983202090427, "learning_rate": 2.028516013705807e-07, "loss": 1.4911, "step": 3168 }, { "epoch": 0.6964835164835165, "grad_norm": 0.25884262787185724, "learning_rate": 2.0267419934168447e-07, "loss": 1.481, "step": 3169 }, { "epoch": 0.6967032967032967, "grad_norm": 0.33077262673832125, "learning_rate": 2.0249688423385854e-07, "loss": 1.4652, "step": 3170 }, { "epoch": 0.696923076923077, "grad_norm": 0.26993218056623297, "learning_rate": 2.023196561316348e-07, "loss": 1.422, "step": 3171 }, { "epoch": 0.6971428571428572, "grad_norm": 0.24238165964977457, "learning_rate": 2.0214251511950433e-07, "loss": 1.4782, "step": 3172 }, { "epoch": 0.6973626373626374, "grad_norm": 0.25281497035301775, "learning_rate": 2.0196546128191636e-07, "loss": 1.4714, "step": 3173 }, { "epoch": 0.6975824175824176, "grad_norm": 0.23806553817481058, "learning_rate": 2.017884947032787e-07, "loss": 1.4722, "step": 3174 }, { "epoch": 0.6978021978021978, "grad_norm": 0.2621306314371981, "learning_rate": 2.0161161546795733e-07, "loss": 1.5219, "step": 3175 }, { "epoch": 0.6980219780219781, "grad_norm": 0.2510491892136605, "learning_rate": 2.0143482366027676e-07, "loss": 1.5622, "step": 3176 }, { "epoch": 0.6982417582417583, "grad_norm": 0.24834781690274416, "learning_rate": 2.0125811936451996e-07, "loss": 1.4336, "step": 3177 }, { "epoch": 0.6984615384615385, "grad_norm": 0.2404310866256325, "learning_rate": 2.0108150266492785e-07, "loss": 1.3799, "step": 3178 }, { "epoch": 0.6986813186813187, "grad_norm": 0.24294667141443566, "learning_rate": 2.009049736457e-07, "loss": 1.4843, "step": 3179 }, { "epoch": 0.6989010989010989, "grad_norm": 0.24572665064557875, "learning_rate": 2.0072853239099384e-07, "loss": 1.4512, "step": 3180 }, { "epoch": 0.6991208791208792, "grad_norm": 0.2695810921788662, "learning_rate": 2.0055217898492498e-07, "loss": 1.484, "step": 3181 }, { "epoch": 0.6993406593406594, "grad_norm": 5.373659850927592, "learning_rate": 2.0037591351156742e-07, "loss": 1.4724, "step": 3182 }, { "epoch": 0.6995604395604396, "grad_norm": 0.24471537583157932, "learning_rate": 2.0019973605495293e-07, "loss": 1.4605, "step": 3183 }, { "epoch": 0.6997802197802198, "grad_norm": 0.2356772956889882, "learning_rate": 2.000236466990714e-07, "loss": 1.4751, "step": 3184 }, { "epoch": 0.7, "grad_norm": 0.2542023525404883, "learning_rate": 1.9984764552787103e-07, "loss": 1.4768, "step": 3185 }, { "epoch": 0.7002197802197803, "grad_norm": 0.2693191620754359, "learning_rate": 1.996717326252573e-07, "loss": 1.4816, "step": 3186 }, { "epoch": 0.7004395604395605, "grad_norm": 0.24633212397883253, "learning_rate": 1.9949590807509449e-07, "loss": 1.5305, "step": 3187 }, { "epoch": 0.7006593406593407, "grad_norm": 0.24945030856824707, "learning_rate": 1.9932017196120401e-07, "loss": 1.5288, "step": 3188 }, { "epoch": 0.7008791208791209, "grad_norm": 0.25189285712767506, "learning_rate": 1.9914452436736552e-07, "loss": 1.4602, "step": 3189 }, { "epoch": 0.701098901098901, "grad_norm": 0.2523811608389368, "learning_rate": 1.9896896537731632e-07, "loss": 1.3812, "step": 3190 }, { "epoch": 0.7013186813186814, "grad_norm": 0.24492554039973877, "learning_rate": 1.987934950747513e-07, "loss": 1.514, "step": 3191 }, { "epoch": 0.7015384615384616, "grad_norm": 0.7800093696725251, "learning_rate": 1.9861811354332364e-07, "loss": 1.4468, "step": 3192 }, { "epoch": 0.7017582417582418, "grad_norm": 0.2547839687640688, "learning_rate": 1.9844282086664354e-07, "loss": 1.4991, "step": 3193 }, { "epoch": 0.701978021978022, "grad_norm": 0.25480512592699334, "learning_rate": 1.9826761712827927e-07, "loss": 1.4161, "step": 3194 }, { "epoch": 0.7021978021978021, "grad_norm": 0.24873346798027937, "learning_rate": 1.9809250241175666e-07, "loss": 1.4332, "step": 3195 }, { "epoch": 0.7024175824175825, "grad_norm": 0.2536388255891338, "learning_rate": 1.9791747680055873e-07, "loss": 1.5154, "step": 3196 }, { "epoch": 0.7026373626373627, "grad_norm": 0.24990942350035344, "learning_rate": 1.9774254037812657e-07, "loss": 1.4742, "step": 3197 }, { "epoch": 0.7028571428571428, "grad_norm": 0.24481102545247088, "learning_rate": 1.9756769322785836e-07, "loss": 1.4048, "step": 3198 }, { "epoch": 0.703076923076923, "grad_norm": 0.26701590931205094, "learning_rate": 1.9739293543310976e-07, "loss": 1.4619, "step": 3199 }, { "epoch": 0.7032967032967034, "grad_norm": 0.24977058514541342, "learning_rate": 1.9721826707719403e-07, "loss": 1.4717, "step": 3200 }, { "epoch": 0.7035164835164835, "grad_norm": 0.2504927141253575, "learning_rate": 1.9704368824338156e-07, "loss": 1.5187, "step": 3201 }, { "epoch": 0.7037362637362637, "grad_norm": 0.24157986300437764, "learning_rate": 1.968691990149003e-07, "loss": 1.5005, "step": 3202 }, { "epoch": 0.7039560439560439, "grad_norm": 0.24422773100869952, "learning_rate": 1.9669479947493536e-07, "loss": 1.5058, "step": 3203 }, { "epoch": 0.7041758241758241, "grad_norm": 0.2636538207992709, "learning_rate": 1.96520489706629e-07, "loss": 1.5121, "step": 3204 }, { "epoch": 0.7043956043956044, "grad_norm": 0.2508846912155803, "learning_rate": 1.9634626979308063e-07, "loss": 1.5071, "step": 3205 }, { "epoch": 0.7046153846153846, "grad_norm": 0.24934544618780202, "learning_rate": 1.9617213981734727e-07, "loss": 1.4475, "step": 3206 }, { "epoch": 0.7048351648351648, "grad_norm": 0.2834563128653475, "learning_rate": 1.959980998624425e-07, "loss": 1.4617, "step": 3207 }, { "epoch": 0.705054945054945, "grad_norm": 0.24364443488014928, "learning_rate": 1.958241500113373e-07, "loss": 1.4504, "step": 3208 }, { "epoch": 0.7052747252747252, "grad_norm": 0.24498371324935125, "learning_rate": 1.9565029034695976e-07, "loss": 1.4348, "step": 3209 }, { "epoch": 0.7054945054945055, "grad_norm": 0.24224691125853023, "learning_rate": 1.9547652095219466e-07, "loss": 1.4163, "step": 3210 }, { "epoch": 0.7057142857142857, "grad_norm": 0.24619339270884996, "learning_rate": 1.953028419098841e-07, "loss": 1.4669, "step": 3211 }, { "epoch": 0.7059340659340659, "grad_norm": 0.2432722588425196, "learning_rate": 1.951292533028269e-07, "loss": 1.4541, "step": 3212 }, { "epoch": 0.7061538461538461, "grad_norm": 0.2469630623778641, "learning_rate": 1.9495575521377875e-07, "loss": 1.464, "step": 3213 }, { "epoch": 0.7063736263736263, "grad_norm": 0.25267432543083657, "learning_rate": 1.9478234772545232e-07, "loss": 1.4606, "step": 3214 }, { "epoch": 0.7065934065934066, "grad_norm": 0.2605139268164092, "learning_rate": 1.9460903092051678e-07, "loss": 1.5297, "step": 3215 }, { "epoch": 0.7068131868131868, "grad_norm": 0.24803353950597865, "learning_rate": 1.944358048815985e-07, "loss": 1.4619, "step": 3216 }, { "epoch": 0.707032967032967, "grad_norm": 0.25667653637217674, "learning_rate": 1.9426266969128044e-07, "loss": 1.4205, "step": 3217 }, { "epoch": 0.7072527472527472, "grad_norm": 0.23818064685961463, "learning_rate": 1.9408962543210208e-07, "loss": 1.4516, "step": 3218 }, { "epoch": 0.7074725274725274, "grad_norm": 0.25442353898695175, "learning_rate": 1.9391667218655967e-07, "loss": 1.4197, "step": 3219 }, { "epoch": 0.7076923076923077, "grad_norm": 0.24937386993306204, "learning_rate": 1.93743810037106e-07, "loss": 1.4257, "step": 3220 }, { "epoch": 0.7079120879120879, "grad_norm": 0.2691867333141872, "learning_rate": 1.9357103906615056e-07, "loss": 1.4567, "step": 3221 }, { "epoch": 0.7081318681318681, "grad_norm": 0.31630300982249876, "learning_rate": 1.9339835935605933e-07, "loss": 1.4658, "step": 3222 }, { "epoch": 0.7083516483516483, "grad_norm": 0.23962892426204127, "learning_rate": 1.932257709891546e-07, "loss": 1.4942, "step": 3223 }, { "epoch": 0.7085714285714285, "grad_norm": 0.41089038044383347, "learning_rate": 1.9305327404771547e-07, "loss": 1.4822, "step": 3224 }, { "epoch": 0.7087912087912088, "grad_norm": 0.24171709799232335, "learning_rate": 1.9288086861397704e-07, "loss": 1.4951, "step": 3225 }, { "epoch": 0.709010989010989, "grad_norm": 0.25245628249702773, "learning_rate": 1.9270855477013123e-07, "loss": 1.4492, "step": 3226 }, { "epoch": 0.7092307692307692, "grad_norm": 0.26284560030310655, "learning_rate": 1.925363325983259e-07, "loss": 1.4917, "step": 3227 }, { "epoch": 0.7094505494505494, "grad_norm": 0.2463516949941388, "learning_rate": 1.9236420218066538e-07, "loss": 1.4079, "step": 3228 }, { "epoch": 0.7096703296703297, "grad_norm": 0.263921226755463, "learning_rate": 1.9219216359921018e-07, "loss": 1.4824, "step": 3229 }, { "epoch": 0.7098901098901099, "grad_norm": 0.24651817675689766, "learning_rate": 1.9202021693597735e-07, "loss": 1.4845, "step": 3230 }, { "epoch": 0.7101098901098901, "grad_norm": 0.24910206911441665, "learning_rate": 1.9184836227293953e-07, "loss": 1.4456, "step": 3231 }, { "epoch": 0.7103296703296703, "grad_norm": 0.2505853015151859, "learning_rate": 1.9167659969202608e-07, "loss": 1.4469, "step": 3232 }, { "epoch": 0.7105494505494505, "grad_norm": 0.2544684201809747, "learning_rate": 1.9150492927512218e-07, "loss": 1.4046, "step": 3233 }, { "epoch": 0.7107692307692308, "grad_norm": 0.2465100264312349, "learning_rate": 1.9133335110406892e-07, "loss": 1.4689, "step": 3234 }, { "epoch": 0.710989010989011, "grad_norm": 0.2426853563471961, "learning_rate": 1.9116186526066384e-07, "loss": 1.5355, "step": 3235 }, { "epoch": 0.7112087912087912, "grad_norm": 0.24803171720248252, "learning_rate": 1.9099047182666014e-07, "loss": 1.4461, "step": 3236 }, { "epoch": 0.7114285714285714, "grad_norm": 0.2780107540933846, "learning_rate": 1.90819170883767e-07, "loss": 1.4616, "step": 3237 }, { "epoch": 0.7116483516483516, "grad_norm": 0.27470184243471857, "learning_rate": 1.9064796251364956e-07, "loss": 1.5052, "step": 3238 }, { "epoch": 0.7118681318681319, "grad_norm": 0.25169211517097295, "learning_rate": 1.904768467979288e-07, "loss": 1.4661, "step": 3239 }, { "epoch": 0.7120879120879121, "grad_norm": 0.25545898180429133, "learning_rate": 1.903058238181817e-07, "loss": 1.5073, "step": 3240 }, { "epoch": 0.7123076923076923, "grad_norm": 0.34123367359757256, "learning_rate": 1.9013489365594084e-07, "loss": 1.4718, "step": 3241 }, { "epoch": 0.7125274725274725, "grad_norm": 0.24916513219850814, "learning_rate": 1.8996405639269446e-07, "loss": 1.4685, "step": 3242 }, { "epoch": 0.7127472527472527, "grad_norm": 0.287545194200866, "learning_rate": 1.8979331210988678e-07, "loss": 1.4795, "step": 3243 }, { "epoch": 0.712967032967033, "grad_norm": 0.2412527323583216, "learning_rate": 1.8962266088891736e-07, "loss": 1.4452, "step": 3244 }, { "epoch": 0.7131868131868132, "grad_norm": 0.24399314333194297, "learning_rate": 1.8945210281114183e-07, "loss": 1.4066, "step": 3245 }, { "epoch": 0.7134065934065934, "grad_norm": 0.23808242277076305, "learning_rate": 1.8928163795787096e-07, "loss": 1.4539, "step": 3246 }, { "epoch": 0.7136263736263736, "grad_norm": 0.253134156732432, "learning_rate": 1.891112664103715e-07, "loss": 1.4865, "step": 3247 }, { "epoch": 0.7138461538461538, "grad_norm": 0.2461659703341645, "learning_rate": 1.8894098824986546e-07, "loss": 1.4467, "step": 3248 }, { "epoch": 0.7140659340659341, "grad_norm": 0.2689641006317081, "learning_rate": 1.887708035575302e-07, "loss": 1.4629, "step": 3249 }, { "epoch": 0.7142857142857143, "grad_norm": 0.23989033150090913, "learning_rate": 1.8860071241449896e-07, "loss": 1.4704, "step": 3250 }, { "epoch": 0.7145054945054945, "grad_norm": 0.26677787884863696, "learning_rate": 1.8843071490185996e-07, "loss": 1.4694, "step": 3251 }, { "epoch": 0.7147252747252747, "grad_norm": 0.23844986824915337, "learning_rate": 1.8826081110065702e-07, "loss": 1.4545, "step": 3252 }, { "epoch": 0.7149450549450549, "grad_norm": 0.31345042350401403, "learning_rate": 1.880910010918891e-07, "loss": 1.4333, "step": 3253 }, { "epoch": 0.7151648351648352, "grad_norm": 0.2555583725561702, "learning_rate": 1.8792128495651053e-07, "loss": 1.4538, "step": 3254 }, { "epoch": 0.7153846153846154, "grad_norm": 0.24811479341230933, "learning_rate": 1.877516627754312e-07, "loss": 1.5049, "step": 3255 }, { "epoch": 0.7156043956043956, "grad_norm": 0.24447189966855204, "learning_rate": 1.8758213462951568e-07, "loss": 1.487, "step": 3256 }, { "epoch": 0.7158241758241758, "grad_norm": 0.24154809271473499, "learning_rate": 1.87412700599584e-07, "loss": 1.5263, "step": 3257 }, { "epoch": 0.7160439560439561, "grad_norm": 0.24063646509027245, "learning_rate": 1.872433607664112e-07, "loss": 1.4076, "step": 3258 }, { "epoch": 0.7162637362637363, "grad_norm": 0.2498250801060971, "learning_rate": 1.8707411521072768e-07, "loss": 1.4191, "step": 3259 }, { "epoch": 0.7164835164835165, "grad_norm": 0.25350785794448677, "learning_rate": 1.8690496401321855e-07, "loss": 1.4553, "step": 3260 }, { "epoch": 0.7167032967032967, "grad_norm": 0.2407961693858166, "learning_rate": 1.8673590725452402e-07, "loss": 1.447, "step": 3261 }, { "epoch": 0.7169230769230769, "grad_norm": 0.24641495109494188, "learning_rate": 1.865669450152396e-07, "loss": 1.4332, "step": 3262 }, { "epoch": 0.7171428571428572, "grad_norm": 0.24588475335334106, "learning_rate": 1.8639807737591512e-07, "loss": 1.4299, "step": 3263 }, { "epoch": 0.7173626373626374, "grad_norm": 0.2624717335867807, "learning_rate": 1.8622930441705606e-07, "loss": 1.5054, "step": 3264 }, { "epoch": 0.7175824175824176, "grad_norm": 0.27990391708380047, "learning_rate": 1.860606262191222e-07, "loss": 1.461, "step": 3265 }, { "epoch": 0.7178021978021978, "grad_norm": 0.2595034808065735, "learning_rate": 1.8589204286252833e-07, "loss": 1.4493, "step": 3266 }, { "epoch": 0.718021978021978, "grad_norm": 0.24735307720367514, "learning_rate": 1.8572355442764399e-07, "loss": 1.4956, "step": 3267 }, { "epoch": 0.7182417582417583, "grad_norm": 0.2610167548710171, "learning_rate": 1.855551609947934e-07, "loss": 1.4545, "step": 3268 }, { "epoch": 0.7184615384615385, "grad_norm": 0.24621173602374435, "learning_rate": 1.8538686264425571e-07, "loss": 1.4619, "step": 3269 }, { "epoch": 0.7186813186813187, "grad_norm": 0.2736164815114475, "learning_rate": 1.852186594562647e-07, "loss": 1.4773, "step": 3270 }, { "epoch": 0.7189010989010989, "grad_norm": 0.25480669335083556, "learning_rate": 1.8505055151100872e-07, "loss": 1.5123, "step": 3271 }, { "epoch": 0.7191208791208791, "grad_norm": 0.24798224721324247, "learning_rate": 1.848825388886304e-07, "loss": 1.5256, "step": 3272 }, { "epoch": 0.7193406593406594, "grad_norm": 0.25733916159075726, "learning_rate": 1.847146216692274e-07, "loss": 1.4671, "step": 3273 }, { "epoch": 0.7195604395604396, "grad_norm": 0.24376604468862545, "learning_rate": 1.8454679993285176e-07, "loss": 1.4894, "step": 3274 }, { "epoch": 0.7197802197802198, "grad_norm": 0.24600997729651705, "learning_rate": 1.8437907375950997e-07, "loss": 1.5186, "step": 3275 }, { "epoch": 0.72, "grad_norm": 0.2488959774772532, "learning_rate": 1.8421144322916273e-07, "loss": 1.4553, "step": 3276 }, { "epoch": 0.7202197802197802, "grad_norm": 0.24938165662904727, "learning_rate": 1.8404390842172562e-07, "loss": 1.449, "step": 3277 }, { "epoch": 0.7204395604395605, "grad_norm": 0.32252047564284203, "learning_rate": 1.8387646941706818e-07, "loss": 1.4665, "step": 3278 }, { "epoch": 0.7206593406593407, "grad_norm": 0.2525303889820265, "learning_rate": 1.8370912629501456e-07, "loss": 1.432, "step": 3279 }, { "epoch": 0.7208791208791209, "grad_norm": 0.23499246121617587, "learning_rate": 1.835418791353429e-07, "loss": 1.3934, "step": 3280 }, { "epoch": 0.7210989010989011, "grad_norm": 0.24906913844441317, "learning_rate": 1.8337472801778592e-07, "loss": 1.4236, "step": 3281 }, { "epoch": 0.7213186813186813, "grad_norm": 0.24678602622920362, "learning_rate": 1.8320767302203028e-07, "loss": 1.4371, "step": 3282 }, { "epoch": 0.7215384615384616, "grad_norm": 0.3113074610143378, "learning_rate": 1.8304071422771679e-07, "loss": 1.4679, "step": 3283 }, { "epoch": 0.7217582417582418, "grad_norm": 0.26082664228768687, "learning_rate": 1.828738517144407e-07, "loss": 1.4577, "step": 3284 }, { "epoch": 0.721978021978022, "grad_norm": 0.24495495503093082, "learning_rate": 1.8270708556175134e-07, "loss": 1.4281, "step": 3285 }, { "epoch": 0.7221978021978022, "grad_norm": 0.2582850342960555, "learning_rate": 1.8254041584915174e-07, "loss": 1.4721, "step": 3286 }, { "epoch": 0.7224175824175825, "grad_norm": 0.241793919075375, "learning_rate": 1.8237384265609912e-07, "loss": 1.4833, "step": 3287 }, { "epoch": 0.7226373626373627, "grad_norm": 0.2501095050095039, "learning_rate": 1.8220736606200497e-07, "loss": 1.4916, "step": 3288 }, { "epoch": 0.7228571428571429, "grad_norm": 0.26555359339606643, "learning_rate": 1.8204098614623432e-07, "loss": 1.4344, "step": 3289 }, { "epoch": 0.7230769230769231, "grad_norm": 0.24798733506828943, "learning_rate": 1.8187470298810632e-07, "loss": 1.4475, "step": 3290 }, { "epoch": 0.7232967032967033, "grad_norm": 0.25674027566194535, "learning_rate": 1.8170851666689384e-07, "loss": 1.4907, "step": 3291 }, { "epoch": 0.7235164835164836, "grad_norm": 0.25240289939628696, "learning_rate": 1.8154242726182383e-07, "loss": 1.4166, "step": 3292 }, { "epoch": 0.7237362637362638, "grad_norm": 0.25343258434035865, "learning_rate": 1.8137643485207693e-07, "loss": 1.4977, "step": 3293 }, { "epoch": 0.723956043956044, "grad_norm": 0.25973109361538593, "learning_rate": 1.8121053951678747e-07, "loss": 1.4771, "step": 3294 }, { "epoch": 0.7241758241758242, "grad_norm": 0.24549839642838567, "learning_rate": 1.8104474133504357e-07, "loss": 1.505, "step": 3295 }, { "epoch": 0.7243956043956044, "grad_norm": 0.2621369442543476, "learning_rate": 1.8087904038588693e-07, "loss": 1.4428, "step": 3296 }, { "epoch": 0.7246153846153847, "grad_norm": 0.24132826419125739, "learning_rate": 1.8071343674831296e-07, "loss": 1.4258, "step": 3297 }, { "epoch": 0.7248351648351649, "grad_norm": 0.24003552269956913, "learning_rate": 1.8054793050127078e-07, "loss": 1.4788, "step": 3298 }, { "epoch": 0.725054945054945, "grad_norm": 0.2544725057915693, "learning_rate": 1.8038252172366285e-07, "loss": 1.4748, "step": 3299 }, { "epoch": 0.7252747252747253, "grad_norm": 0.25386484420295485, "learning_rate": 1.8021721049434555e-07, "loss": 1.4935, "step": 3300 }, { "epoch": 0.7254945054945054, "grad_norm": 0.2435757370023608, "learning_rate": 1.800519968921284e-07, "loss": 1.4653, "step": 3301 }, { "epoch": 0.7257142857142858, "grad_norm": 0.27279085070738307, "learning_rate": 1.798868809957743e-07, "loss": 1.4265, "step": 3302 }, { "epoch": 0.725934065934066, "grad_norm": 0.26117927967675364, "learning_rate": 1.797218628840001e-07, "loss": 1.4879, "step": 3303 }, { "epoch": 0.7261538461538461, "grad_norm": 0.2473784528154045, "learning_rate": 1.7955694263547543e-07, "loss": 1.5087, "step": 3304 }, { "epoch": 0.7263736263736263, "grad_norm": 0.2681737124631539, "learning_rate": 1.7939212032882368e-07, "loss": 1.468, "step": 3305 }, { "epoch": 0.7265934065934065, "grad_norm": 0.2527039926528341, "learning_rate": 1.7922739604262114e-07, "loss": 1.4972, "step": 3306 }, { "epoch": 0.7268131868131869, "grad_norm": 0.2623933772391432, "learning_rate": 1.7906276985539777e-07, "loss": 1.4191, "step": 3307 }, { "epoch": 0.727032967032967, "grad_norm": 0.24264095108199527, "learning_rate": 1.7889824184563687e-07, "loss": 1.4532, "step": 3308 }, { "epoch": 0.7272527472527472, "grad_norm": 0.2585626294135146, "learning_rate": 1.787338120917744e-07, "loss": 1.5527, "step": 3309 }, { "epoch": 0.7274725274725274, "grad_norm": 0.2441638996498909, "learning_rate": 1.7856948067219985e-07, "loss": 1.4354, "step": 3310 }, { "epoch": 0.7276923076923076, "grad_norm": 0.24615369620341251, "learning_rate": 1.7840524766525566e-07, "loss": 1.4313, "step": 3311 }, { "epoch": 0.727912087912088, "grad_norm": 0.24637571327402003, "learning_rate": 1.7824111314923744e-07, "loss": 1.4676, "step": 3312 }, { "epoch": 0.7281318681318681, "grad_norm": 0.2741600516349602, "learning_rate": 1.7807707720239397e-07, "loss": 1.4246, "step": 3313 }, { "epoch": 0.7283516483516483, "grad_norm": 0.24482069265418338, "learning_rate": 1.7791313990292673e-07, "loss": 1.454, "step": 3314 }, { "epoch": 0.7285714285714285, "grad_norm": 0.25074974889991747, "learning_rate": 1.7774930132899053e-07, "loss": 1.4506, "step": 3315 }, { "epoch": 0.7287912087912088, "grad_norm": 0.2444664324215279, "learning_rate": 1.7758556155869278e-07, "loss": 1.4875, "step": 3316 }, { "epoch": 0.729010989010989, "grad_norm": 0.2477842161344449, "learning_rate": 1.7742192067009405e-07, "loss": 1.4896, "step": 3317 }, { "epoch": 0.7292307692307692, "grad_norm": 0.2554882555390125, "learning_rate": 1.772583787412077e-07, "loss": 1.4367, "step": 3318 }, { "epoch": 0.7294505494505494, "grad_norm": 0.2588058927240242, "learning_rate": 1.770949358499998e-07, "loss": 1.4709, "step": 3319 }, { "epoch": 0.7296703296703296, "grad_norm": 0.24129779231256632, "learning_rate": 1.769315920743892e-07, "loss": 1.4206, "step": 3320 }, { "epoch": 0.7298901098901099, "grad_norm": 0.23705292466120687, "learning_rate": 1.7676834749224764e-07, "loss": 1.4891, "step": 3321 }, { "epoch": 0.7301098901098901, "grad_norm": 0.24185560692826077, "learning_rate": 1.7660520218139944e-07, "loss": 1.483, "step": 3322 }, { "epoch": 0.7303296703296703, "grad_norm": 0.2402788640388191, "learning_rate": 1.7644215621962188e-07, "loss": 1.4908, "step": 3323 }, { "epoch": 0.7305494505494505, "grad_norm": 0.24367930657678058, "learning_rate": 1.7627920968464443e-07, "loss": 1.393, "step": 3324 }, { "epoch": 0.7307692307692307, "grad_norm": 0.29088630455111597, "learning_rate": 1.761163626541496e-07, "loss": 1.4339, "step": 3325 }, { "epoch": 0.730989010989011, "grad_norm": 0.24828865102198766, "learning_rate": 1.7595361520577196e-07, "loss": 1.5337, "step": 3326 }, { "epoch": 0.7312087912087912, "grad_norm": 0.23701206977489214, "learning_rate": 1.7579096741709917e-07, "loss": 1.4834, "step": 3327 }, { "epoch": 0.7314285714285714, "grad_norm": 0.25408991367745354, "learning_rate": 1.75628419365671e-07, "loss": 1.5045, "step": 3328 }, { "epoch": 0.7316483516483516, "grad_norm": 0.3536005652886286, "learning_rate": 1.7546597112897966e-07, "loss": 1.4548, "step": 3329 }, { "epoch": 0.7318681318681318, "grad_norm": 0.2491959389437149, "learning_rate": 1.7530362278447014e-07, "loss": 1.4917, "step": 3330 }, { "epoch": 0.7320879120879121, "grad_norm": 0.2516841603708008, "learning_rate": 1.7514137440953927e-07, "loss": 1.4304, "step": 3331 }, { "epoch": 0.7323076923076923, "grad_norm": 0.3148516129988252, "learning_rate": 1.7497922608153681e-07, "loss": 1.5283, "step": 3332 }, { "epoch": 0.7325274725274725, "grad_norm": 0.24297610594375227, "learning_rate": 1.7481717787776442e-07, "loss": 1.4926, "step": 3333 }, { "epoch": 0.7327472527472527, "grad_norm": 0.24651349476658602, "learning_rate": 1.74655229875476e-07, "loss": 1.4798, "step": 3334 }, { "epoch": 0.7329670329670329, "grad_norm": 0.24363110015164513, "learning_rate": 1.74493382151878e-07, "loss": 1.4844, "step": 3335 }, { "epoch": 0.7331868131868132, "grad_norm": 0.24571049245301715, "learning_rate": 1.743316347841286e-07, "loss": 1.4756, "step": 3336 }, { "epoch": 0.7334065934065934, "grad_norm": 0.30127997312691257, "learning_rate": 1.7416998784933863e-07, "loss": 1.4834, "step": 3337 }, { "epoch": 0.7336263736263736, "grad_norm": 0.24090887689107635, "learning_rate": 1.7400844142457091e-07, "loss": 1.457, "step": 3338 }, { "epoch": 0.7338461538461538, "grad_norm": 0.2506063848987723, "learning_rate": 1.738469955868401e-07, "loss": 1.462, "step": 3339 }, { "epoch": 0.734065934065934, "grad_norm": 0.26223452210156184, "learning_rate": 1.7368565041311317e-07, "loss": 1.5086, "step": 3340 }, { "epoch": 0.7342857142857143, "grad_norm": 0.2511184231142407, "learning_rate": 1.7352440598030876e-07, "loss": 1.5082, "step": 3341 }, { "epoch": 0.7345054945054945, "grad_norm": 0.2746536075154917, "learning_rate": 1.7336326236529798e-07, "loss": 1.4161, "step": 3342 }, { "epoch": 0.7347252747252747, "grad_norm": 0.2634029891295733, "learning_rate": 1.7320221964490353e-07, "loss": 1.4605, "step": 3343 }, { "epoch": 0.7349450549450549, "grad_norm": 0.232507129441169, "learning_rate": 1.730412778959e-07, "loss": 1.3648, "step": 3344 }, { "epoch": 0.7351648351648352, "grad_norm": 0.23945792218875195, "learning_rate": 1.7288043719501395e-07, "loss": 1.4142, "step": 3345 }, { "epoch": 0.7353846153846154, "grad_norm": 0.24304528671522965, "learning_rate": 1.72719697618924e-07, "loss": 1.448, "step": 3346 }, { "epoch": 0.7356043956043956, "grad_norm": 0.24245404420714622, "learning_rate": 1.7255905924426006e-07, "loss": 1.4324, "step": 3347 }, { "epoch": 0.7358241758241758, "grad_norm": 0.27729188749805217, "learning_rate": 1.7239852214760411e-07, "loss": 1.4664, "step": 3348 }, { "epoch": 0.736043956043956, "grad_norm": 0.251772724522178, "learning_rate": 1.7223808640548984e-07, "loss": 1.5219, "step": 3349 }, { "epoch": 0.7362637362637363, "grad_norm": 0.2583832288599042, "learning_rate": 1.7207775209440237e-07, "loss": 1.477, "step": 3350 }, { "epoch": 0.7364835164835165, "grad_norm": 0.2496458226077847, "learning_rate": 1.7191751929077894e-07, "loss": 1.4877, "step": 3351 }, { "epoch": 0.7367032967032967, "grad_norm": 0.2506660296488448, "learning_rate": 1.7175738807100788e-07, "loss": 1.4495, "step": 3352 }, { "epoch": 0.7369230769230769, "grad_norm": 0.2461831193132943, "learning_rate": 1.7159735851142952e-07, "loss": 1.4803, "step": 3353 }, { "epoch": 0.7371428571428571, "grad_norm": 0.3196415414113024, "learning_rate": 1.714374306883355e-07, "loss": 1.4885, "step": 3354 }, { "epoch": 0.7373626373626374, "grad_norm": 0.24597553253439683, "learning_rate": 1.7127760467796877e-07, "loss": 1.4544, "step": 3355 }, { "epoch": 0.7375824175824176, "grad_norm": 0.25955877802464666, "learning_rate": 1.7111788055652426e-07, "loss": 1.4241, "step": 3356 }, { "epoch": 0.7378021978021978, "grad_norm": 0.3085525455010409, "learning_rate": 1.7095825840014792e-07, "loss": 1.446, "step": 3357 }, { "epoch": 0.738021978021978, "grad_norm": 0.24592311041502127, "learning_rate": 1.7079873828493717e-07, "loss": 1.4611, "step": 3358 }, { "epoch": 0.7382417582417582, "grad_norm": 0.24841376538151683, "learning_rate": 1.7063932028694074e-07, "loss": 1.4753, "step": 3359 }, { "epoch": 0.7384615384615385, "grad_norm": 0.2713539897488017, "learning_rate": 1.7048000448215886e-07, "loss": 1.4949, "step": 3360 }, { "epoch": 0.7386813186813187, "grad_norm": 0.24746011975549406, "learning_rate": 1.70320790946543e-07, "loss": 1.5289, "step": 3361 }, { "epoch": 0.7389010989010989, "grad_norm": 0.23917557293883, "learning_rate": 1.701616797559957e-07, "loss": 1.4783, "step": 3362 }, { "epoch": 0.7391208791208791, "grad_norm": 0.235060805618452, "learning_rate": 1.7000267098637086e-07, "loss": 1.45, "step": 3363 }, { "epoch": 0.7393406593406593, "grad_norm": 0.2500362528853272, "learning_rate": 1.698437647134735e-07, "loss": 1.4598, "step": 3364 }, { "epoch": 0.7395604395604396, "grad_norm": 0.23903024188249605, "learning_rate": 1.696849610130596e-07, "loss": 1.4895, "step": 3365 }, { "epoch": 0.7397802197802198, "grad_norm": 0.2683642338537404, "learning_rate": 1.6952625996083677e-07, "loss": 1.5428, "step": 3366 }, { "epoch": 0.74, "grad_norm": 0.24651225838741586, "learning_rate": 1.6936766163246304e-07, "loss": 1.4814, "step": 3367 }, { "epoch": 0.7402197802197802, "grad_norm": 0.24757725955012502, "learning_rate": 1.69209166103548e-07, "loss": 1.4704, "step": 3368 }, { "epoch": 0.7404395604395604, "grad_norm": 0.24913547631859492, "learning_rate": 1.6905077344965188e-07, "loss": 1.4637, "step": 3369 }, { "epoch": 0.7406593406593407, "grad_norm": 0.24699608292536634, "learning_rate": 1.688924837462859e-07, "loss": 1.4833, "step": 3370 }, { "epoch": 0.7408791208791209, "grad_norm": 0.25031556402939964, "learning_rate": 1.6873429706891243e-07, "loss": 1.4265, "step": 3371 }, { "epoch": 0.7410989010989011, "grad_norm": 0.2472417163494106, "learning_rate": 1.6857621349294455e-07, "loss": 1.4265, "step": 3372 }, { "epoch": 0.7413186813186813, "grad_norm": 0.24054463349031055, "learning_rate": 1.684182330937462e-07, "loss": 1.4242, "step": 3373 }, { "epoch": 0.7415384615384616, "grad_norm": 0.25049026957787496, "learning_rate": 1.6826035594663202e-07, "loss": 1.4479, "step": 3374 }, { "epoch": 0.7417582417582418, "grad_norm": 0.3432479096196123, "learning_rate": 1.6810258212686767e-07, "loss": 1.4528, "step": 3375 }, { "epoch": 0.741978021978022, "grad_norm": 0.43450980079736995, "learning_rate": 1.6794491170966954e-07, "loss": 1.4056, "step": 3376 }, { "epoch": 0.7421978021978022, "grad_norm": 0.2524522000964982, "learning_rate": 1.6778734477020451e-07, "loss": 1.4629, "step": 3377 }, { "epoch": 0.7424175824175824, "grad_norm": 0.28158437297803574, "learning_rate": 1.6762988138359027e-07, "loss": 1.4661, "step": 3378 }, { "epoch": 0.7426373626373627, "grad_norm": 0.2438021232880787, "learning_rate": 1.6747252162489492e-07, "loss": 1.4857, "step": 3379 }, { "epoch": 0.7428571428571429, "grad_norm": 0.24487210679330199, "learning_rate": 1.6731526556913763e-07, "loss": 1.4632, "step": 3380 }, { "epoch": 0.7430769230769231, "grad_norm": 0.2375924533319818, "learning_rate": 1.6715811329128773e-07, "loss": 1.4816, "step": 3381 }, { "epoch": 0.7432967032967033, "grad_norm": 0.25019045466570355, "learning_rate": 1.6700106486626512e-07, "loss": 1.4484, "step": 3382 }, { "epoch": 0.7435164835164835, "grad_norm": 0.248879208187517, "learning_rate": 1.6684412036894041e-07, "loss": 1.4562, "step": 3383 }, { "epoch": 0.7437362637362638, "grad_norm": 0.2533580101247259, "learning_rate": 1.6668727987413432e-07, "loss": 1.4878, "step": 3384 }, { "epoch": 0.743956043956044, "grad_norm": 0.23959289705268724, "learning_rate": 1.6653054345661842e-07, "loss": 1.4892, "step": 3385 }, { "epoch": 0.7441758241758242, "grad_norm": 0.243596496979219, "learning_rate": 1.6637391119111432e-07, "loss": 1.4503, "step": 3386 }, { "epoch": 0.7443956043956044, "grad_norm": 0.25527158467697086, "learning_rate": 1.6621738315229406e-07, "loss": 1.49, "step": 3387 }, { "epoch": 0.7446153846153846, "grad_norm": 0.24111902811215588, "learning_rate": 1.6606095941478005e-07, "loss": 1.4166, "step": 3388 }, { "epoch": 0.7448351648351649, "grad_norm": 0.259317487676881, "learning_rate": 1.6590464005314483e-07, "loss": 1.4944, "step": 3389 }, { "epoch": 0.7450549450549451, "grad_norm": 5.822964868824899, "learning_rate": 1.6574842514191139e-07, "loss": 1.4883, "step": 3390 }, { "epoch": 0.7452747252747253, "grad_norm": 0.24580329806796744, "learning_rate": 1.655923147555529e-07, "loss": 1.4484, "step": 3391 }, { "epoch": 0.7454945054945055, "grad_norm": 0.31420614026945887, "learning_rate": 1.654363089684926e-07, "loss": 1.4186, "step": 3392 }, { "epoch": 0.7457142857142857, "grad_norm": 0.24336437785507056, "learning_rate": 1.6528040785510382e-07, "loss": 1.4657, "step": 3393 }, { "epoch": 0.745934065934066, "grad_norm": 0.24724684614368012, "learning_rate": 1.6512461148971e-07, "loss": 1.5491, "step": 3394 }, { "epoch": 0.7461538461538462, "grad_norm": 0.2586628625291868, "learning_rate": 1.6496891994658489e-07, "loss": 1.5383, "step": 3395 }, { "epoch": 0.7463736263736264, "grad_norm": 0.24906770603882725, "learning_rate": 1.6481333329995197e-07, "loss": 1.4988, "step": 3396 }, { "epoch": 0.7465934065934066, "grad_norm": 0.255615007808465, "learning_rate": 1.6465785162398475e-07, "loss": 1.5019, "step": 3397 }, { "epoch": 0.7468131868131868, "grad_norm": 0.2510150542694438, "learning_rate": 1.6450247499280692e-07, "loss": 1.4584, "step": 3398 }, { "epoch": 0.7470329670329671, "grad_norm": 0.25952766439093616, "learning_rate": 1.6434720348049183e-07, "loss": 1.4725, "step": 3399 }, { "epoch": 0.7472527472527473, "grad_norm": 0.2694863141977391, "learning_rate": 1.6419203716106294e-07, "loss": 1.4251, "step": 3400 }, { "epoch": 0.7474725274725275, "grad_norm": 0.27898197115846796, "learning_rate": 1.6403697610849337e-07, "loss": 1.4128, "step": 3401 }, { "epoch": 0.7476923076923077, "grad_norm": 0.3037927593183233, "learning_rate": 1.6388202039670614e-07, "loss": 1.4882, "step": 3402 }, { "epoch": 0.747912087912088, "grad_norm": 0.2493692644682538, "learning_rate": 1.6372717009957397e-07, "loss": 1.5281, "step": 3403 }, { "epoch": 0.7481318681318682, "grad_norm": 0.23686739433759432, "learning_rate": 1.635724252909196e-07, "loss": 1.5028, "step": 3404 }, { "epoch": 0.7483516483516484, "grad_norm": 0.2515016455629176, "learning_rate": 1.6341778604451498e-07, "loss": 1.4553, "step": 3405 }, { "epoch": 0.7485714285714286, "grad_norm": 0.25018838268203325, "learning_rate": 1.6326325243408235e-07, "loss": 1.477, "step": 3406 }, { "epoch": 0.7487912087912088, "grad_norm": 0.3274730094658662, "learning_rate": 1.6310882453329314e-07, "loss": 1.425, "step": 3407 }, { "epoch": 0.7490109890109891, "grad_norm": 0.24745487892889775, "learning_rate": 1.6295450241576836e-07, "loss": 1.4169, "step": 3408 }, { "epoch": 0.7492307692307693, "grad_norm": 0.24659486656899832, "learning_rate": 1.6280028615507903e-07, "loss": 1.459, "step": 3409 }, { "epoch": 0.7494505494505495, "grad_norm": 0.23935687943994277, "learning_rate": 1.626461758247453e-07, "loss": 1.4627, "step": 3410 }, { "epoch": 0.7496703296703296, "grad_norm": 0.24158440391563418, "learning_rate": 1.6249217149823694e-07, "loss": 1.4688, "step": 3411 }, { "epoch": 0.7498901098901098, "grad_norm": 0.2563783065798018, "learning_rate": 1.6233827324897304e-07, "loss": 1.5046, "step": 3412 }, { "epoch": 0.7501098901098902, "grad_norm": 0.24403422210786213, "learning_rate": 1.621844811503225e-07, "loss": 1.5023, "step": 3413 }, { "epoch": 0.7503296703296704, "grad_norm": 0.2449530891458622, "learning_rate": 1.6203079527560321e-07, "loss": 1.4354, "step": 3414 }, { "epoch": 0.7505494505494505, "grad_norm": 0.24346123409489204, "learning_rate": 1.618772156980827e-07, "loss": 1.4633, "step": 3415 }, { "epoch": 0.7507692307692307, "grad_norm": 0.2586465298697483, "learning_rate": 1.6172374249097774e-07, "loss": 1.4764, "step": 3416 }, { "epoch": 0.7509890109890109, "grad_norm": 0.26093534928694234, "learning_rate": 1.6157037572745423e-07, "loss": 1.4304, "step": 3417 }, { "epoch": 0.7512087912087912, "grad_norm": 0.24764290586163387, "learning_rate": 1.6141711548062745e-07, "loss": 1.4488, "step": 3418 }, { "epoch": 0.7514285714285714, "grad_norm": 0.2540610271643636, "learning_rate": 1.6126396182356206e-07, "loss": 1.4131, "step": 3419 }, { "epoch": 0.7516483516483516, "grad_norm": 0.23576589677329232, "learning_rate": 1.6111091482927167e-07, "loss": 1.4519, "step": 3420 }, { "epoch": 0.7518681318681318, "grad_norm": 0.24835772431958147, "learning_rate": 1.6095797457071904e-07, "loss": 1.4788, "step": 3421 }, { "epoch": 0.752087912087912, "grad_norm": 0.2574199723783817, "learning_rate": 1.6080514112081625e-07, "loss": 1.4617, "step": 3422 }, { "epoch": 0.7523076923076923, "grad_norm": 0.2476977553732144, "learning_rate": 1.6065241455242424e-07, "loss": 1.5329, "step": 3423 }, { "epoch": 0.7525274725274725, "grad_norm": 0.2602043876638605, "learning_rate": 1.604997949383532e-07, "loss": 1.4821, "step": 3424 }, { "epoch": 0.7527472527472527, "grad_norm": 0.2760198955284463, "learning_rate": 1.6034728235136223e-07, "loss": 1.4769, "step": 3425 }, { "epoch": 0.7529670329670329, "grad_norm": 0.4087552429938026, "learning_rate": 1.601948768641593e-07, "loss": 1.4092, "step": 3426 }, { "epoch": 0.7531868131868132, "grad_norm": 0.24167683016679026, "learning_rate": 1.6004257854940138e-07, "loss": 1.4417, "step": 3427 }, { "epoch": 0.7534065934065934, "grad_norm": 0.2645530578878678, "learning_rate": 1.5989038747969457e-07, "loss": 1.4437, "step": 3428 }, { "epoch": 0.7536263736263736, "grad_norm": 0.2536499249107328, "learning_rate": 1.5973830372759342e-07, "loss": 1.5188, "step": 3429 }, { "epoch": 0.7538461538461538, "grad_norm": 0.25005127243745934, "learning_rate": 1.595863273656019e-07, "loss": 1.5286, "step": 3430 }, { "epoch": 0.754065934065934, "grad_norm": 0.250870371649544, "learning_rate": 1.5943445846617214e-07, "loss": 1.4469, "step": 3431 }, { "epoch": 0.7542857142857143, "grad_norm": 0.24240831220869535, "learning_rate": 1.5928269710170544e-07, "loss": 1.5119, "step": 3432 }, { "epoch": 0.7545054945054945, "grad_norm": 0.25008920693077885, "learning_rate": 1.5913104334455182e-07, "loss": 1.4287, "step": 3433 }, { "epoch": 0.7547252747252747, "grad_norm": 0.27701456778254857, "learning_rate": 1.5897949726700984e-07, "loss": 1.4594, "step": 3434 }, { "epoch": 0.7549450549450549, "grad_norm": 0.2636801355696161, "learning_rate": 1.5882805894132682e-07, "loss": 1.4497, "step": 3435 }, { "epoch": 0.7551648351648351, "grad_norm": 0.25439861863278274, "learning_rate": 1.586767284396986e-07, "loss": 1.4414, "step": 3436 }, { "epoch": 0.7553846153846154, "grad_norm": 0.24747572642399232, "learning_rate": 1.5852550583426973e-07, "loss": 1.4624, "step": 3437 }, { "epoch": 0.7556043956043956, "grad_norm": 0.3514579941733955, "learning_rate": 1.583743911971335e-07, "loss": 1.4549, "step": 3438 }, { "epoch": 0.7558241758241758, "grad_norm": 0.24662463724587547, "learning_rate": 1.5822338460033142e-07, "loss": 1.4864, "step": 3439 }, { "epoch": 0.756043956043956, "grad_norm": 0.2568050695942072, "learning_rate": 1.5807248611585354e-07, "loss": 1.4916, "step": 3440 }, { "epoch": 0.7562637362637362, "grad_norm": 0.26561380930110146, "learning_rate": 1.5792169581563847e-07, "loss": 1.467, "step": 3441 }, { "epoch": 0.7564835164835165, "grad_norm": 0.24657508351391255, "learning_rate": 1.5777101377157312e-07, "loss": 1.4409, "step": 3442 }, { "epoch": 0.7567032967032967, "grad_norm": 0.250822224928817, "learning_rate": 1.5762044005549306e-07, "loss": 1.4361, "step": 3443 }, { "epoch": 0.7569230769230769, "grad_norm": 0.2720373682723322, "learning_rate": 1.5746997473918184e-07, "loss": 1.4137, "step": 3444 }, { "epoch": 0.7571428571428571, "grad_norm": 0.2535506633968077, "learning_rate": 1.5731961789437168e-07, "loss": 1.5107, "step": 3445 }, { "epoch": 0.7573626373626373, "grad_norm": 0.24674063795003515, "learning_rate": 1.5716936959274292e-07, "loss": 1.5388, "step": 3446 }, { "epoch": 0.7575824175824176, "grad_norm": 0.35941569129434386, "learning_rate": 1.5701922990592402e-07, "loss": 1.4322, "step": 3447 }, { "epoch": 0.7578021978021978, "grad_norm": 0.24791786193462975, "learning_rate": 1.56869198905492e-07, "loss": 1.5379, "step": 3448 }, { "epoch": 0.758021978021978, "grad_norm": 0.25728322482594124, "learning_rate": 1.5671927666297175e-07, "loss": 1.4963, "step": 3449 }, { "epoch": 0.7582417582417582, "grad_norm": 0.2596196568363513, "learning_rate": 1.565694632498365e-07, "loss": 1.4646, "step": 3450 }, { "epoch": 0.7584615384615384, "grad_norm": 0.24793286961565922, "learning_rate": 1.564197587375074e-07, "loss": 1.497, "step": 3451 }, { "epoch": 0.7586813186813187, "grad_norm": 0.2345649016314522, "learning_rate": 1.5627016319735388e-07, "loss": 1.443, "step": 3452 }, { "epoch": 0.7589010989010989, "grad_norm": 0.2536264010182477, "learning_rate": 1.5612067670069351e-07, "loss": 1.4511, "step": 3453 }, { "epoch": 0.7591208791208791, "grad_norm": 0.2506823976400012, "learning_rate": 1.5597129931879156e-07, "loss": 1.5272, "step": 3454 }, { "epoch": 0.7593406593406593, "grad_norm": 0.2754415001707102, "learning_rate": 1.558220311228615e-07, "loss": 1.5006, "step": 3455 }, { "epoch": 0.7595604395604396, "grad_norm": 0.2389276474612393, "learning_rate": 1.5567287218406453e-07, "loss": 1.4615, "step": 3456 }, { "epoch": 0.7597802197802198, "grad_norm": 0.2571718798103615, "learning_rate": 1.5552382257351018e-07, "loss": 1.4553, "step": 3457 }, { "epoch": 0.76, "grad_norm": 0.24655091055769965, "learning_rate": 1.5537488236225542e-07, "loss": 1.4649, "step": 3458 }, { "epoch": 0.7602197802197802, "grad_norm": 0.24302876484845864, "learning_rate": 1.5522605162130522e-07, "loss": 1.4474, "step": 3459 }, { "epoch": 0.7604395604395604, "grad_norm": 0.3185694212030723, "learning_rate": 1.5507733042161257e-07, "loss": 1.4567, "step": 3460 }, { "epoch": 0.7606593406593407, "grad_norm": 0.2696515727530926, "learning_rate": 1.549287188340778e-07, "loss": 1.4754, "step": 3461 }, { "epoch": 0.7608791208791209, "grad_norm": 0.2673114015606836, "learning_rate": 1.547802169295495e-07, "loss": 1.5218, "step": 3462 }, { "epoch": 0.7610989010989011, "grad_norm": 0.23610970983581625, "learning_rate": 1.5463182477882358e-07, "loss": 1.4978, "step": 3463 }, { "epoch": 0.7613186813186813, "grad_norm": 0.2730698187830436, "learning_rate": 1.544835424526438e-07, "loss": 1.4913, "step": 3464 }, { "epoch": 0.7615384615384615, "grad_norm": 0.25254305724161885, "learning_rate": 1.5433537002170144e-07, "loss": 1.507, "step": 3465 }, { "epoch": 0.7617582417582418, "grad_norm": 0.25786668272947233, "learning_rate": 1.541873075566354e-07, "loss": 1.4795, "step": 3466 }, { "epoch": 0.761978021978022, "grad_norm": 0.2637700902156982, "learning_rate": 1.5403935512803235e-07, "loss": 1.5609, "step": 3467 }, { "epoch": 0.7621978021978022, "grad_norm": 0.25065676260386205, "learning_rate": 1.538915128064265e-07, "loss": 1.4574, "step": 3468 }, { "epoch": 0.7624175824175824, "grad_norm": 0.25594108668524185, "learning_rate": 1.5374378066229924e-07, "loss": 1.4698, "step": 3469 }, { "epoch": 0.7626373626373626, "grad_norm": 0.2610440771977634, "learning_rate": 1.5359615876607972e-07, "loss": 1.4879, "step": 3470 }, { "epoch": 0.7628571428571429, "grad_norm": 0.25432582156452654, "learning_rate": 1.534486471881443e-07, "loss": 1.4248, "step": 3471 }, { "epoch": 0.7630769230769231, "grad_norm": 0.26147745724715327, "learning_rate": 1.5330124599881714e-07, "loss": 1.4193, "step": 3472 }, { "epoch": 0.7632967032967033, "grad_norm": 0.2428130190593821, "learning_rate": 1.531539552683694e-07, "loss": 1.4906, "step": 3473 }, { "epoch": 0.7635164835164835, "grad_norm": 0.2728233192533336, "learning_rate": 1.5300677506701962e-07, "loss": 1.4934, "step": 3474 }, { "epoch": 0.7637362637362637, "grad_norm": 0.2518794128485732, "learning_rate": 1.5285970546493395e-07, "loss": 1.4364, "step": 3475 }, { "epoch": 0.763956043956044, "grad_norm": 0.24789489049670058, "learning_rate": 1.5271274653222537e-07, "loss": 1.5031, "step": 3476 }, { "epoch": 0.7641758241758242, "grad_norm": 0.2520873445398746, "learning_rate": 1.5256589833895455e-07, "loss": 1.4572, "step": 3477 }, { "epoch": 0.7643956043956044, "grad_norm": 0.25574820315312446, "learning_rate": 1.5241916095512905e-07, "loss": 1.5286, "step": 3478 }, { "epoch": 0.7646153846153846, "grad_norm": 0.2414238185517599, "learning_rate": 1.522725344507037e-07, "loss": 1.464, "step": 3479 }, { "epoch": 0.7648351648351648, "grad_norm": 0.24641018639956744, "learning_rate": 1.521260188955804e-07, "loss": 1.4514, "step": 3480 }, { "epoch": 0.7650549450549451, "grad_norm": 0.24799631724452226, "learning_rate": 1.5197961435960828e-07, "loss": 1.4701, "step": 3481 }, { "epoch": 0.7652747252747253, "grad_norm": 0.23412073254313034, "learning_rate": 1.5183332091258343e-07, "loss": 1.4642, "step": 3482 }, { "epoch": 0.7654945054945055, "grad_norm": 0.25836737209752714, "learning_rate": 1.5168713862424925e-07, "loss": 1.3991, "step": 3483 }, { "epoch": 0.7657142857142857, "grad_norm": 0.4351468390241667, "learning_rate": 1.5154106756429578e-07, "loss": 1.4956, "step": 3484 }, { "epoch": 0.765934065934066, "grad_norm": 0.2524831931161781, "learning_rate": 1.513951078023601e-07, "loss": 1.4345, "step": 3485 }, { "epoch": 0.7661538461538462, "grad_norm": 0.2675055175327928, "learning_rate": 1.5124925940802654e-07, "loss": 1.4711, "step": 3486 }, { "epoch": 0.7663736263736264, "grad_norm": 0.24495792608121375, "learning_rate": 1.51103522450826e-07, "loss": 1.4758, "step": 3487 }, { "epoch": 0.7665934065934066, "grad_norm": 0.2522590634894788, "learning_rate": 1.5095789700023637e-07, "loss": 1.4539, "step": 3488 }, { "epoch": 0.7668131868131868, "grad_norm": 0.25286682540893707, "learning_rate": 1.508123831256823e-07, "loss": 1.4253, "step": 3489 }, { "epoch": 0.7670329670329671, "grad_norm": 0.23925401562789175, "learning_rate": 1.5066698089653542e-07, "loss": 1.4544, "step": 3490 }, { "epoch": 0.7672527472527473, "grad_norm": 0.25340387447587676, "learning_rate": 1.5052169038211415e-07, "loss": 1.4535, "step": 3491 }, { "epoch": 0.7674725274725275, "grad_norm": 0.24347089266421007, "learning_rate": 1.503765116516834e-07, "loss": 1.4148, "step": 3492 }, { "epoch": 0.7676923076923077, "grad_norm": 0.23447059043429536, "learning_rate": 1.5023144477445505e-07, "loss": 1.452, "step": 3493 }, { "epoch": 0.7679120879120879, "grad_norm": 0.2514153456877283, "learning_rate": 1.500864898195874e-07, "loss": 1.4588, "step": 3494 }, { "epoch": 0.7681318681318682, "grad_norm": 0.2592288169580409, "learning_rate": 1.4994164685618558e-07, "loss": 1.4572, "step": 3495 }, { "epoch": 0.7683516483516484, "grad_norm": 0.25138327897053964, "learning_rate": 1.4979691595330134e-07, "loss": 1.4905, "step": 3496 }, { "epoch": 0.7685714285714286, "grad_norm": 0.24374688066574657, "learning_rate": 1.4965229717993287e-07, "loss": 1.3699, "step": 3497 }, { "epoch": 0.7687912087912088, "grad_norm": 0.23783679830155766, "learning_rate": 1.495077906050251e-07, "loss": 1.3991, "step": 3498 }, { "epoch": 0.769010989010989, "grad_norm": 0.2876140351160437, "learning_rate": 1.4936339629746926e-07, "loss": 1.4186, "step": 3499 }, { "epoch": 0.7692307692307693, "grad_norm": 0.2563523347998719, "learning_rate": 1.4921911432610313e-07, "loss": 1.5102, "step": 3500 }, { "epoch": 0.7694505494505495, "grad_norm": 0.24859372497912346, "learning_rate": 1.4907494475971111e-07, "loss": 1.4803, "step": 3501 }, { "epoch": 0.7696703296703297, "grad_norm": 0.2518190377848507, "learning_rate": 1.4893088766702374e-07, "loss": 1.4816, "step": 3502 }, { "epoch": 0.7698901098901099, "grad_norm": 0.2460201616848149, "learning_rate": 1.4878694311671812e-07, "loss": 1.4837, "step": 3503 }, { "epoch": 0.7701098901098901, "grad_norm": 0.3125759802208812, "learning_rate": 1.4864311117741748e-07, "loss": 1.4507, "step": 3504 }, { "epoch": 0.7703296703296704, "grad_norm": 0.23312905930707495, "learning_rate": 1.4849939191769167e-07, "loss": 1.505, "step": 3505 }, { "epoch": 0.7705494505494506, "grad_norm": 0.24369591311036734, "learning_rate": 1.4835578540605674e-07, "loss": 1.4383, "step": 3506 }, { "epoch": 0.7707692307692308, "grad_norm": 0.24574200737848517, "learning_rate": 1.4821229171097481e-07, "loss": 1.4348, "step": 3507 }, { "epoch": 0.770989010989011, "grad_norm": 0.25948030707170733, "learning_rate": 1.480689109008544e-07, "loss": 1.4927, "step": 3508 }, { "epoch": 0.7712087912087912, "grad_norm": 0.24646101665815814, "learning_rate": 1.4792564304405008e-07, "loss": 1.4658, "step": 3509 }, { "epoch": 0.7714285714285715, "grad_norm": 0.25183416888382154, "learning_rate": 1.4778248820886256e-07, "loss": 1.4892, "step": 3510 }, { "epoch": 0.7716483516483517, "grad_norm": 0.26214956967582087, "learning_rate": 1.4763944646353896e-07, "loss": 1.4403, "step": 3511 }, { "epoch": 0.7718681318681319, "grad_norm": 0.24312505162483267, "learning_rate": 1.4749651787627206e-07, "loss": 1.4669, "step": 3512 }, { "epoch": 0.772087912087912, "grad_norm": 0.27184280319665866, "learning_rate": 1.4735370251520106e-07, "loss": 1.4458, "step": 3513 }, { "epoch": 0.7723076923076924, "grad_norm": 0.2528196050262525, "learning_rate": 1.4721100044841088e-07, "loss": 1.4496, "step": 3514 }, { "epoch": 0.7725274725274726, "grad_norm": 0.24368904732271113, "learning_rate": 1.470684117439328e-07, "loss": 1.4656, "step": 3515 }, { "epoch": 0.7727472527472528, "grad_norm": 0.2545030683078165, "learning_rate": 1.469259364697436e-07, "loss": 1.4787, "step": 3516 }, { "epoch": 0.772967032967033, "grad_norm": 0.2505895633602117, "learning_rate": 1.4678357469376635e-07, "loss": 1.4548, "step": 3517 }, { "epoch": 0.7731868131868131, "grad_norm": 0.2467224646872098, "learning_rate": 1.466413264838698e-07, "loss": 1.4153, "step": 3518 }, { "epoch": 0.7734065934065935, "grad_norm": 0.25185570838648613, "learning_rate": 1.464991919078686e-07, "loss": 1.4617, "step": 3519 }, { "epoch": 0.7736263736263737, "grad_norm": 0.2534599780017951, "learning_rate": 1.463571710335233e-07, "loss": 1.4327, "step": 3520 }, { "epoch": 0.7738461538461539, "grad_norm": 0.25144705278300167, "learning_rate": 1.4621526392854032e-07, "loss": 1.5118, "step": 3521 }, { "epoch": 0.774065934065934, "grad_norm": 0.26670095797124216, "learning_rate": 1.4607347066057164e-07, "loss": 1.439, "step": 3522 }, { "epoch": 0.7742857142857142, "grad_norm": 0.2413791122321761, "learning_rate": 1.4593179129721504e-07, "loss": 1.478, "step": 3523 }, { "epoch": 0.7745054945054946, "grad_norm": 0.25088589122680605, "learning_rate": 1.4579022590601395e-07, "loss": 1.4302, "step": 3524 }, { "epoch": 0.7747252747252747, "grad_norm": 0.26053424455776625, "learning_rate": 1.456487745544577e-07, "loss": 1.4311, "step": 3525 }, { "epoch": 0.774945054945055, "grad_norm": 0.25485413495907105, "learning_rate": 1.45507437309981e-07, "loss": 1.5281, "step": 3526 }, { "epoch": 0.7751648351648351, "grad_norm": 0.2659615784811842, "learning_rate": 1.4536621423996413e-07, "loss": 1.4063, "step": 3527 }, { "epoch": 0.7753846153846153, "grad_norm": 0.23943128667617608, "learning_rate": 1.452251054117332e-07, "loss": 1.4649, "step": 3528 }, { "epoch": 0.7756043956043956, "grad_norm": 0.25526777049689364, "learning_rate": 1.4508411089255965e-07, "loss": 1.455, "step": 3529 }, { "epoch": 0.7758241758241758, "grad_norm": 0.2558591110689858, "learning_rate": 1.4494323074966062e-07, "loss": 1.4661, "step": 3530 }, { "epoch": 0.776043956043956, "grad_norm": 0.25419760736204466, "learning_rate": 1.4480246505019842e-07, "loss": 1.4577, "step": 3531 }, { "epoch": 0.7762637362637362, "grad_norm": 0.25903526768170637, "learning_rate": 1.4466181386128103e-07, "loss": 1.4744, "step": 3532 }, { "epoch": 0.7764835164835164, "grad_norm": 0.24828421408389714, "learning_rate": 1.4452127724996183e-07, "loss": 1.479, "step": 3533 }, { "epoch": 0.7767032967032967, "grad_norm": 0.26867652179704915, "learning_rate": 1.4438085528323935e-07, "loss": 1.5028, "step": 3534 }, { "epoch": 0.7769230769230769, "grad_norm": 0.25106634632110353, "learning_rate": 1.4424054802805773e-07, "loss": 1.4358, "step": 3535 }, { "epoch": 0.7771428571428571, "grad_norm": 0.25370969667004695, "learning_rate": 1.441003555513065e-07, "loss": 1.5298, "step": 3536 }, { "epoch": 0.7773626373626373, "grad_norm": 0.2598201877282118, "learning_rate": 1.4396027791982015e-07, "loss": 1.4975, "step": 3537 }, { "epoch": 0.7775824175824175, "grad_norm": 0.2553860475676706, "learning_rate": 1.438203152003786e-07, "loss": 1.5021, "step": 3538 }, { "epoch": 0.7778021978021978, "grad_norm": 0.45663122512373583, "learning_rate": 1.436804674597069e-07, "loss": 1.4772, "step": 3539 }, { "epoch": 0.778021978021978, "grad_norm": 0.2552533439140228, "learning_rate": 1.4354073476447547e-07, "loss": 1.4803, "step": 3540 }, { "epoch": 0.7782417582417582, "grad_norm": 0.2479704540195358, "learning_rate": 1.434011171812997e-07, "loss": 1.5289, "step": 3541 }, { "epoch": 0.7784615384615384, "grad_norm": 0.32542840344395213, "learning_rate": 1.4326161477674004e-07, "loss": 1.5266, "step": 3542 }, { "epoch": 0.7786813186813187, "grad_norm": 0.23565302383184014, "learning_rate": 1.431222276173023e-07, "loss": 1.4597, "step": 3543 }, { "epoch": 0.7789010989010989, "grad_norm": 0.3867720198894138, "learning_rate": 1.429829557694372e-07, "loss": 1.424, "step": 3544 }, { "epoch": 0.7791208791208791, "grad_norm": 0.2597633759561389, "learning_rate": 1.4284379929954057e-07, "loss": 1.4179, "step": 3545 }, { "epoch": 0.7793406593406593, "grad_norm": 0.30002084123346406, "learning_rate": 1.4270475827395293e-07, "loss": 1.4433, "step": 3546 }, { "epoch": 0.7795604395604395, "grad_norm": 0.25167569917621246, "learning_rate": 1.4256583275896016e-07, "loss": 1.4922, "step": 3547 }, { "epoch": 0.7797802197802198, "grad_norm": 0.24780269228434898, "learning_rate": 1.4242702282079278e-07, "loss": 1.4483, "step": 3548 }, { "epoch": 0.78, "grad_norm": 0.23782809847618502, "learning_rate": 1.4228832852562644e-07, "loss": 1.5192, "step": 3549 }, { "epoch": 0.7802197802197802, "grad_norm": 0.25015173000447616, "learning_rate": 1.421497499395814e-07, "loss": 1.4726, "step": 3550 }, { "epoch": 0.7804395604395604, "grad_norm": 0.25407649625995454, "learning_rate": 1.4201128712872312e-07, "loss": 1.4719, "step": 3551 }, { "epoch": 0.7806593406593406, "grad_norm": 0.24941603461830075, "learning_rate": 1.4187294015906157e-07, "loss": 1.5027, "step": 3552 }, { "epoch": 0.7808791208791209, "grad_norm": 0.23807140103975025, "learning_rate": 1.417347090965514e-07, "loss": 1.4064, "step": 3553 }, { "epoch": 0.7810989010989011, "grad_norm": 0.24191487472968665, "learning_rate": 1.4159659400709242e-07, "loss": 1.4771, "step": 3554 }, { "epoch": 0.7813186813186813, "grad_norm": 0.25806951471456535, "learning_rate": 1.414585949565289e-07, "loss": 1.4518, "step": 3555 }, { "epoch": 0.7815384615384615, "grad_norm": 0.26102687643833583, "learning_rate": 1.4132071201064965e-07, "loss": 1.4944, "step": 3556 }, { "epoch": 0.7817582417582417, "grad_norm": 0.23708380309095017, "learning_rate": 1.4118294523518827e-07, "loss": 1.4228, "step": 3557 }, { "epoch": 0.781978021978022, "grad_norm": 0.2471786128723923, "learning_rate": 1.4104529469582307e-07, "loss": 1.4799, "step": 3558 }, { "epoch": 0.7821978021978022, "grad_norm": 0.24500193059953368, "learning_rate": 1.4090776045817697e-07, "loss": 1.473, "step": 3559 }, { "epoch": 0.7824175824175824, "grad_norm": 0.25561153064545017, "learning_rate": 1.407703425878172e-07, "loss": 1.4672, "step": 3560 }, { "epoch": 0.7826373626373626, "grad_norm": 0.25072978364339477, "learning_rate": 1.406330411502557e-07, "loss": 1.3961, "step": 3561 }, { "epoch": 0.7828571428571428, "grad_norm": 0.2503499710724997, "learning_rate": 1.4049585621094887e-07, "loss": 1.4371, "step": 3562 }, { "epoch": 0.7830769230769231, "grad_norm": 0.25685892700179125, "learning_rate": 1.4035878783529743e-07, "loss": 1.5071, "step": 3563 }, { "epoch": 0.7832967032967033, "grad_norm": 0.24035617397967948, "learning_rate": 1.4022183608864683e-07, "loss": 1.462, "step": 3564 }, { "epoch": 0.7835164835164835, "grad_norm": 0.2558478125326631, "learning_rate": 1.4008500103628658e-07, "loss": 1.4597, "step": 3565 }, { "epoch": 0.7837362637362637, "grad_norm": 0.24857759745695338, "learning_rate": 1.3994828274345092e-07, "loss": 1.4438, "step": 3566 }, { "epoch": 0.7839560439560439, "grad_norm": 0.25543980960816365, "learning_rate": 1.3981168127531812e-07, "loss": 1.4608, "step": 3567 }, { "epoch": 0.7841758241758242, "grad_norm": 0.29150447172254046, "learning_rate": 1.3967519669701077e-07, "loss": 1.5015, "step": 3568 }, { "epoch": 0.7843956043956044, "grad_norm": 0.24935243330679485, "learning_rate": 1.3953882907359602e-07, "loss": 1.4433, "step": 3569 }, { "epoch": 0.7846153846153846, "grad_norm": 0.2514029301929969, "learning_rate": 1.3940257847008497e-07, "loss": 1.4389, "step": 3570 }, { "epoch": 0.7848351648351648, "grad_norm": 0.25925198028207674, "learning_rate": 1.392664449514331e-07, "loss": 1.4709, "step": 3571 }, { "epoch": 0.7850549450549451, "grad_norm": 0.2723531342337533, "learning_rate": 1.3913042858253984e-07, "loss": 1.448, "step": 3572 }, { "epoch": 0.7852747252747253, "grad_norm": 0.24850649632846272, "learning_rate": 1.3899452942824903e-07, "loss": 1.4978, "step": 3573 }, { "epoch": 0.7854945054945055, "grad_norm": 0.2689628321688233, "learning_rate": 1.3885874755334867e-07, "loss": 1.5149, "step": 3574 }, { "epoch": 0.7857142857142857, "grad_norm": 0.24957677048899574, "learning_rate": 1.3872308302257062e-07, "loss": 1.4822, "step": 3575 }, { "epoch": 0.7859340659340659, "grad_norm": 0.24824761818350033, "learning_rate": 1.385875359005909e-07, "loss": 1.4686, "step": 3576 }, { "epoch": 0.7861538461538462, "grad_norm": 0.24653686749157777, "learning_rate": 1.384521062520294e-07, "loss": 1.4865, "step": 3577 }, { "epoch": 0.7863736263736264, "grad_norm": 0.24079307159403177, "learning_rate": 1.3831679414145045e-07, "loss": 1.5198, "step": 3578 }, { "epoch": 0.7865934065934066, "grad_norm": 0.2635919590559695, "learning_rate": 1.3818159963336183e-07, "loss": 1.4742, "step": 3579 }, { "epoch": 0.7868131868131868, "grad_norm": 0.2751772003342524, "learning_rate": 1.3804652279221552e-07, "loss": 1.4842, "step": 3580 }, { "epoch": 0.787032967032967, "grad_norm": 0.2431755192493663, "learning_rate": 1.3791156368240744e-07, "loss": 1.4473, "step": 3581 }, { "epoch": 0.7872527472527473, "grad_norm": 0.25128470024797633, "learning_rate": 1.3777672236827717e-07, "loss": 1.5358, "step": 3582 }, { "epoch": 0.7874725274725275, "grad_norm": 0.25448077071732245, "learning_rate": 1.3764199891410846e-07, "loss": 1.4844, "step": 3583 }, { "epoch": 0.7876923076923077, "grad_norm": 1.0570616444023322, "learning_rate": 1.3750739338412858e-07, "loss": 1.4446, "step": 3584 }, { "epoch": 0.7879120879120879, "grad_norm": 0.23944048780271676, "learning_rate": 1.3737290584250865e-07, "loss": 1.464, "step": 3585 }, { "epoch": 0.7881318681318681, "grad_norm": 0.24227352066455082, "learning_rate": 1.372385363533636e-07, "loss": 1.4377, "step": 3586 }, { "epoch": 0.7883516483516484, "grad_norm": 0.2537578306005774, "learning_rate": 1.37104284980752e-07, "loss": 1.495, "step": 3587 }, { "epoch": 0.7885714285714286, "grad_norm": 0.2620923166657437, "learning_rate": 1.3697015178867622e-07, "loss": 1.4803, "step": 3588 }, { "epoch": 0.7887912087912088, "grad_norm": 0.24691711029696287, "learning_rate": 1.3683613684108231e-07, "loss": 1.4204, "step": 3589 }, { "epoch": 0.789010989010989, "grad_norm": 0.24829109602928742, "learning_rate": 1.3670224020185976e-07, "loss": 1.4059, "step": 3590 }, { "epoch": 0.7892307692307692, "grad_norm": 0.2591759361401097, "learning_rate": 1.3656846193484182e-07, "loss": 1.5086, "step": 3591 }, { "epoch": 0.7894505494505495, "grad_norm": 0.25291516492263455, "learning_rate": 1.3643480210380517e-07, "loss": 1.4726, "step": 3592 }, { "epoch": 0.7896703296703297, "grad_norm": 0.6249091491515365, "learning_rate": 1.3630126077247027e-07, "loss": 1.4475, "step": 3593 }, { "epoch": 0.7898901098901099, "grad_norm": 0.23734670216213488, "learning_rate": 1.361678380045008e-07, "loss": 1.487, "step": 3594 }, { "epoch": 0.7901098901098901, "grad_norm": 0.3959081821447881, "learning_rate": 1.3603453386350402e-07, "loss": 1.4392, "step": 3595 }, { "epoch": 0.7903296703296703, "grad_norm": 0.24372053188157605, "learning_rate": 1.3590134841303083e-07, "loss": 1.4699, "step": 3596 }, { "epoch": 0.7905494505494506, "grad_norm": 0.24807847698118313, "learning_rate": 1.357682817165752e-07, "loss": 1.4987, "step": 3597 }, { "epoch": 0.7907692307692308, "grad_norm": 0.2497931550368373, "learning_rate": 1.3563533383757485e-07, "loss": 1.4807, "step": 3598 }, { "epoch": 0.790989010989011, "grad_norm": 0.25284357241917227, "learning_rate": 1.3550250483941054e-07, "loss": 1.517, "step": 3599 }, { "epoch": 0.7912087912087912, "grad_norm": 0.42253741591653593, "learning_rate": 1.3536979478540658e-07, "loss": 1.4666, "step": 3600 }, { "epoch": 0.7914285714285715, "grad_norm": 0.2532715902024033, "learning_rate": 1.352372037388303e-07, "loss": 1.4652, "step": 3601 }, { "epoch": 0.7916483516483517, "grad_norm": 0.25926879674229014, "learning_rate": 1.3510473176289269e-07, "loss": 1.4912, "step": 3602 }, { "epoch": 0.7918681318681319, "grad_norm": 0.26097516478868577, "learning_rate": 1.3497237892074762e-07, "loss": 1.5148, "step": 3603 }, { "epoch": 0.7920879120879121, "grad_norm": 0.254543043371435, "learning_rate": 1.3484014527549244e-07, "loss": 1.5257, "step": 3604 }, { "epoch": 0.7923076923076923, "grad_norm": 0.2457397860178966, "learning_rate": 1.3470803089016746e-07, "loss": 1.4766, "step": 3605 }, { "epoch": 0.7925274725274726, "grad_norm": 0.4144715230446353, "learning_rate": 1.3457603582775611e-07, "loss": 1.5179, "step": 3606 }, { "epoch": 0.7927472527472528, "grad_norm": 0.247610218313042, "learning_rate": 1.3444416015118525e-07, "loss": 1.5228, "step": 3607 }, { "epoch": 0.792967032967033, "grad_norm": 0.25391350551724173, "learning_rate": 1.3431240392332454e-07, "loss": 1.5207, "step": 3608 }, { "epoch": 0.7931868131868132, "grad_norm": 0.25184527811361523, "learning_rate": 1.3418076720698675e-07, "loss": 1.4727, "step": 3609 }, { "epoch": 0.7934065934065934, "grad_norm": 0.24622976983362402, "learning_rate": 1.3404925006492758e-07, "loss": 1.4735, "step": 3610 }, { "epoch": 0.7936263736263737, "grad_norm": 0.25390713793587044, "learning_rate": 1.3391785255984596e-07, "loss": 1.4771, "step": 3611 }, { "epoch": 0.7938461538461539, "grad_norm": 0.38109727986093606, "learning_rate": 1.337865747543838e-07, "loss": 1.4558, "step": 3612 }, { "epoch": 0.7940659340659341, "grad_norm": 0.2641541808340066, "learning_rate": 1.336554167111256e-07, "loss": 1.4885, "step": 3613 }, { "epoch": 0.7942857142857143, "grad_norm": 0.27480749669000204, "learning_rate": 1.3352437849259907e-07, "loss": 1.4257, "step": 3614 }, { "epoch": 0.7945054945054945, "grad_norm": 0.2401052566258336, "learning_rate": 1.3339346016127466e-07, "loss": 1.4629, "step": 3615 }, { "epoch": 0.7947252747252748, "grad_norm": 0.2564716645167739, "learning_rate": 1.3326266177956567e-07, "loss": 1.3667, "step": 3616 }, { "epoch": 0.794945054945055, "grad_norm": 0.2644254892320189, "learning_rate": 1.331319834098284e-07, "loss": 1.4852, "step": 3617 }, { "epoch": 0.7951648351648352, "grad_norm": 0.23510699478496122, "learning_rate": 1.3300142511436154e-07, "loss": 1.4449, "step": 3618 }, { "epoch": 0.7953846153846154, "grad_norm": 0.24680782985585087, "learning_rate": 1.3287098695540706e-07, "loss": 1.4294, "step": 3619 }, { "epoch": 0.7956043956043956, "grad_norm": 0.23889386781148805, "learning_rate": 1.3274066899514926e-07, "loss": 1.47, "step": 3620 }, { "epoch": 0.7958241758241759, "grad_norm": 0.25622950788305665, "learning_rate": 1.3261047129571515e-07, "loss": 1.5286, "step": 3621 }, { "epoch": 0.7960439560439561, "grad_norm": 0.25262759088704845, "learning_rate": 1.3248039391917472e-07, "loss": 1.4826, "step": 3622 }, { "epoch": 0.7962637362637363, "grad_norm": 0.25194531805796505, "learning_rate": 1.323504369275403e-07, "loss": 1.4836, "step": 3623 }, { "epoch": 0.7964835164835165, "grad_norm": 0.25171370581313124, "learning_rate": 1.3222060038276684e-07, "loss": 1.4469, "step": 3624 }, { "epoch": 0.7967032967032966, "grad_norm": 0.24556699791763342, "learning_rate": 1.3209088434675204e-07, "loss": 1.4828, "step": 3625 }, { "epoch": 0.796923076923077, "grad_norm": 0.2521356125431271, "learning_rate": 1.3196128888133607e-07, "loss": 1.4993, "step": 3626 }, { "epoch": 0.7971428571428572, "grad_norm": 0.2553718391138483, "learning_rate": 1.3183181404830155e-07, "loss": 1.495, "step": 3627 }, { "epoch": 0.7973626373626373, "grad_norm": 0.25627697102082586, "learning_rate": 1.3170245990937372e-07, "loss": 1.4891, "step": 3628 }, { "epoch": 0.7975824175824175, "grad_norm": 0.25474200716456835, "learning_rate": 1.3157322652622018e-07, "loss": 1.451, "step": 3629 }, { "epoch": 0.7978021978021979, "grad_norm": 0.2780876430660836, "learning_rate": 1.3144411396045093e-07, "loss": 1.4847, "step": 3630 }, { "epoch": 0.798021978021978, "grad_norm": 0.24647331582578247, "learning_rate": 1.313151222736186e-07, "loss": 1.5069, "step": 3631 }, { "epoch": 0.7982417582417582, "grad_norm": 0.2439146931874856, "learning_rate": 1.3118625152721788e-07, "loss": 1.4613, "step": 3632 }, { "epoch": 0.7984615384615384, "grad_norm": 0.248085550020872, "learning_rate": 1.3105750178268592e-07, "loss": 1.4373, "step": 3633 }, { "epoch": 0.7986813186813186, "grad_norm": 0.25150351290048434, "learning_rate": 1.3092887310140235e-07, "loss": 1.4231, "step": 3634 }, { "epoch": 0.798901098901099, "grad_norm": 0.24013690468453455, "learning_rate": 1.3080036554468878e-07, "loss": 1.4962, "step": 3635 }, { "epoch": 0.7991208791208791, "grad_norm": 0.25278653321754047, "learning_rate": 1.3067197917380946e-07, "loss": 1.4942, "step": 3636 }, { "epoch": 0.7993406593406593, "grad_norm": 0.2569306306755292, "learning_rate": 1.3054371404997055e-07, "loss": 1.4595, "step": 3637 }, { "epoch": 0.7995604395604395, "grad_norm": 0.2573964718132754, "learning_rate": 1.3041557023432052e-07, "loss": 1.4758, "step": 3638 }, { "epoch": 0.7997802197802197, "grad_norm": 0.24776350556113183, "learning_rate": 1.3028754778794997e-07, "loss": 1.4398, "step": 3639 }, { "epoch": 0.8, "grad_norm": 0.24989334557464338, "learning_rate": 1.301596467718916e-07, "loss": 1.4161, "step": 3640 } ], "logging_steps": 1, "max_steps": 4550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 910, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6845111312777216e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }