diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57092 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001226993865030675, + "grad_norm": 7.532712243050341, + "learning_rate": 8.163265306122451e-08, + "loss": 0.8895, + "step": 1 + }, + { + "epoch": 0.000245398773006135, + "grad_norm": 7.804554457698843, + "learning_rate": 1.6326530612244901e-07, + "loss": 0.8743, + "step": 2 + }, + { + "epoch": 0.0003680981595092025, + "grad_norm": 13.111797711410347, + "learning_rate": 2.4489795918367347e-07, + "loss": 1.3506, + "step": 3 + }, + { + "epoch": 0.00049079754601227, + "grad_norm": 7.837921653516651, + "learning_rate": 3.2653061224489803e-07, + "loss": 0.8568, + "step": 4 + }, + { + "epoch": 0.0006134969325153375, + "grad_norm": 8.141746556269675, + "learning_rate": 4.0816326530612243e-07, + "loss": 0.9279, + "step": 5 + }, + { + "epoch": 0.000736196319018405, + "grad_norm": 12.221393338847722, + "learning_rate": 4.897959183673469e-07, + "loss": 1.3375, + "step": 6 + }, + { + "epoch": 0.0008588957055214724, + "grad_norm": 8.928439635180318, + "learning_rate": 5.714285714285715e-07, + "loss": 0.8634, + "step": 7 + }, + { + "epoch": 0.00098159509202454, + "grad_norm": 7.280559979079106, + "learning_rate": 6.530612244897961e-07, + "loss": 0.8221, + "step": 8 + }, + { + "epoch": 0.0011042944785276073, + "grad_norm": 6.49526337925753, + "learning_rate": 7.346938775510205e-07, + "loss": 0.8067, + "step": 9 + }, + { + "epoch": 0.001226993865030675, + "grad_norm": 5.924609788484046, + "learning_rate": 8.163265306122449e-07, + "loss": 0.8125, + "step": 10 + }, + { + "epoch": 0.0013496932515337423, + "grad_norm": 5.83323003277381, + "learning_rate": 8.979591836734694e-07, + "loss": 0.7828, + "step": 11 + }, + { + "epoch": 0.00147239263803681, + "grad_norm": 5.16685304235967, + "learning_rate": 9.795918367346939e-07, + "loss": 0.8306, + "step": 12 + }, + { + "epoch": 0.0015950920245398773, + "grad_norm": 10.623162533404457, + "learning_rate": 1.0612244897959184e-06, + "loss": 1.3015, + "step": 13 + }, + { + "epoch": 0.0017177914110429449, + "grad_norm": 5.774915901481005, + "learning_rate": 1.142857142857143e-06, + "loss": 0.8135, + "step": 14 + }, + { + "epoch": 0.0018404907975460123, + "grad_norm": 4.1437536403244835, + "learning_rate": 1.2244897959183673e-06, + "loss": 0.8297, + "step": 15 + }, + { + "epoch": 0.00196319018404908, + "grad_norm": 3.21624167079745, + "learning_rate": 1.3061224489795921e-06, + "loss": 0.8088, + "step": 16 + }, + { + "epoch": 0.0020858895705521473, + "grad_norm": 2.736606110451633, + "learning_rate": 1.3877551020408165e-06, + "loss": 0.7864, + "step": 17 + }, + { + "epoch": 0.0022085889570552146, + "grad_norm": 2.883880659771355, + "learning_rate": 1.469387755102041e-06, + "loss": 0.7951, + "step": 18 + }, + { + "epoch": 0.002331288343558282, + "grad_norm": 2.006115968542541, + "learning_rate": 1.5510204081632654e-06, + "loss": 0.723, + "step": 19 + }, + { + "epoch": 0.00245398773006135, + "grad_norm": 2.074021446211109, + "learning_rate": 1.6326530612244897e-06, + "loss": 0.7451, + "step": 20 + }, + { + "epoch": 0.0025766871165644172, + "grad_norm": 2.0268832222549333, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.7892, + "step": 21 + }, + { + "epoch": 0.0026993865030674846, + "grad_norm": 2.204624379222282, + "learning_rate": 1.7959183673469388e-06, + "loss": 0.7271, + "step": 22 + }, + { + "epoch": 0.002822085889570552, + "grad_norm": 2.0750421020665106, + "learning_rate": 1.8775510204081634e-06, + "loss": 0.6879, + "step": 23 + }, + { + "epoch": 0.00294478527607362, + "grad_norm": 3.3188834249438783, + "learning_rate": 1.9591836734693877e-06, + "loss": 0.7394, + "step": 24 + }, + { + "epoch": 0.003067484662576687, + "grad_norm": 4.992871158215294, + "learning_rate": 2.0408163265306125e-06, + "loss": 1.1554, + "step": 25 + }, + { + "epoch": 0.0031901840490797546, + "grad_norm": 2.4832295492601815, + "learning_rate": 2.122448979591837e-06, + "loss": 0.679, + "step": 26 + }, + { + "epoch": 0.003312883435582822, + "grad_norm": 4.116997286262095, + "learning_rate": 2.2040816326530616e-06, + "loss": 1.0825, + "step": 27 + }, + { + "epoch": 0.0034355828220858898, + "grad_norm": 2.1923343682875194, + "learning_rate": 2.285714285714286e-06, + "loss": 0.7219, + "step": 28 + }, + { + "epoch": 0.003558282208588957, + "grad_norm": 2.531784542022958, + "learning_rate": 2.3673469387755103e-06, + "loss": 0.7423, + "step": 29 + }, + { + "epoch": 0.0036809815950920245, + "grad_norm": 3.3959264509008102, + "learning_rate": 2.4489795918367347e-06, + "loss": 1.0463, + "step": 30 + }, + { + "epoch": 0.003803680981595092, + "grad_norm": 2.6820720640441436, + "learning_rate": 2.530612244897959e-06, + "loss": 0.7186, + "step": 31 + }, + { + "epoch": 0.00392638036809816, + "grad_norm": 2.398899235654956, + "learning_rate": 2.6122448979591842e-06, + "loss": 0.741, + "step": 32 + }, + { + "epoch": 0.004049079754601227, + "grad_norm": 1.914092782536963, + "learning_rate": 2.6938775510204086e-06, + "loss": 0.7345, + "step": 33 + }, + { + "epoch": 0.0041717791411042945, + "grad_norm": 2.812473140153217, + "learning_rate": 2.775510204081633e-06, + "loss": 0.7293, + "step": 34 + }, + { + "epoch": 0.004294478527607362, + "grad_norm": 1.520573808215215, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6738, + "step": 35 + }, + { + "epoch": 0.004417177914110429, + "grad_norm": 3.281915068640219, + "learning_rate": 2.938775510204082e-06, + "loss": 0.7192, + "step": 36 + }, + { + "epoch": 0.004539877300613497, + "grad_norm": 1.7070990742008378, + "learning_rate": 3.0204081632653064e-06, + "loss": 0.6979, + "step": 37 + }, + { + "epoch": 0.004662576687116564, + "grad_norm": 2.7833323980430262, + "learning_rate": 3.1020408163265307e-06, + "loss": 0.9976, + "step": 38 + }, + { + "epoch": 0.004785276073619632, + "grad_norm": 1.9302719012359224, + "learning_rate": 3.183673469387755e-06, + "loss": 0.6989, + "step": 39 + }, + { + "epoch": 0.0049079754601227, + "grad_norm": 1.8391889440972482, + "learning_rate": 3.2653061224489794e-06, + "loss": 0.7065, + "step": 40 + }, + { + "epoch": 0.005030674846625767, + "grad_norm": 2.070345081810847, + "learning_rate": 3.3469387755102046e-06, + "loss": 0.6529, + "step": 41 + }, + { + "epoch": 0.0051533742331288344, + "grad_norm": 1.8667514690607716, + "learning_rate": 3.428571428571429e-06, + "loss": 0.6913, + "step": 42 + }, + { + "epoch": 0.005276073619631902, + "grad_norm": 1.7895283203461791, + "learning_rate": 3.5102040816326533e-06, + "loss": 0.6785, + "step": 43 + }, + { + "epoch": 0.005398773006134969, + "grad_norm": 2.2182598673945835, + "learning_rate": 3.5918367346938777e-06, + "loss": 0.9666, + "step": 44 + }, + { + "epoch": 0.005521472392638037, + "grad_norm": 2.1792896868220755, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.96, + "step": 45 + }, + { + "epoch": 0.005644171779141104, + "grad_norm": 1.5379283405394724, + "learning_rate": 3.7551020408163268e-06, + "loss": 0.6313, + "step": 46 + }, + { + "epoch": 0.005766871165644172, + "grad_norm": 1.395718004597691, + "learning_rate": 3.836734693877551e-06, + "loss": 0.6355, + "step": 47 + }, + { + "epoch": 0.00588957055214724, + "grad_norm": 1.64845698139455, + "learning_rate": 3.9183673469387755e-06, + "loss": 0.6698, + "step": 48 + }, + { + "epoch": 0.0060122699386503066, + "grad_norm": 1.6685532719499405, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6652, + "step": 49 + }, + { + "epoch": 0.006134969325153374, + "grad_norm": 1.489412419210689, + "learning_rate": 4.081632653061225e-06, + "loss": 0.7161, + "step": 50 + }, + { + "epoch": 0.006257668711656441, + "grad_norm": 2.197941749201486, + "learning_rate": 4.163265306122449e-06, + "loss": 0.8767, + "step": 51 + }, + { + "epoch": 0.006380368098159509, + "grad_norm": 1.4321704541234217, + "learning_rate": 4.244897959183674e-06, + "loss": 0.6829, + "step": 52 + }, + { + "epoch": 0.006503067484662577, + "grad_norm": 1.556722506300042, + "learning_rate": 4.326530612244899e-06, + "loss": 0.6805, + "step": 53 + }, + { + "epoch": 0.006625766871165644, + "grad_norm": 1.572871418425109, + "learning_rate": 4.408163265306123e-06, + "loss": 0.6281, + "step": 54 + }, + { + "epoch": 0.006748466257668712, + "grad_norm": 1.3563273732285377, + "learning_rate": 4.489795918367348e-06, + "loss": 0.6648, + "step": 55 + }, + { + "epoch": 0.0068711656441717795, + "grad_norm": 2.12848469814007, + "learning_rate": 4.571428571428572e-06, + "loss": 0.8806, + "step": 56 + }, + { + "epoch": 0.0069938650306748465, + "grad_norm": 1.3583930428952373, + "learning_rate": 4.653061224489796e-06, + "loss": 0.6434, + "step": 57 + }, + { + "epoch": 0.007116564417177914, + "grad_norm": 1.5694560739603567, + "learning_rate": 4.734693877551021e-06, + "loss": 0.679, + "step": 58 + }, + { + "epoch": 0.007239263803680981, + "grad_norm": 1.7290758172436085, + "learning_rate": 4.816326530612245e-06, + "loss": 0.8741, + "step": 59 + }, + { + "epoch": 0.007361963190184049, + "grad_norm": 1.5631937577126824, + "learning_rate": 4.897959183673469e-06, + "loss": 0.6621, + "step": 60 + }, + { + "epoch": 0.007484662576687117, + "grad_norm": 1.4768787168186843, + "learning_rate": 4.979591836734694e-06, + "loss": 0.6387, + "step": 61 + }, + { + "epoch": 0.007607361963190184, + "grad_norm": 1.7771600831404846, + "learning_rate": 5.061224489795918e-06, + "loss": 0.6598, + "step": 62 + }, + { + "epoch": 0.007730061349693252, + "grad_norm": 1.5087816322923382, + "learning_rate": 5.142857142857142e-06, + "loss": 0.6616, + "step": 63 + }, + { + "epoch": 0.00785276073619632, + "grad_norm": 1.3928386650227516, + "learning_rate": 5.2244897959183684e-06, + "loss": 0.6499, + "step": 64 + }, + { + "epoch": 0.007975460122699387, + "grad_norm": 1.3237324982898058, + "learning_rate": 5.306122448979593e-06, + "loss": 0.6459, + "step": 65 + }, + { + "epoch": 0.008098159509202453, + "grad_norm": 4.007871343516544, + "learning_rate": 5.387755102040817e-06, + "loss": 0.6816, + "step": 66 + }, + { + "epoch": 0.008220858895705521, + "grad_norm": 2.1418384348024495, + "learning_rate": 5.4693877551020415e-06, + "loss": 0.9139, + "step": 67 + }, + { + "epoch": 0.008343558282208589, + "grad_norm": 1.2459718877377637, + "learning_rate": 5.551020408163266e-06, + "loss": 0.6821, + "step": 68 + }, + { + "epoch": 0.008466257668711657, + "grad_norm": 2.1089340325444157, + "learning_rate": 5.63265306122449e-06, + "loss": 0.6621, + "step": 69 + }, + { + "epoch": 0.008588957055214725, + "grad_norm": 1.5944832889856118, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.6517, + "step": 70 + }, + { + "epoch": 0.00871165644171779, + "grad_norm": 1.4064731884262425, + "learning_rate": 5.795918367346939e-06, + "loss": 0.6361, + "step": 71 + }, + { + "epoch": 0.008834355828220859, + "grad_norm": 1.5097954672392377, + "learning_rate": 5.877551020408164e-06, + "loss": 0.6437, + "step": 72 + }, + { + "epoch": 0.008957055214723926, + "grad_norm": 1.4721727276455243, + "learning_rate": 5.959183673469388e-06, + "loss": 0.6377, + "step": 73 + }, + { + "epoch": 0.009079754601226994, + "grad_norm": 1.4346550994475913, + "learning_rate": 6.040816326530613e-06, + "loss": 0.663, + "step": 74 + }, + { + "epoch": 0.009202453987730062, + "grad_norm": 3.5819831406672384, + "learning_rate": 6.122448979591837e-06, + "loss": 0.926, + "step": 75 + }, + { + "epoch": 0.009325153374233128, + "grad_norm": 1.718904125534879, + "learning_rate": 6.2040816326530614e-06, + "loss": 0.6288, + "step": 76 + }, + { + "epoch": 0.009447852760736196, + "grad_norm": 1.3821322462585206, + "learning_rate": 6.285714285714286e-06, + "loss": 0.627, + "step": 77 + }, + { + "epoch": 0.009570552147239264, + "grad_norm": 1.4703732514547576, + "learning_rate": 6.36734693877551e-06, + "loss": 0.6404, + "step": 78 + }, + { + "epoch": 0.009693251533742332, + "grad_norm": 1.3944676218790335, + "learning_rate": 6.4489795918367345e-06, + "loss": 0.6334, + "step": 79 + }, + { + "epoch": 0.0098159509202454, + "grad_norm": 1.533786231671802, + "learning_rate": 6.530612244897959e-06, + "loss": 0.605, + "step": 80 + }, + { + "epoch": 0.009938650306748465, + "grad_norm": 1.4534305092091941, + "learning_rate": 6.612244897959185e-06, + "loss": 0.6739, + "step": 81 + }, + { + "epoch": 0.010061349693251533, + "grad_norm": 1.5761349576516228, + "learning_rate": 6.693877551020409e-06, + "loss": 0.7028, + "step": 82 + }, + { + "epoch": 0.010184049079754601, + "grad_norm": 1.308152110183147, + "learning_rate": 6.7755102040816336e-06, + "loss": 0.6449, + "step": 83 + }, + { + "epoch": 0.010306748466257669, + "grad_norm": 1.4382682202309465, + "learning_rate": 6.857142857142858e-06, + "loss": 0.657, + "step": 84 + }, + { + "epoch": 0.010429447852760737, + "grad_norm": 1.7045638588633918, + "learning_rate": 6.938775510204082e-06, + "loss": 0.6413, + "step": 85 + }, + { + "epoch": 0.010552147239263805, + "grad_norm": 4.812237750210511, + "learning_rate": 7.020408163265307e-06, + "loss": 0.924, + "step": 86 + }, + { + "epoch": 0.01067484662576687, + "grad_norm": 1.2954837909896975, + "learning_rate": 7.102040816326531e-06, + "loss": 0.6472, + "step": 87 + }, + { + "epoch": 0.010797546012269938, + "grad_norm": 1.6295579604341712, + "learning_rate": 7.183673469387755e-06, + "loss": 0.5921, + "step": 88 + }, + { + "epoch": 0.010920245398773006, + "grad_norm": 1.326086257457675, + "learning_rate": 7.2653061224489805e-06, + "loss": 0.6801, + "step": 89 + }, + { + "epoch": 0.011042944785276074, + "grad_norm": 1.8611352611773524, + "learning_rate": 7.346938775510205e-06, + "loss": 0.642, + "step": 90 + }, + { + "epoch": 0.011165644171779142, + "grad_norm": 4.4003437938530645, + "learning_rate": 7.428571428571429e-06, + "loss": 0.9129, + "step": 91 + }, + { + "epoch": 0.011288343558282208, + "grad_norm": 1.2759868796659357, + "learning_rate": 7.5102040816326536e-06, + "loss": 0.6086, + "step": 92 + }, + { + "epoch": 0.011411042944785276, + "grad_norm": 1.5024341471814642, + "learning_rate": 7.591836734693878e-06, + "loss": 0.6727, + "step": 93 + }, + { + "epoch": 0.011533742331288344, + "grad_norm": 1.3803484124503267, + "learning_rate": 7.673469387755102e-06, + "loss": 0.6657, + "step": 94 + }, + { + "epoch": 0.011656441717791411, + "grad_norm": 1.4816517876067286, + "learning_rate": 7.755102040816327e-06, + "loss": 0.6485, + "step": 95 + }, + { + "epoch": 0.01177914110429448, + "grad_norm": 1.336383365104349, + "learning_rate": 7.836734693877551e-06, + "loss": 0.6113, + "step": 96 + }, + { + "epoch": 0.011901840490797545, + "grad_norm": 2.8106935490719116, + "learning_rate": 7.918367346938776e-06, + "loss": 0.8615, + "step": 97 + }, + { + "epoch": 0.012024539877300613, + "grad_norm": 1.321271004651003, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6547, + "step": 98 + }, + { + "epoch": 0.012147239263803681, + "grad_norm": 1.343752580881755, + "learning_rate": 8.081632653061225e-06, + "loss": 0.6091, + "step": 99 + }, + { + "epoch": 0.012269938650306749, + "grad_norm": 1.521944309460903, + "learning_rate": 8.16326530612245e-06, + "loss": 0.6409, + "step": 100 + }, + { + "epoch": 0.012392638036809817, + "grad_norm": 1.4214962448549209, + "learning_rate": 8.244897959183674e-06, + "loss": 0.6645, + "step": 101 + }, + { + "epoch": 0.012515337423312883, + "grad_norm": 1.5956541015298145, + "learning_rate": 8.326530612244899e-06, + "loss": 0.708, + "step": 102 + }, + { + "epoch": 0.01263803680981595, + "grad_norm": 2.3225030500704875, + "learning_rate": 8.408163265306122e-06, + "loss": 0.8803, + "step": 103 + }, + { + "epoch": 0.012760736196319018, + "grad_norm": 1.442563551684642, + "learning_rate": 8.489795918367347e-06, + "loss": 0.5846, + "step": 104 + }, + { + "epoch": 0.012883435582822086, + "grad_norm": 1.3470044998761659, + "learning_rate": 8.571428571428571e-06, + "loss": 0.6276, + "step": 105 + }, + { + "epoch": 0.013006134969325154, + "grad_norm": 1.5878606484798579, + "learning_rate": 8.653061224489798e-06, + "loss": 0.6664, + "step": 106 + }, + { + "epoch": 0.01312883435582822, + "grad_norm": 1.3787019094192, + "learning_rate": 8.734693877551021e-06, + "loss": 0.6612, + "step": 107 + }, + { + "epoch": 0.013251533742331288, + "grad_norm": 3.6222512946282754, + "learning_rate": 8.816326530612247e-06, + "loss": 0.9101, + "step": 108 + }, + { + "epoch": 0.013374233128834356, + "grad_norm": 1.745171983165382, + "learning_rate": 8.89795918367347e-06, + "loss": 0.8261, + "step": 109 + }, + { + "epoch": 0.013496932515337423, + "grad_norm": 1.4706093202151633, + "learning_rate": 8.979591836734695e-06, + "loss": 0.6962, + "step": 110 + }, + { + "epoch": 0.013619631901840491, + "grad_norm": 1.42833884838061, + "learning_rate": 9.061224489795919e-06, + "loss": 0.6793, + "step": 111 + }, + { + "epoch": 0.013742331288343559, + "grad_norm": 1.3429185507604358, + "learning_rate": 9.142857142857144e-06, + "loss": 0.6015, + "step": 112 + }, + { + "epoch": 0.013865030674846625, + "grad_norm": 1.666035574261893, + "learning_rate": 9.224489795918367e-06, + "loss": 0.6034, + "step": 113 + }, + { + "epoch": 0.013987730061349693, + "grad_norm": 1.4402780771919435, + "learning_rate": 9.306122448979593e-06, + "loss": 0.6812, + "step": 114 + }, + { + "epoch": 0.01411042944785276, + "grad_norm": 1.3498749606749698, + "learning_rate": 9.387755102040818e-06, + "loss": 0.6396, + "step": 115 + }, + { + "epoch": 0.014233128834355829, + "grad_norm": 1.1863599063365857, + "learning_rate": 9.469387755102041e-06, + "loss": 0.6037, + "step": 116 + }, + { + "epoch": 0.014355828220858896, + "grad_norm": 1.2921132582721828, + "learning_rate": 9.551020408163266e-06, + "loss": 0.612, + "step": 117 + }, + { + "epoch": 0.014478527607361963, + "grad_norm": 1.401389895693612, + "learning_rate": 9.63265306122449e-06, + "loss": 0.5599, + "step": 118 + }, + { + "epoch": 0.01460122699386503, + "grad_norm": 1.2494940421846765, + "learning_rate": 9.714285714285715e-06, + "loss": 0.6361, + "step": 119 + }, + { + "epoch": 0.014723926380368098, + "grad_norm": 1.7052707503589923, + "learning_rate": 9.795918367346939e-06, + "loss": 0.6384, + "step": 120 + }, + { + "epoch": 0.014846625766871166, + "grad_norm": 1.517286823248137, + "learning_rate": 9.877551020408164e-06, + "loss": 0.6135, + "step": 121 + }, + { + "epoch": 0.014969325153374234, + "grad_norm": 1.3241784125539682, + "learning_rate": 9.959183673469387e-06, + "loss": 0.6527, + "step": 122 + }, + { + "epoch": 0.0150920245398773, + "grad_norm": 1.957682527515417, + "learning_rate": 1.0040816326530614e-05, + "loss": 0.6411, + "step": 123 + }, + { + "epoch": 0.015214723926380368, + "grad_norm": 1.281118154332832, + "learning_rate": 1.0122448979591836e-05, + "loss": 0.6868, + "step": 124 + }, + { + "epoch": 0.015337423312883436, + "grad_norm": 1.5141880977684326, + "learning_rate": 1.0204081632653063e-05, + "loss": 0.6535, + "step": 125 + }, + { + "epoch": 0.015460122699386503, + "grad_norm": 1.464975267553152, + "learning_rate": 1.0285714285714285e-05, + "loss": 0.5731, + "step": 126 + }, + { + "epoch": 0.015582822085889571, + "grad_norm": 1.4412477265363501, + "learning_rate": 1.0367346938775512e-05, + "loss": 0.6245, + "step": 127 + }, + { + "epoch": 0.01570552147239264, + "grad_norm": 1.7676284412587377, + "learning_rate": 1.0448979591836737e-05, + "loss": 0.6527, + "step": 128 + }, + { + "epoch": 0.015828220858895705, + "grad_norm": 1.3927912071770023, + "learning_rate": 1.053061224489796e-05, + "loss": 0.6093, + "step": 129 + }, + { + "epoch": 0.015950920245398775, + "grad_norm": 6.303947100055873, + "learning_rate": 1.0612244897959186e-05, + "loss": 0.9843, + "step": 130 + }, + { + "epoch": 0.01607361963190184, + "grad_norm": 1.181267506560879, + "learning_rate": 1.0693877551020409e-05, + "loss": 0.5896, + "step": 131 + }, + { + "epoch": 0.016196319018404907, + "grad_norm": 1.4682172527552286, + "learning_rate": 1.0775510204081634e-05, + "loss": 0.599, + "step": 132 + }, + { + "epoch": 0.016319018404907976, + "grad_norm": 1.3809356055040654, + "learning_rate": 1.0857142857142858e-05, + "loss": 0.6291, + "step": 133 + }, + { + "epoch": 0.016441717791411042, + "grad_norm": 1.4453870558438366, + "learning_rate": 1.0938775510204083e-05, + "loss": 0.6523, + "step": 134 + }, + { + "epoch": 0.016564417177914112, + "grad_norm": 1.4719820192087334, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.016687116564417178, + "grad_norm": 1.4475219204160377, + "learning_rate": 1.1102040816326532e-05, + "loss": 0.6195, + "step": 136 + }, + { + "epoch": 0.016809815950920244, + "grad_norm": 1.5471662104632906, + "learning_rate": 1.1183673469387757e-05, + "loss": 0.6702, + "step": 137 + }, + { + "epoch": 0.016932515337423314, + "grad_norm": 3.8139389790226352, + "learning_rate": 1.126530612244898e-05, + "loss": 0.9219, + "step": 138 + }, + { + "epoch": 0.01705521472392638, + "grad_norm": 1.4530346546542248, + "learning_rate": 1.1346938775510206e-05, + "loss": 0.6396, + "step": 139 + }, + { + "epoch": 0.01717791411042945, + "grad_norm": 1.396452888661188, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.6124, + "step": 140 + }, + { + "epoch": 0.017300613496932515, + "grad_norm": 2.568116727718259, + "learning_rate": 1.1510204081632654e-05, + "loss": 0.9117, + "step": 141 + }, + { + "epoch": 0.01742331288343558, + "grad_norm": 2.204263167502245, + "learning_rate": 1.1591836734693878e-05, + "loss": 0.8372, + "step": 142 + }, + { + "epoch": 0.01754601226993865, + "grad_norm": 2.1092982810789294, + "learning_rate": 1.1673469387755103e-05, + "loss": 0.6131, + "step": 143 + }, + { + "epoch": 0.017668711656441717, + "grad_norm": 1.5495612959635343, + "learning_rate": 1.1755102040816328e-05, + "loss": 0.6547, + "step": 144 + }, + { + "epoch": 0.017791411042944787, + "grad_norm": 1.505048978898222, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.5818, + "step": 145 + }, + { + "epoch": 0.017914110429447853, + "grad_norm": 2.6283333536780837, + "learning_rate": 1.1918367346938777e-05, + "loss": 0.8645, + "step": 146 + }, + { + "epoch": 0.01803680981595092, + "grad_norm": 2.0548813048090935, + "learning_rate": 1.2e-05, + "loss": 0.8307, + "step": 147 + }, + { + "epoch": 0.01815950920245399, + "grad_norm": 1.5427337557071954, + "learning_rate": 1.2081632653061225e-05, + "loss": 0.6484, + "step": 148 + }, + { + "epoch": 0.018282208588957054, + "grad_norm": 1.5244795275343168, + "learning_rate": 1.2163265306122449e-05, + "loss": 0.6435, + "step": 149 + }, + { + "epoch": 0.018404907975460124, + "grad_norm": 1.7278676441780128, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.6586, + "step": 150 + }, + { + "epoch": 0.01852760736196319, + "grad_norm": 1.4080107593349593, + "learning_rate": 1.2326530612244898e-05, + "loss": 0.6347, + "step": 151 + }, + { + "epoch": 0.018650306748466256, + "grad_norm": 1.4130873894938603, + "learning_rate": 1.2408163265306123e-05, + "loss": 0.6503, + "step": 152 + }, + { + "epoch": 0.018773006134969326, + "grad_norm": 3.647605106146775, + "learning_rate": 1.248979591836735e-05, + "loss": 0.8897, + "step": 153 + }, + { + "epoch": 0.018895705521472392, + "grad_norm": 1.6361629938893059, + "learning_rate": 1.2571428571428572e-05, + "loss": 0.6429, + "step": 154 + }, + { + "epoch": 0.01901840490797546, + "grad_norm": 1.7017110006881724, + "learning_rate": 1.2653061224489798e-05, + "loss": 0.6128, + "step": 155 + }, + { + "epoch": 0.019141104294478527, + "grad_norm": 1.7355731968689359, + "learning_rate": 1.273469387755102e-05, + "loss": 0.8569, + "step": 156 + }, + { + "epoch": 0.019263803680981594, + "grad_norm": 1.6503832331965018, + "learning_rate": 1.2816326530612247e-05, + "loss": 0.6469, + "step": 157 + }, + { + "epoch": 0.019386503067484663, + "grad_norm": 1.4367755896286278, + "learning_rate": 1.2897959183673469e-05, + "loss": 0.6059, + "step": 158 + }, + { + "epoch": 0.01950920245398773, + "grad_norm": 1.704478513406631, + "learning_rate": 1.2979591836734696e-05, + "loss": 0.6857, + "step": 159 + }, + { + "epoch": 0.0196319018404908, + "grad_norm": 1.4601974205904196, + "learning_rate": 1.3061224489795918e-05, + "loss": 0.6353, + "step": 160 + }, + { + "epoch": 0.019754601226993865, + "grad_norm": 1.9819466136415722, + "learning_rate": 1.3142857142857145e-05, + "loss": 0.6441, + "step": 161 + }, + { + "epoch": 0.01987730061349693, + "grad_norm": 1.3516186870498679, + "learning_rate": 1.322448979591837e-05, + "loss": 0.6498, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.3107526106267928, + "learning_rate": 1.3306122448979593e-05, + "loss": 0.6276, + "step": 163 + }, + { + "epoch": 0.020122699386503066, + "grad_norm": 1.280699194751243, + "learning_rate": 1.3387755102040818e-05, + "loss": 0.6955, + "step": 164 + }, + { + "epoch": 0.020245398773006136, + "grad_norm": 1.642409489525067, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.6225, + "step": 165 + }, + { + "epoch": 0.020368098159509202, + "grad_norm": 1.6954682487653476, + "learning_rate": 1.3551020408163267e-05, + "loss": 0.623, + "step": 166 + }, + { + "epoch": 0.020490797546012268, + "grad_norm": 1.398989790761581, + "learning_rate": 1.363265306122449e-05, + "loss": 0.6612, + "step": 167 + }, + { + "epoch": 0.020613496932515338, + "grad_norm": 1.5319200869336889, + "learning_rate": 1.3714285714285716e-05, + "loss": 0.6388, + "step": 168 + }, + { + "epoch": 0.020736196319018404, + "grad_norm": 1.4910689279949894, + "learning_rate": 1.3795918367346941e-05, + "loss": 0.6589, + "step": 169 + }, + { + "epoch": 0.020858895705521473, + "grad_norm": 1.218502034435007, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.654, + "step": 170 + }, + { + "epoch": 0.02098159509202454, + "grad_norm": 1.4973504433725373, + "learning_rate": 1.395918367346939e-05, + "loss": 0.6477, + "step": 171 + }, + { + "epoch": 0.02110429447852761, + "grad_norm": 1.4593246216575244, + "learning_rate": 1.4040816326530613e-05, + "loss": 0.6748, + "step": 172 + }, + { + "epoch": 0.021226993865030675, + "grad_norm": 2.2884277112315794, + "learning_rate": 1.4122448979591838e-05, + "loss": 0.8944, + "step": 173 + }, + { + "epoch": 0.02134969325153374, + "grad_norm": 1.3036141883570258, + "learning_rate": 1.4204081632653062e-05, + "loss": 0.6469, + "step": 174 + }, + { + "epoch": 0.02147239263803681, + "grad_norm": 1.3286418741426709, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.6553, + "step": 175 + }, + { + "epoch": 0.021595092024539877, + "grad_norm": 1.245466876420374, + "learning_rate": 1.436734693877551e-05, + "loss": 0.6029, + "step": 176 + }, + { + "epoch": 0.021717791411042946, + "grad_norm": 1.3272045093307772, + "learning_rate": 1.4448979591836736e-05, + "loss": 0.5775, + "step": 177 + }, + { + "epoch": 0.021840490797546012, + "grad_norm": 2.9489506127898086, + "learning_rate": 1.4530612244897961e-05, + "loss": 0.5853, + "step": 178 + }, + { + "epoch": 0.02196319018404908, + "grad_norm": 1.7184920825042445, + "learning_rate": 1.4612244897959185e-05, + "loss": 0.866, + "step": 179 + }, + { + "epoch": 0.022085889570552148, + "grad_norm": 1.239805535838934, + "learning_rate": 1.469387755102041e-05, + "loss": 0.6541, + "step": 180 + }, + { + "epoch": 0.022208588957055214, + "grad_norm": 1.4113577356359175, + "learning_rate": 1.4775510204081633e-05, + "loss": 0.6493, + "step": 181 + }, + { + "epoch": 0.022331288343558284, + "grad_norm": 1.5159818241328895, + "learning_rate": 1.4857142857142858e-05, + "loss": 0.8091, + "step": 182 + }, + { + "epoch": 0.02245398773006135, + "grad_norm": 1.770433123334387, + "learning_rate": 1.4938775510204082e-05, + "loss": 0.8515, + "step": 183 + }, + { + "epoch": 0.022576687116564416, + "grad_norm": 1.4038628238556905, + "learning_rate": 1.5020408163265307e-05, + "loss": 0.6159, + "step": 184 + }, + { + "epoch": 0.022699386503067485, + "grad_norm": 1.4448646740790585, + "learning_rate": 1.510204081632653e-05, + "loss": 0.8175, + "step": 185 + }, + { + "epoch": 0.02282208588957055, + "grad_norm": 1.4834843187836027, + "learning_rate": 1.5183673469387756e-05, + "loss": 0.8212, + "step": 186 + }, + { + "epoch": 0.02294478527607362, + "grad_norm": 1.3864497288329045, + "learning_rate": 1.526530612244898e-05, + "loss": 0.6707, + "step": 187 + }, + { + "epoch": 0.023067484662576687, + "grad_norm": 1.5661990347066268, + "learning_rate": 1.5346938775510204e-05, + "loss": 0.6587, + "step": 188 + }, + { + "epoch": 0.023190184049079753, + "grad_norm": 1.712981343064409, + "learning_rate": 1.542857142857143e-05, + "loss": 0.8091, + "step": 189 + }, + { + "epoch": 0.023312883435582823, + "grad_norm": 1.2813254309534483, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.6163, + "step": 190 + }, + { + "epoch": 0.02343558282208589, + "grad_norm": 1.391034795435677, + "learning_rate": 1.559183673469388e-05, + "loss": 0.5863, + "step": 191 + }, + { + "epoch": 0.02355828220858896, + "grad_norm": 1.4091370134492118, + "learning_rate": 1.5673469387755102e-05, + "loss": 0.6922, + "step": 192 + }, + { + "epoch": 0.023680981595092025, + "grad_norm": 1.2053266863317205, + "learning_rate": 1.575510204081633e-05, + "loss": 0.6616, + "step": 193 + }, + { + "epoch": 0.02380368098159509, + "grad_norm": 1.740193943419144, + "learning_rate": 1.5836734693877552e-05, + "loss": 0.8366, + "step": 194 + }, + { + "epoch": 0.02392638036809816, + "grad_norm": 1.4114396074340245, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.811, + "step": 195 + }, + { + "epoch": 0.024049079754601226, + "grad_norm": 1.6339810754901538, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6871, + "step": 196 + }, + { + "epoch": 0.024171779141104296, + "grad_norm": 1.709007657128296, + "learning_rate": 1.6081632653061226e-05, + "loss": 0.6277, + "step": 197 + }, + { + "epoch": 0.024294478527607362, + "grad_norm": 1.4663948390611898, + "learning_rate": 1.616326530612245e-05, + "loss": 0.6702, + "step": 198 + }, + { + "epoch": 0.024417177914110428, + "grad_norm": 1.3786495672167534, + "learning_rate": 1.6244897959183673e-05, + "loss": 0.6815, + "step": 199 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 1.4574408125448683, + "learning_rate": 1.63265306122449e-05, + "loss": 0.6376, + "step": 200 + }, + { + "epoch": 0.024662576687116564, + "grad_norm": 2.0922932561657417, + "learning_rate": 1.6408163265306124e-05, + "loss": 0.8337, + "step": 201 + }, + { + "epoch": 0.024785276073619633, + "grad_norm": 1.530660555888044, + "learning_rate": 1.6489795918367347e-05, + "loss": 0.603, + "step": 202 + }, + { + "epoch": 0.0249079754601227, + "grad_norm": 1.4449443600206506, + "learning_rate": 1.6571428571428574e-05, + "loss": 0.6645, + "step": 203 + }, + { + "epoch": 0.025030674846625765, + "grad_norm": 1.2254088570910127, + "learning_rate": 1.6653061224489797e-05, + "loss": 0.605, + "step": 204 + }, + { + "epoch": 0.025153374233128835, + "grad_norm": 1.4986860988325637, + "learning_rate": 1.673469387755102e-05, + "loss": 0.6492, + "step": 205 + }, + { + "epoch": 0.0252760736196319, + "grad_norm": 1.448536402062234, + "learning_rate": 1.6816326530612244e-05, + "loss": 0.6804, + "step": 206 + }, + { + "epoch": 0.02539877300613497, + "grad_norm": 1.9893702098961508, + "learning_rate": 1.689795918367347e-05, + "loss": 0.8417, + "step": 207 + }, + { + "epoch": 0.025521472392638037, + "grad_norm": 1.4143547850515343, + "learning_rate": 1.6979591836734695e-05, + "loss": 0.6141, + "step": 208 + }, + { + "epoch": 0.025644171779141103, + "grad_norm": 1.430497413941066, + "learning_rate": 1.7061224489795922e-05, + "loss": 0.665, + "step": 209 + }, + { + "epoch": 0.025766871165644172, + "grad_norm": 1.6171572904636204, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.873, + "step": 210 + }, + { + "epoch": 0.02588957055214724, + "grad_norm": 1.3273046054607374, + "learning_rate": 1.722448979591837e-05, + "loss": 0.7516, + "step": 211 + }, + { + "epoch": 0.026012269938650308, + "grad_norm": 1.570291255017974, + "learning_rate": 1.7306122448979596e-05, + "loss": 0.6797, + "step": 212 + }, + { + "epoch": 0.026134969325153374, + "grad_norm": 1.419640519429817, + "learning_rate": 1.738775510204082e-05, + "loss": 0.6625, + "step": 213 + }, + { + "epoch": 0.02625766871165644, + "grad_norm": 1.2585093812388664, + "learning_rate": 1.7469387755102043e-05, + "loss": 0.6154, + "step": 214 + }, + { + "epoch": 0.02638036809815951, + "grad_norm": 4.61988502489677, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.7076, + "step": 215 + }, + { + "epoch": 0.026503067484662576, + "grad_norm": 1.3907121746860462, + "learning_rate": 1.7632653061224493e-05, + "loss": 0.6525, + "step": 216 + }, + { + "epoch": 0.026625766871165645, + "grad_norm": 1.4806410731019528, + "learning_rate": 1.7714285714285717e-05, + "loss": 0.6771, + "step": 217 + }, + { + "epoch": 0.02674846625766871, + "grad_norm": 1.3076957297953051, + "learning_rate": 1.779591836734694e-05, + "loss": 0.6771, + "step": 218 + }, + { + "epoch": 0.02687116564417178, + "grad_norm": 2.837229018945487, + "learning_rate": 1.7877551020408164e-05, + "loss": 0.6891, + "step": 219 + }, + { + "epoch": 0.026993865030674847, + "grad_norm": 1.3238059855431614, + "learning_rate": 1.795918367346939e-05, + "loss": 0.6488, + "step": 220 + }, + { + "epoch": 0.027116564417177913, + "grad_norm": 1.1602166615056917, + "learning_rate": 1.8040816326530614e-05, + "loss": 0.5803, + "step": 221 + }, + { + "epoch": 0.027239263803680983, + "grad_norm": 3.4054890986047788, + "learning_rate": 1.8122448979591837e-05, + "loss": 0.8617, + "step": 222 + }, + { + "epoch": 0.02736196319018405, + "grad_norm": 1.196738948956846, + "learning_rate": 1.8204081632653064e-05, + "loss": 0.6411, + "step": 223 + }, + { + "epoch": 0.027484662576687118, + "grad_norm": 1.4776170800901354, + "learning_rate": 1.8285714285714288e-05, + "loss": 0.6558, + "step": 224 + }, + { + "epoch": 0.027607361963190184, + "grad_norm": 2.1357813524229456, + "learning_rate": 1.836734693877551e-05, + "loss": 0.6805, + "step": 225 + }, + { + "epoch": 0.02773006134969325, + "grad_norm": 1.3816958746988512, + "learning_rate": 1.8448979591836735e-05, + "loss": 0.6801, + "step": 226 + }, + { + "epoch": 0.02785276073619632, + "grad_norm": 2.412672723474732, + "learning_rate": 1.853061224489796e-05, + "loss": 0.8602, + "step": 227 + }, + { + "epoch": 0.027975460122699386, + "grad_norm": 1.905545387477857, + "learning_rate": 1.8612244897959185e-05, + "loss": 0.8006, + "step": 228 + }, + { + "epoch": 0.028098159509202456, + "grad_norm": 1.7007644337806336, + "learning_rate": 1.869387755102041e-05, + "loss": 0.6362, + "step": 229 + }, + { + "epoch": 0.02822085889570552, + "grad_norm": 1.3916999161261947, + "learning_rate": 1.8775510204081636e-05, + "loss": 0.6398, + "step": 230 + }, + { + "epoch": 0.028343558282208588, + "grad_norm": 1.299954470268507, + "learning_rate": 1.885714285714286e-05, + "loss": 0.6625, + "step": 231 + }, + { + "epoch": 0.028466257668711657, + "grad_norm": 1.364273630197294, + "learning_rate": 1.8938775510204083e-05, + "loss": 0.6333, + "step": 232 + }, + { + "epoch": 0.028588957055214723, + "grad_norm": 2.070976840386248, + "learning_rate": 1.9020408163265306e-05, + "loss": 0.8774, + "step": 233 + }, + { + "epoch": 0.028711656441717793, + "grad_norm": 1.5153307256252029, + "learning_rate": 1.9102040816326533e-05, + "loss": 0.6517, + "step": 234 + }, + { + "epoch": 0.02883435582822086, + "grad_norm": 1.3908992972039818, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.6209, + "step": 235 + }, + { + "epoch": 0.028957055214723925, + "grad_norm": 1.4544103136663047, + "learning_rate": 1.926530612244898e-05, + "loss": 0.6698, + "step": 236 + }, + { + "epoch": 0.029079754601226995, + "grad_norm": 1.5819104224078135, + "learning_rate": 1.9346938775510207e-05, + "loss": 0.8438, + "step": 237 + }, + { + "epoch": 0.02920245398773006, + "grad_norm": 1.6085620313716213, + "learning_rate": 1.942857142857143e-05, + "loss": 0.7738, + "step": 238 + }, + { + "epoch": 0.02932515337423313, + "grad_norm": 1.5451340164441265, + "learning_rate": 1.9510204081632654e-05, + "loss": 0.6327, + "step": 239 + }, + { + "epoch": 0.029447852760736196, + "grad_norm": 1.3512036452877838, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.6599, + "step": 240 + }, + { + "epoch": 0.029570552147239262, + "grad_norm": 1.2509033057151986, + "learning_rate": 1.9673469387755104e-05, + "loss": 0.6127, + "step": 241 + }, + { + "epoch": 0.029693251533742332, + "grad_norm": 1.284086093045228, + "learning_rate": 1.9755102040816328e-05, + "loss": 0.6566, + "step": 242 + }, + { + "epoch": 0.029815950920245398, + "grad_norm": 1.8269807880614994, + "learning_rate": 1.983673469387755e-05, + "loss": 0.793, + "step": 243 + }, + { + "epoch": 0.029938650306748468, + "grad_norm": 1.4414507706669284, + "learning_rate": 1.9918367346938775e-05, + "loss": 0.8151, + "step": 244 + }, + { + "epoch": 0.030061349693251534, + "grad_norm": 1.4682290475070472, + "learning_rate": 2e-05, + "loss": 0.712, + "step": 245 + }, + { + "epoch": 0.0301840490797546, + "grad_norm": 1.275865453420821, + "learning_rate": 1.9999999210292987e-05, + "loss": 0.6675, + "step": 246 + }, + { + "epoch": 0.03030674846625767, + "grad_norm": 1.2428380128517684, + "learning_rate": 1.999999684117207e-05, + "loss": 0.655, + "step": 247 + }, + { + "epoch": 0.030429447852760735, + "grad_norm": 1.3604611031213611, + "learning_rate": 1.9999992892637624e-05, + "loss": 0.6818, + "step": 248 + }, + { + "epoch": 0.030552147239263805, + "grad_norm": 1.67467604142476, + "learning_rate": 1.999998736469027e-05, + "loss": 0.7227, + "step": 249 + }, + { + "epoch": 0.03067484662576687, + "grad_norm": 1.2139307255596175, + "learning_rate": 1.9999980257330883e-05, + "loss": 0.6665, + "step": 250 + }, + { + "epoch": 0.030797546012269937, + "grad_norm": 1.218886817719367, + "learning_rate": 1.9999971570560586e-05, + "loss": 0.61, + "step": 251 + }, + { + "epoch": 0.030920245398773007, + "grad_norm": 1.7168800777851771, + "learning_rate": 1.999996130438075e-05, + "loss": 0.6211, + "step": 252 + }, + { + "epoch": 0.031042944785276073, + "grad_norm": 2.3998162426355476, + "learning_rate": 1.9999949458793e-05, + "loss": 0.8798, + "step": 253 + }, + { + "epoch": 0.031165644171779142, + "grad_norm": 1.3890005348982215, + "learning_rate": 1.99999360337992e-05, + "loss": 0.6408, + "step": 254 + }, + { + "epoch": 0.03128834355828221, + "grad_norm": 1.4233136669859674, + "learning_rate": 1.9999921029401478e-05, + "loss": 0.672, + "step": 255 + }, + { + "epoch": 0.03141104294478528, + "grad_norm": 1.381992716836869, + "learning_rate": 1.99999044456022e-05, + "loss": 0.6468, + "step": 256 + }, + { + "epoch": 0.03153374233128834, + "grad_norm": 1.3927919242857272, + "learning_rate": 1.9999886282403983e-05, + "loss": 0.6653, + "step": 257 + }, + { + "epoch": 0.03165644171779141, + "grad_norm": 1.3797720057303617, + "learning_rate": 1.9999866539809698e-05, + "loss": 0.8035, + "step": 258 + }, + { + "epoch": 0.03177914110429448, + "grad_norm": 1.4032908244491782, + "learning_rate": 1.9999845217822465e-05, + "loss": 0.5923, + "step": 259 + }, + { + "epoch": 0.03190184049079755, + "grad_norm": 1.2140180806480096, + "learning_rate": 1.9999822316445652e-05, + "loss": 0.6461, + "step": 260 + }, + { + "epoch": 0.03202453987730061, + "grad_norm": 1.3136112454801085, + "learning_rate": 1.999979783568287e-05, + "loss": 0.8251, + "step": 261 + }, + { + "epoch": 0.03214723926380368, + "grad_norm": 1.1620520659336846, + "learning_rate": 1.999977177553799e-05, + "loss": 0.6557, + "step": 262 + }, + { + "epoch": 0.03226993865030675, + "grad_norm": 1.1969487746165328, + "learning_rate": 1.9999744136015132e-05, + "loss": 0.664, + "step": 263 + }, + { + "epoch": 0.032392638036809814, + "grad_norm": 1.3554486826602308, + "learning_rate": 1.9999714917118655e-05, + "loss": 0.6943, + "step": 264 + }, + { + "epoch": 0.03251533742331288, + "grad_norm": 1.4319484065882893, + "learning_rate": 1.9999684118853177e-05, + "loss": 0.8462, + "step": 265 + }, + { + "epoch": 0.03263803680981595, + "grad_norm": 1.1808022253161143, + "learning_rate": 1.9999651741223557e-05, + "loss": 0.6452, + "step": 266 + }, + { + "epoch": 0.032760736196319015, + "grad_norm": 1.450751780195753, + "learning_rate": 1.9999617784234914e-05, + "loss": 0.6537, + "step": 267 + }, + { + "epoch": 0.032883435582822085, + "grad_norm": 1.147471627129672, + "learning_rate": 1.9999582247892613e-05, + "loss": 0.6513, + "step": 268 + }, + { + "epoch": 0.033006134969325154, + "grad_norm": 1.4479034483777802, + "learning_rate": 1.9999545132202263e-05, + "loss": 0.8336, + "step": 269 + }, + { + "epoch": 0.033128834355828224, + "grad_norm": 1.4246565222629173, + "learning_rate": 1.9999506437169723e-05, + "loss": 0.6634, + "step": 270 + }, + { + "epoch": 0.033251533742331287, + "grad_norm": 1.2659640413536324, + "learning_rate": 1.9999466162801116e-05, + "loss": 0.6747, + "step": 271 + }, + { + "epoch": 0.033374233128834356, + "grad_norm": 1.2441112010119426, + "learning_rate": 1.999942430910279e-05, + "loss": 0.6451, + "step": 272 + }, + { + "epoch": 0.033496932515337426, + "grad_norm": 1.3323503307002789, + "learning_rate": 1.9999380876081362e-05, + "loss": 0.8524, + "step": 273 + }, + { + "epoch": 0.03361963190184049, + "grad_norm": 1.4386584190550227, + "learning_rate": 1.9999335863743694e-05, + "loss": 0.6674, + "step": 274 + }, + { + "epoch": 0.03374233128834356, + "grad_norm": 1.384935207061949, + "learning_rate": 1.9999289272096886e-05, + "loss": 0.6447, + "step": 275 + }, + { + "epoch": 0.03386503067484663, + "grad_norm": 1.6132569288859309, + "learning_rate": 1.999924110114831e-05, + "loss": 0.6426, + "step": 276 + }, + { + "epoch": 0.03398773006134969, + "grad_norm": 1.1532619008319493, + "learning_rate": 1.999919135090556e-05, + "loss": 0.6206, + "step": 277 + }, + { + "epoch": 0.03411042944785276, + "grad_norm": 1.2332844950829565, + "learning_rate": 1.9999140021376505e-05, + "loss": 0.7092, + "step": 278 + }, + { + "epoch": 0.03423312883435583, + "grad_norm": 1.2283091184034824, + "learning_rate": 1.9999087112569246e-05, + "loss": 0.593, + "step": 279 + }, + { + "epoch": 0.0343558282208589, + "grad_norm": 1.419311232661824, + "learning_rate": 1.9999032624492144e-05, + "loss": 0.7602, + "step": 280 + }, + { + "epoch": 0.03447852760736196, + "grad_norm": 1.4161990123656203, + "learning_rate": 1.9998976557153797e-05, + "loss": 0.6774, + "step": 281 + }, + { + "epoch": 0.03460122699386503, + "grad_norm": 1.2700730575634016, + "learning_rate": 1.999891891056307e-05, + "loss": 0.8217, + "step": 282 + }, + { + "epoch": 0.0347239263803681, + "grad_norm": 1.3453000233387014, + "learning_rate": 1.999885968472906e-05, + "loss": 0.7922, + "step": 283 + }, + { + "epoch": 0.03484662576687116, + "grad_norm": 1.752562015251653, + "learning_rate": 1.9998798879661128e-05, + "loss": 0.6656, + "step": 284 + }, + { + "epoch": 0.03496932515337423, + "grad_norm": 1.295039174638497, + "learning_rate": 1.999873649536887e-05, + "loss": 0.6514, + "step": 285 + }, + { + "epoch": 0.0350920245398773, + "grad_norm": 1.2430656070163204, + "learning_rate": 1.9998672531862147e-05, + "loss": 0.6155, + "step": 286 + }, + { + "epoch": 0.035214723926380365, + "grad_norm": 1.2811411461577427, + "learning_rate": 1.9998606989151057e-05, + "loss": 0.6249, + "step": 287 + }, + { + "epoch": 0.035337423312883434, + "grad_norm": 1.1842251019278185, + "learning_rate": 1.999853986724595e-05, + "loss": 0.674, + "step": 288 + }, + { + "epoch": 0.035460122699386504, + "grad_norm": 1.6844159741430993, + "learning_rate": 1.9998471166157434e-05, + "loss": 0.775, + "step": 289 + }, + { + "epoch": 0.03558282208588957, + "grad_norm": 1.388625156162306, + "learning_rate": 1.9998400885896355e-05, + "loss": 0.667, + "step": 290 + }, + { + "epoch": 0.035705521472392636, + "grad_norm": 1.3746830467321403, + "learning_rate": 1.9998329026473812e-05, + "loss": 0.6329, + "step": 291 + }, + { + "epoch": 0.035828220858895705, + "grad_norm": 1.3552583439623012, + "learning_rate": 1.9998255587901155e-05, + "loss": 0.6248, + "step": 292 + }, + { + "epoch": 0.035950920245398775, + "grad_norm": 1.4162788639484787, + "learning_rate": 1.9998180570189986e-05, + "loss": 0.6897, + "step": 293 + }, + { + "epoch": 0.03607361963190184, + "grad_norm": 1.3947730103722153, + "learning_rate": 1.9998103973352152e-05, + "loss": 0.6483, + "step": 294 + }, + { + "epoch": 0.03619631901840491, + "grad_norm": 1.2727464666894477, + "learning_rate": 1.9998025797399753e-05, + "loss": 0.6268, + "step": 295 + }, + { + "epoch": 0.03631901840490798, + "grad_norm": 2.3793950482211743, + "learning_rate": 1.9997946042345128e-05, + "loss": 0.6491, + "step": 296 + }, + { + "epoch": 0.036441717791411046, + "grad_norm": 1.3906670901928766, + "learning_rate": 1.9997864708200885e-05, + "loss": 0.6834, + "step": 297 + }, + { + "epoch": 0.03656441717791411, + "grad_norm": 1.4757452748164102, + "learning_rate": 1.9997781794979864e-05, + "loss": 0.628, + "step": 298 + }, + { + "epoch": 0.03668711656441718, + "grad_norm": 1.4233091285116848, + "learning_rate": 1.9997697302695157e-05, + "loss": 0.6358, + "step": 299 + }, + { + "epoch": 0.03680981595092025, + "grad_norm": 1.5774722120267088, + "learning_rate": 1.9997611231360117e-05, + "loss": 0.8214, + "step": 300 + }, + { + "epoch": 0.03693251533742331, + "grad_norm": 1.3690083378910478, + "learning_rate": 1.999752358098833e-05, + "loss": 0.6515, + "step": 301 + }, + { + "epoch": 0.03705521472392638, + "grad_norm": 1.52085090564905, + "learning_rate": 1.9997434351593648e-05, + "loss": 0.633, + "step": 302 + }, + { + "epoch": 0.03717791411042945, + "grad_norm": 1.3339270537499353, + "learning_rate": 1.9997343543190154e-05, + "loss": 0.65, + "step": 303 + }, + { + "epoch": 0.03730061349693251, + "grad_norm": 1.4947892888516012, + "learning_rate": 1.99972511557922e-05, + "loss": 0.6898, + "step": 304 + }, + { + "epoch": 0.03742331288343558, + "grad_norm": 1.3000186978029895, + "learning_rate": 1.9997157189414373e-05, + "loss": 0.6465, + "step": 305 + }, + { + "epoch": 0.03754601226993865, + "grad_norm": 1.0816704192415858, + "learning_rate": 1.9997061644071516e-05, + "loss": 0.6395, + "step": 306 + }, + { + "epoch": 0.03766871165644172, + "grad_norm": 1.1357132750814647, + "learning_rate": 1.999696451977872e-05, + "loss": 0.6245, + "step": 307 + }, + { + "epoch": 0.037791411042944784, + "grad_norm": 1.3265827714198515, + "learning_rate": 1.9996865816551317e-05, + "loss": 0.6532, + "step": 308 + }, + { + "epoch": 0.03791411042944785, + "grad_norm": 1.2054711464110721, + "learning_rate": 1.999676553440491e-05, + "loss": 0.6582, + "step": 309 + }, + { + "epoch": 0.03803680981595092, + "grad_norm": 1.4669549871149483, + "learning_rate": 1.9996663673355326e-05, + "loss": 0.8329, + "step": 310 + }, + { + "epoch": 0.038159509202453985, + "grad_norm": 1.3188472663320523, + "learning_rate": 1.999656023341866e-05, + "loss": 0.8104, + "step": 311 + }, + { + "epoch": 0.038282208588957055, + "grad_norm": 1.5316628830855052, + "learning_rate": 1.9996455214611245e-05, + "loss": 0.6271, + "step": 312 + }, + { + "epoch": 0.038404907975460124, + "grad_norm": 1.4033053220347376, + "learning_rate": 1.9996348616949673e-05, + "loss": 0.6761, + "step": 313 + }, + { + "epoch": 0.03852760736196319, + "grad_norm": 1.2557398023386441, + "learning_rate": 1.9996240440450773e-05, + "loss": 0.6606, + "step": 314 + }, + { + "epoch": 0.03865030674846626, + "grad_norm": 1.1079331559151848, + "learning_rate": 1.9996130685131637e-05, + "loss": 0.6632, + "step": 315 + }, + { + "epoch": 0.038773006134969326, + "grad_norm": 1.6548662164566164, + "learning_rate": 1.9996019351009598e-05, + "loss": 0.805, + "step": 316 + }, + { + "epoch": 0.038895705521472396, + "grad_norm": 1.5536392429504013, + "learning_rate": 1.9995906438102238e-05, + "loss": 0.8007, + "step": 317 + }, + { + "epoch": 0.03901840490797546, + "grad_norm": 1.5083280108510724, + "learning_rate": 1.9995791946427396e-05, + "loss": 0.685, + "step": 318 + }, + { + "epoch": 0.03914110429447853, + "grad_norm": 2.1654215053152357, + "learning_rate": 1.9995675876003145e-05, + "loss": 0.6138, + "step": 319 + }, + { + "epoch": 0.0392638036809816, + "grad_norm": 1.2024669088098232, + "learning_rate": 1.999555822684783e-05, + "loss": 0.6052, + "step": 320 + }, + { + "epoch": 0.03938650306748466, + "grad_norm": 1.3468416326749302, + "learning_rate": 1.9995438998980025e-05, + "loss": 0.6677, + "step": 321 + }, + { + "epoch": 0.03950920245398773, + "grad_norm": 1.457651818051573, + "learning_rate": 1.999531819241856e-05, + "loss": 0.6787, + "step": 322 + }, + { + "epoch": 0.0396319018404908, + "grad_norm": 1.43862386769674, + "learning_rate": 1.999519580718252e-05, + "loss": 0.656, + "step": 323 + }, + { + "epoch": 0.03975460122699386, + "grad_norm": 1.356662354345886, + "learning_rate": 1.9995071843291232e-05, + "loss": 0.6296, + "step": 324 + }, + { + "epoch": 0.03987730061349693, + "grad_norm": 1.2951733799331637, + "learning_rate": 1.9994946300764276e-05, + "loss": 0.6404, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 1.3663082880761546, + "learning_rate": 1.9994819179621477e-05, + "loss": 0.6643, + "step": 326 + }, + { + "epoch": 0.04012269938650307, + "grad_norm": 1.1606691496887684, + "learning_rate": 1.9994690479882918e-05, + "loss": 0.6757, + "step": 327 + }, + { + "epoch": 0.04024539877300613, + "grad_norm": 1.21679254409458, + "learning_rate": 1.9994560201568922e-05, + "loss": 0.5917, + "step": 328 + }, + { + "epoch": 0.0403680981595092, + "grad_norm": 1.230208695001609, + "learning_rate": 1.999442834470007e-05, + "loss": 0.6573, + "step": 329 + }, + { + "epoch": 0.04049079754601227, + "grad_norm": 1.182055198459804, + "learning_rate": 1.999429490929718e-05, + "loss": 0.6407, + "step": 330 + }, + { + "epoch": 0.040613496932515335, + "grad_norm": 1.340929277056591, + "learning_rate": 1.9994159895381337e-05, + "loss": 0.6728, + "step": 331 + }, + { + "epoch": 0.040736196319018404, + "grad_norm": 1.3839522042125914, + "learning_rate": 1.9994023302973854e-05, + "loss": 0.6156, + "step": 332 + }, + { + "epoch": 0.040858895705521474, + "grad_norm": 1.1712080975377186, + "learning_rate": 1.9993885132096317e-05, + "loss": 0.6367, + "step": 333 + }, + { + "epoch": 0.040981595092024536, + "grad_norm": 1.875169672964527, + "learning_rate": 1.9993745382770538e-05, + "loss": 0.6545, + "step": 334 + }, + { + "epoch": 0.041104294478527606, + "grad_norm": 2.7624302110753494, + "learning_rate": 1.999360405501859e-05, + "loss": 0.8333, + "step": 335 + }, + { + "epoch": 0.041226993865030676, + "grad_norm": 1.4508775521081894, + "learning_rate": 1.9993461148862806e-05, + "loss": 0.6611, + "step": 336 + }, + { + "epoch": 0.041349693251533745, + "grad_norm": 1.427275264581299, + "learning_rate": 1.9993316664325743e-05, + "loss": 0.8538, + "step": 337 + }, + { + "epoch": 0.04147239263803681, + "grad_norm": 1.400057374464738, + "learning_rate": 1.9993170601430233e-05, + "loss": 0.6563, + "step": 338 + }, + { + "epoch": 0.04159509202453988, + "grad_norm": 1.5063702197161595, + "learning_rate": 1.9993022960199334e-05, + "loss": 0.6379, + "step": 339 + }, + { + "epoch": 0.04171779141104295, + "grad_norm": 1.4616272584350527, + "learning_rate": 1.9992873740656372e-05, + "loss": 0.6436, + "step": 340 + }, + { + "epoch": 0.04184049079754601, + "grad_norm": 1.3901519436905019, + "learning_rate": 1.999272294282491e-05, + "loss": 0.5893, + "step": 341 + }, + { + "epoch": 0.04196319018404908, + "grad_norm": 1.504679749528472, + "learning_rate": 1.999257056672877e-05, + "loss": 0.6656, + "step": 342 + }, + { + "epoch": 0.04208588957055215, + "grad_norm": 1.5594566799674143, + "learning_rate": 1.999241661239202e-05, + "loss": 0.6232, + "step": 343 + }, + { + "epoch": 0.04220858895705522, + "grad_norm": 1.525699902180725, + "learning_rate": 1.9992261079838966e-05, + "loss": 0.6895, + "step": 344 + }, + { + "epoch": 0.04233128834355828, + "grad_norm": 1.4968985196652371, + "learning_rate": 1.9992103969094182e-05, + "loss": 0.649, + "step": 345 + }, + { + "epoch": 0.04245398773006135, + "grad_norm": 1.2914643500769563, + "learning_rate": 1.999194528018248e-05, + "loss": 0.6278, + "step": 346 + }, + { + "epoch": 0.04257668711656442, + "grad_norm": 1.4702956195161083, + "learning_rate": 1.9991785013128922e-05, + "loss": 0.6868, + "step": 347 + }, + { + "epoch": 0.04269938650306748, + "grad_norm": 1.1953900170436507, + "learning_rate": 1.9991623167958827e-05, + "loss": 0.6541, + "step": 348 + }, + { + "epoch": 0.04282208588957055, + "grad_norm": 1.2447002636841686, + "learning_rate": 1.9991459744697748e-05, + "loss": 0.654, + "step": 349 + }, + { + "epoch": 0.04294478527607362, + "grad_norm": 1.3671062418402047, + "learning_rate": 1.99912947433715e-05, + "loss": 0.5956, + "step": 350 + }, + { + "epoch": 0.043067484662576684, + "grad_norm": 2.2285938503855425, + "learning_rate": 1.9991128164006147e-05, + "loss": 0.914, + "step": 351 + }, + { + "epoch": 0.043190184049079754, + "grad_norm": 1.273732619648401, + "learning_rate": 1.9990960006627995e-05, + "loss": 0.6195, + "step": 352 + }, + { + "epoch": 0.04331288343558282, + "grad_norm": 1.3523579446344187, + "learning_rate": 1.99907902712636e-05, + "loss": 0.6293, + "step": 353 + }, + { + "epoch": 0.04343558282208589, + "grad_norm": 1.2355082842106124, + "learning_rate": 1.999061895793978e-05, + "loss": 0.6681, + "step": 354 + }, + { + "epoch": 0.043558282208588955, + "grad_norm": 1.3170562192606745, + "learning_rate": 1.999044606668358e-05, + "loss": 0.6484, + "step": 355 + }, + { + "epoch": 0.043680981595092025, + "grad_norm": 1.138120817993039, + "learning_rate": 1.9990271597522318e-05, + "loss": 0.6951, + "step": 356 + }, + { + "epoch": 0.043803680981595094, + "grad_norm": 1.3147338436321634, + "learning_rate": 1.999009555048354e-05, + "loss": 0.652, + "step": 357 + }, + { + "epoch": 0.04392638036809816, + "grad_norm": 1.2986360919371585, + "learning_rate": 1.9989917925595063e-05, + "loss": 0.6001, + "step": 358 + }, + { + "epoch": 0.04404907975460123, + "grad_norm": 1.4338436424824652, + "learning_rate": 1.998973872288493e-05, + "loss": 0.6524, + "step": 359 + }, + { + "epoch": 0.044171779141104296, + "grad_norm": 1.476800689602959, + "learning_rate": 1.998955794238145e-05, + "loss": 0.6459, + "step": 360 + }, + { + "epoch": 0.04429447852760736, + "grad_norm": 1.3095611698578478, + "learning_rate": 1.9989375584113175e-05, + "loss": 0.6608, + "step": 361 + }, + { + "epoch": 0.04441717791411043, + "grad_norm": 1.7355714185979987, + "learning_rate": 1.9989191648108907e-05, + "loss": 0.8336, + "step": 362 + }, + { + "epoch": 0.0445398773006135, + "grad_norm": 1.1378598616355091, + "learning_rate": 1.9989006134397696e-05, + "loss": 0.6572, + "step": 363 + }, + { + "epoch": 0.04466257668711657, + "grad_norm": 1.7143566071112506, + "learning_rate": 1.998881904300884e-05, + "loss": 0.682, + "step": 364 + }, + { + "epoch": 0.04478527607361963, + "grad_norm": 1.203660881472362, + "learning_rate": 1.9988630373971896e-05, + "loss": 0.6478, + "step": 365 + }, + { + "epoch": 0.0449079754601227, + "grad_norm": 1.2745017844040318, + "learning_rate": 1.9988440127316658e-05, + "loss": 0.6594, + "step": 366 + }, + { + "epoch": 0.04503067484662577, + "grad_norm": 1.3634749686311145, + "learning_rate": 1.9988248303073173e-05, + "loss": 0.7256, + "step": 367 + }, + { + "epoch": 0.04515337423312883, + "grad_norm": 1.234697190565201, + "learning_rate": 1.998805490127174e-05, + "loss": 0.6955, + "step": 368 + }, + { + "epoch": 0.0452760736196319, + "grad_norm": 1.2819637913399191, + "learning_rate": 1.99878599219429e-05, + "loss": 0.8039, + "step": 369 + }, + { + "epoch": 0.04539877300613497, + "grad_norm": 1.1937646982035324, + "learning_rate": 1.9987663365117456e-05, + "loss": 0.8035, + "step": 370 + }, + { + "epoch": 0.045521472392638034, + "grad_norm": 1.2171070881383748, + "learning_rate": 1.998746523082645e-05, + "loss": 0.6563, + "step": 371 + }, + { + "epoch": 0.0456441717791411, + "grad_norm": 1.4027489213750837, + "learning_rate": 1.9987265519101172e-05, + "loss": 0.7063, + "step": 372 + }, + { + "epoch": 0.04576687116564417, + "grad_norm": 1.2294513501038455, + "learning_rate": 1.9987064229973167e-05, + "loss": 0.6997, + "step": 373 + }, + { + "epoch": 0.04588957055214724, + "grad_norm": 1.3642342743594338, + "learning_rate": 1.998686136347423e-05, + "loss": 0.6308, + "step": 374 + }, + { + "epoch": 0.046012269938650305, + "grad_norm": 1.528569021433768, + "learning_rate": 1.99866569196364e-05, + "loss": 0.6206, + "step": 375 + }, + { + "epoch": 0.046134969325153374, + "grad_norm": 1.2212860818692948, + "learning_rate": 1.998645089849196e-05, + "loss": 0.6631, + "step": 376 + }, + { + "epoch": 0.046257668711656444, + "grad_norm": 1.4388044275371121, + "learning_rate": 1.998624330007346e-05, + "loss": 0.6505, + "step": 377 + }, + { + "epoch": 0.046380368098159507, + "grad_norm": 1.2021774952585509, + "learning_rate": 1.9986034124413684e-05, + "loss": 0.6241, + "step": 378 + }, + { + "epoch": 0.046503067484662576, + "grad_norm": 1.175122475318069, + "learning_rate": 1.998582337154567e-05, + "loss": 0.6531, + "step": 379 + }, + { + "epoch": 0.046625766871165646, + "grad_norm": 1.1822972817315205, + "learning_rate": 1.9985611041502704e-05, + "loss": 0.6293, + "step": 380 + }, + { + "epoch": 0.04674846625766871, + "grad_norm": 1.295951356242853, + "learning_rate": 1.998539713431832e-05, + "loss": 0.6355, + "step": 381 + }, + { + "epoch": 0.04687116564417178, + "grad_norm": 1.1365251161779046, + "learning_rate": 1.9985181650026305e-05, + "loss": 0.6158, + "step": 382 + }, + { + "epoch": 0.04699386503067485, + "grad_norm": 1.213826142555602, + "learning_rate": 1.9984964588660692e-05, + "loss": 0.6336, + "step": 383 + }, + { + "epoch": 0.04711656441717792, + "grad_norm": 1.21825517460925, + "learning_rate": 1.9984745950255765e-05, + "loss": 0.6568, + "step": 384 + }, + { + "epoch": 0.04723926380368098, + "grad_norm": 1.1927908499739046, + "learning_rate": 1.9984525734846056e-05, + "loss": 0.648, + "step": 385 + }, + { + "epoch": 0.04736196319018405, + "grad_norm": 2.566506022545533, + "learning_rate": 1.9984303942466346e-05, + "loss": 0.6682, + "step": 386 + }, + { + "epoch": 0.04748466257668712, + "grad_norm": 1.1580281649272353, + "learning_rate": 1.9984080573151667e-05, + "loss": 0.6898, + "step": 387 + }, + { + "epoch": 0.04760736196319018, + "grad_norm": 1.8610031601064558, + "learning_rate": 1.998385562693729e-05, + "loss": 0.8734, + "step": 388 + }, + { + "epoch": 0.04773006134969325, + "grad_norm": 1.1778659445155015, + "learning_rate": 1.9983629103858754e-05, + "loss": 0.6386, + "step": 389 + }, + { + "epoch": 0.04785276073619632, + "grad_norm": 1.2768212792214273, + "learning_rate": 1.998340100395183e-05, + "loss": 0.8014, + "step": 390 + }, + { + "epoch": 0.04797546012269939, + "grad_norm": 1.1256688532857433, + "learning_rate": 1.9983171327252547e-05, + "loss": 0.6457, + "step": 391 + }, + { + "epoch": 0.04809815950920245, + "grad_norm": 1.358290308651056, + "learning_rate": 1.998294007379718e-05, + "loss": 0.6851, + "step": 392 + }, + { + "epoch": 0.04822085889570552, + "grad_norm": 1.2487712604482668, + "learning_rate": 1.9982707243622254e-05, + "loss": 0.6674, + "step": 393 + }, + { + "epoch": 0.04834355828220859, + "grad_norm": 1.6657548030152605, + "learning_rate": 1.998247283676454e-05, + "loss": 0.6272, + "step": 394 + }, + { + "epoch": 0.048466257668711654, + "grad_norm": 1.167771355481486, + "learning_rate": 1.9982236853261067e-05, + "loss": 0.649, + "step": 395 + }, + { + "epoch": 0.048588957055214724, + "grad_norm": 1.237938555690847, + "learning_rate": 1.9981999293149094e-05, + "loss": 0.6266, + "step": 396 + }, + { + "epoch": 0.04871165644171779, + "grad_norm": 1.2436270196222692, + "learning_rate": 1.998176015646615e-05, + "loss": 0.6273, + "step": 397 + }, + { + "epoch": 0.048834355828220856, + "grad_norm": 1.4460282003511928, + "learning_rate": 1.998151944325001e-05, + "loss": 0.6469, + "step": 398 + }, + { + "epoch": 0.048957055214723925, + "grad_norm": 1.3035553601054983, + "learning_rate": 1.9981277153538686e-05, + "loss": 0.6649, + "step": 399 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 1.1909134962298118, + "learning_rate": 1.9981033287370443e-05, + "loss": 0.6688, + "step": 400 + }, + { + "epoch": 0.049202453987730065, + "grad_norm": 1.304653512439799, + "learning_rate": 1.99807878447838e-05, + "loss": 0.6806, + "step": 401 + }, + { + "epoch": 0.04932515337423313, + "grad_norm": 1.231831742229607, + "learning_rate": 1.9980540825817525e-05, + "loss": 0.656, + "step": 402 + }, + { + "epoch": 0.0494478527607362, + "grad_norm": 1.4481243386651586, + "learning_rate": 1.9980292230510632e-05, + "loss": 0.6902, + "step": 403 + }, + { + "epoch": 0.049570552147239266, + "grad_norm": 1.8287668921511597, + "learning_rate": 1.9980042058902383e-05, + "loss": 0.6544, + "step": 404 + }, + { + "epoch": 0.04969325153374233, + "grad_norm": 2.5441197889021345, + "learning_rate": 1.9979790311032288e-05, + "loss": 0.8586, + "step": 405 + }, + { + "epoch": 0.0498159509202454, + "grad_norm": 1.2381059232822484, + "learning_rate": 1.9979536986940113e-05, + "loss": 0.5974, + "step": 406 + }, + { + "epoch": 0.04993865030674847, + "grad_norm": 1.3239737928273738, + "learning_rate": 1.9979282086665864e-05, + "loss": 0.6803, + "step": 407 + }, + { + "epoch": 0.05006134969325153, + "grad_norm": 1.6316004974376943, + "learning_rate": 1.9979025610249808e-05, + "loss": 0.782, + "step": 408 + }, + { + "epoch": 0.0501840490797546, + "grad_norm": 1.6828466420418489, + "learning_rate": 1.9978767557732445e-05, + "loss": 0.6571, + "step": 409 + }, + { + "epoch": 0.05030674846625767, + "grad_norm": 1.6559782101948965, + "learning_rate": 1.9978507929154534e-05, + "loss": 0.8052, + "step": 410 + }, + { + "epoch": 0.05042944785276074, + "grad_norm": 1.2183748371078396, + "learning_rate": 1.9978246724557083e-05, + "loss": 0.6492, + "step": 411 + }, + { + "epoch": 0.0505521472392638, + "grad_norm": 1.2555773110626374, + "learning_rate": 1.9977983943981348e-05, + "loss": 0.6848, + "step": 412 + }, + { + "epoch": 0.05067484662576687, + "grad_norm": 1.352852534213827, + "learning_rate": 1.997771958746883e-05, + "loss": 0.6079, + "step": 413 + }, + { + "epoch": 0.05079754601226994, + "grad_norm": 1.274935849504946, + "learning_rate": 1.9977453655061283e-05, + "loss": 0.6753, + "step": 414 + }, + { + "epoch": 0.050920245398773004, + "grad_norm": 1.0676468933617471, + "learning_rate": 1.9977186146800707e-05, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 0.05104294478527607, + "grad_norm": 1.4048792594423696, + "learning_rate": 1.997691706272936e-05, + "loss": 0.6533, + "step": 416 + }, + { + "epoch": 0.05116564417177914, + "grad_norm": 1.08683517302787, + "learning_rate": 1.997664640288973e-05, + "loss": 0.6144, + "step": 417 + }, + { + "epoch": 0.051288343558282205, + "grad_norm": 1.6330237681005906, + "learning_rate": 1.9976374167324575e-05, + "loss": 0.8466, + "step": 418 + }, + { + "epoch": 0.051411042944785275, + "grad_norm": 1.3280669197660584, + "learning_rate": 1.997610035607689e-05, + "loss": 0.6583, + "step": 419 + }, + { + "epoch": 0.051533742331288344, + "grad_norm": 1.3880137900306486, + "learning_rate": 1.9975824969189913e-05, + "loss": 0.8325, + "step": 420 + }, + { + "epoch": 0.051656441717791414, + "grad_norm": 1.27117484402782, + "learning_rate": 1.9975548006707153e-05, + "loss": 0.5966, + "step": 421 + }, + { + "epoch": 0.05177914110429448, + "grad_norm": 1.2766319255882428, + "learning_rate": 1.9975269468672342e-05, + "loss": 0.6359, + "step": 422 + }, + { + "epoch": 0.051901840490797546, + "grad_norm": 1.5424138126927232, + "learning_rate": 1.997498935512948e-05, + "loss": 0.6583, + "step": 423 + }, + { + "epoch": 0.052024539877300616, + "grad_norm": 1.4918625498206006, + "learning_rate": 1.9974707666122806e-05, + "loss": 0.7238, + "step": 424 + }, + { + "epoch": 0.05214723926380368, + "grad_norm": 1.0974590070570394, + "learning_rate": 1.997442440169681e-05, + "loss": 0.7123, + "step": 425 + }, + { + "epoch": 0.05226993865030675, + "grad_norm": 1.4215381581631028, + "learning_rate": 1.9974139561896232e-05, + "loss": 0.6439, + "step": 426 + }, + { + "epoch": 0.05239263803680982, + "grad_norm": 1.2724465660970838, + "learning_rate": 1.9973853146766056e-05, + "loss": 0.6426, + "step": 427 + }, + { + "epoch": 0.05251533742331288, + "grad_norm": 1.280551850284571, + "learning_rate": 1.9973565156351524e-05, + "loss": 0.6909, + "step": 428 + }, + { + "epoch": 0.05263803680981595, + "grad_norm": 1.2971173504482478, + "learning_rate": 1.997327559069812e-05, + "loss": 0.6922, + "step": 429 + }, + { + "epoch": 0.05276073619631902, + "grad_norm": 1.1318580806794394, + "learning_rate": 1.997298444985158e-05, + "loss": 0.6247, + "step": 430 + }, + { + "epoch": 0.05288343558282209, + "grad_norm": 1.9901506740750268, + "learning_rate": 1.997269173385788e-05, + "loss": 0.8605, + "step": 431 + }, + { + "epoch": 0.05300613496932515, + "grad_norm": 1.2445621014658468, + "learning_rate": 1.997239744276326e-05, + "loss": 0.6153, + "step": 432 + }, + { + "epoch": 0.05312883435582822, + "grad_norm": 1.1455006851900622, + "learning_rate": 1.9972101576614203e-05, + "loss": 0.6688, + "step": 433 + }, + { + "epoch": 0.05325153374233129, + "grad_norm": 1.4054180475719273, + "learning_rate": 1.997180413545743e-05, + "loss": 0.8034, + "step": 434 + }, + { + "epoch": 0.05337423312883435, + "grad_norm": 1.0834516272791583, + "learning_rate": 1.9971505119339923e-05, + "loss": 0.6216, + "step": 435 + }, + { + "epoch": 0.05349693251533742, + "grad_norm": 1.2730645872026694, + "learning_rate": 1.9971204528308907e-05, + "loss": 0.5919, + "step": 436 + }, + { + "epoch": 0.05361963190184049, + "grad_norm": 1.4560728867886896, + "learning_rate": 1.997090236241186e-05, + "loss": 0.6696, + "step": 437 + }, + { + "epoch": 0.05374233128834356, + "grad_norm": 1.168537032015925, + "learning_rate": 1.9970598621696507e-05, + "loss": 0.6656, + "step": 438 + }, + { + "epoch": 0.053865030674846624, + "grad_norm": 1.1140888000194717, + "learning_rate": 1.997029330621082e-05, + "loss": 0.629, + "step": 439 + }, + { + "epoch": 0.053987730061349694, + "grad_norm": 1.3593512546493567, + "learning_rate": 1.9969986416003026e-05, + "loss": 0.6549, + "step": 440 + }, + { + "epoch": 0.05411042944785276, + "grad_norm": 1.0372269391808382, + "learning_rate": 1.9969677951121587e-05, + "loss": 0.5923, + "step": 441 + }, + { + "epoch": 0.054233128834355826, + "grad_norm": 1.1591200401387358, + "learning_rate": 1.9969367911615224e-05, + "loss": 0.6858, + "step": 442 + }, + { + "epoch": 0.054355828220858896, + "grad_norm": 1.8596322223392834, + "learning_rate": 1.9969056297532914e-05, + "loss": 0.628, + "step": 443 + }, + { + "epoch": 0.054478527607361965, + "grad_norm": 1.314328326010827, + "learning_rate": 1.996874310892386e-05, + "loss": 0.6439, + "step": 444 + }, + { + "epoch": 0.05460122699386503, + "grad_norm": 1.0299143312386796, + "learning_rate": 1.9968428345837542e-05, + "loss": 0.6357, + "step": 445 + }, + { + "epoch": 0.0547239263803681, + "grad_norm": 1.1831341400566215, + "learning_rate": 1.996811200832366e-05, + "loss": 0.6544, + "step": 446 + }, + { + "epoch": 0.05484662576687117, + "grad_norm": 1.2283372146121019, + "learning_rate": 1.996779409643219e-05, + "loss": 0.6443, + "step": 447 + }, + { + "epoch": 0.054969325153374236, + "grad_norm": 1.2503216811948719, + "learning_rate": 1.9967474610213334e-05, + "loss": 0.6498, + "step": 448 + }, + { + "epoch": 0.0550920245398773, + "grad_norm": 1.6826164705589652, + "learning_rate": 1.996715354971755e-05, + "loss": 0.7764, + "step": 449 + }, + { + "epoch": 0.05521472392638037, + "grad_norm": 0.9660659469438122, + "learning_rate": 1.996683091499556e-05, + "loss": 0.6585, + "step": 450 + }, + { + "epoch": 0.05533742331288344, + "grad_norm": 1.1694323946476552, + "learning_rate": 1.9966506706098312e-05, + "loss": 0.6754, + "step": 451 + }, + { + "epoch": 0.0554601226993865, + "grad_norm": 1.0770706060714892, + "learning_rate": 1.996618092307701e-05, + "loss": 0.6564, + "step": 452 + }, + { + "epoch": 0.05558282208588957, + "grad_norm": 1.2278524697719522, + "learning_rate": 1.9965853565983115e-05, + "loss": 0.6515, + "step": 453 + }, + { + "epoch": 0.05570552147239264, + "grad_norm": 1.3141094957874235, + "learning_rate": 1.9965524634868324e-05, + "loss": 0.7943, + "step": 454 + }, + { + "epoch": 0.0558282208588957, + "grad_norm": 1.5452144058468267, + "learning_rate": 1.9965194129784597e-05, + "loss": 0.6019, + "step": 455 + }, + { + "epoch": 0.05595092024539877, + "grad_norm": 1.1672677744368223, + "learning_rate": 1.9964862050784127e-05, + "loss": 0.6706, + "step": 456 + }, + { + "epoch": 0.05607361963190184, + "grad_norm": 1.6479640866825616, + "learning_rate": 1.9964528397919363e-05, + "loss": 0.6804, + "step": 457 + }, + { + "epoch": 0.05619631901840491, + "grad_norm": 1.1766520036892167, + "learning_rate": 1.9964193171243006e-05, + "loss": 0.63, + "step": 458 + }, + { + "epoch": 0.056319018404907974, + "grad_norm": 1.2765229416966326, + "learning_rate": 1.9963856370808002e-05, + "loss": 0.6238, + "step": 459 + }, + { + "epoch": 0.05644171779141104, + "grad_norm": 1.3178124984413835, + "learning_rate": 1.9963517996667548e-05, + "loss": 0.7201, + "step": 460 + }, + { + "epoch": 0.05656441717791411, + "grad_norm": 1.3092191048636432, + "learning_rate": 1.996317804887508e-05, + "loss": 0.7498, + "step": 461 + }, + { + "epoch": 0.056687116564417175, + "grad_norm": 1.2892011072023903, + "learning_rate": 1.9962836527484296e-05, + "loss": 0.7736, + "step": 462 + }, + { + "epoch": 0.056809815950920245, + "grad_norm": 1.4423718392952822, + "learning_rate": 1.9962493432549136e-05, + "loss": 0.6649, + "step": 463 + }, + { + "epoch": 0.056932515337423314, + "grad_norm": 1.0756050760091143, + "learning_rate": 1.9962148764123785e-05, + "loss": 0.751, + "step": 464 + }, + { + "epoch": 0.05705521472392638, + "grad_norm": 1.2335617479206877, + "learning_rate": 1.9961802522262685e-05, + "loss": 0.6037, + "step": 465 + }, + { + "epoch": 0.05717791411042945, + "grad_norm": 1.243806044263209, + "learning_rate": 1.9961454707020515e-05, + "loss": 0.6487, + "step": 466 + }, + { + "epoch": 0.057300613496932516, + "grad_norm": 1.1338319284580844, + "learning_rate": 1.996110531845222e-05, + "loss": 0.6638, + "step": 467 + }, + { + "epoch": 0.057423312883435586, + "grad_norm": 1.421074537114945, + "learning_rate": 1.996075435661297e-05, + "loss": 0.6551, + "step": 468 + }, + { + "epoch": 0.05754601226993865, + "grad_norm": 1.2300288356592264, + "learning_rate": 1.996040182155821e-05, + "loss": 0.6366, + "step": 469 + }, + { + "epoch": 0.05766871165644172, + "grad_norm": 1.262305335582742, + "learning_rate": 1.996004771334361e-05, + "loss": 0.6642, + "step": 470 + }, + { + "epoch": 0.05779141104294479, + "grad_norm": 1.148311680774811, + "learning_rate": 1.9959692032025106e-05, + "loss": 0.5955, + "step": 471 + }, + { + "epoch": 0.05791411042944785, + "grad_norm": 1.8171201308226868, + "learning_rate": 1.9959334777658865e-05, + "loss": 0.8336, + "step": 472 + }, + { + "epoch": 0.05803680981595092, + "grad_norm": 1.3168155310506422, + "learning_rate": 1.9958975950301322e-05, + "loss": 0.6184, + "step": 473 + }, + { + "epoch": 0.05815950920245399, + "grad_norm": 1.1729047112464193, + "learning_rate": 1.9958615550009145e-05, + "loss": 0.6511, + "step": 474 + }, + { + "epoch": 0.05828220858895705, + "grad_norm": 1.331069631070518, + "learning_rate": 1.9958253576839256e-05, + "loss": 0.6517, + "step": 475 + }, + { + "epoch": 0.05840490797546012, + "grad_norm": 1.368626866599508, + "learning_rate": 1.9957890030848828e-05, + "loss": 0.6626, + "step": 476 + }, + { + "epoch": 0.05852760736196319, + "grad_norm": 1.1480190805714992, + "learning_rate": 1.9957524912095278e-05, + "loss": 0.7, + "step": 477 + }, + { + "epoch": 0.05865030674846626, + "grad_norm": 1.2118909431708582, + "learning_rate": 1.9957158220636278e-05, + "loss": 0.6328, + "step": 478 + }, + { + "epoch": 0.05877300613496932, + "grad_norm": 1.0653621539351597, + "learning_rate": 1.9956789956529738e-05, + "loss": 0.6709, + "step": 479 + }, + { + "epoch": 0.05889570552147239, + "grad_norm": 1.2090627793382362, + "learning_rate": 1.9956420119833826e-05, + "loss": 0.6601, + "step": 480 + }, + { + "epoch": 0.05901840490797546, + "grad_norm": 1.0992144857019184, + "learning_rate": 1.995604871060695e-05, + "loss": 0.6128, + "step": 481 + }, + { + "epoch": 0.059141104294478525, + "grad_norm": 1.2280250015711167, + "learning_rate": 1.9955675728907776e-05, + "loss": 0.6555, + "step": 482 + }, + { + "epoch": 0.059263803680981594, + "grad_norm": 1.0636078065039432, + "learning_rate": 1.995530117479521e-05, + "loss": 0.606, + "step": 483 + }, + { + "epoch": 0.059386503067484664, + "grad_norm": 1.2620471887336335, + "learning_rate": 1.995492504832841e-05, + "loss": 0.6683, + "step": 484 + }, + { + "epoch": 0.05950920245398773, + "grad_norm": 1.2638510731212496, + "learning_rate": 1.9954547349566783e-05, + "loss": 0.6685, + "step": 485 + }, + { + "epoch": 0.059631901840490796, + "grad_norm": 1.172472840106068, + "learning_rate": 1.9954168078569985e-05, + "loss": 0.6626, + "step": 486 + }, + { + "epoch": 0.059754601226993866, + "grad_norm": 1.3561258395237186, + "learning_rate": 1.9953787235397913e-05, + "loss": 0.7787, + "step": 487 + }, + { + "epoch": 0.059877300613496935, + "grad_norm": 1.3221196714519419, + "learning_rate": 1.9953404820110725e-05, + "loss": 0.6045, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 1.2063936346945654, + "learning_rate": 1.9953020832768816e-05, + "loss": 0.6142, + "step": 489 + }, + { + "epoch": 0.06012269938650307, + "grad_norm": 1.1833957307430356, + "learning_rate": 1.9952635273432835e-05, + "loss": 0.824, + "step": 490 + }, + { + "epoch": 0.06024539877300614, + "grad_norm": 1.1771082960201091, + "learning_rate": 1.9952248142163672e-05, + "loss": 0.641, + "step": 491 + }, + { + "epoch": 0.0603680981595092, + "grad_norm": 1.20253595220016, + "learning_rate": 1.9951859439022478e-05, + "loss": 0.6743, + "step": 492 + }, + { + "epoch": 0.06049079754601227, + "grad_norm": 1.4387161252356961, + "learning_rate": 1.9951469164070647e-05, + "loss": 0.6895, + "step": 493 + }, + { + "epoch": 0.06061349693251534, + "grad_norm": 1.0589403476724697, + "learning_rate": 1.995107731736981e-05, + "loss": 0.6789, + "step": 494 + }, + { + "epoch": 0.06073619631901841, + "grad_norm": 1.0843301136024837, + "learning_rate": 1.9950683898981866e-05, + "loss": 0.6425, + "step": 495 + }, + { + "epoch": 0.06085889570552147, + "grad_norm": 1.076988044431687, + "learning_rate": 1.9950288908968947e-05, + "loss": 0.6194, + "step": 496 + }, + { + "epoch": 0.06098159509202454, + "grad_norm": 1.1485171450575844, + "learning_rate": 1.9949892347393438e-05, + "loss": 0.6582, + "step": 497 + }, + { + "epoch": 0.06110429447852761, + "grad_norm": 1.0107140082246506, + "learning_rate": 1.9949494214317973e-05, + "loss": 0.6242, + "step": 498 + }, + { + "epoch": 0.06122699386503067, + "grad_norm": 1.1060071250220467, + "learning_rate": 1.9949094509805436e-05, + "loss": 0.703, + "step": 499 + }, + { + "epoch": 0.06134969325153374, + "grad_norm": 1.0933104690137005, + "learning_rate": 1.994869323391895e-05, + "loss": 0.6413, + "step": 500 + }, + { + "epoch": 0.06147239263803681, + "grad_norm": 1.031140570936467, + "learning_rate": 1.9948290386721904e-05, + "loss": 0.6549, + "step": 501 + }, + { + "epoch": 0.061595092024539874, + "grad_norm": 1.0833964690558575, + "learning_rate": 1.9947885968277916e-05, + "loss": 0.6334, + "step": 502 + }, + { + "epoch": 0.061717791411042944, + "grad_norm": 1.107894187578626, + "learning_rate": 1.9947479978650867e-05, + "loss": 0.5971, + "step": 503 + }, + { + "epoch": 0.06184049079754601, + "grad_norm": 1.1404009158953377, + "learning_rate": 1.9947072417904868e-05, + "loss": 0.6253, + "step": 504 + }, + { + "epoch": 0.06196319018404908, + "grad_norm": 1.340641694239334, + "learning_rate": 1.9946663286104303e-05, + "loss": 0.6704, + "step": 505 + }, + { + "epoch": 0.062085889570552145, + "grad_norm": 1.5846204092276819, + "learning_rate": 1.994625258331378e-05, + "loss": 0.7737, + "step": 506 + }, + { + "epoch": 0.062208588957055215, + "grad_norm": 1.339895085956748, + "learning_rate": 1.9945840309598173e-05, + "loss": 0.8154, + "step": 507 + }, + { + "epoch": 0.062331288343558285, + "grad_norm": 1.1395678029150993, + "learning_rate": 1.9945426465022597e-05, + "loss": 0.7312, + "step": 508 + }, + { + "epoch": 0.06245398773006135, + "grad_norm": 1.4067318223066811, + "learning_rate": 1.994501104965241e-05, + "loss": 0.6495, + "step": 509 + }, + { + "epoch": 0.06257668711656442, + "grad_norm": 1.4140059797754831, + "learning_rate": 1.994459406355323e-05, + "loss": 0.76, + "step": 510 + }, + { + "epoch": 0.06269938650306749, + "grad_norm": 1.4638814083690002, + "learning_rate": 1.994417550679091e-05, + "loss": 0.6184, + "step": 511 + }, + { + "epoch": 0.06282208588957056, + "grad_norm": 1.4936260408144593, + "learning_rate": 1.994375537943156e-05, + "loss": 0.7587, + "step": 512 + }, + { + "epoch": 0.06294478527607363, + "grad_norm": 1.4838587155390504, + "learning_rate": 1.9943333681541537e-05, + "loss": 0.6484, + "step": 513 + }, + { + "epoch": 0.06306748466257668, + "grad_norm": 1.3985699182555964, + "learning_rate": 1.9942910413187445e-05, + "loss": 0.7684, + "step": 514 + }, + { + "epoch": 0.06319018404907975, + "grad_norm": 1.1513310007125044, + "learning_rate": 1.994248557443613e-05, + "loss": 0.6547, + "step": 515 + }, + { + "epoch": 0.06331288343558282, + "grad_norm": 1.3315785208833892, + "learning_rate": 1.99420591653547e-05, + "loss": 0.6456, + "step": 516 + }, + { + "epoch": 0.06343558282208589, + "grad_norm": 1.0798825997018453, + "learning_rate": 1.9941631186010496e-05, + "loss": 0.675, + "step": 517 + }, + { + "epoch": 0.06355828220858896, + "grad_norm": 1.8050683665270766, + "learning_rate": 1.9941201636471115e-05, + "loss": 0.5845, + "step": 518 + }, + { + "epoch": 0.06368098159509203, + "grad_norm": 1.1299923521059203, + "learning_rate": 1.9940770516804402e-05, + "loss": 0.6261, + "step": 519 + }, + { + "epoch": 0.0638036809815951, + "grad_norm": 1.627767373402237, + "learning_rate": 1.9940337827078448e-05, + "loss": 0.651, + "step": 520 + }, + { + "epoch": 0.06392638036809815, + "grad_norm": 1.1137513660109326, + "learning_rate": 1.9939903567361594e-05, + "loss": 0.6738, + "step": 521 + }, + { + "epoch": 0.06404907975460122, + "grad_norm": 1.5137193443395478, + "learning_rate": 1.9939467737722428e-05, + "loss": 0.7745, + "step": 522 + }, + { + "epoch": 0.0641717791411043, + "grad_norm": 1.1685892296989806, + "learning_rate": 1.993903033822978e-05, + "loss": 0.5682, + "step": 523 + }, + { + "epoch": 0.06429447852760736, + "grad_norm": 1.1289490806839753, + "learning_rate": 1.993859136895274e-05, + "loss": 0.6782, + "step": 524 + }, + { + "epoch": 0.06441717791411043, + "grad_norm": 1.0612597401766126, + "learning_rate": 1.9938150829960634e-05, + "loss": 0.6728, + "step": 525 + }, + { + "epoch": 0.0645398773006135, + "grad_norm": 1.3923771213477885, + "learning_rate": 1.9937708721323045e-05, + "loss": 0.6736, + "step": 526 + }, + { + "epoch": 0.06466257668711656, + "grad_norm": 1.1690431759581768, + "learning_rate": 1.99372650431098e-05, + "loss": 0.6741, + "step": 527 + }, + { + "epoch": 0.06478527607361963, + "grad_norm": 1.2766903736325588, + "learning_rate": 1.9936819795390975e-05, + "loss": 0.7903, + "step": 528 + }, + { + "epoch": 0.0649079754601227, + "grad_norm": 1.1048996349417697, + "learning_rate": 1.993637297823689e-05, + "loss": 0.6185, + "step": 529 + }, + { + "epoch": 0.06503067484662577, + "grad_norm": 1.202776628481575, + "learning_rate": 1.993592459171812e-05, + "loss": 0.6269, + "step": 530 + }, + { + "epoch": 0.06515337423312884, + "grad_norm": 1.28338491360886, + "learning_rate": 1.9935474635905477e-05, + "loss": 0.8233, + "step": 531 + }, + { + "epoch": 0.0652760736196319, + "grad_norm": 1.2502666744160271, + "learning_rate": 1.9935023110870037e-05, + "loss": 0.7978, + "step": 532 + }, + { + "epoch": 0.06539877300613497, + "grad_norm": 1.145912508684079, + "learning_rate": 1.993457001668311e-05, + "loss": 0.6486, + "step": 533 + }, + { + "epoch": 0.06552147239263803, + "grad_norm": 1.2245694946286443, + "learning_rate": 1.993411535341625e-05, + "loss": 0.7377, + "step": 534 + }, + { + "epoch": 0.0656441717791411, + "grad_norm": 1.2089673468164421, + "learning_rate": 1.9933659121141283e-05, + "loss": 0.665, + "step": 535 + }, + { + "epoch": 0.06576687116564417, + "grad_norm": 1.147969640615268, + "learning_rate": 1.9933201319930257e-05, + "loss": 0.624, + "step": 536 + }, + { + "epoch": 0.06588957055214724, + "grad_norm": 1.2962075092184449, + "learning_rate": 1.9932741949855478e-05, + "loss": 0.6678, + "step": 537 + }, + { + "epoch": 0.06601226993865031, + "grad_norm": 1.2685614128230656, + "learning_rate": 1.9932281010989504e-05, + "loss": 0.7425, + "step": 538 + }, + { + "epoch": 0.06613496932515338, + "grad_norm": 1.1112518843348458, + "learning_rate": 1.9931818503405132e-05, + "loss": 0.635, + "step": 539 + }, + { + "epoch": 0.06625766871165645, + "grad_norm": 1.2120562590191513, + "learning_rate": 1.993135442717541e-05, + "loss": 0.6519, + "step": 540 + }, + { + "epoch": 0.0663803680981595, + "grad_norm": 1.5280585720623934, + "learning_rate": 1.993088878237364e-05, + "loss": 0.6831, + "step": 541 + }, + { + "epoch": 0.06650306748466257, + "grad_norm": 1.2658345615496098, + "learning_rate": 1.9930421569073365e-05, + "loss": 0.6026, + "step": 542 + }, + { + "epoch": 0.06662576687116564, + "grad_norm": 2.171318103215517, + "learning_rate": 1.9929952787348376e-05, + "loss": 0.6418, + "step": 543 + }, + { + "epoch": 0.06674846625766871, + "grad_norm": 1.1477092452972932, + "learning_rate": 1.9929482437272712e-05, + "loss": 0.6329, + "step": 544 + }, + { + "epoch": 0.06687116564417178, + "grad_norm": 2.290838808458417, + "learning_rate": 1.9929010518920667e-05, + "loss": 0.6209, + "step": 545 + }, + { + "epoch": 0.06699386503067485, + "grad_norm": 1.1619618213493135, + "learning_rate": 1.9928537032366767e-05, + "loss": 0.6644, + "step": 546 + }, + { + "epoch": 0.06711656441717792, + "grad_norm": 1.0912102041067844, + "learning_rate": 1.99280619776858e-05, + "loss": 0.6415, + "step": 547 + }, + { + "epoch": 0.06723926380368098, + "grad_norm": 0.9561822133211202, + "learning_rate": 1.99275853549528e-05, + "loss": 0.6179, + "step": 548 + }, + { + "epoch": 0.06736196319018405, + "grad_norm": 1.0904077665333223, + "learning_rate": 1.992710716424304e-05, + "loss": 0.6572, + "step": 549 + }, + { + "epoch": 0.06748466257668712, + "grad_norm": 1.0335675119495562, + "learning_rate": 1.9926627405632048e-05, + "loss": 0.6227, + "step": 550 + }, + { + "epoch": 0.06760736196319019, + "grad_norm": 2.9508411646204857, + "learning_rate": 1.9926146079195597e-05, + "loss": 0.6788, + "step": 551 + }, + { + "epoch": 0.06773006134969325, + "grad_norm": 1.0805731307763593, + "learning_rate": 1.9925663185009712e-05, + "loss": 0.631, + "step": 552 + }, + { + "epoch": 0.06785276073619632, + "grad_norm": 1.2982095235643278, + "learning_rate": 1.9925178723150655e-05, + "loss": 0.8284, + "step": 553 + }, + { + "epoch": 0.06797546012269938, + "grad_norm": 1.3676459542928454, + "learning_rate": 1.9924692693694953e-05, + "loss": 0.8256, + "step": 554 + }, + { + "epoch": 0.06809815950920245, + "grad_norm": 1.177758478510272, + "learning_rate": 1.992420509671936e-05, + "loss": 0.6171, + "step": 555 + }, + { + "epoch": 0.06822085889570552, + "grad_norm": 1.2399955043844515, + "learning_rate": 1.9923715932300892e-05, + "loss": 0.7071, + "step": 556 + }, + { + "epoch": 0.06834355828220859, + "grad_norm": 1.1644227517444037, + "learning_rate": 1.992322520051681e-05, + "loss": 0.77, + "step": 557 + }, + { + "epoch": 0.06846625766871166, + "grad_norm": 1.1504621670889676, + "learning_rate": 1.9922732901444618e-05, + "loss": 0.6809, + "step": 558 + }, + { + "epoch": 0.06858895705521473, + "grad_norm": 1.2109943649283539, + "learning_rate": 1.992223903516207e-05, + "loss": 0.6609, + "step": 559 + }, + { + "epoch": 0.0687116564417178, + "grad_norm": 1.1173012355231242, + "learning_rate": 1.992174360174717e-05, + "loss": 0.6402, + "step": 560 + }, + { + "epoch": 0.06883435582822085, + "grad_norm": 1.0716959135601285, + "learning_rate": 1.992124660127817e-05, + "loss": 0.636, + "step": 561 + }, + { + "epoch": 0.06895705521472392, + "grad_norm": 1.2157965528715944, + "learning_rate": 1.992074803383356e-05, + "loss": 0.7484, + "step": 562 + }, + { + "epoch": 0.06907975460122699, + "grad_norm": 1.1589662146561455, + "learning_rate": 1.992024789949209e-05, + "loss": 0.6319, + "step": 563 + }, + { + "epoch": 0.06920245398773006, + "grad_norm": 1.157648256393212, + "learning_rate": 1.991974619833275e-05, + "loss": 0.6185, + "step": 564 + }, + { + "epoch": 0.06932515337423313, + "grad_norm": 1.4038336270549359, + "learning_rate": 1.991924293043478e-05, + "loss": 0.6429, + "step": 565 + }, + { + "epoch": 0.0694478527607362, + "grad_norm": 1.3661513607842066, + "learning_rate": 1.9918738095877665e-05, + "loss": 0.6567, + "step": 566 + }, + { + "epoch": 0.06957055214723927, + "grad_norm": 1.200710098660526, + "learning_rate": 1.9918231694741143e-05, + "loss": 0.6454, + "step": 567 + }, + { + "epoch": 0.06969325153374233, + "grad_norm": 1.1171463097716259, + "learning_rate": 1.991772372710519e-05, + "loss": 0.6508, + "step": 568 + }, + { + "epoch": 0.0698159509202454, + "grad_norm": 1.0180697903513145, + "learning_rate": 1.991721419305004e-05, + "loss": 0.6131, + "step": 569 + }, + { + "epoch": 0.06993865030674846, + "grad_norm": 1.1877232703222391, + "learning_rate": 1.991670309265617e-05, + "loss": 0.6687, + "step": 570 + }, + { + "epoch": 0.07006134969325153, + "grad_norm": 1.0551596333932554, + "learning_rate": 1.99161904260043e-05, + "loss": 0.5927, + "step": 571 + }, + { + "epoch": 0.0701840490797546, + "grad_norm": 1.171989130430983, + "learning_rate": 1.9915676193175405e-05, + "loss": 0.6946, + "step": 572 + }, + { + "epoch": 0.07030674846625767, + "grad_norm": 1.18977945490261, + "learning_rate": 1.99151603942507e-05, + "loss": 0.6956, + "step": 573 + }, + { + "epoch": 0.07042944785276073, + "grad_norm": 1.1158789288182853, + "learning_rate": 1.991464302931165e-05, + "loss": 0.6665, + "step": 574 + }, + { + "epoch": 0.0705521472392638, + "grad_norm": 1.8111142030152043, + "learning_rate": 1.9914124098439976e-05, + "loss": 0.8823, + "step": 575 + }, + { + "epoch": 0.07067484662576687, + "grad_norm": 1.0623747640863517, + "learning_rate": 1.991360360171763e-05, + "loss": 0.6035, + "step": 576 + }, + { + "epoch": 0.07079754601226994, + "grad_norm": 1.2353176750291437, + "learning_rate": 1.9913081539226828e-05, + "loss": 0.6355, + "step": 577 + }, + { + "epoch": 0.07092024539877301, + "grad_norm": 1.4067592465689889, + "learning_rate": 1.991255791105002e-05, + "loss": 0.6813, + "step": 578 + }, + { + "epoch": 0.07104294478527608, + "grad_norm": 1.2329889574309405, + "learning_rate": 1.9912032717269908e-05, + "loss": 0.738, + "step": 579 + }, + { + "epoch": 0.07116564417177915, + "grad_norm": 1.4213678999897366, + "learning_rate": 1.9911505957969443e-05, + "loss": 0.6641, + "step": 580 + }, + { + "epoch": 0.0712883435582822, + "grad_norm": 1.306268234182823, + "learning_rate": 1.9910977633231826e-05, + "loss": 0.7904, + "step": 581 + }, + { + "epoch": 0.07141104294478527, + "grad_norm": 1.1885503740834351, + "learning_rate": 1.9910447743140494e-05, + "loss": 0.6747, + "step": 582 + }, + { + "epoch": 0.07153374233128834, + "grad_norm": 1.205109199516197, + "learning_rate": 1.9909916287779147e-05, + "loss": 0.68, + "step": 583 + }, + { + "epoch": 0.07165644171779141, + "grad_norm": 1.1003838029745199, + "learning_rate": 1.9909383267231715e-05, + "loss": 0.602, + "step": 584 + }, + { + "epoch": 0.07177914110429448, + "grad_norm": 1.0629575927437203, + "learning_rate": 1.990884868158239e-05, + "loss": 0.6233, + "step": 585 + }, + { + "epoch": 0.07190184049079755, + "grad_norm": 1.2335387819400487, + "learning_rate": 1.9908312530915603e-05, + "loss": 0.7412, + "step": 586 + }, + { + "epoch": 0.07202453987730062, + "grad_norm": 1.0444279189622971, + "learning_rate": 1.9907774815316037e-05, + "loss": 0.649, + "step": 587 + }, + { + "epoch": 0.07214723926380368, + "grad_norm": 1.00670121745591, + "learning_rate": 1.9907235534868618e-05, + "loss": 0.6623, + "step": 588 + }, + { + "epoch": 0.07226993865030674, + "grad_norm": 1.2341190696198903, + "learning_rate": 1.9906694689658516e-05, + "loss": 0.6303, + "step": 589 + }, + { + "epoch": 0.07239263803680981, + "grad_norm": 1.1729710531638449, + "learning_rate": 1.9906152279771162e-05, + "loss": 0.7796, + "step": 590 + }, + { + "epoch": 0.07251533742331288, + "grad_norm": 1.1046211131766077, + "learning_rate": 1.990560830529222e-05, + "loss": 0.7367, + "step": 591 + }, + { + "epoch": 0.07263803680981595, + "grad_norm": 1.122991908854124, + "learning_rate": 1.99050627663076e-05, + "loss": 0.6353, + "step": 592 + }, + { + "epoch": 0.07276073619631902, + "grad_norm": 1.1772061330807748, + "learning_rate": 1.990451566290348e-05, + "loss": 0.6244, + "step": 593 + }, + { + "epoch": 0.07288343558282209, + "grad_norm": 1.2000010390050546, + "learning_rate": 1.990396699516625e-05, + "loss": 0.7492, + "step": 594 + }, + { + "epoch": 0.07300613496932515, + "grad_norm": 1.2548243380645923, + "learning_rate": 1.990341676318259e-05, + "loss": 0.6649, + "step": 595 + }, + { + "epoch": 0.07312883435582822, + "grad_norm": 1.1341362795706718, + "learning_rate": 1.990286496703939e-05, + "loss": 0.6576, + "step": 596 + }, + { + "epoch": 0.07325153374233129, + "grad_norm": 1.0549969861721071, + "learning_rate": 1.9902311606823806e-05, + "loss": 0.6601, + "step": 597 + }, + { + "epoch": 0.07337423312883436, + "grad_norm": 1.0255801816240813, + "learning_rate": 1.990175668262323e-05, + "loss": 0.6106, + "step": 598 + }, + { + "epoch": 0.07349693251533743, + "grad_norm": 1.127670592760748, + "learning_rate": 1.990120019452532e-05, + "loss": 0.628, + "step": 599 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 1.2663679714383393, + "learning_rate": 1.9900642142617958e-05, + "loss": 0.6145, + "step": 600 + }, + { + "epoch": 0.07374233128834355, + "grad_norm": 1.2747395407606212, + "learning_rate": 1.9900082526989287e-05, + "loss": 0.8048, + "step": 601 + }, + { + "epoch": 0.07386503067484662, + "grad_norm": 1.1750512093283174, + "learning_rate": 1.989952134772769e-05, + "loss": 0.7679, + "step": 602 + }, + { + "epoch": 0.07398773006134969, + "grad_norm": 1.3621396517124778, + "learning_rate": 1.989895860492181e-05, + "loss": 0.679, + "step": 603 + }, + { + "epoch": 0.07411042944785276, + "grad_norm": 1.076694073821085, + "learning_rate": 1.9898394298660517e-05, + "loss": 0.6318, + "step": 604 + }, + { + "epoch": 0.07423312883435583, + "grad_norm": 1.115665233227293, + "learning_rate": 1.9897828429032946e-05, + "loss": 0.6735, + "step": 605 + }, + { + "epoch": 0.0743558282208589, + "grad_norm": 1.1127261428137356, + "learning_rate": 1.9897260996128463e-05, + "loss": 0.6444, + "step": 606 + }, + { + "epoch": 0.07447852760736197, + "grad_norm": 1.205645755440054, + "learning_rate": 1.98966920000367e-05, + "loss": 0.6744, + "step": 607 + }, + { + "epoch": 0.07460122699386502, + "grad_norm": 1.2393192883198811, + "learning_rate": 1.9896121440847515e-05, + "loss": 0.7536, + "step": 608 + }, + { + "epoch": 0.0747239263803681, + "grad_norm": 1.0078709327689148, + "learning_rate": 1.9895549318651027e-05, + "loss": 0.5869, + "step": 609 + }, + { + "epoch": 0.07484662576687116, + "grad_norm": 1.1884022009658552, + "learning_rate": 1.98949756335376e-05, + "loss": 0.6219, + "step": 610 + }, + { + "epoch": 0.07496932515337423, + "grad_norm": 1.0518590632441425, + "learning_rate": 1.9894400385597835e-05, + "loss": 0.6163, + "step": 611 + }, + { + "epoch": 0.0750920245398773, + "grad_norm": 1.31979001790646, + "learning_rate": 1.98938235749226e-05, + "loss": 0.5997, + "step": 612 + }, + { + "epoch": 0.07521472392638037, + "grad_norm": 1.3217640225325638, + "learning_rate": 1.9893245201602984e-05, + "loss": 0.7836, + "step": 613 + }, + { + "epoch": 0.07533742331288344, + "grad_norm": 1.2040974420520925, + "learning_rate": 1.9892665265730344e-05, + "loss": 0.6216, + "step": 614 + }, + { + "epoch": 0.0754601226993865, + "grad_norm": 1.0645911918058615, + "learning_rate": 1.9892083767396274e-05, + "loss": 0.5965, + "step": 615 + }, + { + "epoch": 0.07558282208588957, + "grad_norm": 1.25691879790049, + "learning_rate": 1.9891500706692616e-05, + "loss": 0.6737, + "step": 616 + }, + { + "epoch": 0.07570552147239264, + "grad_norm": 1.0505858827776178, + "learning_rate": 1.9890916083711463e-05, + "loss": 0.7728, + "step": 617 + }, + { + "epoch": 0.0758282208588957, + "grad_norm": 1.2118995844311649, + "learning_rate": 1.9890329898545145e-05, + "loss": 0.6189, + "step": 618 + }, + { + "epoch": 0.07595092024539878, + "grad_norm": 1.1418104555999016, + "learning_rate": 1.988974215128625e-05, + "loss": 0.6637, + "step": 619 + }, + { + "epoch": 0.07607361963190185, + "grad_norm": 1.1507674187756796, + "learning_rate": 1.9889152842027607e-05, + "loss": 0.6346, + "step": 620 + }, + { + "epoch": 0.0761963190184049, + "grad_norm": 1.1843431818006922, + "learning_rate": 1.9888561970862284e-05, + "loss": 0.6078, + "step": 621 + }, + { + "epoch": 0.07631901840490797, + "grad_norm": 1.2975142748400268, + "learning_rate": 1.988796953788362e-05, + "loss": 0.7176, + "step": 622 + }, + { + "epoch": 0.07644171779141104, + "grad_norm": 1.1953083550443355, + "learning_rate": 1.988737554318517e-05, + "loss": 0.742, + "step": 623 + }, + { + "epoch": 0.07656441717791411, + "grad_norm": 1.080491522291197, + "learning_rate": 1.988677998686076e-05, + "loss": 0.6669, + "step": 624 + }, + { + "epoch": 0.07668711656441718, + "grad_norm": 1.0623244659230207, + "learning_rate": 1.9886182869004447e-05, + "loss": 0.6449, + "step": 625 + }, + { + "epoch": 0.07680981595092025, + "grad_norm": 1.2248155219115429, + "learning_rate": 1.9885584189710546e-05, + "loss": 0.7589, + "step": 626 + }, + { + "epoch": 0.07693251533742332, + "grad_norm": 1.1954945032306903, + "learning_rate": 1.9884983949073607e-05, + "loss": 0.6954, + "step": 627 + }, + { + "epoch": 0.07705521472392637, + "grad_norm": 1.1270448266842967, + "learning_rate": 1.9884382147188437e-05, + "loss": 0.6492, + "step": 628 + }, + { + "epoch": 0.07717791411042944, + "grad_norm": 1.2369724709748884, + "learning_rate": 1.9883778784150083e-05, + "loss": 0.7724, + "step": 629 + }, + { + "epoch": 0.07730061349693251, + "grad_norm": 1.0429981052737922, + "learning_rate": 1.9883173860053845e-05, + "loss": 0.6483, + "step": 630 + }, + { + "epoch": 0.07742331288343558, + "grad_norm": 1.3955423741087463, + "learning_rate": 1.988256737499526e-05, + "loss": 0.6679, + "step": 631 + }, + { + "epoch": 0.07754601226993865, + "grad_norm": 1.159425931797785, + "learning_rate": 1.9881959329070123e-05, + "loss": 0.6364, + "step": 632 + }, + { + "epoch": 0.07766871165644172, + "grad_norm": 1.093501612315165, + "learning_rate": 1.9881349722374464e-05, + "loss": 0.6399, + "step": 633 + }, + { + "epoch": 0.07779141104294479, + "grad_norm": 1.2101306375714935, + "learning_rate": 1.9880738555004567e-05, + "loss": 0.6275, + "step": 634 + }, + { + "epoch": 0.07791411042944785, + "grad_norm": 1.1988247588752627, + "learning_rate": 1.9880125827056967e-05, + "loss": 0.6478, + "step": 635 + }, + { + "epoch": 0.07803680981595092, + "grad_norm": 1.1682223160167204, + "learning_rate": 1.987951153862843e-05, + "loss": 0.6661, + "step": 636 + }, + { + "epoch": 0.07815950920245399, + "grad_norm": 1.1068019076268, + "learning_rate": 1.987889568981598e-05, + "loss": 0.6504, + "step": 637 + }, + { + "epoch": 0.07828220858895706, + "grad_norm": 1.290598417158105, + "learning_rate": 1.9878278280716885e-05, + "loss": 0.6466, + "step": 638 + }, + { + "epoch": 0.07840490797546013, + "grad_norm": 1.161783577480427, + "learning_rate": 1.9877659311428667e-05, + "loss": 0.6812, + "step": 639 + }, + { + "epoch": 0.0785276073619632, + "grad_norm": 1.2034110751929346, + "learning_rate": 1.9877038782049074e-05, + "loss": 0.6094, + "step": 640 + }, + { + "epoch": 0.07865030674846626, + "grad_norm": 1.309416569707653, + "learning_rate": 1.9876416692676123e-05, + "loss": 0.6633, + "step": 641 + }, + { + "epoch": 0.07877300613496932, + "grad_norm": 1.0747318868811806, + "learning_rate": 1.987579304340806e-05, + "loss": 0.6428, + "step": 642 + }, + { + "epoch": 0.07889570552147239, + "grad_norm": 1.1855358337956452, + "learning_rate": 1.9875167834343393e-05, + "loss": 0.6719, + "step": 643 + }, + { + "epoch": 0.07901840490797546, + "grad_norm": 1.0413211500755328, + "learning_rate": 1.9874541065580865e-05, + "loss": 0.6944, + "step": 644 + }, + { + "epoch": 0.07914110429447853, + "grad_norm": 1.0615672262402465, + "learning_rate": 1.9873912737219468e-05, + "loss": 0.6033, + "step": 645 + }, + { + "epoch": 0.0792638036809816, + "grad_norm": 1.2533318031622531, + "learning_rate": 1.9873282849358445e-05, + "loss": 0.6228, + "step": 646 + }, + { + "epoch": 0.07938650306748467, + "grad_norm": 1.2307544455077535, + "learning_rate": 1.9872651402097273e-05, + "loss": 0.6551, + "step": 647 + }, + { + "epoch": 0.07950920245398772, + "grad_norm": 1.1744346547511426, + "learning_rate": 1.9872018395535694e-05, + "loss": 0.6375, + "step": 648 + }, + { + "epoch": 0.07963190184049079, + "grad_norm": 1.041941164316595, + "learning_rate": 1.9871383829773676e-05, + "loss": 0.6137, + "step": 649 + }, + { + "epoch": 0.07975460122699386, + "grad_norm": 1.223989026177442, + "learning_rate": 1.987074770491145e-05, + "loss": 0.673, + "step": 650 + }, + { + "epoch": 0.07987730061349693, + "grad_norm": 1.1006314252271012, + "learning_rate": 1.9870110021049482e-05, + "loss": 0.5929, + "step": 651 + }, + { + "epoch": 0.08, + "grad_norm": 1.067013786424502, + "learning_rate": 1.9869470778288493e-05, + "loss": 0.625, + "step": 652 + }, + { + "epoch": 0.08012269938650307, + "grad_norm": 1.037211818498716, + "learning_rate": 1.9868829976729444e-05, + "loss": 0.6395, + "step": 653 + }, + { + "epoch": 0.08024539877300614, + "grad_norm": 1.1294655394315973, + "learning_rate": 1.9868187616473542e-05, + "loss": 0.6707, + "step": 654 + }, + { + "epoch": 0.0803680981595092, + "grad_norm": 1.1118399222802793, + "learning_rate": 1.9867543697622248e-05, + "loss": 0.6089, + "step": 655 + }, + { + "epoch": 0.08049079754601227, + "grad_norm": 1.7767600637587786, + "learning_rate": 1.9866898220277256e-05, + "loss": 0.6581, + "step": 656 + }, + { + "epoch": 0.08061349693251534, + "grad_norm": 0.972801994891119, + "learning_rate": 1.986625118454052e-05, + "loss": 0.6602, + "step": 657 + }, + { + "epoch": 0.0807361963190184, + "grad_norm": 1.141144457588404, + "learning_rate": 1.986560259051423e-05, + "loss": 0.6381, + "step": 658 + }, + { + "epoch": 0.08085889570552147, + "grad_norm": 1.1334283111168266, + "learning_rate": 1.9864952438300826e-05, + "loss": 0.6498, + "step": 659 + }, + { + "epoch": 0.08098159509202454, + "grad_norm": 1.255610805902631, + "learning_rate": 1.9864300728002997e-05, + "loss": 0.8051, + "step": 660 + }, + { + "epoch": 0.08110429447852761, + "grad_norm": 1.313334756494041, + "learning_rate": 1.9863647459723672e-05, + "loss": 0.7689, + "step": 661 + }, + { + "epoch": 0.08122699386503067, + "grad_norm": 1.072973513205862, + "learning_rate": 1.9862992633566026e-05, + "loss": 0.7404, + "step": 662 + }, + { + "epoch": 0.08134969325153374, + "grad_norm": 1.1781686458169291, + "learning_rate": 1.986233624963349e-05, + "loss": 0.6718, + "step": 663 + }, + { + "epoch": 0.08147239263803681, + "grad_norm": 1.176993840288416, + "learning_rate": 1.9861678308029735e-05, + "loss": 0.5885, + "step": 664 + }, + { + "epoch": 0.08159509202453988, + "grad_norm": 1.38337056222181, + "learning_rate": 1.986101880885867e-05, + "loss": 0.7993, + "step": 665 + }, + { + "epoch": 0.08171779141104295, + "grad_norm": 1.0809115123598076, + "learning_rate": 1.986035775222446e-05, + "loss": 0.6377, + "step": 666 + }, + { + "epoch": 0.08184049079754602, + "grad_norm": 1.3561604752279333, + "learning_rate": 1.9859695138231517e-05, + "loss": 0.685, + "step": 667 + }, + { + "epoch": 0.08196319018404907, + "grad_norm": 1.2160082913993393, + "learning_rate": 1.985903096698449e-05, + "loss": 0.6536, + "step": 668 + }, + { + "epoch": 0.08208588957055214, + "grad_norm": 1.2345290914508356, + "learning_rate": 1.9858365238588284e-05, + "loss": 0.6748, + "step": 669 + }, + { + "epoch": 0.08220858895705521, + "grad_norm": 1.1234438147389587, + "learning_rate": 1.985769795314804e-05, + "loss": 0.6467, + "step": 670 + }, + { + "epoch": 0.08233128834355828, + "grad_norm": 1.043781018275884, + "learning_rate": 1.9857029110769154e-05, + "loss": 0.6004, + "step": 671 + }, + { + "epoch": 0.08245398773006135, + "grad_norm": 1.1116199671941123, + "learning_rate": 1.985635871155726e-05, + "loss": 0.6177, + "step": 672 + }, + { + "epoch": 0.08257668711656442, + "grad_norm": 1.0752324951488932, + "learning_rate": 1.9855686755618243e-05, + "loss": 0.5864, + "step": 673 + }, + { + "epoch": 0.08269938650306749, + "grad_norm": 1.1124139718327017, + "learning_rate": 1.985501324305824e-05, + "loss": 0.5855, + "step": 674 + }, + { + "epoch": 0.08282208588957055, + "grad_norm": 1.0891580417541138, + "learning_rate": 1.9854338173983615e-05, + "loss": 0.6105, + "step": 675 + }, + { + "epoch": 0.08294478527607362, + "grad_norm": 1.1266507469135232, + "learning_rate": 1.9853661548500995e-05, + "loss": 0.6922, + "step": 676 + }, + { + "epoch": 0.08306748466257668, + "grad_norm": 1.1419427043685593, + "learning_rate": 1.9852983366717248e-05, + "loss": 0.5945, + "step": 677 + }, + { + "epoch": 0.08319018404907975, + "grad_norm": 1.4120367040628983, + "learning_rate": 1.9852303628739483e-05, + "loss": 0.7603, + "step": 678 + }, + { + "epoch": 0.08331288343558282, + "grad_norm": 1.2472550793103345, + "learning_rate": 1.9851622334675065e-05, + "loss": 0.6323, + "step": 679 + }, + { + "epoch": 0.0834355828220859, + "grad_norm": 1.1282915688726083, + "learning_rate": 1.9850939484631598e-05, + "loss": 0.6275, + "step": 680 + }, + { + "epoch": 0.08355828220858896, + "grad_norm": 1.4485516178959807, + "learning_rate": 1.9850255078716922e-05, + "loss": 0.6298, + "step": 681 + }, + { + "epoch": 0.08368098159509202, + "grad_norm": 1.092082761770064, + "learning_rate": 1.9849569117039146e-05, + "loss": 0.6101, + "step": 682 + }, + { + "epoch": 0.08380368098159509, + "grad_norm": 1.0232593349467711, + "learning_rate": 1.9848881599706605e-05, + "loss": 0.6144, + "step": 683 + }, + { + "epoch": 0.08392638036809816, + "grad_norm": 1.2718341037191174, + "learning_rate": 1.9848192526827885e-05, + "loss": 0.6691, + "step": 684 + }, + { + "epoch": 0.08404907975460123, + "grad_norm": 1.1037945221121939, + "learning_rate": 1.9847501898511824e-05, + "loss": 0.6981, + "step": 685 + }, + { + "epoch": 0.0841717791411043, + "grad_norm": 1.0295784485880957, + "learning_rate": 1.98468097148675e-05, + "loss": 0.6631, + "step": 686 + }, + { + "epoch": 0.08429447852760737, + "grad_norm": 1.8856867774047608, + "learning_rate": 1.9846115976004234e-05, + "loss": 0.5808, + "step": 687 + }, + { + "epoch": 0.08441717791411044, + "grad_norm": 1.0153164649848092, + "learning_rate": 1.98454206820316e-05, + "loss": 0.6743, + "step": 688 + }, + { + "epoch": 0.08453987730061349, + "grad_norm": 1.103788074423574, + "learning_rate": 1.984472383305941e-05, + "loss": 0.5955, + "step": 689 + }, + { + "epoch": 0.08466257668711656, + "grad_norm": 1.2630057503324683, + "learning_rate": 1.9844025429197727e-05, + "loss": 0.736, + "step": 690 + }, + { + "epoch": 0.08478527607361963, + "grad_norm": 1.1176522203953991, + "learning_rate": 1.9843325470556857e-05, + "loss": 0.732, + "step": 691 + }, + { + "epoch": 0.0849079754601227, + "grad_norm": 1.175218884872308, + "learning_rate": 1.9842623957247355e-05, + "loss": 0.6456, + "step": 692 + }, + { + "epoch": 0.08503067484662577, + "grad_norm": 1.0614183850378553, + "learning_rate": 1.9841920889380016e-05, + "loss": 0.6242, + "step": 693 + }, + { + "epoch": 0.08515337423312884, + "grad_norm": 1.2146738337293441, + "learning_rate": 1.984121626706589e-05, + "loss": 0.6675, + "step": 694 + }, + { + "epoch": 0.0852760736196319, + "grad_norm": 1.2809424902376711, + "learning_rate": 1.984051009041626e-05, + "loss": 0.7679, + "step": 695 + }, + { + "epoch": 0.08539877300613496, + "grad_norm": 0.9253723863853289, + "learning_rate": 1.9839802359542655e-05, + "loss": 0.6118, + "step": 696 + }, + { + "epoch": 0.08552147239263803, + "grad_norm": 1.081046526361807, + "learning_rate": 1.9839093074556865e-05, + "loss": 0.614, + "step": 697 + }, + { + "epoch": 0.0856441717791411, + "grad_norm": 1.1768269015439197, + "learning_rate": 1.9838382235570915e-05, + "loss": 0.6677, + "step": 698 + }, + { + "epoch": 0.08576687116564417, + "grad_norm": 1.1756541295238998, + "learning_rate": 1.983766984269707e-05, + "loss": 0.6408, + "step": 699 + }, + { + "epoch": 0.08588957055214724, + "grad_norm": 1.2498133762324315, + "learning_rate": 1.983695589604785e-05, + "loss": 0.6613, + "step": 700 + }, + { + "epoch": 0.08601226993865031, + "grad_norm": 1.0885120089025948, + "learning_rate": 1.9836240395736017e-05, + "loss": 0.6379, + "step": 701 + }, + { + "epoch": 0.08613496932515337, + "grad_norm": 0.9857847523450394, + "learning_rate": 1.9835523341874572e-05, + "loss": 0.5861, + "step": 702 + }, + { + "epoch": 0.08625766871165644, + "grad_norm": 1.0385859746617776, + "learning_rate": 1.983480473457678e-05, + "loss": 0.656, + "step": 703 + }, + { + "epoch": 0.08638036809815951, + "grad_norm": 1.083650392233887, + "learning_rate": 1.983408457395613e-05, + "loss": 0.5998, + "step": 704 + }, + { + "epoch": 0.08650306748466258, + "grad_norm": 1.1095226718737212, + "learning_rate": 1.9833362860126364e-05, + "loss": 0.6817, + "step": 705 + }, + { + "epoch": 0.08662576687116565, + "grad_norm": 1.0033300518223414, + "learning_rate": 1.9832639593201473e-05, + "loss": 0.6193, + "step": 706 + }, + { + "epoch": 0.08674846625766872, + "grad_norm": 1.1501270930235732, + "learning_rate": 1.983191477329569e-05, + "loss": 0.6654, + "step": 707 + }, + { + "epoch": 0.08687116564417179, + "grad_norm": 1.354302561208966, + "learning_rate": 1.9831188400523498e-05, + "loss": 0.6565, + "step": 708 + }, + { + "epoch": 0.08699386503067484, + "grad_norm": 1.1062956036487055, + "learning_rate": 1.9830460474999617e-05, + "loss": 0.6708, + "step": 709 + }, + { + "epoch": 0.08711656441717791, + "grad_norm": 1.4400918196670616, + "learning_rate": 1.982973099683902e-05, + "loss": 0.7626, + "step": 710 + }, + { + "epoch": 0.08723926380368098, + "grad_norm": 2.024775960181904, + "learning_rate": 1.9828999966156917e-05, + "loss": 0.6752, + "step": 711 + }, + { + "epoch": 0.08736196319018405, + "grad_norm": 1.0274215788986, + "learning_rate": 1.9828267383068774e-05, + "loss": 0.6096, + "step": 712 + }, + { + "epoch": 0.08748466257668712, + "grad_norm": 1.1852040197831188, + "learning_rate": 1.982753324769029e-05, + "loss": 0.7015, + "step": 713 + }, + { + "epoch": 0.08760736196319019, + "grad_norm": 1.3282385740684952, + "learning_rate": 1.9826797560137416e-05, + "loss": 0.7221, + "step": 714 + }, + { + "epoch": 0.08773006134969324, + "grad_norm": 1.0488444961496666, + "learning_rate": 1.9826060320526355e-05, + "loss": 0.6384, + "step": 715 + }, + { + "epoch": 0.08785276073619631, + "grad_norm": 1.178507840574899, + "learning_rate": 1.982532152897354e-05, + "loss": 0.6275, + "step": 716 + }, + { + "epoch": 0.08797546012269938, + "grad_norm": 1.1665863184010297, + "learning_rate": 1.982458118559566e-05, + "loss": 0.6685, + "step": 717 + }, + { + "epoch": 0.08809815950920245, + "grad_norm": 1.123810155004092, + "learning_rate": 1.9823839290509643e-05, + "loss": 0.6169, + "step": 718 + }, + { + "epoch": 0.08822085889570552, + "grad_norm": 1.0402067213241832, + "learning_rate": 1.982309584383267e-05, + "loss": 0.6324, + "step": 719 + }, + { + "epoch": 0.08834355828220859, + "grad_norm": 1.1518528788630813, + "learning_rate": 1.982235084568216e-05, + "loss": 0.6027, + "step": 720 + }, + { + "epoch": 0.08846625766871166, + "grad_norm": 1.1793523925588412, + "learning_rate": 1.9821604296175774e-05, + "loss": 0.6381, + "step": 721 + }, + { + "epoch": 0.08858895705521472, + "grad_norm": 1.157043626207937, + "learning_rate": 1.9820856195431428e-05, + "loss": 0.6221, + "step": 722 + }, + { + "epoch": 0.08871165644171779, + "grad_norm": 1.124002314415546, + "learning_rate": 1.982010654356728e-05, + "loss": 0.652, + "step": 723 + }, + { + "epoch": 0.08883435582822086, + "grad_norm": 1.4257010611536935, + "learning_rate": 1.9819355340701726e-05, + "loss": 0.7455, + "step": 724 + }, + { + "epoch": 0.08895705521472393, + "grad_norm": 1.2171968172918597, + "learning_rate": 1.9818602586953414e-05, + "loss": 0.6769, + "step": 725 + }, + { + "epoch": 0.089079754601227, + "grad_norm": 1.2565280962327987, + "learning_rate": 1.981784828244124e-05, + "loss": 0.7586, + "step": 726 + }, + { + "epoch": 0.08920245398773007, + "grad_norm": 1.4079162476303497, + "learning_rate": 1.981709242728433e-05, + "loss": 0.6762, + "step": 727 + }, + { + "epoch": 0.08932515337423313, + "grad_norm": 1.2123311798079834, + "learning_rate": 1.9816335021602072e-05, + "loss": 0.723, + "step": 728 + }, + { + "epoch": 0.08944785276073619, + "grad_norm": 1.0604295155172536, + "learning_rate": 1.981557606551409e-05, + "loss": 0.6209, + "step": 729 + }, + { + "epoch": 0.08957055214723926, + "grad_norm": 0.993978214786388, + "learning_rate": 1.9814815559140258e-05, + "loss": 0.6491, + "step": 730 + }, + { + "epoch": 0.08969325153374233, + "grad_norm": 1.1846228814419693, + "learning_rate": 1.9814053502600683e-05, + "loss": 0.6075, + "step": 731 + }, + { + "epoch": 0.0898159509202454, + "grad_norm": 1.2995392233373702, + "learning_rate": 1.981328989601573e-05, + "loss": 0.7269, + "step": 732 + }, + { + "epoch": 0.08993865030674847, + "grad_norm": 1.0669512669328174, + "learning_rate": 1.9812524739506007e-05, + "loss": 0.6527, + "step": 733 + }, + { + "epoch": 0.09006134969325154, + "grad_norm": 1.2285989920020148, + "learning_rate": 1.981175803319236e-05, + "loss": 0.6199, + "step": 734 + }, + { + "epoch": 0.09018404907975461, + "grad_norm": 1.1944496908176792, + "learning_rate": 1.9810989777195884e-05, + "loss": 0.7439, + "step": 735 + }, + { + "epoch": 0.09030674846625766, + "grad_norm": 1.0542874574532475, + "learning_rate": 1.981021997163792e-05, + "loss": 0.6139, + "step": 736 + }, + { + "epoch": 0.09042944785276073, + "grad_norm": 1.1234412828550608, + "learning_rate": 1.9809448616640052e-05, + "loss": 0.6349, + "step": 737 + }, + { + "epoch": 0.0905521472392638, + "grad_norm": 0.9980226684002773, + "learning_rate": 1.9808675712324108e-05, + "loss": 0.6499, + "step": 738 + }, + { + "epoch": 0.09067484662576687, + "grad_norm": 1.2056342891024319, + "learning_rate": 1.980790125881216e-05, + "loss": 0.6285, + "step": 739 + }, + { + "epoch": 0.09079754601226994, + "grad_norm": 1.0281889690050787, + "learning_rate": 1.9807125256226532e-05, + "loss": 0.6951, + "step": 740 + }, + { + "epoch": 0.09092024539877301, + "grad_norm": 1.1044984518182375, + "learning_rate": 1.9806347704689778e-05, + "loss": 0.6482, + "step": 741 + }, + { + "epoch": 0.09104294478527607, + "grad_norm": 1.192912507938953, + "learning_rate": 1.9805568604324717e-05, + "loss": 0.7449, + "step": 742 + }, + { + "epoch": 0.09116564417177914, + "grad_norm": 1.1232831403650898, + "learning_rate": 1.980478795525439e-05, + "loss": 0.6725, + "step": 743 + }, + { + "epoch": 0.0912883435582822, + "grad_norm": 1.3134219304358026, + "learning_rate": 1.98040057576021e-05, + "loss": 0.6531, + "step": 744 + }, + { + "epoch": 0.09141104294478528, + "grad_norm": 2.7041267610501643, + "learning_rate": 1.9803222011491385e-05, + "loss": 0.6458, + "step": 745 + }, + { + "epoch": 0.09153374233128835, + "grad_norm": 1.0317616750675407, + "learning_rate": 1.980243671704604e-05, + "loss": 0.6196, + "step": 746 + }, + { + "epoch": 0.09165644171779141, + "grad_norm": 1.2317393688697482, + "learning_rate": 1.980164987439008e-05, + "loss": 0.6236, + "step": 747 + }, + { + "epoch": 0.09177914110429448, + "grad_norm": 1.1754414192203217, + "learning_rate": 1.9800861483647792e-05, + "loss": 0.6358, + "step": 748 + }, + { + "epoch": 0.09190184049079754, + "grad_norm": 1.236944512357945, + "learning_rate": 1.9800071544943696e-05, + "loss": 0.7013, + "step": 749 + }, + { + "epoch": 0.09202453987730061, + "grad_norm": 1.2247296671807844, + "learning_rate": 1.979928005840255e-05, + "loss": 0.76, + "step": 750 + }, + { + "epoch": 0.09214723926380368, + "grad_norm": 1.0696129028259742, + "learning_rate": 1.979848702414936e-05, + "loss": 0.5887, + "step": 751 + }, + { + "epoch": 0.09226993865030675, + "grad_norm": 1.1493054824733442, + "learning_rate": 1.9797692442309387e-05, + "loss": 0.6005, + "step": 752 + }, + { + "epoch": 0.09239263803680982, + "grad_norm": 1.2135278482733294, + "learning_rate": 1.9796896313008126e-05, + "loss": 0.7313, + "step": 753 + }, + { + "epoch": 0.09251533742331289, + "grad_norm": 1.0942024376215491, + "learning_rate": 1.9796098636371316e-05, + "loss": 0.6875, + "step": 754 + }, + { + "epoch": 0.09263803680981596, + "grad_norm": 1.298194450120287, + "learning_rate": 1.9795299412524948e-05, + "loss": 0.6807, + "step": 755 + }, + { + "epoch": 0.09276073619631901, + "grad_norm": 1.1081820211426434, + "learning_rate": 1.9794498641595246e-05, + "loss": 0.7121, + "step": 756 + }, + { + "epoch": 0.09288343558282208, + "grad_norm": 1.1310259664260507, + "learning_rate": 1.979369632370869e-05, + "loss": 0.6762, + "step": 757 + }, + { + "epoch": 0.09300613496932515, + "grad_norm": 1.0849402916330555, + "learning_rate": 1.9792892458991995e-05, + "loss": 0.6807, + "step": 758 + }, + { + "epoch": 0.09312883435582822, + "grad_norm": 1.2089319427907266, + "learning_rate": 1.979208704757213e-05, + "loss": 0.644, + "step": 759 + }, + { + "epoch": 0.09325153374233129, + "grad_norm": 1.1450630088373384, + "learning_rate": 1.9791280089576302e-05, + "loss": 0.6536, + "step": 760 + }, + { + "epoch": 0.09337423312883436, + "grad_norm": 1.2051503470717289, + "learning_rate": 1.9790471585131956e-05, + "loss": 0.6521, + "step": 761 + }, + { + "epoch": 0.09349693251533742, + "grad_norm": 1.0686289884638487, + "learning_rate": 1.9789661534366796e-05, + "loss": 0.6267, + "step": 762 + }, + { + "epoch": 0.09361963190184049, + "grad_norm": 1.6177693298664084, + "learning_rate": 1.9788849937408757e-05, + "loss": 0.7396, + "step": 763 + }, + { + "epoch": 0.09374233128834356, + "grad_norm": 1.2992469495018277, + "learning_rate": 1.978803679438603e-05, + "loss": 0.6526, + "step": 764 + }, + { + "epoch": 0.09386503067484663, + "grad_norm": 1.1978780172832706, + "learning_rate": 1.978722210542704e-05, + "loss": 0.6801, + "step": 765 + }, + { + "epoch": 0.0939877300613497, + "grad_norm": 1.1492289323915885, + "learning_rate": 1.9786405870660458e-05, + "loss": 0.6363, + "step": 766 + }, + { + "epoch": 0.09411042944785276, + "grad_norm": 1.1316358101793982, + "learning_rate": 1.9785588090215205e-05, + "loss": 0.611, + "step": 767 + }, + { + "epoch": 0.09423312883435583, + "grad_norm": 1.224276880554046, + "learning_rate": 1.9784768764220447e-05, + "loss": 0.6769, + "step": 768 + }, + { + "epoch": 0.09435582822085889, + "grad_norm": 1.1988906747298913, + "learning_rate": 1.978394789280558e-05, + "loss": 0.6812, + "step": 769 + }, + { + "epoch": 0.09447852760736196, + "grad_norm": 1.1185941320093085, + "learning_rate": 1.9783125476100254e-05, + "loss": 0.6239, + "step": 770 + }, + { + "epoch": 0.09460122699386503, + "grad_norm": 1.2172054610069902, + "learning_rate": 1.9782301514234368e-05, + "loss": 0.6397, + "step": 771 + }, + { + "epoch": 0.0947239263803681, + "grad_norm": 1.1644196865394292, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.5942, + "step": 772 + }, + { + "epoch": 0.09484662576687117, + "grad_norm": 1.1520279807161358, + "learning_rate": 1.9780648955541706e-05, + "loss": 0.6007, + "step": 773 + }, + { + "epoch": 0.09496932515337424, + "grad_norm": 1.522173278338626, + "learning_rate": 1.9779820358975936e-05, + "loss": 0.7616, + "step": 774 + }, + { + "epoch": 0.0950920245398773, + "grad_norm": 1.0591937353968115, + "learning_rate": 1.977899021777162e-05, + "loss": 0.6487, + "step": 775 + }, + { + "epoch": 0.09521472392638036, + "grad_norm": 1.0832719391359693, + "learning_rate": 1.977815853205987e-05, + "loss": 0.6833, + "step": 776 + }, + { + "epoch": 0.09533742331288343, + "grad_norm": 0.9963947027915574, + "learning_rate": 1.9777325301972046e-05, + "loss": 0.6003, + "step": 777 + }, + { + "epoch": 0.0954601226993865, + "grad_norm": 0.9887013642996605, + "learning_rate": 1.9776490527639746e-05, + "loss": 0.6537, + "step": 778 + }, + { + "epoch": 0.09558282208588957, + "grad_norm": 1.0619244932153724, + "learning_rate": 1.9775654209194816e-05, + "loss": 0.6703, + "step": 779 + }, + { + "epoch": 0.09570552147239264, + "grad_norm": 1.267790127114491, + "learning_rate": 1.977481634676935e-05, + "loss": 0.7486, + "step": 780 + }, + { + "epoch": 0.09582822085889571, + "grad_norm": 1.0102476071879238, + "learning_rate": 1.9773976940495672e-05, + "loss": 0.6154, + "step": 781 + }, + { + "epoch": 0.09595092024539878, + "grad_norm": 1.1496282615003854, + "learning_rate": 1.977313599050637e-05, + "loss": 0.6277, + "step": 782 + }, + { + "epoch": 0.09607361963190184, + "grad_norm": 1.1297527728062633, + "learning_rate": 1.977229349693426e-05, + "loss": 0.6301, + "step": 783 + }, + { + "epoch": 0.0961963190184049, + "grad_norm": 1.1437984346758099, + "learning_rate": 1.97714494599124e-05, + "loss": 0.6352, + "step": 784 + }, + { + "epoch": 0.09631901840490797, + "grad_norm": 1.1082726588783707, + "learning_rate": 1.9770603879574108e-05, + "loss": 0.6438, + "step": 785 + }, + { + "epoch": 0.09644171779141104, + "grad_norm": 1.1428992967407385, + "learning_rate": 1.9769756756052933e-05, + "loss": 0.7292, + "step": 786 + }, + { + "epoch": 0.09656441717791411, + "grad_norm": 0.9390076713400252, + "learning_rate": 1.976890808948267e-05, + "loss": 0.6228, + "step": 787 + }, + { + "epoch": 0.09668711656441718, + "grad_norm": 1.2843113428710051, + "learning_rate": 1.976805787999736e-05, + "loss": 0.7733, + "step": 788 + }, + { + "epoch": 0.09680981595092024, + "grad_norm": 1.4005558570359518, + "learning_rate": 1.9767206127731283e-05, + "loss": 0.6679, + "step": 789 + }, + { + "epoch": 0.09693251533742331, + "grad_norm": 1.0455192541926799, + "learning_rate": 1.9766352832818972e-05, + "loss": 0.5973, + "step": 790 + }, + { + "epoch": 0.09705521472392638, + "grad_norm": 0.9675497571888397, + "learning_rate": 1.976549799539519e-05, + "loss": 0.6121, + "step": 791 + }, + { + "epoch": 0.09717791411042945, + "grad_norm": 1.1403204437439298, + "learning_rate": 1.9764641615594956e-05, + "loss": 0.6518, + "step": 792 + }, + { + "epoch": 0.09730061349693252, + "grad_norm": 1.3891507220395114, + "learning_rate": 1.9763783693553527e-05, + "loss": 0.618, + "step": 793 + }, + { + "epoch": 0.09742331288343559, + "grad_norm": 1.2979902327992203, + "learning_rate": 1.9762924229406405e-05, + "loss": 0.6152, + "step": 794 + }, + { + "epoch": 0.09754601226993866, + "grad_norm": 1.0386638636167143, + "learning_rate": 1.9762063223289334e-05, + "loss": 0.6113, + "step": 795 + }, + { + "epoch": 0.09766871165644171, + "grad_norm": 1.3789293189124603, + "learning_rate": 1.97612006753383e-05, + "loss": 0.7785, + "step": 796 + }, + { + "epoch": 0.09779141104294478, + "grad_norm": 1.1631502210016023, + "learning_rate": 1.9760336585689544e-05, + "loss": 0.7663, + "step": 797 + }, + { + "epoch": 0.09791411042944785, + "grad_norm": 1.1799120635495894, + "learning_rate": 1.975947095447953e-05, + "loss": 0.6547, + "step": 798 + }, + { + "epoch": 0.09803680981595092, + "grad_norm": 1.295995255902854, + "learning_rate": 1.9758603781844986e-05, + "loss": 0.6263, + "step": 799 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 1.205592730969111, + "learning_rate": 1.975773506792287e-05, + "loss": 0.6269, + "step": 800 + }, + { + "epoch": 0.09828220858895706, + "grad_norm": 1.261790354777458, + "learning_rate": 1.9756864812850386e-05, + "loss": 0.6101, + "step": 801 + }, + { + "epoch": 0.09840490797546013, + "grad_norm": 1.0779039851245453, + "learning_rate": 1.975599301676499e-05, + "loss": 0.5892, + "step": 802 + }, + { + "epoch": 0.09852760736196318, + "grad_norm": 1.1337320852196797, + "learning_rate": 1.975511967980437e-05, + "loss": 0.6322, + "step": 803 + }, + { + "epoch": 0.09865030674846625, + "grad_norm": 1.1919499988680744, + "learning_rate": 1.975424480210646e-05, + "loss": 0.643, + "step": 804 + }, + { + "epoch": 0.09877300613496932, + "grad_norm": 1.4115201467028355, + "learning_rate": 1.9753368383809445e-05, + "loss": 0.7281, + "step": 805 + }, + { + "epoch": 0.0988957055214724, + "grad_norm": 1.1466052799439244, + "learning_rate": 1.975249042505174e-05, + "loss": 0.6746, + "step": 806 + }, + { + "epoch": 0.09901840490797546, + "grad_norm": 1.0932139144345898, + "learning_rate": 1.9751610925972023e-05, + "loss": 0.6283, + "step": 807 + }, + { + "epoch": 0.09914110429447853, + "grad_norm": 1.1124222485323796, + "learning_rate": 1.9750729886709193e-05, + "loss": 0.6327, + "step": 808 + }, + { + "epoch": 0.09926380368098159, + "grad_norm": 1.3589206017285438, + "learning_rate": 1.9749847307402406e-05, + "loss": 0.7335, + "step": 809 + }, + { + "epoch": 0.09938650306748466, + "grad_norm": 1.1398090423448177, + "learning_rate": 1.974896318819106e-05, + "loss": 0.649, + "step": 810 + }, + { + "epoch": 0.09950920245398773, + "grad_norm": 1.0767558571257312, + "learning_rate": 1.9748077529214786e-05, + "loss": 0.6079, + "step": 811 + }, + { + "epoch": 0.0996319018404908, + "grad_norm": 1.0921143037038858, + "learning_rate": 1.9747190330613475e-05, + "loss": 0.631, + "step": 812 + }, + { + "epoch": 0.09975460122699387, + "grad_norm": 1.138371767639768, + "learning_rate": 1.9746301592527252e-05, + "loss": 0.6582, + "step": 813 + }, + { + "epoch": 0.09987730061349694, + "grad_norm": 1.1256424921147163, + "learning_rate": 1.974541131509648e-05, + "loss": 0.6842, + "step": 814 + }, + { + "epoch": 0.1, + "grad_norm": 1.1292749210878186, + "learning_rate": 1.974451949846177e-05, + "loss": 0.6281, + "step": 815 + }, + { + "epoch": 0.10012269938650306, + "grad_norm": 1.3655541502923092, + "learning_rate": 1.9743626142763988e-05, + "loss": 0.6105, + "step": 816 + }, + { + "epoch": 0.10024539877300613, + "grad_norm": 1.1595204539111807, + "learning_rate": 1.974273124814422e-05, + "loss": 0.6313, + "step": 817 + }, + { + "epoch": 0.1003680981595092, + "grad_norm": 1.1785992508860272, + "learning_rate": 1.9741834814743812e-05, + "loss": 0.6746, + "step": 818 + }, + { + "epoch": 0.10049079754601227, + "grad_norm": 1.1979697538365142, + "learning_rate": 1.9740936842704346e-05, + "loss": 0.6118, + "step": 819 + }, + { + "epoch": 0.10061349693251534, + "grad_norm": 1.3706664908247002, + "learning_rate": 1.974003733216765e-05, + "loss": 0.7105, + "step": 820 + }, + { + "epoch": 0.10073619631901841, + "grad_norm": 1.12399280049398, + "learning_rate": 1.9739136283275794e-05, + "loss": 0.617, + "step": 821 + }, + { + "epoch": 0.10085889570552148, + "grad_norm": 1.4851555295320962, + "learning_rate": 1.9738233696171094e-05, + "loss": 0.6906, + "step": 822 + }, + { + "epoch": 0.10098159509202453, + "grad_norm": 1.0110610542311702, + "learning_rate": 1.9737329570996098e-05, + "loss": 0.632, + "step": 823 + }, + { + "epoch": 0.1011042944785276, + "grad_norm": 1.0639641864340097, + "learning_rate": 1.9736423907893612e-05, + "loss": 0.66, + "step": 824 + }, + { + "epoch": 0.10122699386503067, + "grad_norm": 1.147152767521932, + "learning_rate": 1.9735516707006676e-05, + "loss": 0.6461, + "step": 825 + }, + { + "epoch": 0.10134969325153374, + "grad_norm": 1.1945699784875752, + "learning_rate": 1.9734607968478574e-05, + "loss": 0.6435, + "step": 826 + }, + { + "epoch": 0.10147239263803681, + "grad_norm": 1.1322239762474284, + "learning_rate": 1.973369769245283e-05, + "loss": 0.6581, + "step": 827 + }, + { + "epoch": 0.10159509202453988, + "grad_norm": 1.0801732355143894, + "learning_rate": 1.9732785879073223e-05, + "loss": 0.6483, + "step": 828 + }, + { + "epoch": 0.10171779141104295, + "grad_norm": 0.9807272387674165, + "learning_rate": 1.9731872528483754e-05, + "loss": 0.6227, + "step": 829 + }, + { + "epoch": 0.10184049079754601, + "grad_norm": 1.0958404828985537, + "learning_rate": 1.973095764082869e-05, + "loss": 0.6771, + "step": 830 + }, + { + "epoch": 0.10196319018404908, + "grad_norm": 1.1973716300413932, + "learning_rate": 1.9730041216252524e-05, + "loss": 0.7835, + "step": 831 + }, + { + "epoch": 0.10208588957055215, + "grad_norm": 1.314977818390018, + "learning_rate": 1.97291232549e-05, + "loss": 0.6201, + "step": 832 + }, + { + "epoch": 0.10220858895705522, + "grad_norm": 1.1213898177531587, + "learning_rate": 1.97282037569161e-05, + "loss": 0.6519, + "step": 833 + }, + { + "epoch": 0.10233128834355829, + "grad_norm": 1.362654088072738, + "learning_rate": 1.972728272244605e-05, + "loss": 0.6338, + "step": 834 + }, + { + "epoch": 0.10245398773006135, + "grad_norm": 1.053790922970196, + "learning_rate": 1.972636015163532e-05, + "loss": 0.6484, + "step": 835 + }, + { + "epoch": 0.10257668711656441, + "grad_norm": 1.1314793238826932, + "learning_rate": 1.9725436044629625e-05, + "loss": 0.6541, + "step": 836 + }, + { + "epoch": 0.10269938650306748, + "grad_norm": 1.1504176085691211, + "learning_rate": 1.9724510401574917e-05, + "loss": 0.6723, + "step": 837 + }, + { + "epoch": 0.10282208588957055, + "grad_norm": 1.1052458579653415, + "learning_rate": 1.9723583222617395e-05, + "loss": 0.6415, + "step": 838 + }, + { + "epoch": 0.10294478527607362, + "grad_norm": 1.1888429492369936, + "learning_rate": 1.9722654507903497e-05, + "loss": 0.6528, + "step": 839 + }, + { + "epoch": 0.10306748466257669, + "grad_norm": 1.148267548448684, + "learning_rate": 1.9721724257579907e-05, + "loss": 0.7142, + "step": 840 + }, + { + "epoch": 0.10319018404907976, + "grad_norm": 0.9846656365832203, + "learning_rate": 1.972079247179355e-05, + "loss": 0.608, + "step": 841 + }, + { + "epoch": 0.10331288343558283, + "grad_norm": 0.9664506809699015, + "learning_rate": 1.9719859150691595e-05, + "loss": 0.6827, + "step": 842 + }, + { + "epoch": 0.10343558282208588, + "grad_norm": 0.9670660609178642, + "learning_rate": 1.9718924294421447e-05, + "loss": 0.6011, + "step": 843 + }, + { + "epoch": 0.10355828220858895, + "grad_norm": 1.0495381490061098, + "learning_rate": 1.971798790313076e-05, + "loss": 0.7045, + "step": 844 + }, + { + "epoch": 0.10368098159509202, + "grad_norm": 1.099479955095761, + "learning_rate": 1.9717049976967437e-05, + "loss": 0.6468, + "step": 845 + }, + { + "epoch": 0.10380368098159509, + "grad_norm": 1.0775403982025966, + "learning_rate": 1.9716110516079604e-05, + "loss": 0.635, + "step": 846 + }, + { + "epoch": 0.10392638036809816, + "grad_norm": 1.0912711940292699, + "learning_rate": 1.9715169520615647e-05, + "loss": 0.6528, + "step": 847 + }, + { + "epoch": 0.10404907975460123, + "grad_norm": 1.0574663935551598, + "learning_rate": 1.971422699072419e-05, + "loss": 0.6084, + "step": 848 + }, + { + "epoch": 0.1041717791411043, + "grad_norm": 1.0964007030250746, + "learning_rate": 1.971328292655409e-05, + "loss": 0.6949, + "step": 849 + }, + { + "epoch": 0.10429447852760736, + "grad_norm": 1.0874089782681966, + "learning_rate": 1.971233732825446e-05, + "loss": 0.6316, + "step": 850 + }, + { + "epoch": 0.10441717791411043, + "grad_norm": 1.3465030634496231, + "learning_rate": 1.971139019597465e-05, + "loss": 0.6743, + "step": 851 + }, + { + "epoch": 0.1045398773006135, + "grad_norm": 1.2208897617504697, + "learning_rate": 1.9710441529864248e-05, + "loss": 0.6458, + "step": 852 + }, + { + "epoch": 0.10466257668711657, + "grad_norm": 1.1347042178219668, + "learning_rate": 1.9709491330073087e-05, + "loss": 0.7789, + "step": 853 + }, + { + "epoch": 0.10478527607361963, + "grad_norm": 1.2529454092184362, + "learning_rate": 1.9708539596751245e-05, + "loss": 0.6604, + "step": 854 + }, + { + "epoch": 0.1049079754601227, + "grad_norm": 1.1663513304620514, + "learning_rate": 1.9707586330049037e-05, + "loss": 0.6432, + "step": 855 + }, + { + "epoch": 0.10503067484662576, + "grad_norm": 1.0136336628529166, + "learning_rate": 1.9706631530117027e-05, + "loss": 0.6197, + "step": 856 + }, + { + "epoch": 0.10515337423312883, + "grad_norm": 1.2331313525782857, + "learning_rate": 1.970567519710602e-05, + "loss": 0.6175, + "step": 857 + }, + { + "epoch": 0.1052760736196319, + "grad_norm": 1.144363379858588, + "learning_rate": 1.970471733116705e-05, + "loss": 0.6284, + "step": 858 + }, + { + "epoch": 0.10539877300613497, + "grad_norm": 1.1140615395155984, + "learning_rate": 1.9703757932451414e-05, + "loss": 0.6259, + "step": 859 + }, + { + "epoch": 0.10552147239263804, + "grad_norm": 1.0379391343326747, + "learning_rate": 1.9702797001110642e-05, + "loss": 0.6541, + "step": 860 + }, + { + "epoch": 0.10564417177914111, + "grad_norm": 1.2202435806687795, + "learning_rate": 1.9701834537296493e-05, + "loss": 0.7321, + "step": 861 + }, + { + "epoch": 0.10576687116564418, + "grad_norm": 1.0513831215528902, + "learning_rate": 1.970087054116099e-05, + "loss": 0.6201, + "step": 862 + }, + { + "epoch": 0.10588957055214723, + "grad_norm": 1.1046850946504265, + "learning_rate": 1.9699905012856383e-05, + "loss": 0.5996, + "step": 863 + }, + { + "epoch": 0.1060122699386503, + "grad_norm": 1.2269806633731029, + "learning_rate": 1.9698937952535173e-05, + "loss": 0.6533, + "step": 864 + }, + { + "epoch": 0.10613496932515337, + "grad_norm": 1.217445978105421, + "learning_rate": 1.9697969360350098e-05, + "loss": 0.622, + "step": 865 + }, + { + "epoch": 0.10625766871165644, + "grad_norm": 1.1163911030486295, + "learning_rate": 1.9696999236454135e-05, + "loss": 0.7171, + "step": 866 + }, + { + "epoch": 0.10638036809815951, + "grad_norm": 1.0268047700617313, + "learning_rate": 1.9696027581000508e-05, + "loss": 0.6559, + "step": 867 + }, + { + "epoch": 0.10650306748466258, + "grad_norm": 1.1627773028396127, + "learning_rate": 1.9695054394142684e-05, + "loss": 0.6731, + "step": 868 + }, + { + "epoch": 0.10662576687116565, + "grad_norm": 1.090114879274513, + "learning_rate": 1.969407967603437e-05, + "loss": 0.607, + "step": 869 + }, + { + "epoch": 0.1067484662576687, + "grad_norm": 0.9789767874783912, + "learning_rate": 1.969310342682951e-05, + "loss": 0.6921, + "step": 870 + }, + { + "epoch": 0.10687116564417178, + "grad_norm": 1.1026749847163404, + "learning_rate": 1.96921256466823e-05, + "loss": 0.648, + "step": 871 + }, + { + "epoch": 0.10699386503067485, + "grad_norm": 1.1780818699012696, + "learning_rate": 1.9691146335747165e-05, + "loss": 0.6457, + "step": 872 + }, + { + "epoch": 0.10711656441717791, + "grad_norm": 1.1273720312109947, + "learning_rate": 1.9690165494178782e-05, + "loss": 0.6656, + "step": 873 + }, + { + "epoch": 0.10723926380368098, + "grad_norm": 1.0257593858802123, + "learning_rate": 1.968918312213207e-05, + "loss": 0.6557, + "step": 874 + }, + { + "epoch": 0.10736196319018405, + "grad_norm": 1.2366193935339398, + "learning_rate": 1.9688199219762183e-05, + "loss": 0.726, + "step": 875 + }, + { + "epoch": 0.10748466257668712, + "grad_norm": 1.1317999519802568, + "learning_rate": 1.968721378722452e-05, + "loss": 0.5849, + "step": 876 + }, + { + "epoch": 0.10760736196319018, + "grad_norm": 1.0904219335145906, + "learning_rate": 1.968622682467472e-05, + "loss": 0.6531, + "step": 877 + }, + { + "epoch": 0.10773006134969325, + "grad_norm": 1.288261896971006, + "learning_rate": 1.968523833226867e-05, + "loss": 0.6162, + "step": 878 + }, + { + "epoch": 0.10785276073619632, + "grad_norm": 1.0746450467688327, + "learning_rate": 1.9684248310162488e-05, + "loss": 0.675, + "step": 879 + }, + { + "epoch": 0.10797546012269939, + "grad_norm": 1.3086059320170242, + "learning_rate": 1.9683256758512544e-05, + "loss": 0.6907, + "step": 880 + }, + { + "epoch": 0.10809815950920246, + "grad_norm": 1.3146060675540552, + "learning_rate": 1.9682263677475442e-05, + "loss": 0.6361, + "step": 881 + }, + { + "epoch": 0.10822085889570553, + "grad_norm": 1.1058957666739735, + "learning_rate": 1.9681269067208032e-05, + "loss": 0.6743, + "step": 882 + }, + { + "epoch": 0.10834355828220858, + "grad_norm": 1.121266482075299, + "learning_rate": 1.9680272927867404e-05, + "loss": 0.6268, + "step": 883 + }, + { + "epoch": 0.10846625766871165, + "grad_norm": 1.5276610174928178, + "learning_rate": 1.9679275259610894e-05, + "loss": 0.6292, + "step": 884 + }, + { + "epoch": 0.10858895705521472, + "grad_norm": 1.2966072527530241, + "learning_rate": 1.967827606259607e-05, + "loss": 0.7295, + "step": 885 + }, + { + "epoch": 0.10871165644171779, + "grad_norm": 1.1770214631508569, + "learning_rate": 1.9677275336980742e-05, + "loss": 0.6605, + "step": 886 + }, + { + "epoch": 0.10883435582822086, + "grad_norm": 1.0776125979844782, + "learning_rate": 1.9676273082922973e-05, + "loss": 0.6055, + "step": 887 + }, + { + "epoch": 0.10895705521472393, + "grad_norm": 1.2444344938387584, + "learning_rate": 1.9675269300581062e-05, + "loss": 0.6535, + "step": 888 + }, + { + "epoch": 0.109079754601227, + "grad_norm": 1.170819015934504, + "learning_rate": 1.9674263990113544e-05, + "loss": 0.6503, + "step": 889 + }, + { + "epoch": 0.10920245398773006, + "grad_norm": 1.201953704004694, + "learning_rate": 1.96732571516792e-05, + "loss": 0.7038, + "step": 890 + }, + { + "epoch": 0.10932515337423312, + "grad_norm": 1.202532181308002, + "learning_rate": 1.9672248785437053e-05, + "loss": 0.6946, + "step": 891 + }, + { + "epoch": 0.1094478527607362, + "grad_norm": 1.1065675766200744, + "learning_rate": 1.9671238891546358e-05, + "loss": 0.5984, + "step": 892 + }, + { + "epoch": 0.10957055214723926, + "grad_norm": 0.981248929956275, + "learning_rate": 1.967022747016663e-05, + "loss": 0.6779, + "step": 893 + }, + { + "epoch": 0.10969325153374233, + "grad_norm": 1.0759923429058533, + "learning_rate": 1.966921452145761e-05, + "loss": 0.6399, + "step": 894 + }, + { + "epoch": 0.1098159509202454, + "grad_norm": 1.2738888488884925, + "learning_rate": 1.9668200045579283e-05, + "loss": 0.7341, + "step": 895 + }, + { + "epoch": 0.10993865030674847, + "grad_norm": 0.9906867789918543, + "learning_rate": 1.9667184042691877e-05, + "loss": 0.6235, + "step": 896 + }, + { + "epoch": 0.11006134969325153, + "grad_norm": 1.2555888134626263, + "learning_rate": 1.9666166512955863e-05, + "loss": 0.615, + "step": 897 + }, + { + "epoch": 0.1101840490797546, + "grad_norm": 1.0999223967522687, + "learning_rate": 1.9665147456531945e-05, + "loss": 0.5852, + "step": 898 + }, + { + "epoch": 0.11030674846625767, + "grad_norm": 1.245709656274543, + "learning_rate": 1.9664126873581086e-05, + "loss": 0.6417, + "step": 899 + }, + { + "epoch": 0.11042944785276074, + "grad_norm": 1.3125997338833397, + "learning_rate": 1.9663104764264468e-05, + "loss": 0.708, + "step": 900 + }, + { + "epoch": 0.1105521472392638, + "grad_norm": 1.1860368921062567, + "learning_rate": 1.9662081128743527e-05, + "loss": 0.6076, + "step": 901 + }, + { + "epoch": 0.11067484662576688, + "grad_norm": 1.3578076522332112, + "learning_rate": 1.966105596717994e-05, + "loss": 0.6283, + "step": 902 + }, + { + "epoch": 0.11079754601226993, + "grad_norm": 1.1276607589237213, + "learning_rate": 1.9660029279735617e-05, + "loss": 0.6991, + "step": 903 + }, + { + "epoch": 0.110920245398773, + "grad_norm": 1.1896751140998203, + "learning_rate": 1.965900106657272e-05, + "loss": 0.6093, + "step": 904 + }, + { + "epoch": 0.11104294478527607, + "grad_norm": 1.179070440836627, + "learning_rate": 1.9657971327853644e-05, + "loss": 0.6561, + "step": 905 + }, + { + "epoch": 0.11116564417177914, + "grad_norm": 1.073040303761454, + "learning_rate": 1.965694006374103e-05, + "loss": 0.6832, + "step": 906 + }, + { + "epoch": 0.11128834355828221, + "grad_norm": 1.1344774542070752, + "learning_rate": 1.9655907274397755e-05, + "loss": 0.5852, + "step": 907 + }, + { + "epoch": 0.11141104294478528, + "grad_norm": 1.5776119196675435, + "learning_rate": 1.9654872959986936e-05, + "loss": 0.6133, + "step": 908 + }, + { + "epoch": 0.11153374233128835, + "grad_norm": 0.9781911888710627, + "learning_rate": 1.9653837120671938e-05, + "loss": 0.6191, + "step": 909 + }, + { + "epoch": 0.1116564417177914, + "grad_norm": 1.2050050377818364, + "learning_rate": 1.9652799756616364e-05, + "loss": 0.5909, + "step": 910 + }, + { + "epoch": 0.11177914110429447, + "grad_norm": 1.2381451367966845, + "learning_rate": 1.9651760867984054e-05, + "loss": 0.6977, + "step": 911 + }, + { + "epoch": 0.11190184049079754, + "grad_norm": 1.010105010970565, + "learning_rate": 1.9650720454939095e-05, + "loss": 0.6491, + "step": 912 + }, + { + "epoch": 0.11202453987730061, + "grad_norm": 0.9995152737195242, + "learning_rate": 1.9649678517645808e-05, + "loss": 0.6121, + "step": 913 + }, + { + "epoch": 0.11214723926380368, + "grad_norm": 1.1551534095257354, + "learning_rate": 1.9648635056268757e-05, + "loss": 0.6404, + "step": 914 + }, + { + "epoch": 0.11226993865030675, + "grad_norm": 1.185471253320875, + "learning_rate": 1.964759007097275e-05, + "loss": 0.6185, + "step": 915 + }, + { + "epoch": 0.11239263803680982, + "grad_norm": 1.1713161580340004, + "learning_rate": 1.9646543561922833e-05, + "loss": 0.7657, + "step": 916 + }, + { + "epoch": 0.11251533742331288, + "grad_norm": 1.2399639551751152, + "learning_rate": 1.9645495529284292e-05, + "loss": 0.6175, + "step": 917 + }, + { + "epoch": 0.11263803680981595, + "grad_norm": 1.467422562806453, + "learning_rate": 1.964444597322266e-05, + "loss": 0.6159, + "step": 918 + }, + { + "epoch": 0.11276073619631902, + "grad_norm": 1.1209008947976733, + "learning_rate": 1.96433948939037e-05, + "loss": 0.737, + "step": 919 + }, + { + "epoch": 0.11288343558282209, + "grad_norm": 1.028116063915563, + "learning_rate": 1.964234229149342e-05, + "loss": 0.6429, + "step": 920 + }, + { + "epoch": 0.11300613496932516, + "grad_norm": 1.0263906977128174, + "learning_rate": 1.964128816615807e-05, + "loss": 0.6108, + "step": 921 + }, + { + "epoch": 0.11312883435582823, + "grad_norm": 1.053604417490505, + "learning_rate": 1.9640232518064148e-05, + "loss": 0.6229, + "step": 922 + }, + { + "epoch": 0.1132515337423313, + "grad_norm": 1.1014970309738876, + "learning_rate": 1.9639175347378378e-05, + "loss": 0.6605, + "step": 923 + }, + { + "epoch": 0.11337423312883435, + "grad_norm": 1.1950481253530947, + "learning_rate": 1.9638116654267725e-05, + "loss": 0.6565, + "step": 924 + }, + { + "epoch": 0.11349693251533742, + "grad_norm": 1.1459218762238783, + "learning_rate": 1.963705643889941e-05, + "loss": 0.6554, + "step": 925 + }, + { + "epoch": 0.11361963190184049, + "grad_norm": 1.4202780920919589, + "learning_rate": 1.9635994701440882e-05, + "loss": 0.6035, + "step": 926 + }, + { + "epoch": 0.11374233128834356, + "grad_norm": 1.1368445968337855, + "learning_rate": 1.9634931442059833e-05, + "loss": 0.6493, + "step": 927 + }, + { + "epoch": 0.11386503067484663, + "grad_norm": 1.1877881272497162, + "learning_rate": 1.9633866660924195e-05, + "loss": 0.7493, + "step": 928 + }, + { + "epoch": 0.1139877300613497, + "grad_norm": 1.2536649804133237, + "learning_rate": 1.9632800358202142e-05, + "loss": 0.6393, + "step": 929 + }, + { + "epoch": 0.11411042944785275, + "grad_norm": 1.0648896317550627, + "learning_rate": 1.9631732534062088e-05, + "loss": 0.6187, + "step": 930 + }, + { + "epoch": 0.11423312883435582, + "grad_norm": 1.1035033367707554, + "learning_rate": 1.9630663188672686e-05, + "loss": 0.6727, + "step": 931 + }, + { + "epoch": 0.1143558282208589, + "grad_norm": 1.110882277798712, + "learning_rate": 1.962959232220283e-05, + "loss": 0.7469, + "step": 932 + }, + { + "epoch": 0.11447852760736196, + "grad_norm": 1.0819991523287535, + "learning_rate": 1.962851993482165e-05, + "loss": 0.6539, + "step": 933 + }, + { + "epoch": 0.11460122699386503, + "grad_norm": 1.1165942531798978, + "learning_rate": 1.9627446026698526e-05, + "loss": 0.6564, + "step": 934 + }, + { + "epoch": 0.1147239263803681, + "grad_norm": 1.3175153398085866, + "learning_rate": 1.962637059800307e-05, + "loss": 0.612, + "step": 935 + }, + { + "epoch": 0.11484662576687117, + "grad_norm": 1.3935594147829693, + "learning_rate": 1.962529364890514e-05, + "loss": 0.6607, + "step": 936 + }, + { + "epoch": 0.11496932515337423, + "grad_norm": 1.0742542673754611, + "learning_rate": 1.9624215179574826e-05, + "loss": 0.6096, + "step": 937 + }, + { + "epoch": 0.1150920245398773, + "grad_norm": 1.0020650896243173, + "learning_rate": 1.962313519018247e-05, + "loss": 0.6215, + "step": 938 + }, + { + "epoch": 0.11521472392638037, + "grad_norm": 1.0892026735244424, + "learning_rate": 1.9622053680898637e-05, + "loss": 0.6051, + "step": 939 + }, + { + "epoch": 0.11533742331288344, + "grad_norm": 1.2768048006542068, + "learning_rate": 1.9620970651894146e-05, + "loss": 0.6184, + "step": 940 + }, + { + "epoch": 0.1154601226993865, + "grad_norm": 1.106274918573809, + "learning_rate": 1.9619886103340056e-05, + "loss": 0.6601, + "step": 941 + }, + { + "epoch": 0.11558282208588957, + "grad_norm": 1.0569107892035279, + "learning_rate": 1.961880003540766e-05, + "loss": 0.6557, + "step": 942 + }, + { + "epoch": 0.11570552147239264, + "grad_norm": 1.0102282623876953, + "learning_rate": 1.9617712448268494e-05, + "loss": 0.6098, + "step": 943 + }, + { + "epoch": 0.1158282208588957, + "grad_norm": 1.0923747485680557, + "learning_rate": 1.9616623342094328e-05, + "loss": 0.5881, + "step": 944 + }, + { + "epoch": 0.11595092024539877, + "grad_norm": 1.3431469747346025, + "learning_rate": 1.9615532717057185e-05, + "loss": 0.692, + "step": 945 + }, + { + "epoch": 0.11607361963190184, + "grad_norm": 0.975751561670281, + "learning_rate": 1.9614440573329313e-05, + "loss": 0.6503, + "step": 946 + }, + { + "epoch": 0.11619631901840491, + "grad_norm": 1.0384478217667892, + "learning_rate": 1.961334691108321e-05, + "loss": 0.6417, + "step": 947 + }, + { + "epoch": 0.11631901840490798, + "grad_norm": 0.9756486338064363, + "learning_rate": 1.9612251730491606e-05, + "loss": 0.5972, + "step": 948 + }, + { + "epoch": 0.11644171779141105, + "grad_norm": 1.035899613995727, + "learning_rate": 1.9611155031727486e-05, + "loss": 0.6539, + "step": 949 + }, + { + "epoch": 0.1165644171779141, + "grad_norm": 1.338736957958188, + "learning_rate": 1.9610056814964053e-05, + "loss": 0.5937, + "step": 950 + }, + { + "epoch": 0.11668711656441717, + "grad_norm": 1.0237412393626093, + "learning_rate": 1.9608957080374764e-05, + "loss": 0.6626, + "step": 951 + }, + { + "epoch": 0.11680981595092024, + "grad_norm": 1.1516889531911891, + "learning_rate": 1.9607855828133314e-05, + "loss": 0.7129, + "step": 952 + }, + { + "epoch": 0.11693251533742331, + "grad_norm": 1.0659588083567075, + "learning_rate": 1.960675305841364e-05, + "loss": 0.661, + "step": 953 + }, + { + "epoch": 0.11705521472392638, + "grad_norm": 1.1652147089253493, + "learning_rate": 1.960564877138991e-05, + "loss": 0.7091, + "step": 954 + }, + { + "epoch": 0.11717791411042945, + "grad_norm": 1.001538697491031, + "learning_rate": 1.9604542967236535e-05, + "loss": 0.728, + "step": 955 + }, + { + "epoch": 0.11730061349693252, + "grad_norm": 1.0675937004445564, + "learning_rate": 1.9603435646128172e-05, + "loss": 0.6123, + "step": 956 + }, + { + "epoch": 0.11742331288343558, + "grad_norm": 1.0936001292125128, + "learning_rate": 1.960232680823971e-05, + "loss": 0.6895, + "step": 957 + }, + { + "epoch": 0.11754601226993865, + "grad_norm": 1.089202529098884, + "learning_rate": 1.9601216453746285e-05, + "loss": 0.6099, + "step": 958 + }, + { + "epoch": 0.11766871165644172, + "grad_norm": 0.9494645649020539, + "learning_rate": 1.960010458282326e-05, + "loss": 0.618, + "step": 959 + }, + { + "epoch": 0.11779141104294479, + "grad_norm": 1.0568813897723588, + "learning_rate": 1.9598991195646252e-05, + "loss": 0.6635, + "step": 960 + }, + { + "epoch": 0.11791411042944785, + "grad_norm": 1.1093115543323233, + "learning_rate": 1.9597876292391106e-05, + "loss": 0.6472, + "step": 961 + }, + { + "epoch": 0.11803680981595092, + "grad_norm": 1.034490692283579, + "learning_rate": 1.9596759873233916e-05, + "loss": 0.6317, + "step": 962 + }, + { + "epoch": 0.118159509202454, + "grad_norm": 1.1906500828034696, + "learning_rate": 1.9595641938351008e-05, + "loss": 0.6128, + "step": 963 + }, + { + "epoch": 0.11828220858895705, + "grad_norm": 1.3748523232632293, + "learning_rate": 1.959452248791895e-05, + "loss": 0.7791, + "step": 964 + }, + { + "epoch": 0.11840490797546012, + "grad_norm": 1.1955072255658916, + "learning_rate": 1.959340152211455e-05, + "loss": 0.6389, + "step": 965 + }, + { + "epoch": 0.11852760736196319, + "grad_norm": 1.0395164006006912, + "learning_rate": 1.9592279041114862e-05, + "loss": 0.6419, + "step": 966 + }, + { + "epoch": 0.11865030674846626, + "grad_norm": 1.0563678710932716, + "learning_rate": 1.959115504509716e-05, + "loss": 0.6279, + "step": 967 + }, + { + "epoch": 0.11877300613496933, + "grad_norm": 1.8550701615661778, + "learning_rate": 1.9590029534238977e-05, + "loss": 0.6158, + "step": 968 + }, + { + "epoch": 0.1188957055214724, + "grad_norm": 1.1940408601833528, + "learning_rate": 1.958890250871807e-05, + "loss": 0.6431, + "step": 969 + }, + { + "epoch": 0.11901840490797547, + "grad_norm": 1.128739153496371, + "learning_rate": 1.9587773968712458e-05, + "loss": 0.6917, + "step": 970 + }, + { + "epoch": 0.11914110429447852, + "grad_norm": 1.16196023850644, + "learning_rate": 1.9586643914400372e-05, + "loss": 0.7706, + "step": 971 + }, + { + "epoch": 0.11926380368098159, + "grad_norm": 1.064866940240241, + "learning_rate": 1.9585512345960297e-05, + "loss": 0.6612, + "step": 972 + }, + { + "epoch": 0.11938650306748466, + "grad_norm": 1.4456204906366634, + "learning_rate": 1.9584379263570955e-05, + "loss": 0.6956, + "step": 973 + }, + { + "epoch": 0.11950920245398773, + "grad_norm": 1.4933499623112958, + "learning_rate": 1.9583244667411306e-05, + "loss": 0.6131, + "step": 974 + }, + { + "epoch": 0.1196319018404908, + "grad_norm": 1.1719257357797013, + "learning_rate": 1.958210855766055e-05, + "loss": 0.6744, + "step": 975 + }, + { + "epoch": 0.11975460122699387, + "grad_norm": 1.104526091867297, + "learning_rate": 1.958097093449813e-05, + "loss": 0.6078, + "step": 976 + }, + { + "epoch": 0.11987730061349693, + "grad_norm": 1.154633978341845, + "learning_rate": 1.9579831798103716e-05, + "loss": 0.6253, + "step": 977 + }, + { + "epoch": 0.12, + "grad_norm": 1.1723514115272446, + "learning_rate": 1.9578691148657228e-05, + "loss": 0.6362, + "step": 978 + }, + { + "epoch": 0.12012269938650307, + "grad_norm": 1.0636648715835944, + "learning_rate": 1.9577548986338827e-05, + "loss": 0.6379, + "step": 979 + }, + { + "epoch": 0.12024539877300613, + "grad_norm": 1.3011612485776876, + "learning_rate": 1.95764053113289e-05, + "loss": 0.7535, + "step": 980 + }, + { + "epoch": 0.1203680981595092, + "grad_norm": 1.3469838756615986, + "learning_rate": 1.9575260123808088e-05, + "loss": 0.6441, + "step": 981 + }, + { + "epoch": 0.12049079754601227, + "grad_norm": 1.041534063847163, + "learning_rate": 1.9574113423957254e-05, + "loss": 0.6416, + "step": 982 + }, + { + "epoch": 0.12061349693251534, + "grad_norm": 1.2929627890266646, + "learning_rate": 1.9572965211957515e-05, + "loss": 0.6711, + "step": 983 + }, + { + "epoch": 0.1207361963190184, + "grad_norm": 1.1563099771329053, + "learning_rate": 1.957181548799022e-05, + "loss": 0.6623, + "step": 984 + }, + { + "epoch": 0.12085889570552147, + "grad_norm": 1.2053098100809312, + "learning_rate": 1.9570664252236966e-05, + "loss": 0.7197, + "step": 985 + }, + { + "epoch": 0.12098159509202454, + "grad_norm": 1.2168808131587747, + "learning_rate": 1.956951150487957e-05, + "loss": 0.6058, + "step": 986 + }, + { + "epoch": 0.12110429447852761, + "grad_norm": 1.0996076793910945, + "learning_rate": 1.95683572461001e-05, + "loss": 0.634, + "step": 987 + }, + { + "epoch": 0.12122699386503068, + "grad_norm": 1.2355466150578982, + "learning_rate": 1.9567201476080866e-05, + "loss": 0.6483, + "step": 988 + }, + { + "epoch": 0.12134969325153375, + "grad_norm": 1.080440232474657, + "learning_rate": 1.956604419500441e-05, + "loss": 0.6164, + "step": 989 + }, + { + "epoch": 0.12147239263803682, + "grad_norm": 1.1736836907226245, + "learning_rate": 1.956488540305351e-05, + "loss": 0.7236, + "step": 990 + }, + { + "epoch": 0.12159509202453987, + "grad_norm": 1.2344031841240826, + "learning_rate": 1.9563725100411198e-05, + "loss": 0.636, + "step": 991 + }, + { + "epoch": 0.12171779141104294, + "grad_norm": 1.0494292681302004, + "learning_rate": 1.9562563287260724e-05, + "loss": 0.6591, + "step": 992 + }, + { + "epoch": 0.12184049079754601, + "grad_norm": 1.0600661489725105, + "learning_rate": 1.9561399963785586e-05, + "loss": 0.6476, + "step": 993 + }, + { + "epoch": 0.12196319018404908, + "grad_norm": 1.07105234208492, + "learning_rate": 1.956023513016953e-05, + "loss": 0.6535, + "step": 994 + }, + { + "epoch": 0.12208588957055215, + "grad_norm": 1.062009410364129, + "learning_rate": 1.9559068786596526e-05, + "loss": 0.6085, + "step": 995 + }, + { + "epoch": 0.12220858895705522, + "grad_norm": 1.1019869024290885, + "learning_rate": 1.9557900933250786e-05, + "loss": 0.6251, + "step": 996 + }, + { + "epoch": 0.12233128834355828, + "grad_norm": 1.0069711700117645, + "learning_rate": 1.9556731570316763e-05, + "loss": 0.624, + "step": 997 + }, + { + "epoch": 0.12245398773006134, + "grad_norm": 1.1197186214875723, + "learning_rate": 1.9555560697979147e-05, + "loss": 0.6468, + "step": 998 + }, + { + "epoch": 0.12257668711656441, + "grad_norm": 1.0806290960264686, + "learning_rate": 1.9554388316422873e-05, + "loss": 0.6451, + "step": 999 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 1.0982945340224686, + "learning_rate": 1.9553214425833108e-05, + "loss": 0.6086, + "step": 1000 + }, + { + "epoch": 0.12282208588957055, + "grad_norm": 1.071481608181602, + "learning_rate": 1.955203902639525e-05, + "loss": 0.6249, + "step": 1001 + }, + { + "epoch": 0.12294478527607362, + "grad_norm": 1.0592244208668817, + "learning_rate": 1.955086211829495e-05, + "loss": 0.6293, + "step": 1002 + }, + { + "epoch": 0.12306748466257669, + "grad_norm": 1.0039672127766213, + "learning_rate": 1.9549683701718086e-05, + "loss": 0.6249, + "step": 1003 + }, + { + "epoch": 0.12319018404907975, + "grad_norm": 1.212071484411326, + "learning_rate": 1.9548503776850785e-05, + "loss": 0.6398, + "step": 1004 + }, + { + "epoch": 0.12331288343558282, + "grad_norm": 1.0432133379493285, + "learning_rate": 1.9547322343879397e-05, + "loss": 0.6445, + "step": 1005 + }, + { + "epoch": 0.12343558282208589, + "grad_norm": 1.0087653350147658, + "learning_rate": 1.954613940299053e-05, + "loss": 0.6951, + "step": 1006 + }, + { + "epoch": 0.12355828220858896, + "grad_norm": 1.0230694772905629, + "learning_rate": 1.9544954954371013e-05, + "loss": 0.657, + "step": 1007 + }, + { + "epoch": 0.12368098159509203, + "grad_norm": 1.1740617737922678, + "learning_rate": 1.9543768998207918e-05, + "loss": 0.7161, + "step": 1008 + }, + { + "epoch": 0.1238036809815951, + "grad_norm": 0.9977964982674402, + "learning_rate": 1.954258153468856e-05, + "loss": 0.6459, + "step": 1009 + }, + { + "epoch": 0.12392638036809817, + "grad_norm": 1.0533155310644842, + "learning_rate": 1.954139256400049e-05, + "loss": 0.6604, + "step": 1010 + }, + { + "epoch": 0.12404907975460122, + "grad_norm": 1.0678740554880954, + "learning_rate": 1.954020208633149e-05, + "loss": 0.6351, + "step": 1011 + }, + { + "epoch": 0.12417177914110429, + "grad_norm": 3.283111212666623, + "learning_rate": 1.953901010186959e-05, + "loss": 0.5915, + "step": 1012 + }, + { + "epoch": 0.12429447852760736, + "grad_norm": 1.0586785363144593, + "learning_rate": 1.9537816610803056e-05, + "loss": 0.6102, + "step": 1013 + }, + { + "epoch": 0.12441717791411043, + "grad_norm": 1.2036550677686975, + "learning_rate": 1.953662161332038e-05, + "loss": 0.6485, + "step": 1014 + }, + { + "epoch": 0.1245398773006135, + "grad_norm": 1.1478908512396526, + "learning_rate": 1.9535425109610317e-05, + "loss": 0.6199, + "step": 1015 + }, + { + "epoch": 0.12466257668711657, + "grad_norm": 1.0275264075098633, + "learning_rate": 1.9534227099861827e-05, + "loss": 0.6382, + "step": 1016 + }, + { + "epoch": 0.12478527607361964, + "grad_norm": 1.4684175348933677, + "learning_rate": 1.9533027584264138e-05, + "loss": 0.5964, + "step": 1017 + }, + { + "epoch": 0.1249079754601227, + "grad_norm": 1.0791403604315883, + "learning_rate": 1.9531826563006696e-05, + "loss": 0.6431, + "step": 1018 + }, + { + "epoch": 0.12503067484662578, + "grad_norm": 1.1240625892027076, + "learning_rate": 1.95306240362792e-05, + "loss": 0.5915, + "step": 1019 + }, + { + "epoch": 0.12515337423312883, + "grad_norm": 1.2076028505960839, + "learning_rate": 1.9529420004271568e-05, + "loss": 0.653, + "step": 1020 + }, + { + "epoch": 0.1252760736196319, + "grad_norm": 1.131722580974081, + "learning_rate": 1.9528214467173974e-05, + "loss": 0.6285, + "step": 1021 + }, + { + "epoch": 0.12539877300613497, + "grad_norm": 1.063420279798949, + "learning_rate": 1.952700742517682e-05, + "loss": 0.5953, + "step": 1022 + }, + { + "epoch": 0.12552147239263803, + "grad_norm": 1.060361192528377, + "learning_rate": 1.952579887847075e-05, + "loss": 0.6238, + "step": 1023 + }, + { + "epoch": 0.1256441717791411, + "grad_norm": 1.1660667355256193, + "learning_rate": 1.9524588827246644e-05, + "loss": 0.6456, + "step": 1024 + }, + { + "epoch": 0.12576687116564417, + "grad_norm": 1.393741665831264, + "learning_rate": 1.952337727169561e-05, + "loss": 0.7281, + "step": 1025 + }, + { + "epoch": 0.12588957055214725, + "grad_norm": 1.2637466993694002, + "learning_rate": 1.9522164212009015e-05, + "loss": 0.6338, + "step": 1026 + }, + { + "epoch": 0.1260122699386503, + "grad_norm": 1.097033420382912, + "learning_rate": 1.9520949648378444e-05, + "loss": 0.6654, + "step": 1027 + }, + { + "epoch": 0.12613496932515336, + "grad_norm": 1.1349854804114399, + "learning_rate": 1.951973358099573e-05, + "loss": 0.7075, + "step": 1028 + }, + { + "epoch": 0.12625766871165645, + "grad_norm": 1.0393933689141763, + "learning_rate": 1.9518516010052943e-05, + "loss": 0.6948, + "step": 1029 + }, + { + "epoch": 0.1263803680981595, + "grad_norm": 0.9893851005664946, + "learning_rate": 1.951729693574238e-05, + "loss": 0.6257, + "step": 1030 + }, + { + "epoch": 0.12650306748466258, + "grad_norm": 1.0686023522744075, + "learning_rate": 1.9516076358256585e-05, + "loss": 0.6268, + "step": 1031 + }, + { + "epoch": 0.12662576687116564, + "grad_norm": 1.3372543749838341, + "learning_rate": 1.9514854277788347e-05, + "loss": 0.6897, + "step": 1032 + }, + { + "epoch": 0.12674846625766872, + "grad_norm": 1.1459833776928405, + "learning_rate": 1.9513630694530673e-05, + "loss": 0.6844, + "step": 1033 + }, + { + "epoch": 0.12687116564417178, + "grad_norm": 0.9227981284288925, + "learning_rate": 1.9512405608676822e-05, + "loss": 0.6122, + "step": 1034 + }, + { + "epoch": 0.12699386503067484, + "grad_norm": 1.105368990074935, + "learning_rate": 1.9511179020420284e-05, + "loss": 0.5777, + "step": 1035 + }, + { + "epoch": 0.12711656441717792, + "grad_norm": 1.0386230182739833, + "learning_rate": 1.9509950929954787e-05, + "loss": 0.6601, + "step": 1036 + }, + { + "epoch": 0.12723926380368097, + "grad_norm": 1.1350967175163749, + "learning_rate": 1.95087213374743e-05, + "loss": 0.6104, + "step": 1037 + }, + { + "epoch": 0.12736196319018406, + "grad_norm": 1.312502345720438, + "learning_rate": 1.9507490243173027e-05, + "loss": 0.7314, + "step": 1038 + }, + { + "epoch": 0.1274846625766871, + "grad_norm": 1.0102867065343706, + "learning_rate": 1.950625764724541e-05, + "loss": 0.6223, + "step": 1039 + }, + { + "epoch": 0.1276073619631902, + "grad_norm": 1.1816285293880615, + "learning_rate": 1.950502354988612e-05, + "loss": 0.7074, + "step": 1040 + }, + { + "epoch": 0.12773006134969325, + "grad_norm": 1.0987906999336954, + "learning_rate": 1.9503787951290078e-05, + "loss": 0.6156, + "step": 1041 + }, + { + "epoch": 0.1278527607361963, + "grad_norm": 1.143720754144711, + "learning_rate": 1.9502550851652433e-05, + "loss": 0.6243, + "step": 1042 + }, + { + "epoch": 0.1279754601226994, + "grad_norm": 0.9816910053908001, + "learning_rate": 1.9501312251168574e-05, + "loss": 0.6336, + "step": 1043 + }, + { + "epoch": 0.12809815950920245, + "grad_norm": 1.103985793284278, + "learning_rate": 1.9500072150034136e-05, + "loss": 0.6185, + "step": 1044 + }, + { + "epoch": 0.12822085889570553, + "grad_norm": 1.1468359625384104, + "learning_rate": 1.9498830548444972e-05, + "loss": 0.6997, + "step": 1045 + }, + { + "epoch": 0.1283435582822086, + "grad_norm": 1.004236870010181, + "learning_rate": 1.9497587446597185e-05, + "loss": 0.6294, + "step": 1046 + }, + { + "epoch": 0.12846625766871167, + "grad_norm": 1.1211127246017023, + "learning_rate": 1.9496342844687113e-05, + "loss": 0.6544, + "step": 1047 + }, + { + "epoch": 0.12858895705521473, + "grad_norm": 1.1606518135689785, + "learning_rate": 1.9495096742911332e-05, + "loss": 0.6632, + "step": 1048 + }, + { + "epoch": 0.12871165644171778, + "grad_norm": 0.9268540351601084, + "learning_rate": 1.949384914146665e-05, + "loss": 0.5784, + "step": 1049 + }, + { + "epoch": 0.12883435582822086, + "grad_norm": 1.0244990292799203, + "learning_rate": 1.9492600040550114e-05, + "loss": 0.6365, + "step": 1050 + }, + { + "epoch": 0.12895705521472392, + "grad_norm": 1.014978798745757, + "learning_rate": 1.9491349440359014e-05, + "loss": 0.6193, + "step": 1051 + }, + { + "epoch": 0.129079754601227, + "grad_norm": 1.0212789560208713, + "learning_rate": 1.9490097341090868e-05, + "loss": 0.6208, + "step": 1052 + }, + { + "epoch": 0.12920245398773006, + "grad_norm": 1.4198636760649754, + "learning_rate": 1.9488843742943433e-05, + "loss": 0.6929, + "step": 1053 + }, + { + "epoch": 0.12932515337423311, + "grad_norm": 1.0078793071229681, + "learning_rate": 1.948758864611471e-05, + "loss": 0.6528, + "step": 1054 + }, + { + "epoch": 0.1294478527607362, + "grad_norm": 0.9812271816918029, + "learning_rate": 1.948633205080292e-05, + "loss": 0.6505, + "step": 1055 + }, + { + "epoch": 0.12957055214723925, + "grad_norm": 1.132092548221401, + "learning_rate": 1.9485073957206543e-05, + "loss": 0.5958, + "step": 1056 + }, + { + "epoch": 0.12969325153374234, + "grad_norm": 1.1972922004478064, + "learning_rate": 1.9483814365524278e-05, + "loss": 0.6802, + "step": 1057 + }, + { + "epoch": 0.1298159509202454, + "grad_norm": 0.9886642567009184, + "learning_rate": 1.9482553275955067e-05, + "loss": 0.6311, + "step": 1058 + }, + { + "epoch": 0.12993865030674848, + "grad_norm": 0.9862712261915207, + "learning_rate": 1.948129068869809e-05, + "loss": 0.6298, + "step": 1059 + }, + { + "epoch": 0.13006134969325153, + "grad_norm": 1.2102695250262359, + "learning_rate": 1.948002660395276e-05, + "loss": 0.7277, + "step": 1060 + }, + { + "epoch": 0.1301840490797546, + "grad_norm": 1.0644868421500857, + "learning_rate": 1.947876102191873e-05, + "loss": 0.6262, + "step": 1061 + }, + { + "epoch": 0.13030674846625767, + "grad_norm": 1.0189797006768415, + "learning_rate": 1.9477493942795885e-05, + "loss": 0.6541, + "step": 1062 + }, + { + "epoch": 0.13042944785276073, + "grad_norm": 1.1660374319306466, + "learning_rate": 1.947622536678435e-05, + "loss": 0.689, + "step": 1063 + }, + { + "epoch": 0.1305521472392638, + "grad_norm": 1.2020399881845998, + "learning_rate": 1.9474955294084492e-05, + "loss": 0.6342, + "step": 1064 + }, + { + "epoch": 0.13067484662576687, + "grad_norm": 1.0057086400811346, + "learning_rate": 1.9473683724896898e-05, + "loss": 0.6456, + "step": 1065 + }, + { + "epoch": 0.13079754601226995, + "grad_norm": 1.0370027602756402, + "learning_rate": 1.947241065942241e-05, + "loss": 0.6105, + "step": 1066 + }, + { + "epoch": 0.130920245398773, + "grad_norm": 1.2580812121151403, + "learning_rate": 1.9471136097862093e-05, + "loss": 0.757, + "step": 1067 + }, + { + "epoch": 0.13104294478527606, + "grad_norm": 1.1014032596854029, + "learning_rate": 1.9469860040417253e-05, + "loss": 0.6364, + "step": 1068 + }, + { + "epoch": 0.13116564417177914, + "grad_norm": 1.0908477381830681, + "learning_rate": 1.9468582487289434e-05, + "loss": 0.6302, + "step": 1069 + }, + { + "epoch": 0.1312883435582822, + "grad_norm": 1.2107836747044154, + "learning_rate": 1.9467303438680414e-05, + "loss": 0.6079, + "step": 1070 + }, + { + "epoch": 0.13141104294478528, + "grad_norm": 1.1512592678154019, + "learning_rate": 1.946602289479221e-05, + "loss": 0.7566, + "step": 1071 + }, + { + "epoch": 0.13153374233128834, + "grad_norm": 1.2441737490426301, + "learning_rate": 1.9464740855827067e-05, + "loss": 0.5916, + "step": 1072 + }, + { + "epoch": 0.13165644171779142, + "grad_norm": 1.1808082333399645, + "learning_rate": 1.946345732198748e-05, + "loss": 0.6178, + "step": 1073 + }, + { + "epoch": 0.13177914110429448, + "grad_norm": 0.9944062459217516, + "learning_rate": 1.9462172293476162e-05, + "loss": 0.6667, + "step": 1074 + }, + { + "epoch": 0.13190184049079753, + "grad_norm": 1.1018718934203495, + "learning_rate": 1.946088577049608e-05, + "loss": 0.6772, + "step": 1075 + }, + { + "epoch": 0.13202453987730062, + "grad_norm": 0.9838024146972139, + "learning_rate": 1.945959775325043e-05, + "loss": 0.6255, + "step": 1076 + }, + { + "epoch": 0.13214723926380367, + "grad_norm": 1.6439948180224029, + "learning_rate": 1.9458308241942637e-05, + "loss": 0.6402, + "step": 1077 + }, + { + "epoch": 0.13226993865030676, + "grad_norm": 1.1444163240281735, + "learning_rate": 1.945701723677637e-05, + "loss": 0.6224, + "step": 1078 + }, + { + "epoch": 0.1323926380368098, + "grad_norm": 1.1547790653366596, + "learning_rate": 1.945572473795554e-05, + "loss": 0.7336, + "step": 1079 + }, + { + "epoch": 0.1325153374233129, + "grad_norm": 1.1194448825380023, + "learning_rate": 1.9454430745684276e-05, + "loss": 0.6246, + "step": 1080 + }, + { + "epoch": 0.13263803680981595, + "grad_norm": 0.9792137554023445, + "learning_rate": 1.945313526016696e-05, + "loss": 0.625, + "step": 1081 + }, + { + "epoch": 0.132760736196319, + "grad_norm": 1.0166225636448185, + "learning_rate": 1.94518382816082e-05, + "loss": 0.5932, + "step": 1082 + }, + { + "epoch": 0.1328834355828221, + "grad_norm": 1.1297462390094066, + "learning_rate": 1.9450539810212835e-05, + "loss": 0.7365, + "step": 1083 + }, + { + "epoch": 0.13300613496932515, + "grad_norm": 1.2054017814289737, + "learning_rate": 1.9449239846185962e-05, + "loss": 0.6607, + "step": 1084 + }, + { + "epoch": 0.13312883435582823, + "grad_norm": 0.937410010680947, + "learning_rate": 1.944793838973289e-05, + "loss": 0.6164, + "step": 1085 + }, + { + "epoch": 0.13325153374233129, + "grad_norm": 1.065225169837227, + "learning_rate": 1.9446635441059173e-05, + "loss": 0.7238, + "step": 1086 + }, + { + "epoch": 0.13337423312883437, + "grad_norm": 1.0234458279727348, + "learning_rate": 1.9445331000370604e-05, + "loss": 0.6512, + "step": 1087 + }, + { + "epoch": 0.13349693251533742, + "grad_norm": 1.1401956129843254, + "learning_rate": 1.9444025067873205e-05, + "loss": 0.6234, + "step": 1088 + }, + { + "epoch": 0.13361963190184048, + "grad_norm": 1.007132678296995, + "learning_rate": 1.944271764377324e-05, + "loss": 0.6212, + "step": 1089 + }, + { + "epoch": 0.13374233128834356, + "grad_norm": 1.1359786005766561, + "learning_rate": 1.94414087282772e-05, + "loss": 0.6841, + "step": 1090 + }, + { + "epoch": 0.13386503067484662, + "grad_norm": 1.0147454887981726, + "learning_rate": 1.9440098321591825e-05, + "loss": 0.67, + "step": 1091 + }, + { + "epoch": 0.1339877300613497, + "grad_norm": 1.016316004691533, + "learning_rate": 1.9438786423924075e-05, + "loss": 0.6059, + "step": 1092 + }, + { + "epoch": 0.13411042944785276, + "grad_norm": 0.987633979585972, + "learning_rate": 1.9437473035481157e-05, + "loss": 0.614, + "step": 1093 + }, + { + "epoch": 0.13423312883435584, + "grad_norm": 1.0273772479114869, + "learning_rate": 1.943615815647051e-05, + "loss": 0.6632, + "step": 1094 + }, + { + "epoch": 0.1343558282208589, + "grad_norm": 1.1509868238832992, + "learning_rate": 1.9434841787099804e-05, + "loss": 0.7113, + "step": 1095 + }, + { + "epoch": 0.13447852760736195, + "grad_norm": 1.0529738226053798, + "learning_rate": 1.943352392757695e-05, + "loss": 0.6995, + "step": 1096 + }, + { + "epoch": 0.13460122699386504, + "grad_norm": 1.1526811017513532, + "learning_rate": 1.9432204578110094e-05, + "loss": 0.6421, + "step": 1097 + }, + { + "epoch": 0.1347239263803681, + "grad_norm": 1.084298920681517, + "learning_rate": 1.9430883738907617e-05, + "loss": 0.6626, + "step": 1098 + }, + { + "epoch": 0.13484662576687118, + "grad_norm": 1.155252094574697, + "learning_rate": 1.9429561410178132e-05, + "loss": 0.6337, + "step": 1099 + }, + { + "epoch": 0.13496932515337423, + "grad_norm": 1.0760839336010641, + "learning_rate": 1.9428237592130487e-05, + "loss": 0.648, + "step": 1100 + }, + { + "epoch": 0.1350920245398773, + "grad_norm": 1.1649072300831542, + "learning_rate": 1.942691228497377e-05, + "loss": 0.7558, + "step": 1101 + }, + { + "epoch": 0.13521472392638037, + "grad_norm": 0.9994254997993712, + "learning_rate": 1.9425585488917305e-05, + "loss": 0.5918, + "step": 1102 + }, + { + "epoch": 0.13533742331288343, + "grad_norm": 1.0559560679637734, + "learning_rate": 1.9424257204170643e-05, + "loss": 0.7468, + "step": 1103 + }, + { + "epoch": 0.1354601226993865, + "grad_norm": 1.083920172433204, + "learning_rate": 1.9422927430943578e-05, + "loss": 0.6564, + "step": 1104 + }, + { + "epoch": 0.13558282208588956, + "grad_norm": 1.2757934294257165, + "learning_rate": 1.9421596169446135e-05, + "loss": 0.6399, + "step": 1105 + }, + { + "epoch": 0.13570552147239265, + "grad_norm": 1.1586019430928884, + "learning_rate": 1.9420263419888577e-05, + "loss": 0.6507, + "step": 1106 + }, + { + "epoch": 0.1358282208588957, + "grad_norm": 1.0789537726826661, + "learning_rate": 1.94189291824814e-05, + "loss": 0.6074, + "step": 1107 + }, + { + "epoch": 0.13595092024539876, + "grad_norm": 1.2072748971470018, + "learning_rate": 1.941759345743533e-05, + "loss": 0.6724, + "step": 1108 + }, + { + "epoch": 0.13607361963190184, + "grad_norm": 0.9866090215232858, + "learning_rate": 1.9416256244961348e-05, + "loss": 0.6383, + "step": 1109 + }, + { + "epoch": 0.1361963190184049, + "grad_norm": 1.0739766335432683, + "learning_rate": 1.941491754527064e-05, + "loss": 0.6479, + "step": 1110 + }, + { + "epoch": 0.13631901840490798, + "grad_norm": 1.0131994761266538, + "learning_rate": 1.941357735857464e-05, + "loss": 0.5985, + "step": 1111 + }, + { + "epoch": 0.13644171779141104, + "grad_norm": 0.8253400089216009, + "learning_rate": 1.9412235685085034e-05, + "loss": 0.5903, + "step": 1112 + }, + { + "epoch": 0.13656441717791412, + "grad_norm": 1.3378666813399185, + "learning_rate": 1.9410892525013717e-05, + "loss": 0.7525, + "step": 1113 + }, + { + "epoch": 0.13668711656441718, + "grad_norm": 1.5562858797340078, + "learning_rate": 1.9409547878572835e-05, + "loss": 0.6986, + "step": 1114 + }, + { + "epoch": 0.13680981595092023, + "grad_norm": 1.3470052620936612, + "learning_rate": 1.940820174597476e-05, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.13693251533742332, + "grad_norm": 1.142356385674114, + "learning_rate": 1.94068541274321e-05, + "loss": 0.6609, + "step": 1116 + }, + { + "epoch": 0.13705521472392637, + "grad_norm": 1.13868575752736, + "learning_rate": 1.9405505023157704e-05, + "loss": 0.6006, + "step": 1117 + }, + { + "epoch": 0.13717791411042946, + "grad_norm": 1.1191302129057352, + "learning_rate": 1.9404154433364652e-05, + "loss": 0.6804, + "step": 1118 + }, + { + "epoch": 0.1373006134969325, + "grad_norm": 1.2677909465507924, + "learning_rate": 1.9402802358266255e-05, + "loss": 0.6104, + "step": 1119 + }, + { + "epoch": 0.1374233128834356, + "grad_norm": 1.0760061191256245, + "learning_rate": 1.9401448798076064e-05, + "loss": 0.6677, + "step": 1120 + }, + { + "epoch": 0.13754601226993865, + "grad_norm": 1.2860592923323708, + "learning_rate": 1.940009375300786e-05, + "loss": 0.7297, + "step": 1121 + }, + { + "epoch": 0.1376687116564417, + "grad_norm": 1.0479578999436647, + "learning_rate": 1.939873722327566e-05, + "loss": 0.6765, + "step": 1122 + }, + { + "epoch": 0.1377914110429448, + "grad_norm": 1.1566299103556352, + "learning_rate": 1.939737920909372e-05, + "loss": 0.6829, + "step": 1123 + }, + { + "epoch": 0.13791411042944784, + "grad_norm": 1.21287716255947, + "learning_rate": 1.9396019710676527e-05, + "loss": 0.6746, + "step": 1124 + }, + { + "epoch": 0.13803680981595093, + "grad_norm": 1.257936767746031, + "learning_rate": 1.9394658728238797e-05, + "loss": 0.7358, + "step": 1125 + }, + { + "epoch": 0.13815950920245398, + "grad_norm": 1.0403182737661645, + "learning_rate": 1.939329626199549e-05, + "loss": 0.6438, + "step": 1126 + }, + { + "epoch": 0.13828220858895707, + "grad_norm": 1.4120782486489425, + "learning_rate": 1.9391932312161786e-05, + "loss": 0.6272, + "step": 1127 + }, + { + "epoch": 0.13840490797546012, + "grad_norm": 1.2155034546072399, + "learning_rate": 1.9390566878953127e-05, + "loss": 0.6371, + "step": 1128 + }, + { + "epoch": 0.13852760736196318, + "grad_norm": 1.0590630613601846, + "learning_rate": 1.9389199962585156e-05, + "loss": 0.6379, + "step": 1129 + }, + { + "epoch": 0.13865030674846626, + "grad_norm": 0.9399211975816442, + "learning_rate": 1.9387831563273775e-05, + "loss": 0.6225, + "step": 1130 + }, + { + "epoch": 0.13877300613496932, + "grad_norm": 1.1805030196583417, + "learning_rate": 1.9386461681235106e-05, + "loss": 0.6816, + "step": 1131 + }, + { + "epoch": 0.1388957055214724, + "grad_norm": 1.08388969937118, + "learning_rate": 1.9385090316685508e-05, + "loss": 0.7166, + "step": 1132 + }, + { + "epoch": 0.13901840490797546, + "grad_norm": 1.094188726453223, + "learning_rate": 1.938371746984158e-05, + "loss": 0.6241, + "step": 1133 + }, + { + "epoch": 0.13914110429447854, + "grad_norm": 1.0162800760952921, + "learning_rate": 1.9382343140920157e-05, + "loss": 0.6872, + "step": 1134 + }, + { + "epoch": 0.1392638036809816, + "grad_norm": 1.1412616587849695, + "learning_rate": 1.938096733013829e-05, + "loss": 0.6193, + "step": 1135 + }, + { + "epoch": 0.13938650306748465, + "grad_norm": 1.0751731390982593, + "learning_rate": 1.9379590037713287e-05, + "loss": 0.6254, + "step": 1136 + }, + { + "epoch": 0.13950920245398774, + "grad_norm": 1.187208745068796, + "learning_rate": 1.9378211263862673e-05, + "loss": 0.6193, + "step": 1137 + }, + { + "epoch": 0.1396319018404908, + "grad_norm": 1.3741913471939706, + "learning_rate": 1.937683100880422e-05, + "loss": 0.6779, + "step": 1138 + }, + { + "epoch": 0.13975460122699387, + "grad_norm": 1.154416796921968, + "learning_rate": 1.9375449272755917e-05, + "loss": 0.6557, + "step": 1139 + }, + { + "epoch": 0.13987730061349693, + "grad_norm": 0.9787971353381778, + "learning_rate": 1.9374066055936004e-05, + "loss": 0.6448, + "step": 1140 + }, + { + "epoch": 0.14, + "grad_norm": 1.060305197442144, + "learning_rate": 1.937268135856295e-05, + "loss": 0.6441, + "step": 1141 + }, + { + "epoch": 0.14012269938650307, + "grad_norm": 1.1055100595616152, + "learning_rate": 1.9371295180855454e-05, + "loss": 0.64, + "step": 1142 + }, + { + "epoch": 0.14024539877300612, + "grad_norm": 1.73701393446287, + "learning_rate": 1.9369907523032448e-05, + "loss": 0.6307, + "step": 1143 + }, + { + "epoch": 0.1403680981595092, + "grad_norm": 1.2346553568193583, + "learning_rate": 1.9368518385313108e-05, + "loss": 0.6242, + "step": 1144 + }, + { + "epoch": 0.14049079754601226, + "grad_norm": 1.0262080647716498, + "learning_rate": 1.9367127767916828e-05, + "loss": 0.6594, + "step": 1145 + }, + { + "epoch": 0.14061349693251535, + "grad_norm": 1.472878201147957, + "learning_rate": 1.9365735671063247e-05, + "loss": 0.7484, + "step": 1146 + }, + { + "epoch": 0.1407361963190184, + "grad_norm": 1.1884258559944945, + "learning_rate": 1.936434209497224e-05, + "loss": 0.5833, + "step": 1147 + }, + { + "epoch": 0.14085889570552146, + "grad_norm": 1.2718335837842576, + "learning_rate": 1.93629470398639e-05, + "loss": 0.7288, + "step": 1148 + }, + { + "epoch": 0.14098159509202454, + "grad_norm": 1.5904925134673338, + "learning_rate": 1.9361550505958574e-05, + "loss": 0.6352, + "step": 1149 + }, + { + "epoch": 0.1411042944785276, + "grad_norm": 1.3305966528285422, + "learning_rate": 1.9360152493476828e-05, + "loss": 0.6302, + "step": 1150 + }, + { + "epoch": 0.14122699386503068, + "grad_norm": 1.1655711622996296, + "learning_rate": 1.9358753002639466e-05, + "loss": 0.669, + "step": 1151 + }, + { + "epoch": 0.14134969325153374, + "grad_norm": 1.1806158178732498, + "learning_rate": 1.9357352033667524e-05, + "loss": 0.6546, + "step": 1152 + }, + { + "epoch": 0.14147239263803682, + "grad_norm": 1.0202417356577622, + "learning_rate": 1.9355949586782274e-05, + "loss": 0.7006, + "step": 1153 + }, + { + "epoch": 0.14159509202453988, + "grad_norm": 1.0161767634782424, + "learning_rate": 1.935454566220522e-05, + "loss": 0.6092, + "step": 1154 + }, + { + "epoch": 0.14171779141104293, + "grad_norm": 1.2367579637943298, + "learning_rate": 1.9353140260158108e-05, + "loss": 0.6809, + "step": 1155 + }, + { + "epoch": 0.14184049079754601, + "grad_norm": 1.0393245333083427, + "learning_rate": 1.93517333808629e-05, + "loss": 0.635, + "step": 1156 + }, + { + "epoch": 0.14196319018404907, + "grad_norm": 1.5588620259029509, + "learning_rate": 1.93503250245418e-05, + "loss": 0.6761, + "step": 1157 + }, + { + "epoch": 0.14208588957055215, + "grad_norm": 1.1560438219192202, + "learning_rate": 1.9348915191417247e-05, + "loss": 0.6635, + "step": 1158 + }, + { + "epoch": 0.1422085889570552, + "grad_norm": 1.3543620214162047, + "learning_rate": 1.934750388171192e-05, + "loss": 0.6821, + "step": 1159 + }, + { + "epoch": 0.1423312883435583, + "grad_norm": 1.0002321474731735, + "learning_rate": 1.9346091095648712e-05, + "loss": 0.6673, + "step": 1160 + }, + { + "epoch": 0.14245398773006135, + "grad_norm": 0.9638723482699219, + "learning_rate": 1.9344676833450763e-05, + "loss": 0.6535, + "step": 1161 + }, + { + "epoch": 0.1425766871165644, + "grad_norm": 1.0949772691093835, + "learning_rate": 1.934326109534145e-05, + "loss": 0.6026, + "step": 1162 + }, + { + "epoch": 0.1426993865030675, + "grad_norm": 1.4193236802739742, + "learning_rate": 1.9341843881544372e-05, + "loss": 0.6881, + "step": 1163 + }, + { + "epoch": 0.14282208588957054, + "grad_norm": 1.2170739803013249, + "learning_rate": 1.9340425192283364e-05, + "loss": 0.7128, + "step": 1164 + }, + { + "epoch": 0.14294478527607363, + "grad_norm": 1.1781778159190732, + "learning_rate": 1.93390050277825e-05, + "loss": 0.6282, + "step": 1165 + }, + { + "epoch": 0.14306748466257668, + "grad_norm": 1.1521313847261017, + "learning_rate": 1.933758338826608e-05, + "loss": 0.6305, + "step": 1166 + }, + { + "epoch": 0.14319018404907977, + "grad_norm": 1.0392603089860295, + "learning_rate": 1.933616027395864e-05, + "loss": 0.6701, + "step": 1167 + }, + { + "epoch": 0.14331288343558282, + "grad_norm": 1.0869398453107566, + "learning_rate": 1.933473568508495e-05, + "loss": 0.5997, + "step": 1168 + }, + { + "epoch": 0.14343558282208588, + "grad_norm": 1.1561928984658152, + "learning_rate": 1.9333309621870012e-05, + "loss": 0.631, + "step": 1169 + }, + { + "epoch": 0.14355828220858896, + "grad_norm": 1.1071476405868665, + "learning_rate": 1.9331882084539056e-05, + "loss": 0.631, + "step": 1170 + }, + { + "epoch": 0.14368098159509202, + "grad_norm": 1.107684271039044, + "learning_rate": 1.933045307331755e-05, + "loss": 0.6885, + "step": 1171 + }, + { + "epoch": 0.1438036809815951, + "grad_norm": 1.0500401831450503, + "learning_rate": 1.9329022588431204e-05, + "loss": 0.6307, + "step": 1172 + }, + { + "epoch": 0.14392638036809816, + "grad_norm": 1.186111233492873, + "learning_rate": 1.932759063010594e-05, + "loss": 0.7203, + "step": 1173 + }, + { + "epoch": 0.14404907975460124, + "grad_norm": 1.1993326510355362, + "learning_rate": 1.9326157198567925e-05, + "loss": 0.5972, + "step": 1174 + }, + { + "epoch": 0.1441717791411043, + "grad_norm": 1.1560316545219804, + "learning_rate": 1.932472229404356e-05, + "loss": 0.7221, + "step": 1175 + }, + { + "epoch": 0.14429447852760735, + "grad_norm": 1.2547634331608781, + "learning_rate": 1.9323285916759477e-05, + "loss": 0.645, + "step": 1176 + }, + { + "epoch": 0.14441717791411043, + "grad_norm": 1.0327792461021819, + "learning_rate": 1.932184806694253e-05, + "loss": 0.6547, + "step": 1177 + }, + { + "epoch": 0.1445398773006135, + "grad_norm": 1.0471875050879365, + "learning_rate": 1.932040874481983e-05, + "loss": 0.6395, + "step": 1178 + }, + { + "epoch": 0.14466257668711657, + "grad_norm": 1.0972682066519721, + "learning_rate": 1.9318967950618696e-05, + "loss": 0.6549, + "step": 1179 + }, + { + "epoch": 0.14478527607361963, + "grad_norm": 1.5772811864517446, + "learning_rate": 1.9317525684566686e-05, + "loss": 0.6219, + "step": 1180 + }, + { + "epoch": 0.1449079754601227, + "grad_norm": 1.3219522993031223, + "learning_rate": 1.9316081946891604e-05, + "loss": 0.6553, + "step": 1181 + }, + { + "epoch": 0.14503067484662577, + "grad_norm": 1.2302247515421565, + "learning_rate": 1.9314636737821467e-05, + "loss": 0.7144, + "step": 1182 + }, + { + "epoch": 0.14515337423312882, + "grad_norm": 1.1951298990471388, + "learning_rate": 1.9313190057584538e-05, + "loss": 0.6165, + "step": 1183 + }, + { + "epoch": 0.1452760736196319, + "grad_norm": 1.0266999665424787, + "learning_rate": 1.9311741906409302e-05, + "loss": 0.6134, + "step": 1184 + }, + { + "epoch": 0.14539877300613496, + "grad_norm": 1.0819678275085036, + "learning_rate": 1.931029228452449e-05, + "loss": 0.6245, + "step": 1185 + }, + { + "epoch": 0.14552147239263805, + "grad_norm": 1.0499455779620759, + "learning_rate": 1.9308841192159056e-05, + "loss": 0.63, + "step": 1186 + }, + { + "epoch": 0.1456441717791411, + "grad_norm": 1.1481834602326686, + "learning_rate": 1.9307388629542183e-05, + "loss": 0.711, + "step": 1187 + }, + { + "epoch": 0.14576687116564419, + "grad_norm": 1.1172417518416558, + "learning_rate": 1.9305934596903292e-05, + "loss": 0.6134, + "step": 1188 + }, + { + "epoch": 0.14588957055214724, + "grad_norm": 1.1395707466871778, + "learning_rate": 1.9304479094472038e-05, + "loss": 0.6054, + "step": 1189 + }, + { + "epoch": 0.1460122699386503, + "grad_norm": 1.2302967699867216, + "learning_rate": 1.9303022122478303e-05, + "loss": 0.7321, + "step": 1190 + }, + { + "epoch": 0.14613496932515338, + "grad_norm": 1.048045204710245, + "learning_rate": 1.93015636811522e-05, + "loss": 0.6399, + "step": 1191 + }, + { + "epoch": 0.14625766871165644, + "grad_norm": 1.1718376696028647, + "learning_rate": 1.9300103770724084e-05, + "loss": 0.6665, + "step": 1192 + }, + { + "epoch": 0.14638036809815952, + "grad_norm": 1.052565005000713, + "learning_rate": 1.929864239142453e-05, + "loss": 0.638, + "step": 1193 + }, + { + "epoch": 0.14650306748466257, + "grad_norm": 1.1315516890076935, + "learning_rate": 1.9297179543484353e-05, + "loss": 0.6271, + "step": 1194 + }, + { + "epoch": 0.14662576687116563, + "grad_norm": 1.162096115496984, + "learning_rate": 1.9295715227134595e-05, + "loss": 0.7005, + "step": 1195 + }, + { + "epoch": 0.1467484662576687, + "grad_norm": 1.0962469491190359, + "learning_rate": 1.9294249442606533e-05, + "loss": 0.7265, + "step": 1196 + }, + { + "epoch": 0.14687116564417177, + "grad_norm": 1.0352843320376774, + "learning_rate": 1.9292782190131677e-05, + "loss": 0.6346, + "step": 1197 + }, + { + "epoch": 0.14699386503067485, + "grad_norm": 1.0264634367429608, + "learning_rate": 1.9291313469941765e-05, + "loss": 0.6647, + "step": 1198 + }, + { + "epoch": 0.1471165644171779, + "grad_norm": 1.140808149264658, + "learning_rate": 1.9289843282268773e-05, + "loss": 0.6445, + "step": 1199 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 0.9835239812049381, + "learning_rate": 1.9288371627344894e-05, + "loss": 0.6314, + "step": 1200 + }, + { + "epoch": 0.14736196319018405, + "grad_norm": 1.1277862840839994, + "learning_rate": 1.9286898505402575e-05, + "loss": 0.661, + "step": 1201 + }, + { + "epoch": 0.1474846625766871, + "grad_norm": 1.420113847519041, + "learning_rate": 1.9285423916674478e-05, + "loss": 0.7082, + "step": 1202 + }, + { + "epoch": 0.1476073619631902, + "grad_norm": 1.0447164481413855, + "learning_rate": 1.92839478613935e-05, + "loss": 0.6576, + "step": 1203 + }, + { + "epoch": 0.14773006134969324, + "grad_norm": 0.9753312118030911, + "learning_rate": 1.928247033979277e-05, + "loss": 0.6284, + "step": 1204 + }, + { + "epoch": 0.14785276073619633, + "grad_norm": 1.1124682834726376, + "learning_rate": 1.9280991352105656e-05, + "loss": 0.657, + "step": 1205 + }, + { + "epoch": 0.14797546012269938, + "grad_norm": 0.9597049589677273, + "learning_rate": 1.927951089856575e-05, + "loss": 0.6089, + "step": 1206 + }, + { + "epoch": 0.14809815950920246, + "grad_norm": 0.9566749991595269, + "learning_rate": 1.927802897940687e-05, + "loss": 0.6264, + "step": 1207 + }, + { + "epoch": 0.14822085889570552, + "grad_norm": 1.180142158587175, + "learning_rate": 1.927654559486308e-05, + "loss": 0.6458, + "step": 1208 + }, + { + "epoch": 0.14834355828220858, + "grad_norm": 1.1725330813918322, + "learning_rate": 1.9275060745168662e-05, + "loss": 0.6333, + "step": 1209 + }, + { + "epoch": 0.14846625766871166, + "grad_norm": 1.22123254645114, + "learning_rate": 1.9273574430558143e-05, + "loss": 0.7546, + "step": 1210 + }, + { + "epoch": 0.14858895705521472, + "grad_norm": 1.0123560781536645, + "learning_rate": 1.927208665126627e-05, + "loss": 0.658, + "step": 1211 + }, + { + "epoch": 0.1487116564417178, + "grad_norm": 1.213443131996196, + "learning_rate": 1.927059740752802e-05, + "loss": 0.6285, + "step": 1212 + }, + { + "epoch": 0.14883435582822085, + "grad_norm": 1.0036038583648563, + "learning_rate": 1.9269106699578614e-05, + "loss": 0.6309, + "step": 1213 + }, + { + "epoch": 0.14895705521472394, + "grad_norm": 1.0471876224213876, + "learning_rate": 1.926761452765349e-05, + "loss": 0.6092, + "step": 1214 + }, + { + "epoch": 0.149079754601227, + "grad_norm": 1.1155562407322142, + "learning_rate": 1.9266120891988326e-05, + "loss": 0.6902, + "step": 1215 + }, + { + "epoch": 0.14920245398773005, + "grad_norm": 1.1556443025037277, + "learning_rate": 1.9264625792819027e-05, + "loss": 0.6383, + "step": 1216 + }, + { + "epoch": 0.14932515337423313, + "grad_norm": 1.3657821848713174, + "learning_rate": 1.9263129230381735e-05, + "loss": 0.6587, + "step": 1217 + }, + { + "epoch": 0.1494478527607362, + "grad_norm": 1.105652718335007, + "learning_rate": 1.926163120491282e-05, + "loss": 0.6661, + "step": 1218 + }, + { + "epoch": 0.14957055214723927, + "grad_norm": 0.9752483821434931, + "learning_rate": 1.926013171664888e-05, + "loss": 0.6378, + "step": 1219 + }, + { + "epoch": 0.14969325153374233, + "grad_norm": 1.0290851174662548, + "learning_rate": 1.925863076582674e-05, + "loss": 0.749, + "step": 1220 + }, + { + "epoch": 0.1498159509202454, + "grad_norm": 0.974495093495109, + "learning_rate": 1.9257128352683478e-05, + "loss": 0.6096, + "step": 1221 + }, + { + "epoch": 0.14993865030674847, + "grad_norm": 1.100898596410231, + "learning_rate": 1.925562447745637e-05, + "loss": 0.6168, + "step": 1222 + }, + { + "epoch": 0.15006134969325152, + "grad_norm": 1.0795208509225862, + "learning_rate": 1.9254119140382952e-05, + "loss": 0.6806, + "step": 1223 + }, + { + "epoch": 0.1501840490797546, + "grad_norm": 1.1158539596937072, + "learning_rate": 1.9252612341700974e-05, + "loss": 0.5885, + "step": 1224 + }, + { + "epoch": 0.15030674846625766, + "grad_norm": 1.0879575436495534, + "learning_rate": 1.9251104081648423e-05, + "loss": 0.6679, + "step": 1225 + }, + { + "epoch": 0.15042944785276074, + "grad_norm": 0.9904545193640225, + "learning_rate": 1.9249594360463514e-05, + "loss": 0.6721, + "step": 1226 + }, + { + "epoch": 0.1505521472392638, + "grad_norm": 1.077580220195873, + "learning_rate": 1.9248083178384695e-05, + "loss": 0.652, + "step": 1227 + }, + { + "epoch": 0.15067484662576688, + "grad_norm": 1.0755562542916612, + "learning_rate": 1.9246570535650647e-05, + "loss": 0.6497, + "step": 1228 + }, + { + "epoch": 0.15079754601226994, + "grad_norm": 1.0725552689132105, + "learning_rate": 1.9245056432500277e-05, + "loss": 0.6562, + "step": 1229 + }, + { + "epoch": 0.150920245398773, + "grad_norm": 1.4944753699575168, + "learning_rate": 1.9243540869172724e-05, + "loss": 0.6494, + "step": 1230 + }, + { + "epoch": 0.15104294478527608, + "grad_norm": 1.1070603532114147, + "learning_rate": 1.9242023845907362e-05, + "loss": 0.6575, + "step": 1231 + }, + { + "epoch": 0.15116564417177913, + "grad_norm": 0.9020834989241775, + "learning_rate": 1.9240505362943782e-05, + "loss": 0.6083, + "step": 1232 + }, + { + "epoch": 0.15128834355828222, + "grad_norm": 1.1215749770637213, + "learning_rate": 1.9238985420521828e-05, + "loss": 0.6241, + "step": 1233 + }, + { + "epoch": 0.15141104294478527, + "grad_norm": 1.235120656669961, + "learning_rate": 1.9237464018881552e-05, + "loss": 0.6729, + "step": 1234 + }, + { + "epoch": 0.15153374233128836, + "grad_norm": 0.9700911892925566, + "learning_rate": 1.9235941158263253e-05, + "loss": 0.6235, + "step": 1235 + }, + { + "epoch": 0.1516564417177914, + "grad_norm": 1.0077058137974826, + "learning_rate": 1.923441683890745e-05, + "loss": 0.66, + "step": 1236 + }, + { + "epoch": 0.15177914110429447, + "grad_norm": 1.061135785273971, + "learning_rate": 1.9232891061054896e-05, + "loss": 0.6357, + "step": 1237 + }, + { + "epoch": 0.15190184049079755, + "grad_norm": 1.2534011087986974, + "learning_rate": 1.9231363824946572e-05, + "loss": 0.5916, + "step": 1238 + }, + { + "epoch": 0.1520245398773006, + "grad_norm": 0.9762325030252198, + "learning_rate": 1.9229835130823703e-05, + "loss": 0.6431, + "step": 1239 + }, + { + "epoch": 0.1521472392638037, + "grad_norm": 0.9838893278848946, + "learning_rate": 1.922830497892772e-05, + "loss": 0.6358, + "step": 1240 + }, + { + "epoch": 0.15226993865030675, + "grad_norm": 1.1025267763748257, + "learning_rate": 1.9226773369500305e-05, + "loss": 0.7166, + "step": 1241 + }, + { + "epoch": 0.1523926380368098, + "grad_norm": 0.9688955292976925, + "learning_rate": 1.9225240302783358e-05, + "loss": 0.6148, + "step": 1242 + }, + { + "epoch": 0.15251533742331289, + "grad_norm": 1.1209628853797744, + "learning_rate": 1.922370577901902e-05, + "loss": 0.5942, + "step": 1243 + }, + { + "epoch": 0.15263803680981594, + "grad_norm": 1.057600611938339, + "learning_rate": 1.9222169798449647e-05, + "loss": 0.6344, + "step": 1244 + }, + { + "epoch": 0.15276073619631902, + "grad_norm": 1.1162719583272593, + "learning_rate": 1.9220632361317843e-05, + "loss": 0.6176, + "step": 1245 + }, + { + "epoch": 0.15288343558282208, + "grad_norm": 0.9947901794137003, + "learning_rate": 1.9219093467866425e-05, + "loss": 0.6569, + "step": 1246 + }, + { + "epoch": 0.15300613496932516, + "grad_norm": 1.537397535525037, + "learning_rate": 1.9217553118338453e-05, + "loss": 0.654, + "step": 1247 + }, + { + "epoch": 0.15312883435582822, + "grad_norm": 0.8954538002785088, + "learning_rate": 1.921601131297721e-05, + "loss": 0.6016, + "step": 1248 + }, + { + "epoch": 0.15325153374233128, + "grad_norm": 1.1178148386425555, + "learning_rate": 1.9214468052026213e-05, + "loss": 0.6841, + "step": 1249 + }, + { + "epoch": 0.15337423312883436, + "grad_norm": 1.0573990958741348, + "learning_rate": 1.9212923335729206e-05, + "loss": 0.67, + "step": 1250 + }, + { + "epoch": 0.15349693251533741, + "grad_norm": 1.0647987533960044, + "learning_rate": 1.921137716433016e-05, + "loss": 0.5928, + "step": 1251 + }, + { + "epoch": 0.1536196319018405, + "grad_norm": 1.1364970565038606, + "learning_rate": 1.9209829538073285e-05, + "loss": 0.7272, + "step": 1252 + }, + { + "epoch": 0.15374233128834355, + "grad_norm": 1.0114525749373078, + "learning_rate": 1.9208280457203014e-05, + "loss": 0.6057, + "step": 1253 + }, + { + "epoch": 0.15386503067484664, + "grad_norm": 1.2527907460139247, + "learning_rate": 1.9206729921964006e-05, + "loss": 0.6432, + "step": 1254 + }, + { + "epoch": 0.1539877300613497, + "grad_norm": 1.2360336786831136, + "learning_rate": 1.920517793260116e-05, + "loss": 0.65, + "step": 1255 + }, + { + "epoch": 0.15411042944785275, + "grad_norm": 1.015118294306023, + "learning_rate": 1.92036244893596e-05, + "loss": 0.5985, + "step": 1256 + }, + { + "epoch": 0.15423312883435583, + "grad_norm": 1.0419345321913067, + "learning_rate": 1.9202069592484675e-05, + "loss": 0.5741, + "step": 1257 + }, + { + "epoch": 0.1543558282208589, + "grad_norm": 1.12554222806204, + "learning_rate": 1.920051324222197e-05, + "loss": 0.613, + "step": 1258 + }, + { + "epoch": 0.15447852760736197, + "grad_norm": 0.9975230322968243, + "learning_rate": 1.9198955438817294e-05, + "loss": 0.6232, + "step": 1259 + }, + { + "epoch": 0.15460122699386503, + "grad_norm": 1.111221534657519, + "learning_rate": 1.9197396182516694e-05, + "loss": 0.6656, + "step": 1260 + }, + { + "epoch": 0.1547239263803681, + "grad_norm": 0.996616285954016, + "learning_rate": 1.9195835473566437e-05, + "loss": 0.6361, + "step": 1261 + }, + { + "epoch": 0.15484662576687117, + "grad_norm": 0.957896300437858, + "learning_rate": 1.9194273312213027e-05, + "loss": 0.5803, + "step": 1262 + }, + { + "epoch": 0.15496932515337422, + "grad_norm": 0.994783962080064, + "learning_rate": 1.919270969870319e-05, + "loss": 0.5845, + "step": 1263 + }, + { + "epoch": 0.1550920245398773, + "grad_norm": 1.0295334553740672, + "learning_rate": 1.919114463328389e-05, + "loss": 0.6976, + "step": 1264 + }, + { + "epoch": 0.15521472392638036, + "grad_norm": 1.418008946067881, + "learning_rate": 1.918957811620231e-05, + "loss": 0.6667, + "step": 1265 + }, + { + "epoch": 0.15533742331288344, + "grad_norm": 0.9780036249695209, + "learning_rate": 1.918801014770587e-05, + "loss": 0.6449, + "step": 1266 + }, + { + "epoch": 0.1554601226993865, + "grad_norm": 1.0089399038895142, + "learning_rate": 1.918644072804222e-05, + "loss": 0.6491, + "step": 1267 + }, + { + "epoch": 0.15558282208588958, + "grad_norm": 0.9823714224733027, + "learning_rate": 1.9184869857459233e-05, + "loss": 0.6061, + "step": 1268 + }, + { + "epoch": 0.15570552147239264, + "grad_norm": 1.3474333845584638, + "learning_rate": 1.9183297536205013e-05, + "loss": 0.6602, + "step": 1269 + }, + { + "epoch": 0.1558282208588957, + "grad_norm": 1.0007825587834922, + "learning_rate": 1.9181723764527902e-05, + "loss": 0.6354, + "step": 1270 + }, + { + "epoch": 0.15595092024539878, + "grad_norm": 0.9991475779260988, + "learning_rate": 1.9180148542676456e-05, + "loss": 0.693, + "step": 1271 + }, + { + "epoch": 0.15607361963190183, + "grad_norm": 1.1379521790362888, + "learning_rate": 1.9178571870899473e-05, + "loss": 0.608, + "step": 1272 + }, + { + "epoch": 0.15619631901840492, + "grad_norm": 1.0583562570488896, + "learning_rate": 1.917699374944597e-05, + "loss": 0.6132, + "step": 1273 + }, + { + "epoch": 0.15631901840490797, + "grad_norm": 1.0296956559585861, + "learning_rate": 1.9175414178565205e-05, + "loss": 0.6573, + "step": 1274 + }, + { + "epoch": 0.15644171779141106, + "grad_norm": 1.3287215000840424, + "learning_rate": 1.917383315850665e-05, + "loss": 0.5924, + "step": 1275 + }, + { + "epoch": 0.1565644171779141, + "grad_norm": 1.0701990144876365, + "learning_rate": 1.9172250689520018e-05, + "loss": 0.6594, + "step": 1276 + }, + { + "epoch": 0.15668711656441717, + "grad_norm": 0.9530851678588385, + "learning_rate": 1.9170666771855242e-05, + "loss": 0.6629, + "step": 1277 + }, + { + "epoch": 0.15680981595092025, + "grad_norm": 1.1153194923499403, + "learning_rate": 1.9169081405762494e-05, + "loss": 0.6541, + "step": 1278 + }, + { + "epoch": 0.1569325153374233, + "grad_norm": 1.0510204683709645, + "learning_rate": 1.9167494591492163e-05, + "loss": 0.6193, + "step": 1279 + }, + { + "epoch": 0.1570552147239264, + "grad_norm": 1.2976511368691974, + "learning_rate": 1.9165906329294875e-05, + "loss": 0.6385, + "step": 1280 + }, + { + "epoch": 0.15717791411042945, + "grad_norm": 1.0256407506929774, + "learning_rate": 1.9164316619421485e-05, + "loss": 0.6155, + "step": 1281 + }, + { + "epoch": 0.15730061349693253, + "grad_norm": 1.1124970138640253, + "learning_rate": 1.9162725462123074e-05, + "loss": 0.62, + "step": 1282 + }, + { + "epoch": 0.15742331288343558, + "grad_norm": 1.0954075297257626, + "learning_rate": 1.9161132857650945e-05, + "loss": 0.6245, + "step": 1283 + }, + { + "epoch": 0.15754601226993864, + "grad_norm": 1.0331112590145344, + "learning_rate": 1.9159538806256645e-05, + "loss": 0.6264, + "step": 1284 + }, + { + "epoch": 0.15766871165644172, + "grad_norm": 1.2672864529171215, + "learning_rate": 1.9157943308191934e-05, + "loss": 0.7175, + "step": 1285 + }, + { + "epoch": 0.15779141104294478, + "grad_norm": 1.091559997139023, + "learning_rate": 1.915634636370881e-05, + "loss": 0.5737, + "step": 1286 + }, + { + "epoch": 0.15791411042944786, + "grad_norm": 1.148318017737345, + "learning_rate": 1.9154747973059496e-05, + "loss": 0.6577, + "step": 1287 + }, + { + "epoch": 0.15803680981595092, + "grad_norm": 1.05441969456612, + "learning_rate": 1.915314813649645e-05, + "loss": 0.6466, + "step": 1288 + }, + { + "epoch": 0.15815950920245397, + "grad_norm": 1.0204437923716771, + "learning_rate": 1.9151546854272335e-05, + "loss": 0.6663, + "step": 1289 + }, + { + "epoch": 0.15828220858895706, + "grad_norm": 1.1589464747998894, + "learning_rate": 1.914994412664008e-05, + "loss": 0.6456, + "step": 1290 + }, + { + "epoch": 0.1584049079754601, + "grad_norm": 1.0230166655584045, + "learning_rate": 1.9148339953852812e-05, + "loss": 0.6432, + "step": 1291 + }, + { + "epoch": 0.1585276073619632, + "grad_norm": 1.2011698595109113, + "learning_rate": 1.9146734336163898e-05, + "loss": 0.6557, + "step": 1292 + }, + { + "epoch": 0.15865030674846625, + "grad_norm": 1.1032269441129967, + "learning_rate": 1.9145127273826932e-05, + "loss": 0.6084, + "step": 1293 + }, + { + "epoch": 0.15877300613496934, + "grad_norm": 1.6194634367718581, + "learning_rate": 1.9143518767095734e-05, + "loss": 0.6596, + "step": 1294 + }, + { + "epoch": 0.1588957055214724, + "grad_norm": 1.0578355067333396, + "learning_rate": 1.9141908816224356e-05, + "loss": 0.6457, + "step": 1295 + }, + { + "epoch": 0.15901840490797545, + "grad_norm": 1.1288296672966234, + "learning_rate": 1.9140297421467076e-05, + "loss": 0.6661, + "step": 1296 + }, + { + "epoch": 0.15914110429447853, + "grad_norm": 0.9653016459823468, + "learning_rate": 1.91386845830784e-05, + "loss": 0.6102, + "step": 1297 + }, + { + "epoch": 0.15926380368098159, + "grad_norm": 1.022176389561916, + "learning_rate": 1.9137070301313053e-05, + "loss": 0.5971, + "step": 1298 + }, + { + "epoch": 0.15938650306748467, + "grad_norm": 0.9869379216584135, + "learning_rate": 1.913545457642601e-05, + "loss": 0.6351, + "step": 1299 + }, + { + "epoch": 0.15950920245398773, + "grad_norm": 0.9627933764336752, + "learning_rate": 1.9133837408672456e-05, + "loss": 0.6232, + "step": 1300 + }, + { + "epoch": 0.1596319018404908, + "grad_norm": 1.1708264274819007, + "learning_rate": 1.9132218798307806e-05, + "loss": 0.7125, + "step": 1301 + }, + { + "epoch": 0.15975460122699386, + "grad_norm": 1.0314625431670834, + "learning_rate": 1.9130598745587708e-05, + "loss": 0.6646, + "step": 1302 + }, + { + "epoch": 0.15987730061349692, + "grad_norm": 1.5319335366405242, + "learning_rate": 1.912897725076804e-05, + "loss": 0.6729, + "step": 1303 + }, + { + "epoch": 0.16, + "grad_norm": 1.023675846496323, + "learning_rate": 1.9127354314104892e-05, + "loss": 0.6656, + "step": 1304 + }, + { + "epoch": 0.16012269938650306, + "grad_norm": 1.1468962468995165, + "learning_rate": 1.9125729935854606e-05, + "loss": 0.6115, + "step": 1305 + }, + { + "epoch": 0.16024539877300614, + "grad_norm": 1.045671186962096, + "learning_rate": 1.9124104116273727e-05, + "loss": 0.6257, + "step": 1306 + }, + { + "epoch": 0.1603680981595092, + "grad_norm": 1.1470132854481747, + "learning_rate": 1.9122476855619045e-05, + "loss": 0.5995, + "step": 1307 + }, + { + "epoch": 0.16049079754601228, + "grad_norm": 1.147986965250599, + "learning_rate": 1.9120848154147572e-05, + "loss": 0.6377, + "step": 1308 + }, + { + "epoch": 0.16061349693251534, + "grad_norm": 0.9291063863074406, + "learning_rate": 1.9119218012116546e-05, + "loss": 0.6025, + "step": 1309 + }, + { + "epoch": 0.1607361963190184, + "grad_norm": 1.564562561859563, + "learning_rate": 1.9117586429783433e-05, + "loss": 0.5957, + "step": 1310 + }, + { + "epoch": 0.16085889570552148, + "grad_norm": 1.4569159310236484, + "learning_rate": 1.911595340740593e-05, + "loss": 0.7343, + "step": 1311 + }, + { + "epoch": 0.16098159509202453, + "grad_norm": 1.0755700352158686, + "learning_rate": 1.9114318945241955e-05, + "loss": 0.6461, + "step": 1312 + }, + { + "epoch": 0.16110429447852762, + "grad_norm": 1.1297426916916862, + "learning_rate": 1.9112683043549665e-05, + "loss": 0.5527, + "step": 1313 + }, + { + "epoch": 0.16122699386503067, + "grad_norm": 1.0443605161710514, + "learning_rate": 1.9111045702587426e-05, + "loss": 0.6433, + "step": 1314 + }, + { + "epoch": 0.16134969325153375, + "grad_norm": 1.1079308276105948, + "learning_rate": 1.910940692261385e-05, + "loss": 0.7044, + "step": 1315 + }, + { + "epoch": 0.1614723926380368, + "grad_norm": 1.0749879334582868, + "learning_rate": 1.9107766703887764e-05, + "loss": 0.719, + "step": 1316 + }, + { + "epoch": 0.16159509202453987, + "grad_norm": 1.1165224875597484, + "learning_rate": 1.910612504666823e-05, + "loss": 0.7138, + "step": 1317 + }, + { + "epoch": 0.16171779141104295, + "grad_norm": 1.0754059453439093, + "learning_rate": 1.910448195121453e-05, + "loss": 0.6991, + "step": 1318 + }, + { + "epoch": 0.161840490797546, + "grad_norm": 1.2097051556205833, + "learning_rate": 1.910283741778618e-05, + "loss": 0.5981, + "step": 1319 + }, + { + "epoch": 0.1619631901840491, + "grad_norm": 1.1580129208834788, + "learning_rate": 1.9101191446642917e-05, + "loss": 0.6609, + "step": 1320 + }, + { + "epoch": 0.16208588957055214, + "grad_norm": 1.0296080274454709, + "learning_rate": 1.909954403804471e-05, + "loss": 0.6034, + "step": 1321 + }, + { + "epoch": 0.16220858895705523, + "grad_norm": 1.0279431118623075, + "learning_rate": 1.9097895192251754e-05, + "loss": 0.6155, + "step": 1322 + }, + { + "epoch": 0.16233128834355828, + "grad_norm": 1.3226652290220782, + "learning_rate": 1.9096244909524465e-05, + "loss": 0.7202, + "step": 1323 + }, + { + "epoch": 0.16245398773006134, + "grad_norm": 1.0559677050495309, + "learning_rate": 1.9094593190123497e-05, + "loss": 0.5837, + "step": 1324 + }, + { + "epoch": 0.16257668711656442, + "grad_norm": 1.2685192987144978, + "learning_rate": 1.909294003430972e-05, + "loss": 0.6603, + "step": 1325 + }, + { + "epoch": 0.16269938650306748, + "grad_norm": 1.0745417210001176, + "learning_rate": 1.9091285442344238e-05, + "loss": 0.6926, + "step": 1326 + }, + { + "epoch": 0.16282208588957056, + "grad_norm": 0.987486934883395, + "learning_rate": 1.9089629414488383e-05, + "loss": 0.6142, + "step": 1327 + }, + { + "epoch": 0.16294478527607362, + "grad_norm": 1.0028351082191747, + "learning_rate": 1.9087971951003705e-05, + "loss": 0.6399, + "step": 1328 + }, + { + "epoch": 0.1630674846625767, + "grad_norm": 1.1228558177168648, + "learning_rate": 1.9086313052151987e-05, + "loss": 0.672, + "step": 1329 + }, + { + "epoch": 0.16319018404907976, + "grad_norm": 1.0255894774940648, + "learning_rate": 1.9084652718195237e-05, + "loss": 0.6073, + "step": 1330 + }, + { + "epoch": 0.1633128834355828, + "grad_norm": 1.1121528816660813, + "learning_rate": 1.9082990949395696e-05, + "loss": 0.6459, + "step": 1331 + }, + { + "epoch": 0.1634355828220859, + "grad_norm": 1.1307966781086376, + "learning_rate": 1.9081327746015822e-05, + "loss": 0.6562, + "step": 1332 + }, + { + "epoch": 0.16355828220858895, + "grad_norm": 3.3047544938293725, + "learning_rate": 1.9079663108318304e-05, + "loss": 0.6671, + "step": 1333 + }, + { + "epoch": 0.16368098159509203, + "grad_norm": 0.9099883109783001, + "learning_rate": 1.9077997036566054e-05, + "loss": 0.6096, + "step": 1334 + }, + { + "epoch": 0.1638036809815951, + "grad_norm": 1.0269214013363734, + "learning_rate": 1.907632953102222e-05, + "loss": 0.624, + "step": 1335 + }, + { + "epoch": 0.16392638036809815, + "grad_norm": 1.0058833525534048, + "learning_rate": 1.9074660591950164e-05, + "loss": 0.6911, + "step": 1336 + }, + { + "epoch": 0.16404907975460123, + "grad_norm": 1.1918855983198338, + "learning_rate": 1.9072990219613486e-05, + "loss": 0.6529, + "step": 1337 + }, + { + "epoch": 0.16417177914110428, + "grad_norm": 1.7850964285307447, + "learning_rate": 1.907131841427601e-05, + "loss": 0.729, + "step": 1338 + }, + { + "epoch": 0.16429447852760737, + "grad_norm": 0.9912011637423788, + "learning_rate": 1.906964517620177e-05, + "loss": 0.6279, + "step": 1339 + }, + { + "epoch": 0.16441717791411042, + "grad_norm": 0.9912490782960592, + "learning_rate": 1.906797050565505e-05, + "loss": 0.6343, + "step": 1340 + }, + { + "epoch": 0.1645398773006135, + "grad_norm": 1.128466066828524, + "learning_rate": 1.9066294402900347e-05, + "loss": 0.6541, + "step": 1341 + }, + { + "epoch": 0.16466257668711656, + "grad_norm": 0.9631487880753643, + "learning_rate": 1.906461686820239e-05, + "loss": 0.6447, + "step": 1342 + }, + { + "epoch": 0.16478527607361962, + "grad_norm": 1.0288771621974293, + "learning_rate": 1.9062937901826128e-05, + "loss": 0.6743, + "step": 1343 + }, + { + "epoch": 0.1649079754601227, + "grad_norm": 1.2066738184440888, + "learning_rate": 1.9061257504036736e-05, + "loss": 0.6196, + "step": 1344 + }, + { + "epoch": 0.16503067484662576, + "grad_norm": 1.0891076470904124, + "learning_rate": 1.9059575675099622e-05, + "loss": 0.6424, + "step": 1345 + }, + { + "epoch": 0.16515337423312884, + "grad_norm": 1.7655654283151963, + "learning_rate": 1.905789241528042e-05, + "loss": 0.5571, + "step": 1346 + }, + { + "epoch": 0.1652760736196319, + "grad_norm": 1.019119228751647, + "learning_rate": 1.9056207724844983e-05, + "loss": 0.6618, + "step": 1347 + }, + { + "epoch": 0.16539877300613498, + "grad_norm": 1.1825880686000454, + "learning_rate": 1.9054521604059394e-05, + "loss": 0.6723, + "step": 1348 + }, + { + "epoch": 0.16552147239263804, + "grad_norm": 1.0304179546882906, + "learning_rate": 1.9052834053189957e-05, + "loss": 0.6164, + "step": 1349 + }, + { + "epoch": 0.1656441717791411, + "grad_norm": 1.1093414869033615, + "learning_rate": 1.9051145072503216e-05, + "loss": 0.6624, + "step": 1350 + }, + { + "epoch": 0.16576687116564418, + "grad_norm": 1.288722872135433, + "learning_rate": 1.904945466226592e-05, + "loss": 0.6506, + "step": 1351 + }, + { + "epoch": 0.16588957055214723, + "grad_norm": 1.355751745617132, + "learning_rate": 1.904776282274506e-05, + "loss": 0.7186, + "step": 1352 + }, + { + "epoch": 0.16601226993865031, + "grad_norm": 1.146863950283371, + "learning_rate": 1.9046069554207846e-05, + "loss": 0.7126, + "step": 1353 + }, + { + "epoch": 0.16613496932515337, + "grad_norm": 1.1742404121883396, + "learning_rate": 1.904437485692172e-05, + "loss": 0.654, + "step": 1354 + }, + { + "epoch": 0.16625766871165645, + "grad_norm": 1.1138735530750976, + "learning_rate": 1.9042678731154337e-05, + "loss": 0.6159, + "step": 1355 + }, + { + "epoch": 0.1663803680981595, + "grad_norm": 1.1016578586596224, + "learning_rate": 1.9040981177173594e-05, + "loss": 0.6546, + "step": 1356 + }, + { + "epoch": 0.16650306748466256, + "grad_norm": 0.9799572344862445, + "learning_rate": 1.90392821952476e-05, + "loss": 0.649, + "step": 1357 + }, + { + "epoch": 0.16662576687116565, + "grad_norm": 0.9972217834393227, + "learning_rate": 1.9037581785644695e-05, + "loss": 0.6547, + "step": 1358 + }, + { + "epoch": 0.1667484662576687, + "grad_norm": 1.1527181909462005, + "learning_rate": 1.9035879948633442e-05, + "loss": 0.7012, + "step": 1359 + }, + { + "epoch": 0.1668711656441718, + "grad_norm": 1.0155691351870004, + "learning_rate": 1.9034176684482638e-05, + "loss": 0.6612, + "step": 1360 + }, + { + "epoch": 0.16699386503067484, + "grad_norm": 1.1984181590715677, + "learning_rate": 1.903247199346129e-05, + "loss": 0.6869, + "step": 1361 + }, + { + "epoch": 0.16711656441717793, + "grad_norm": 1.1031854127568985, + "learning_rate": 1.903076587583865e-05, + "loss": 0.6406, + "step": 1362 + }, + { + "epoch": 0.16723926380368098, + "grad_norm": 0.9473616367837105, + "learning_rate": 1.902905833188417e-05, + "loss": 0.6129, + "step": 1363 + }, + { + "epoch": 0.16736196319018404, + "grad_norm": 1.1043418523285062, + "learning_rate": 1.902734936186756e-05, + "loss": 0.5963, + "step": 1364 + }, + { + "epoch": 0.16748466257668712, + "grad_norm": 1.1575463618626172, + "learning_rate": 1.9025638966058722e-05, + "loss": 0.6999, + "step": 1365 + }, + { + "epoch": 0.16760736196319018, + "grad_norm": 0.9817410362551493, + "learning_rate": 1.9023927144727807e-05, + "loss": 0.6363, + "step": 1366 + }, + { + "epoch": 0.16773006134969326, + "grad_norm": 1.0624780336550101, + "learning_rate": 1.9022213898145176e-05, + "loss": 0.6285, + "step": 1367 + }, + { + "epoch": 0.16785276073619632, + "grad_norm": 1.0330731187961077, + "learning_rate": 1.9020499226581428e-05, + "loss": 0.6065, + "step": 1368 + }, + { + "epoch": 0.1679754601226994, + "grad_norm": 0.9115286094206208, + "learning_rate": 1.9018783130307378e-05, + "loss": 0.6169, + "step": 1369 + }, + { + "epoch": 0.16809815950920245, + "grad_norm": 1.0725665450497837, + "learning_rate": 1.901706560959407e-05, + "loss": 0.637, + "step": 1370 + }, + { + "epoch": 0.1682208588957055, + "grad_norm": 0.9166347239458408, + "learning_rate": 1.9015346664712764e-05, + "loss": 0.5791, + "step": 1371 + }, + { + "epoch": 0.1683435582822086, + "grad_norm": 1.0981534530598722, + "learning_rate": 1.9013626295934963e-05, + "loss": 0.6121, + "step": 1372 + }, + { + "epoch": 0.16846625766871165, + "grad_norm": 1.1634592576320224, + "learning_rate": 1.901190450353238e-05, + "loss": 0.7132, + "step": 1373 + }, + { + "epoch": 0.16858895705521473, + "grad_norm": 1.0220017685427876, + "learning_rate": 1.9010181287776958e-05, + "loss": 0.7188, + "step": 1374 + }, + { + "epoch": 0.1687116564417178, + "grad_norm": 1.1190886015316441, + "learning_rate": 1.900845664894086e-05, + "loss": 0.6217, + "step": 1375 + }, + { + "epoch": 0.16883435582822087, + "grad_norm": 1.2314055604539678, + "learning_rate": 1.900673058729649e-05, + "loss": 0.6316, + "step": 1376 + }, + { + "epoch": 0.16895705521472393, + "grad_norm": 1.1814466840469573, + "learning_rate": 1.9005003103116446e-05, + "loss": 0.6962, + "step": 1377 + }, + { + "epoch": 0.16907975460122698, + "grad_norm": 1.03644840354074, + "learning_rate": 1.9003274196673583e-05, + "loss": 0.6352, + "step": 1378 + }, + { + "epoch": 0.16920245398773007, + "grad_norm": 0.9930877142443433, + "learning_rate": 1.900154386824096e-05, + "loss": 0.6259, + "step": 1379 + }, + { + "epoch": 0.16932515337423312, + "grad_norm": 0.9797881984659039, + "learning_rate": 1.8999812118091877e-05, + "loss": 0.6315, + "step": 1380 + }, + { + "epoch": 0.1694478527607362, + "grad_norm": 1.2796740649238265, + "learning_rate": 1.899807894649984e-05, + "loss": 0.6171, + "step": 1381 + }, + { + "epoch": 0.16957055214723926, + "grad_norm": 0.9936654539846095, + "learning_rate": 1.8996344353738587e-05, + "loss": 0.6343, + "step": 1382 + }, + { + "epoch": 0.16969325153374232, + "grad_norm": 1.0079949744099528, + "learning_rate": 1.8994608340082093e-05, + "loss": 0.6083, + "step": 1383 + }, + { + "epoch": 0.1698159509202454, + "grad_norm": 1.1948254211921936, + "learning_rate": 1.8992870905804535e-05, + "loss": 0.6273, + "step": 1384 + }, + { + "epoch": 0.16993865030674846, + "grad_norm": 1.0350967910827102, + "learning_rate": 1.8991132051180332e-05, + "loss": 0.6598, + "step": 1385 + }, + { + "epoch": 0.17006134969325154, + "grad_norm": 1.0972642449925587, + "learning_rate": 1.8989391776484122e-05, + "loss": 0.6492, + "step": 1386 + }, + { + "epoch": 0.1701840490797546, + "grad_norm": 1.5956583281002066, + "learning_rate": 1.8987650081990757e-05, + "loss": 0.718, + "step": 1387 + }, + { + "epoch": 0.17030674846625768, + "grad_norm": 1.0493777892692113, + "learning_rate": 1.8985906967975337e-05, + "loss": 0.5954, + "step": 1388 + }, + { + "epoch": 0.17042944785276073, + "grad_norm": 0.9939056991994644, + "learning_rate": 1.898416243471316e-05, + "loss": 0.6242, + "step": 1389 + }, + { + "epoch": 0.1705521472392638, + "grad_norm": 1.0831093604091313, + "learning_rate": 1.898241648247977e-05, + "loss": 0.6986, + "step": 1390 + }, + { + "epoch": 0.17067484662576687, + "grad_norm": 1.00554914543931, + "learning_rate": 1.8980669111550917e-05, + "loss": 0.6879, + "step": 1391 + }, + { + "epoch": 0.17079754601226993, + "grad_norm": 0.9180078070727061, + "learning_rate": 1.8978920322202582e-05, + "loss": 0.6279, + "step": 1392 + }, + { + "epoch": 0.170920245398773, + "grad_norm": 0.9536011645955076, + "learning_rate": 1.8977170114710978e-05, + "loss": 0.6395, + "step": 1393 + }, + { + "epoch": 0.17104294478527607, + "grad_norm": 1.099380420879858, + "learning_rate": 1.8975418489352534e-05, + "loss": 0.7186, + "step": 1394 + }, + { + "epoch": 0.17116564417177915, + "grad_norm": 1.0772188862496384, + "learning_rate": 1.8973665446403902e-05, + "loss": 0.6128, + "step": 1395 + }, + { + "epoch": 0.1712883435582822, + "grad_norm": 1.0493187837233606, + "learning_rate": 1.897191098614196e-05, + "loss": 0.642, + "step": 1396 + }, + { + "epoch": 0.17141104294478526, + "grad_norm": 1.2323790721721564, + "learning_rate": 1.8970155108843812e-05, + "loss": 0.6462, + "step": 1397 + }, + { + "epoch": 0.17153374233128835, + "grad_norm": 0.9637215346996354, + "learning_rate": 1.896839781478678e-05, + "loss": 0.638, + "step": 1398 + }, + { + "epoch": 0.1716564417177914, + "grad_norm": 1.0178595069938714, + "learning_rate": 1.8966639104248416e-05, + "loss": 0.6193, + "step": 1399 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 0.9890202623736719, + "learning_rate": 1.8964878977506496e-05, + "loss": 0.6452, + "step": 1400 + }, + { + "epoch": 0.17190184049079754, + "grad_norm": 1.0295358894119873, + "learning_rate": 1.896311743483901e-05, + "loss": 0.624, + "step": 1401 + }, + { + "epoch": 0.17202453987730063, + "grad_norm": 1.1059064474094498, + "learning_rate": 1.8961354476524187e-05, + "loss": 0.663, + "step": 1402 + }, + { + "epoch": 0.17214723926380368, + "grad_norm": 1.1456655637825888, + "learning_rate": 1.8959590102840465e-05, + "loss": 0.6778, + "step": 1403 + }, + { + "epoch": 0.17226993865030674, + "grad_norm": 1.2959164795225695, + "learning_rate": 1.895782431406651e-05, + "loss": 0.6726, + "step": 1404 + }, + { + "epoch": 0.17239263803680982, + "grad_norm": 1.1718218697580496, + "learning_rate": 1.895605711048122e-05, + "loss": 0.7312, + "step": 1405 + }, + { + "epoch": 0.17251533742331288, + "grad_norm": 1.1555104765762754, + "learning_rate": 1.8954288492363708e-05, + "loss": 0.6008, + "step": 1406 + }, + { + "epoch": 0.17263803680981596, + "grad_norm": 1.235533437387532, + "learning_rate": 1.89525184599933e-05, + "loss": 0.6168, + "step": 1407 + }, + { + "epoch": 0.17276073619631901, + "grad_norm": 1.079952873808466, + "learning_rate": 1.8950747013649577e-05, + "loss": 0.6219, + "step": 1408 + }, + { + "epoch": 0.1728834355828221, + "grad_norm": 1.023092044552521, + "learning_rate": 1.894897415361231e-05, + "loss": 0.624, + "step": 1409 + }, + { + "epoch": 0.17300613496932515, + "grad_norm": 1.0326089422128235, + "learning_rate": 1.8947199880161515e-05, + "loss": 0.6264, + "step": 1410 + }, + { + "epoch": 0.1731288343558282, + "grad_norm": 1.4016599388977633, + "learning_rate": 1.8945424193577415e-05, + "loss": 0.7095, + "step": 1411 + }, + { + "epoch": 0.1732515337423313, + "grad_norm": 1.107474471825156, + "learning_rate": 1.8943647094140473e-05, + "loss": 0.6638, + "step": 1412 + }, + { + "epoch": 0.17337423312883435, + "grad_norm": 1.1144447394306207, + "learning_rate": 1.8941868582131357e-05, + "loss": 0.7106, + "step": 1413 + }, + { + "epoch": 0.17349693251533743, + "grad_norm": 0.9302117074112858, + "learning_rate": 1.8940088657830977e-05, + "loss": 0.5965, + "step": 1414 + }, + { + "epoch": 0.1736196319018405, + "grad_norm": 1.0119596803276896, + "learning_rate": 1.8938307321520453e-05, + "loss": 0.6223, + "step": 1415 + }, + { + "epoch": 0.17374233128834357, + "grad_norm": 1.0540068443666697, + "learning_rate": 1.893652457348113e-05, + "loss": 0.7161, + "step": 1416 + }, + { + "epoch": 0.17386503067484663, + "grad_norm": 0.961007057955622, + "learning_rate": 1.8934740413994576e-05, + "loss": 0.6356, + "step": 1417 + }, + { + "epoch": 0.17398773006134968, + "grad_norm": 1.2305145800761808, + "learning_rate": 1.893295484334259e-05, + "loss": 0.682, + "step": 1418 + }, + { + "epoch": 0.17411042944785277, + "grad_norm": 1.2555600664280966, + "learning_rate": 1.8931167861807186e-05, + "loss": 0.6131, + "step": 1419 + }, + { + "epoch": 0.17423312883435582, + "grad_norm": 1.0608847265539552, + "learning_rate": 1.89293794696706e-05, + "loss": 0.6777, + "step": 1420 + }, + { + "epoch": 0.1743558282208589, + "grad_norm": 1.1111853574280284, + "learning_rate": 1.8927589667215294e-05, + "loss": 0.6036, + "step": 1421 + }, + { + "epoch": 0.17447852760736196, + "grad_norm": 1.07094888589105, + "learning_rate": 1.892579845472395e-05, + "loss": 0.6982, + "step": 1422 + }, + { + "epoch": 0.17460122699386504, + "grad_norm": 0.918287040100538, + "learning_rate": 1.892400583247948e-05, + "loss": 0.6484, + "step": 1423 + }, + { + "epoch": 0.1747239263803681, + "grad_norm": 1.0146523367170484, + "learning_rate": 1.8922211800765006e-05, + "loss": 0.5668, + "step": 1424 + }, + { + "epoch": 0.17484662576687116, + "grad_norm": 1.114731620912284, + "learning_rate": 1.8920416359863885e-05, + "loss": 0.5834, + "step": 1425 + }, + { + "epoch": 0.17496932515337424, + "grad_norm": 1.1205592649112053, + "learning_rate": 1.8918619510059694e-05, + "loss": 0.6171, + "step": 1426 + }, + { + "epoch": 0.1750920245398773, + "grad_norm": 1.0500271303670539, + "learning_rate": 1.891682125163622e-05, + "loss": 0.6404, + "step": 1427 + }, + { + "epoch": 0.17521472392638038, + "grad_norm": 1.0113250533560598, + "learning_rate": 1.8915021584877492e-05, + "loss": 0.6581, + "step": 1428 + }, + { + "epoch": 0.17533742331288343, + "grad_norm": 1.0664341088242846, + "learning_rate": 1.8913220510067745e-05, + "loss": 0.5644, + "step": 1429 + }, + { + "epoch": 0.1754601226993865, + "grad_norm": 1.0205763012642173, + "learning_rate": 1.8911418027491453e-05, + "loss": 0.6164, + "step": 1430 + }, + { + "epoch": 0.17558282208588957, + "grad_norm": 1.0969012760711379, + "learning_rate": 1.8909614137433292e-05, + "loss": 0.5737, + "step": 1431 + }, + { + "epoch": 0.17570552147239263, + "grad_norm": 1.0524974404845895, + "learning_rate": 1.8907808840178178e-05, + "loss": 0.6202, + "step": 1432 + }, + { + "epoch": 0.1758282208588957, + "grad_norm": 1.0314688266708447, + "learning_rate": 1.8906002136011235e-05, + "loss": 0.6487, + "step": 1433 + }, + { + "epoch": 0.17595092024539877, + "grad_norm": 0.8998038129811143, + "learning_rate": 1.8904194025217822e-05, + "loss": 0.6341, + "step": 1434 + }, + { + "epoch": 0.17607361963190185, + "grad_norm": 1.3187476702795675, + "learning_rate": 1.8902384508083518e-05, + "loss": 0.6437, + "step": 1435 + }, + { + "epoch": 0.1761963190184049, + "grad_norm": 1.7193042541855272, + "learning_rate": 1.8900573584894112e-05, + "loss": 0.7135, + "step": 1436 + }, + { + "epoch": 0.17631901840490796, + "grad_norm": 1.0515158444002584, + "learning_rate": 1.889876125593563e-05, + "loss": 0.6199, + "step": 1437 + }, + { + "epoch": 0.17644171779141105, + "grad_norm": 1.0272540080633432, + "learning_rate": 1.889694752149431e-05, + "loss": 0.6592, + "step": 1438 + }, + { + "epoch": 0.1765644171779141, + "grad_norm": 1.2656438911558343, + "learning_rate": 1.8895132381856617e-05, + "loss": 0.6639, + "step": 1439 + }, + { + "epoch": 0.17668711656441718, + "grad_norm": 1.3109724585566682, + "learning_rate": 1.8893315837309235e-05, + "loss": 0.6664, + "step": 1440 + }, + { + "epoch": 0.17680981595092024, + "grad_norm": 0.977440915230305, + "learning_rate": 1.889149788813908e-05, + "loss": 0.6086, + "step": 1441 + }, + { + "epoch": 0.17693251533742332, + "grad_norm": 0.9629847275099348, + "learning_rate": 1.888967853463327e-05, + "loss": 0.6024, + "step": 1442 + }, + { + "epoch": 0.17705521472392638, + "grad_norm": 1.1060788950873015, + "learning_rate": 1.8887857777079164e-05, + "loss": 0.6423, + "step": 1443 + }, + { + "epoch": 0.17717791411042944, + "grad_norm": 0.9850148911233516, + "learning_rate": 1.8886035615764334e-05, + "loss": 0.6135, + "step": 1444 + }, + { + "epoch": 0.17730061349693252, + "grad_norm": 1.0966027083179832, + "learning_rate": 1.8884212050976568e-05, + "loss": 0.6729, + "step": 1445 + }, + { + "epoch": 0.17742331288343557, + "grad_norm": 0.936730334725744, + "learning_rate": 1.888238708300389e-05, + "loss": 0.6248, + "step": 1446 + }, + { + "epoch": 0.17754601226993866, + "grad_norm": 1.0345442717094677, + "learning_rate": 1.8880560712134535e-05, + "loss": 0.6517, + "step": 1447 + }, + { + "epoch": 0.1776687116564417, + "grad_norm": 0.9860363651195594, + "learning_rate": 1.8878732938656962e-05, + "loss": 0.6008, + "step": 1448 + }, + { + "epoch": 0.1777914110429448, + "grad_norm": 1.0636684087290753, + "learning_rate": 1.8876903762859858e-05, + "loss": 0.6416, + "step": 1449 + }, + { + "epoch": 0.17791411042944785, + "grad_norm": 1.0363159083139795, + "learning_rate": 1.8875073185032116e-05, + "loss": 0.6331, + "step": 1450 + }, + { + "epoch": 0.1780368098159509, + "grad_norm": 1.1389555410510566, + "learning_rate": 1.8873241205462864e-05, + "loss": 0.6101, + "step": 1451 + }, + { + "epoch": 0.178159509202454, + "grad_norm": 1.0240497193511597, + "learning_rate": 1.8871407824441453e-05, + "loss": 0.6251, + "step": 1452 + }, + { + "epoch": 0.17828220858895705, + "grad_norm": 1.1572586061169585, + "learning_rate": 1.886957304225744e-05, + "loss": 0.6744, + "step": 1453 + }, + { + "epoch": 0.17840490797546013, + "grad_norm": 1.0751190316555173, + "learning_rate": 1.886773685920062e-05, + "loss": 0.615, + "step": 1454 + }, + { + "epoch": 0.1785276073619632, + "grad_norm": 1.2124276814120438, + "learning_rate": 1.8865899275561003e-05, + "loss": 0.6199, + "step": 1455 + }, + { + "epoch": 0.17865030674846627, + "grad_norm": 1.1094482748391048, + "learning_rate": 1.886406029162881e-05, + "loss": 0.5838, + "step": 1456 + }, + { + "epoch": 0.17877300613496933, + "grad_norm": 1.161046046987915, + "learning_rate": 1.8862219907694505e-05, + "loss": 0.6559, + "step": 1457 + }, + { + "epoch": 0.17889570552147238, + "grad_norm": 1.0692519188735758, + "learning_rate": 1.8860378124048754e-05, + "loss": 0.6063, + "step": 1458 + }, + { + "epoch": 0.17901840490797546, + "grad_norm": 1.0741135918997478, + "learning_rate": 1.8858534940982456e-05, + "loss": 0.6534, + "step": 1459 + }, + { + "epoch": 0.17914110429447852, + "grad_norm": 1.1041807533170904, + "learning_rate": 1.885669035878672e-05, + "loss": 0.6774, + "step": 1460 + }, + { + "epoch": 0.1792638036809816, + "grad_norm": 1.1643287609012205, + "learning_rate": 1.885484437775288e-05, + "loss": 0.5928, + "step": 1461 + }, + { + "epoch": 0.17938650306748466, + "grad_norm": 1.1041635198029052, + "learning_rate": 1.8852996998172502e-05, + "loss": 0.6599, + "step": 1462 + }, + { + "epoch": 0.17950920245398774, + "grad_norm": 1.003584301615833, + "learning_rate": 1.8851148220337357e-05, + "loss": 0.626, + "step": 1463 + }, + { + "epoch": 0.1796319018404908, + "grad_norm": 1.3329540591366644, + "learning_rate": 1.8849298044539446e-05, + "loss": 0.6502, + "step": 1464 + }, + { + "epoch": 0.17975460122699385, + "grad_norm": 1.0912036912515746, + "learning_rate": 1.8847446471070985e-05, + "loss": 0.6128, + "step": 1465 + }, + { + "epoch": 0.17987730061349694, + "grad_norm": 0.9771634633290931, + "learning_rate": 1.8845593500224416e-05, + "loss": 0.607, + "step": 1466 + }, + { + "epoch": 0.18, + "grad_norm": 1.0302020008734454, + "learning_rate": 1.8843739132292405e-05, + "loss": 0.6135, + "step": 1467 + }, + { + "epoch": 0.18012269938650308, + "grad_norm": 1.1011140800541541, + "learning_rate": 1.8841883367567827e-05, + "loss": 0.6399, + "step": 1468 + }, + { + "epoch": 0.18024539877300613, + "grad_norm": 0.9634129083589413, + "learning_rate": 1.8840026206343786e-05, + "loss": 0.662, + "step": 1469 + }, + { + "epoch": 0.18036809815950922, + "grad_norm": 0.9590291138157901, + "learning_rate": 1.8838167648913606e-05, + "loss": 0.632, + "step": 1470 + }, + { + "epoch": 0.18049079754601227, + "grad_norm": 1.2831026567058144, + "learning_rate": 1.883630769557083e-05, + "loss": 0.5811, + "step": 1471 + }, + { + "epoch": 0.18061349693251533, + "grad_norm": 1.1757817706734852, + "learning_rate": 1.8834446346609216e-05, + "loss": 0.6257, + "step": 1472 + }, + { + "epoch": 0.1807361963190184, + "grad_norm": 0.946440735696095, + "learning_rate": 1.8832583602322754e-05, + "loss": 0.5863, + "step": 1473 + }, + { + "epoch": 0.18085889570552147, + "grad_norm": 1.0235851918900456, + "learning_rate": 1.883071946300565e-05, + "loss": 0.6633, + "step": 1474 + }, + { + "epoch": 0.18098159509202455, + "grad_norm": 1.0061878344552746, + "learning_rate": 1.882885392895232e-05, + "loss": 0.5869, + "step": 1475 + }, + { + "epoch": 0.1811042944785276, + "grad_norm": 0.9905688044286721, + "learning_rate": 1.882698700045742e-05, + "loss": 0.5429, + "step": 1476 + }, + { + "epoch": 0.18122699386503066, + "grad_norm": 0.9250011207226774, + "learning_rate": 1.882511867781581e-05, + "loss": 0.5993, + "step": 1477 + }, + { + "epoch": 0.18134969325153374, + "grad_norm": 1.1488176687932208, + "learning_rate": 1.8823248961322572e-05, + "loss": 0.5948, + "step": 1478 + }, + { + "epoch": 0.1814723926380368, + "grad_norm": 0.9583145979661041, + "learning_rate": 1.882137785127302e-05, + "loss": 0.6419, + "step": 1479 + }, + { + "epoch": 0.18159509202453988, + "grad_norm": 0.9336388198353237, + "learning_rate": 1.881950534796267e-05, + "loss": 0.604, + "step": 1480 + }, + { + "epoch": 0.18171779141104294, + "grad_norm": 1.1990192432734557, + "learning_rate": 1.8817631451687277e-05, + "loss": 0.6889, + "step": 1481 + }, + { + "epoch": 0.18184049079754602, + "grad_norm": 1.051949626818864, + "learning_rate": 1.8815756162742805e-05, + "loss": 0.5843, + "step": 1482 + }, + { + "epoch": 0.18196319018404908, + "grad_norm": 1.101033756424987, + "learning_rate": 1.881387948142543e-05, + "loss": 0.6622, + "step": 1483 + }, + { + "epoch": 0.18208588957055213, + "grad_norm": 1.1372205653725238, + "learning_rate": 1.881200140803157e-05, + "loss": 0.6773, + "step": 1484 + }, + { + "epoch": 0.18220858895705522, + "grad_norm": 1.0710532369602361, + "learning_rate": 1.8810121942857848e-05, + "loss": 0.6349, + "step": 1485 + }, + { + "epoch": 0.18233128834355827, + "grad_norm": 1.0925330597613856, + "learning_rate": 1.8808241086201106e-05, + "loss": 0.6564, + "step": 1486 + }, + { + "epoch": 0.18245398773006136, + "grad_norm": 1.02041468094604, + "learning_rate": 1.8806358838358403e-05, + "loss": 0.653, + "step": 1487 + }, + { + "epoch": 0.1825766871165644, + "grad_norm": 1.0837286869839238, + "learning_rate": 1.880447519962704e-05, + "loss": 0.6834, + "step": 1488 + }, + { + "epoch": 0.1826993865030675, + "grad_norm": 1.0503527078226211, + "learning_rate": 1.8802590170304507e-05, + "loss": 0.6338, + "step": 1489 + }, + { + "epoch": 0.18282208588957055, + "grad_norm": 1.0693636131931863, + "learning_rate": 1.8800703750688536e-05, + "loss": 0.6544, + "step": 1490 + }, + { + "epoch": 0.1829447852760736, + "grad_norm": 1.096445194790518, + "learning_rate": 1.879881594107707e-05, + "loss": 0.6096, + "step": 1491 + }, + { + "epoch": 0.1830674846625767, + "grad_norm": 0.9735197356772355, + "learning_rate": 1.8796926741768267e-05, + "loss": 0.6536, + "step": 1492 + }, + { + "epoch": 0.18319018404907975, + "grad_norm": 1.0315847991774707, + "learning_rate": 1.8795036153060518e-05, + "loss": 0.6256, + "step": 1493 + }, + { + "epoch": 0.18331288343558283, + "grad_norm": 1.025272140074344, + "learning_rate": 1.879314417525242e-05, + "loss": 0.6394, + "step": 1494 + }, + { + "epoch": 0.18343558282208589, + "grad_norm": 1.116924223553374, + "learning_rate": 1.8791250808642792e-05, + "loss": 0.6383, + "step": 1495 + }, + { + "epoch": 0.18355828220858897, + "grad_norm": 0.8831469336479917, + "learning_rate": 1.878935605353068e-05, + "loss": 0.6839, + "step": 1496 + }, + { + "epoch": 0.18368098159509202, + "grad_norm": 0.9545046491397473, + "learning_rate": 1.8787459910215344e-05, + "loss": 0.6548, + "step": 1497 + }, + { + "epoch": 0.18380368098159508, + "grad_norm": 1.0818533736760019, + "learning_rate": 1.878556237899626e-05, + "loss": 0.6511, + "step": 1498 + }, + { + "epoch": 0.18392638036809816, + "grad_norm": 1.2364420569315087, + "learning_rate": 1.8783663460173135e-05, + "loss": 0.6514, + "step": 1499 + }, + { + "epoch": 0.18404907975460122, + "grad_norm": 1.3077021854412954, + "learning_rate": 1.8781763154045873e-05, + "loss": 0.7129, + "step": 1500 + }, + { + "epoch": 0.1841717791411043, + "grad_norm": 1.0524175194037801, + "learning_rate": 1.8779861460914624e-05, + "loss": 0.5887, + "step": 1501 + }, + { + "epoch": 0.18429447852760736, + "grad_norm": 0.9479509300083745, + "learning_rate": 1.8777958381079737e-05, + "loss": 0.5901, + "step": 1502 + }, + { + "epoch": 0.18441717791411044, + "grad_norm": 1.1933205069097008, + "learning_rate": 1.877605391484179e-05, + "loss": 0.6747, + "step": 1503 + }, + { + "epoch": 0.1845398773006135, + "grad_norm": 1.033089559317479, + "learning_rate": 1.8774148062501577e-05, + "loss": 0.6091, + "step": 1504 + }, + { + "epoch": 0.18466257668711655, + "grad_norm": 1.0168259710667547, + "learning_rate": 1.877224082436011e-05, + "loss": 0.647, + "step": 1505 + }, + { + "epoch": 0.18478527607361964, + "grad_norm": 1.255920814699413, + "learning_rate": 1.877033220071862e-05, + "loss": 0.6601, + "step": 1506 + }, + { + "epoch": 0.1849079754601227, + "grad_norm": 0.9332148784467393, + "learning_rate": 1.8768422191878558e-05, + "loss": 0.6463, + "step": 1507 + }, + { + "epoch": 0.18503067484662578, + "grad_norm": 1.074922286113234, + "learning_rate": 1.8766510798141597e-05, + "loss": 0.6922, + "step": 1508 + }, + { + "epoch": 0.18515337423312883, + "grad_norm": 1.0421297227665425, + "learning_rate": 1.8764598019809618e-05, + "loss": 0.6134, + "step": 1509 + }, + { + "epoch": 0.18527607361963191, + "grad_norm": 1.051681540468799, + "learning_rate": 1.8762683857184738e-05, + "loss": 0.6522, + "step": 1510 + }, + { + "epoch": 0.18539877300613497, + "grad_norm": 1.0354938536038987, + "learning_rate": 1.8760768310569273e-05, + "loss": 0.6382, + "step": 1511 + }, + { + "epoch": 0.18552147239263803, + "grad_norm": 1.0201586958762507, + "learning_rate": 1.8758851380265772e-05, + "loss": 0.5993, + "step": 1512 + }, + { + "epoch": 0.1856441717791411, + "grad_norm": 1.012989298245683, + "learning_rate": 1.8756933066576997e-05, + "loss": 0.694, + "step": 1513 + }, + { + "epoch": 0.18576687116564417, + "grad_norm": 1.0828529861620757, + "learning_rate": 1.8755013369805926e-05, + "loss": 0.6674, + "step": 1514 + }, + { + "epoch": 0.18588957055214725, + "grad_norm": 1.0604930428636383, + "learning_rate": 1.8753092290255765e-05, + "loss": 0.6887, + "step": 1515 + }, + { + "epoch": 0.1860122699386503, + "grad_norm": 0.9742372735913432, + "learning_rate": 1.8751169828229927e-05, + "loss": 0.626, + "step": 1516 + }, + { + "epoch": 0.1861349693251534, + "grad_norm": 1.1641226530493802, + "learning_rate": 1.8749245984032053e-05, + "loss": 0.6549, + "step": 1517 + }, + { + "epoch": 0.18625766871165644, + "grad_norm": 1.123148608854607, + "learning_rate": 1.874732075796599e-05, + "loss": 0.6386, + "step": 1518 + }, + { + "epoch": 0.1863803680981595, + "grad_norm": 1.0561539920851617, + "learning_rate": 1.8745394150335818e-05, + "loss": 0.6318, + "step": 1519 + }, + { + "epoch": 0.18650306748466258, + "grad_norm": 0.975282543988113, + "learning_rate": 1.8743466161445823e-05, + "loss": 0.6208, + "step": 1520 + }, + { + "epoch": 0.18662576687116564, + "grad_norm": 1.06958374669767, + "learning_rate": 1.8741536791600518e-05, + "loss": 0.6322, + "step": 1521 + }, + { + "epoch": 0.18674846625766872, + "grad_norm": 1.0767307024222237, + "learning_rate": 1.873960604110463e-05, + "loss": 0.5957, + "step": 1522 + }, + { + "epoch": 0.18687116564417178, + "grad_norm": 1.0260159578737276, + "learning_rate": 1.87376739102631e-05, + "loss": 0.6126, + "step": 1523 + }, + { + "epoch": 0.18699386503067483, + "grad_norm": 1.1132480394666775, + "learning_rate": 1.87357403993811e-05, + "loss": 0.7015, + "step": 1524 + }, + { + "epoch": 0.18711656441717792, + "grad_norm": 1.000821841868233, + "learning_rate": 1.8733805508764e-05, + "loss": 0.6244, + "step": 1525 + }, + { + "epoch": 0.18723926380368097, + "grad_norm": 1.1559397573221473, + "learning_rate": 1.8731869238717413e-05, + "loss": 0.7262, + "step": 1526 + }, + { + "epoch": 0.18736196319018406, + "grad_norm": 1.126391912163197, + "learning_rate": 1.8729931589547145e-05, + "loss": 0.6685, + "step": 1527 + }, + { + "epoch": 0.1874846625766871, + "grad_norm": 1.0523368409008784, + "learning_rate": 1.8727992561559235e-05, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.1876073619631902, + "grad_norm": 0.9537954932389616, + "learning_rate": 1.8726052155059937e-05, + "loss": 0.6224, + "step": 1529 + }, + { + "epoch": 0.18773006134969325, + "grad_norm": 1.0139922648163335, + "learning_rate": 1.872411037035572e-05, + "loss": 0.6289, + "step": 1530 + }, + { + "epoch": 0.1878527607361963, + "grad_norm": 0.9831976837512058, + "learning_rate": 1.872216720775327e-05, + "loss": 0.6358, + "step": 1531 + }, + { + "epoch": 0.1879754601226994, + "grad_norm": 1.1939994313494722, + "learning_rate": 1.8720222667559498e-05, + "loss": 0.6083, + "step": 1532 + }, + { + "epoch": 0.18809815950920244, + "grad_norm": 1.0010314765023216, + "learning_rate": 1.8718276750081523e-05, + "loss": 0.67, + "step": 1533 + }, + { + "epoch": 0.18822085889570553, + "grad_norm": 1.0167220164736959, + "learning_rate": 1.871632945562669e-05, + "loss": 0.6784, + "step": 1534 + }, + { + "epoch": 0.18834355828220858, + "grad_norm": 1.0005529027450908, + "learning_rate": 1.8714380784502553e-05, + "loss": 0.6263, + "step": 1535 + }, + { + "epoch": 0.18846625766871167, + "grad_norm": 1.0461865449904615, + "learning_rate": 1.8712430737016892e-05, + "loss": 0.664, + "step": 1536 + }, + { + "epoch": 0.18858895705521472, + "grad_norm": 1.0732624103788075, + "learning_rate": 1.8710479313477697e-05, + "loss": 0.6836, + "step": 1537 + }, + { + "epoch": 0.18871165644171778, + "grad_norm": 1.143038474768521, + "learning_rate": 1.870852651419318e-05, + "loss": 0.7036, + "step": 1538 + }, + { + "epoch": 0.18883435582822086, + "grad_norm": 1.0835140337401377, + "learning_rate": 1.870657233947177e-05, + "loss": 0.6727, + "step": 1539 + }, + { + "epoch": 0.18895705521472392, + "grad_norm": 1.038496613336541, + "learning_rate": 1.870461678962211e-05, + "loss": 0.652, + "step": 1540 + }, + { + "epoch": 0.189079754601227, + "grad_norm": 1.0725218972654542, + "learning_rate": 1.870265986495306e-05, + "loss": 0.6427, + "step": 1541 + }, + { + "epoch": 0.18920245398773006, + "grad_norm": 0.990500713997063, + "learning_rate": 1.8700701565773704e-05, + "loss": 0.6341, + "step": 1542 + }, + { + "epoch": 0.18932515337423314, + "grad_norm": 0.9105144570666309, + "learning_rate": 1.8698741892393333e-05, + "loss": 0.5742, + "step": 1543 + }, + { + "epoch": 0.1894478527607362, + "grad_norm": 1.1103425983888464, + "learning_rate": 1.869678084512147e-05, + "loss": 0.6535, + "step": 1544 + }, + { + "epoch": 0.18957055214723925, + "grad_norm": 1.1396476012195433, + "learning_rate": 1.869481842426784e-05, + "loss": 0.6404, + "step": 1545 + }, + { + "epoch": 0.18969325153374234, + "grad_norm": 1.0439717091784455, + "learning_rate": 1.869285463014239e-05, + "loss": 0.6781, + "step": 1546 + }, + { + "epoch": 0.1898159509202454, + "grad_norm": 1.1301416225469323, + "learning_rate": 1.8690889463055285e-05, + "loss": 0.6058, + "step": 1547 + }, + { + "epoch": 0.18993865030674847, + "grad_norm": 0.9972889564856924, + "learning_rate": 1.8688922923316904e-05, + "loss": 0.6141, + "step": 1548 + }, + { + "epoch": 0.19006134969325153, + "grad_norm": 1.0023836495776979, + "learning_rate": 1.868695501123785e-05, + "loss": 0.6812, + "step": 1549 + }, + { + "epoch": 0.1901840490797546, + "grad_norm": 1.0834435870539325, + "learning_rate": 1.8684985727128936e-05, + "loss": 0.6392, + "step": 1550 + }, + { + "epoch": 0.19030674846625767, + "grad_norm": 1.0382335838054444, + "learning_rate": 1.868301507130119e-05, + "loss": 0.68, + "step": 1551 + }, + { + "epoch": 0.19042944785276072, + "grad_norm": 0.9893799573723961, + "learning_rate": 1.868104304406586e-05, + "loss": 0.6756, + "step": 1552 + }, + { + "epoch": 0.1905521472392638, + "grad_norm": 0.8815232088941037, + "learning_rate": 1.867906964573442e-05, + "loss": 0.589, + "step": 1553 + }, + { + "epoch": 0.19067484662576686, + "grad_norm": 1.038565231861038, + "learning_rate": 1.867709487661854e-05, + "loss": 0.5937, + "step": 1554 + }, + { + "epoch": 0.19079754601226995, + "grad_norm": 0.9952982483314674, + "learning_rate": 1.8675118737030123e-05, + "loss": 0.6709, + "step": 1555 + }, + { + "epoch": 0.190920245398773, + "grad_norm": 1.0912191517220797, + "learning_rate": 1.8673141227281282e-05, + "loss": 0.6449, + "step": 1556 + }, + { + "epoch": 0.1910429447852761, + "grad_norm": 0.9760208263285446, + "learning_rate": 1.867116234768435e-05, + "loss": 0.6086, + "step": 1557 + }, + { + "epoch": 0.19116564417177914, + "grad_norm": 0.9574168290246773, + "learning_rate": 1.866918209855187e-05, + "loss": 0.6278, + "step": 1558 + }, + { + "epoch": 0.1912883435582822, + "grad_norm": 0.8874245462734494, + "learning_rate": 1.866720048019661e-05, + "loss": 0.6088, + "step": 1559 + }, + { + "epoch": 0.19141104294478528, + "grad_norm": 1.0671011972267872, + "learning_rate": 1.866521749293155e-05, + "loss": 0.6025, + "step": 1560 + }, + { + "epoch": 0.19153374233128834, + "grad_norm": 1.0577994887786972, + "learning_rate": 1.866323313706988e-05, + "loss": 0.6423, + "step": 1561 + }, + { + "epoch": 0.19165644171779142, + "grad_norm": 1.3580153374380615, + "learning_rate": 1.866124741292502e-05, + "loss": 0.7148, + "step": 1562 + }, + { + "epoch": 0.19177914110429448, + "grad_norm": 1.106575548650998, + "learning_rate": 1.8659260320810588e-05, + "loss": 0.6001, + "step": 1563 + }, + { + "epoch": 0.19190184049079756, + "grad_norm": 0.9413217508520478, + "learning_rate": 1.8657271861040432e-05, + "loss": 0.6818, + "step": 1564 + }, + { + "epoch": 0.19202453987730062, + "grad_norm": 1.1433235719890205, + "learning_rate": 1.8655282033928618e-05, + "loss": 0.6414, + "step": 1565 + }, + { + "epoch": 0.19214723926380367, + "grad_norm": 1.0126796627795682, + "learning_rate": 1.8653290839789412e-05, + "loss": 0.6176, + "step": 1566 + }, + { + "epoch": 0.19226993865030675, + "grad_norm": 1.0550576820815691, + "learning_rate": 1.8651298278937318e-05, + "loss": 0.6275, + "step": 1567 + }, + { + "epoch": 0.1923926380368098, + "grad_norm": 1.0273170209358196, + "learning_rate": 1.864930435168703e-05, + "loss": 0.6233, + "step": 1568 + }, + { + "epoch": 0.1925153374233129, + "grad_norm": 0.8837614151587271, + "learning_rate": 1.8647309058353487e-05, + "loss": 0.6068, + "step": 1569 + }, + { + "epoch": 0.19263803680981595, + "grad_norm": 1.1428572368344616, + "learning_rate": 1.8645312399251818e-05, + "loss": 0.6119, + "step": 1570 + }, + { + "epoch": 0.192760736196319, + "grad_norm": 1.007600950913667, + "learning_rate": 1.8643314374697377e-05, + "loss": 0.6404, + "step": 1571 + }, + { + "epoch": 0.1928834355828221, + "grad_norm": 0.9522386128451171, + "learning_rate": 1.864131498500574e-05, + "loss": 0.6563, + "step": 1572 + }, + { + "epoch": 0.19300613496932514, + "grad_norm": 1.1140246227396804, + "learning_rate": 1.8639314230492693e-05, + "loss": 0.6457, + "step": 1573 + }, + { + "epoch": 0.19312883435582823, + "grad_norm": 1.170890895955532, + "learning_rate": 1.8637312111474237e-05, + "loss": 0.7065, + "step": 1574 + }, + { + "epoch": 0.19325153374233128, + "grad_norm": 1.070068310258234, + "learning_rate": 1.8635308628266586e-05, + "loss": 0.6969, + "step": 1575 + }, + { + "epoch": 0.19337423312883437, + "grad_norm": 1.0770045388436975, + "learning_rate": 1.863330378118618e-05, + "loss": 0.6425, + "step": 1576 + }, + { + "epoch": 0.19349693251533742, + "grad_norm": 1.0072408492453058, + "learning_rate": 1.863129757054966e-05, + "loss": 0.6216, + "step": 1577 + }, + { + "epoch": 0.19361963190184048, + "grad_norm": 1.0116778343365282, + "learning_rate": 1.86292899966739e-05, + "loss": 0.5951, + "step": 1578 + }, + { + "epoch": 0.19374233128834356, + "grad_norm": 1.194948688224886, + "learning_rate": 1.8627281059875965e-05, + "loss": 0.5881, + "step": 1579 + }, + { + "epoch": 0.19386503067484662, + "grad_norm": 1.1933823935009829, + "learning_rate": 1.8625270760473164e-05, + "loss": 0.6896, + "step": 1580 + }, + { + "epoch": 0.1939877300613497, + "grad_norm": 1.1060937556640673, + "learning_rate": 1.8623259098782994e-05, + "loss": 0.6452, + "step": 1581 + }, + { + "epoch": 0.19411042944785276, + "grad_norm": 0.9162247282742254, + "learning_rate": 1.8621246075123182e-05, + "loss": 0.6605, + "step": 1582 + }, + { + "epoch": 0.19423312883435584, + "grad_norm": 0.9752092437657589, + "learning_rate": 1.8619231689811673e-05, + "loss": 0.6186, + "step": 1583 + }, + { + "epoch": 0.1943558282208589, + "grad_norm": 0.9876628604464341, + "learning_rate": 1.8617215943166623e-05, + "loss": 0.607, + "step": 1584 + }, + { + "epoch": 0.19447852760736195, + "grad_norm": 1.0518972048776674, + "learning_rate": 1.8615198835506393e-05, + "loss": 0.6651, + "step": 1585 + }, + { + "epoch": 0.19460122699386503, + "grad_norm": 1.032640941264659, + "learning_rate": 1.8613180367149577e-05, + "loss": 0.6124, + "step": 1586 + }, + { + "epoch": 0.1947239263803681, + "grad_norm": 1.1402369373868273, + "learning_rate": 1.861116053841497e-05, + "loss": 0.6842, + "step": 1587 + }, + { + "epoch": 0.19484662576687117, + "grad_norm": 1.0297246551767312, + "learning_rate": 1.8609139349621588e-05, + "loss": 0.6659, + "step": 1588 + }, + { + "epoch": 0.19496932515337423, + "grad_norm": 0.9878711786634051, + "learning_rate": 1.8607116801088658e-05, + "loss": 0.6358, + "step": 1589 + }, + { + "epoch": 0.1950920245398773, + "grad_norm": 0.9633681068610048, + "learning_rate": 1.8605092893135626e-05, + "loss": 0.5943, + "step": 1590 + }, + { + "epoch": 0.19521472392638037, + "grad_norm": 1.3768322934854542, + "learning_rate": 1.8603067626082154e-05, + "loss": 0.6424, + "step": 1591 + }, + { + "epoch": 0.19533742331288342, + "grad_norm": 0.9642252298958731, + "learning_rate": 1.860104100024811e-05, + "loss": 0.6178, + "step": 1592 + }, + { + "epoch": 0.1954601226993865, + "grad_norm": 1.1342296661415683, + "learning_rate": 1.8599013015953586e-05, + "loss": 0.6763, + "step": 1593 + }, + { + "epoch": 0.19558282208588956, + "grad_norm": 1.1212847953928997, + "learning_rate": 1.8596983673518877e-05, + "loss": 0.6898, + "step": 1594 + }, + { + "epoch": 0.19570552147239265, + "grad_norm": 1.1117785246349632, + "learning_rate": 1.8594952973264512e-05, + "loss": 0.7103, + "step": 1595 + }, + { + "epoch": 0.1958282208588957, + "grad_norm": 0.9429144956056362, + "learning_rate": 1.859292091551122e-05, + "loss": 0.6174, + "step": 1596 + }, + { + "epoch": 0.19595092024539879, + "grad_norm": 1.323074957916979, + "learning_rate": 1.8590887500579934e-05, + "loss": 0.6316, + "step": 1597 + }, + { + "epoch": 0.19607361963190184, + "grad_norm": 0.8884965521705429, + "learning_rate": 1.8588852728791833e-05, + "loss": 0.6448, + "step": 1598 + }, + { + "epoch": 0.1961963190184049, + "grad_norm": 0.8622622682688045, + "learning_rate": 1.8586816600468277e-05, + "loss": 0.6382, + "step": 1599 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 1.0428209850735164, + "learning_rate": 1.8584779115930866e-05, + "loss": 0.6464, + "step": 1600 + }, + { + "epoch": 0.19644171779141104, + "grad_norm": 0.959979768794765, + "learning_rate": 1.85827402755014e-05, + "loss": 0.6187, + "step": 1601 + }, + { + "epoch": 0.19656441717791412, + "grad_norm": 1.1152792909376648, + "learning_rate": 1.858070007950189e-05, + "loss": 0.6666, + "step": 1602 + }, + { + "epoch": 0.19668711656441717, + "grad_norm": 1.141371264497128, + "learning_rate": 1.8578658528254575e-05, + "loss": 0.6789, + "step": 1603 + }, + { + "epoch": 0.19680981595092026, + "grad_norm": 1.014510525596514, + "learning_rate": 1.8576615622081893e-05, + "loss": 0.6287, + "step": 1604 + }, + { + "epoch": 0.19693251533742331, + "grad_norm": 1.0192784193258035, + "learning_rate": 1.857457136130651e-05, + "loss": 0.6197, + "step": 1605 + }, + { + "epoch": 0.19705521472392637, + "grad_norm": 1.0125661798487298, + "learning_rate": 1.85725257462513e-05, + "loss": 0.5851, + "step": 1606 + }, + { + "epoch": 0.19717791411042945, + "grad_norm": 0.9667185229650244, + "learning_rate": 1.8570478777239348e-05, + "loss": 0.615, + "step": 1607 + }, + { + "epoch": 0.1973006134969325, + "grad_norm": 1.0208453207374797, + "learning_rate": 1.8568430454593952e-05, + "loss": 0.6568, + "step": 1608 + }, + { + "epoch": 0.1974233128834356, + "grad_norm": 0.9691444990696324, + "learning_rate": 1.856638077863863e-05, + "loss": 0.6197, + "step": 1609 + }, + { + "epoch": 0.19754601226993865, + "grad_norm": 0.8592938057767897, + "learning_rate": 1.856432974969711e-05, + "loss": 0.6483, + "step": 1610 + }, + { + "epoch": 0.19766871165644173, + "grad_norm": 1.121423757204261, + "learning_rate": 1.8562277368093336e-05, + "loss": 0.6238, + "step": 1611 + }, + { + "epoch": 0.1977914110429448, + "grad_norm": 0.8823183883996647, + "learning_rate": 1.8560223634151465e-05, + "loss": 0.629, + "step": 1612 + }, + { + "epoch": 0.19791411042944784, + "grad_norm": 1.2626720035180448, + "learning_rate": 1.8558168548195858e-05, + "loss": 0.6589, + "step": 1613 + }, + { + "epoch": 0.19803680981595093, + "grad_norm": 1.0555255418587943, + "learning_rate": 1.855611211055111e-05, + "loss": 0.6365, + "step": 1614 + }, + { + "epoch": 0.19815950920245398, + "grad_norm": 0.9012459228612415, + "learning_rate": 1.855405432154201e-05, + "loss": 0.6271, + "step": 1615 + }, + { + "epoch": 0.19828220858895707, + "grad_norm": 1.2597206813766175, + "learning_rate": 1.8551995181493574e-05, + "loss": 0.6505, + "step": 1616 + }, + { + "epoch": 0.19840490797546012, + "grad_norm": 0.966632603238906, + "learning_rate": 1.854993469073102e-05, + "loss": 0.653, + "step": 1617 + }, + { + "epoch": 0.19852760736196318, + "grad_norm": 0.9418151921685511, + "learning_rate": 1.8547872849579788e-05, + "loss": 0.6014, + "step": 1618 + }, + { + "epoch": 0.19865030674846626, + "grad_norm": 1.0164425691610122, + "learning_rate": 1.8545809658365522e-05, + "loss": 0.5942, + "step": 1619 + }, + { + "epoch": 0.19877300613496932, + "grad_norm": 1.0489796266860762, + "learning_rate": 1.8543745117414094e-05, + "loss": 0.6383, + "step": 1620 + }, + { + "epoch": 0.1988957055214724, + "grad_norm": 0.9768111893921024, + "learning_rate": 1.8541679227051578e-05, + "loss": 0.6364, + "step": 1621 + }, + { + "epoch": 0.19901840490797545, + "grad_norm": 1.049533064177869, + "learning_rate": 1.853961198760426e-05, + "loss": 0.5873, + "step": 1622 + }, + { + "epoch": 0.19914110429447854, + "grad_norm": 1.1596543904287824, + "learning_rate": 1.8537543399398645e-05, + "loss": 0.6955, + "step": 1623 + }, + { + "epoch": 0.1992638036809816, + "grad_norm": 1.0705273235961346, + "learning_rate": 1.8535473462761447e-05, + "loss": 0.623, + "step": 1624 + }, + { + "epoch": 0.19938650306748465, + "grad_norm": 1.0041363374541443, + "learning_rate": 1.8533402178019596e-05, + "loss": 0.7092, + "step": 1625 + }, + { + "epoch": 0.19950920245398773, + "grad_norm": 0.9077418022397252, + "learning_rate": 1.8531329545500235e-05, + "loss": 0.5696, + "step": 1626 + }, + { + "epoch": 0.1996319018404908, + "grad_norm": 0.9470417274042309, + "learning_rate": 1.8529255565530718e-05, + "loss": 0.6396, + "step": 1627 + }, + { + "epoch": 0.19975460122699387, + "grad_norm": 1.0596124008540808, + "learning_rate": 1.8527180238438612e-05, + "loss": 0.6346, + "step": 1628 + }, + { + "epoch": 0.19987730061349693, + "grad_norm": 1.0171100731613283, + "learning_rate": 1.8525103564551696e-05, + "loss": 0.6913, + "step": 1629 + }, + { + "epoch": 0.2, + "grad_norm": 0.9479994911176929, + "learning_rate": 1.8523025544197964e-05, + "loss": 0.5985, + "step": 1630 + }, + { + "epoch": 0.20012269938650307, + "grad_norm": 1.6038258454325482, + "learning_rate": 1.8520946177705622e-05, + "loss": 0.6311, + "step": 1631 + }, + { + "epoch": 0.20024539877300612, + "grad_norm": 0.8723746725294234, + "learning_rate": 1.8518865465403087e-05, + "loss": 0.6403, + "step": 1632 + }, + { + "epoch": 0.2003680981595092, + "grad_norm": 1.0812797875002143, + "learning_rate": 1.851678340761899e-05, + "loss": 0.6475, + "step": 1633 + }, + { + "epoch": 0.20049079754601226, + "grad_norm": 0.9520039217339146, + "learning_rate": 1.8514700004682172e-05, + "loss": 0.5704, + "step": 1634 + }, + { + "epoch": 0.20061349693251534, + "grad_norm": 1.0038770532126633, + "learning_rate": 1.8512615256921692e-05, + "loss": 0.6947, + "step": 1635 + }, + { + "epoch": 0.2007361963190184, + "grad_norm": 0.9103231214949049, + "learning_rate": 1.8510529164666814e-05, + "loss": 0.6638, + "step": 1636 + }, + { + "epoch": 0.20085889570552148, + "grad_norm": 0.9691241168660084, + "learning_rate": 1.850844172824702e-05, + "loss": 0.6347, + "step": 1637 + }, + { + "epoch": 0.20098159509202454, + "grad_norm": 0.996482051379336, + "learning_rate": 1.850635294799201e-05, + "loss": 0.6494, + "step": 1638 + }, + { + "epoch": 0.2011042944785276, + "grad_norm": 1.3100350961187295, + "learning_rate": 1.8504262824231675e-05, + "loss": 0.7243, + "step": 1639 + }, + { + "epoch": 0.20122699386503068, + "grad_norm": 1.1091039821491355, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.6852, + "step": 1640 + }, + { + "epoch": 0.20134969325153373, + "grad_norm": 1.0132722009134834, + "learning_rate": 1.850007854751574e-05, + "loss": 0.5877, + "step": 1641 + }, + { + "epoch": 0.20147239263803682, + "grad_norm": 1.2533451400945086, + "learning_rate": 1.8497984395221004e-05, + "loss": 0.6804, + "step": 1642 + }, + { + "epoch": 0.20159509202453987, + "grad_norm": 0.8946805730294942, + "learning_rate": 1.8495888900742694e-05, + "loss": 0.6426, + "step": 1643 + }, + { + "epoch": 0.20171779141104296, + "grad_norm": 1.0963764735450094, + "learning_rate": 1.849379206441177e-05, + "loss": 0.6798, + "step": 1644 + }, + { + "epoch": 0.201840490797546, + "grad_norm": 1.025716562514288, + "learning_rate": 1.8491693886559413e-05, + "loss": 0.6884, + "step": 1645 + }, + { + "epoch": 0.20196319018404907, + "grad_norm": 1.029781780109026, + "learning_rate": 1.8489594367517015e-05, + "loss": 0.6506, + "step": 1646 + }, + { + "epoch": 0.20208588957055215, + "grad_norm": 1.0907127396442622, + "learning_rate": 1.8487493507616166e-05, + "loss": 0.6549, + "step": 1647 + }, + { + "epoch": 0.2022085889570552, + "grad_norm": 1.2628242000527075, + "learning_rate": 1.848539130718869e-05, + "loss": 0.6715, + "step": 1648 + }, + { + "epoch": 0.2023312883435583, + "grad_norm": 0.9848106039751562, + "learning_rate": 1.8483287766566607e-05, + "loss": 0.6065, + "step": 1649 + }, + { + "epoch": 0.20245398773006135, + "grad_norm": 1.0768677840362837, + "learning_rate": 1.848118288608215e-05, + "loss": 0.6, + "step": 1650 + }, + { + "epoch": 0.20257668711656443, + "grad_norm": 1.026394048715683, + "learning_rate": 1.847907666606778e-05, + "loss": 0.6735, + "step": 1651 + }, + { + "epoch": 0.20269938650306749, + "grad_norm": 1.0397477506052872, + "learning_rate": 1.847696910685613e-05, + "loss": 0.6052, + "step": 1652 + }, + { + "epoch": 0.20282208588957054, + "grad_norm": 0.8556275071003198, + "learning_rate": 1.84748602087801e-05, + "loss": 0.6058, + "step": 1653 + }, + { + "epoch": 0.20294478527607362, + "grad_norm": 0.9514304827946805, + "learning_rate": 1.8472749972172756e-05, + "loss": 0.5957, + "step": 1654 + }, + { + "epoch": 0.20306748466257668, + "grad_norm": 1.0344916504143742, + "learning_rate": 1.8470638397367397e-05, + "loss": 0.6461, + "step": 1655 + }, + { + "epoch": 0.20319018404907976, + "grad_norm": 0.9916337330947441, + "learning_rate": 1.8468525484697527e-05, + "loss": 0.6055, + "step": 1656 + }, + { + "epoch": 0.20331288343558282, + "grad_norm": 0.9039380629777853, + "learning_rate": 1.8466411234496857e-05, + "loss": 0.6242, + "step": 1657 + }, + { + "epoch": 0.2034355828220859, + "grad_norm": 1.0396190841546122, + "learning_rate": 1.8464295647099323e-05, + "loss": 0.6531, + "step": 1658 + }, + { + "epoch": 0.20355828220858896, + "grad_norm": 1.0626338880366404, + "learning_rate": 1.8462178722839063e-05, + "loss": 0.6567, + "step": 1659 + }, + { + "epoch": 0.20368098159509201, + "grad_norm": 1.0442575671978351, + "learning_rate": 1.846006046205042e-05, + "loss": 0.6911, + "step": 1660 + }, + { + "epoch": 0.2038036809815951, + "grad_norm": 0.8246755956432948, + "learning_rate": 1.845794086506796e-05, + "loss": 0.5982, + "step": 1661 + }, + { + "epoch": 0.20392638036809815, + "grad_norm": 1.0040800701224115, + "learning_rate": 1.8455819932226456e-05, + "loss": 0.6206, + "step": 1662 + }, + { + "epoch": 0.20404907975460124, + "grad_norm": 1.0577108317978672, + "learning_rate": 1.8453697663860888e-05, + "loss": 0.6835, + "step": 1663 + }, + { + "epoch": 0.2041717791411043, + "grad_norm": 1.0876913188051918, + "learning_rate": 1.8451574060306452e-05, + "loss": 0.5874, + "step": 1664 + }, + { + "epoch": 0.20429447852760735, + "grad_norm": 0.9725806105384617, + "learning_rate": 1.8449449121898552e-05, + "loss": 0.6697, + "step": 1665 + }, + { + "epoch": 0.20441717791411043, + "grad_norm": 1.1270270486270033, + "learning_rate": 1.8447322848972807e-05, + "loss": 0.6283, + "step": 1666 + }, + { + "epoch": 0.2045398773006135, + "grad_norm": 1.1181722748489196, + "learning_rate": 1.844519524186504e-05, + "loss": 0.6684, + "step": 1667 + }, + { + "epoch": 0.20466257668711657, + "grad_norm": 0.97264217463598, + "learning_rate": 1.8443066300911287e-05, + "loss": 0.6239, + "step": 1668 + }, + { + "epoch": 0.20478527607361963, + "grad_norm": 1.075325803607145, + "learning_rate": 1.84409360264478e-05, + "loss": 0.7057, + "step": 1669 + }, + { + "epoch": 0.2049079754601227, + "grad_norm": 0.9598054874797466, + "learning_rate": 1.8438804418811038e-05, + "loss": 0.625, + "step": 1670 + }, + { + "epoch": 0.20503067484662577, + "grad_norm": 1.1839758294507126, + "learning_rate": 1.8436671478337666e-05, + "loss": 0.6523, + "step": 1671 + }, + { + "epoch": 0.20515337423312882, + "grad_norm": 0.9166150926843205, + "learning_rate": 1.8434537205364563e-05, + "loss": 0.602, + "step": 1672 + }, + { + "epoch": 0.2052760736196319, + "grad_norm": 0.9017490898619649, + "learning_rate": 1.8432401600228823e-05, + "loss": 0.6049, + "step": 1673 + }, + { + "epoch": 0.20539877300613496, + "grad_norm": 0.9469456855906945, + "learning_rate": 1.8430264663267744e-05, + "loss": 0.6418, + "step": 1674 + }, + { + "epoch": 0.20552147239263804, + "grad_norm": 1.0421687186197122, + "learning_rate": 1.842812639481884e-05, + "loss": 0.5874, + "step": 1675 + }, + { + "epoch": 0.2056441717791411, + "grad_norm": 1.0665288925304952, + "learning_rate": 1.8425986795219828e-05, + "loss": 0.658, + "step": 1676 + }, + { + "epoch": 0.20576687116564418, + "grad_norm": 1.0099062450601108, + "learning_rate": 1.842384586480864e-05, + "loss": 0.6263, + "step": 1677 + }, + { + "epoch": 0.20588957055214724, + "grad_norm": 0.8727364560543106, + "learning_rate": 1.842170360392342e-05, + "loss": 0.5897, + "step": 1678 + }, + { + "epoch": 0.2060122699386503, + "grad_norm": 1.1781820986793357, + "learning_rate": 1.841956001290252e-05, + "loss": 0.6899, + "step": 1679 + }, + { + "epoch": 0.20613496932515338, + "grad_norm": 0.9635279535070304, + "learning_rate": 1.84174150920845e-05, + "loss": 0.5958, + "step": 1680 + }, + { + "epoch": 0.20625766871165643, + "grad_norm": 1.2844714950520808, + "learning_rate": 1.8415268841808132e-05, + "loss": 0.5528, + "step": 1681 + }, + { + "epoch": 0.20638036809815952, + "grad_norm": 0.9745368992033845, + "learning_rate": 1.8413121262412395e-05, + "loss": 0.6634, + "step": 1682 + }, + { + "epoch": 0.20650306748466257, + "grad_norm": 1.1759123318578155, + "learning_rate": 1.8410972354236484e-05, + "loss": 0.5863, + "step": 1683 + }, + { + "epoch": 0.20662576687116566, + "grad_norm": 1.1908803031516144, + "learning_rate": 1.8408822117619804e-05, + "loss": 0.633, + "step": 1684 + }, + { + "epoch": 0.2067484662576687, + "grad_norm": 1.0651492110791232, + "learning_rate": 1.8406670552901958e-05, + "loss": 0.6587, + "step": 1685 + }, + { + "epoch": 0.20687116564417177, + "grad_norm": 1.1501753746990018, + "learning_rate": 1.8404517660422773e-05, + "loss": 0.7071, + "step": 1686 + }, + { + "epoch": 0.20699386503067485, + "grad_norm": 0.9082554854387923, + "learning_rate": 1.8402363440522277e-05, + "loss": 0.6513, + "step": 1687 + }, + { + "epoch": 0.2071165644171779, + "grad_norm": 1.0070366131647424, + "learning_rate": 1.8400207893540714e-05, + "loss": 0.5878, + "step": 1688 + }, + { + "epoch": 0.207239263803681, + "grad_norm": 0.9669145383875095, + "learning_rate": 1.839805101981853e-05, + "loss": 0.6109, + "step": 1689 + }, + { + "epoch": 0.20736196319018405, + "grad_norm": 1.101850088370926, + "learning_rate": 1.839589281969639e-05, + "loss": 0.614, + "step": 1690 + }, + { + "epoch": 0.20748466257668713, + "grad_norm": 1.0425945187519257, + "learning_rate": 1.8393733293515158e-05, + "loss": 0.6639, + "step": 1691 + }, + { + "epoch": 0.20760736196319018, + "grad_norm": 0.9767058319161392, + "learning_rate": 1.8391572441615916e-05, + "loss": 0.5873, + "step": 1692 + }, + { + "epoch": 0.20773006134969324, + "grad_norm": 1.0349083616419232, + "learning_rate": 1.838941026433995e-05, + "loss": 0.6497, + "step": 1693 + }, + { + "epoch": 0.20785276073619632, + "grad_norm": 0.9559522361387264, + "learning_rate": 1.8387246762028758e-05, + "loss": 0.6708, + "step": 1694 + }, + { + "epoch": 0.20797546012269938, + "grad_norm": 1.0787657612391477, + "learning_rate": 1.8385081935024044e-05, + "loss": 0.635, + "step": 1695 + }, + { + "epoch": 0.20809815950920246, + "grad_norm": 1.038917105686666, + "learning_rate": 1.8382915783667728e-05, + "loss": 0.6307, + "step": 1696 + }, + { + "epoch": 0.20822085889570552, + "grad_norm": 1.2416864147552686, + "learning_rate": 1.838074830830193e-05, + "loss": 0.6861, + "step": 1697 + }, + { + "epoch": 0.2083435582822086, + "grad_norm": 0.980526475837455, + "learning_rate": 1.837857950926899e-05, + "loss": 0.619, + "step": 1698 + }, + { + "epoch": 0.20846625766871166, + "grad_norm": 1.1285070448772323, + "learning_rate": 1.837640938691145e-05, + "loss": 0.6469, + "step": 1699 + }, + { + "epoch": 0.2085889570552147, + "grad_norm": 1.2799020334800728, + "learning_rate": 1.837423794157206e-05, + "loss": 0.6588, + "step": 1700 + }, + { + "epoch": 0.2087116564417178, + "grad_norm": 0.9103052035835869, + "learning_rate": 1.837206517359378e-05, + "loss": 0.6889, + "step": 1701 + }, + { + "epoch": 0.20883435582822085, + "grad_norm": 1.0072155425924163, + "learning_rate": 1.836989108331978e-05, + "loss": 0.6541, + "step": 1702 + }, + { + "epoch": 0.20895705521472394, + "grad_norm": 1.0268437780328943, + "learning_rate": 1.836771567109344e-05, + "loss": 0.701, + "step": 1703 + }, + { + "epoch": 0.209079754601227, + "grad_norm": 0.9501173805909887, + "learning_rate": 1.8365538937258354e-05, + "loss": 0.6483, + "step": 1704 + }, + { + "epoch": 0.20920245398773007, + "grad_norm": 1.0268655422868147, + "learning_rate": 1.836336088215831e-05, + "loss": 0.6534, + "step": 1705 + }, + { + "epoch": 0.20932515337423313, + "grad_norm": 0.9268886890149227, + "learning_rate": 1.836118150613732e-05, + "loss": 0.629, + "step": 1706 + }, + { + "epoch": 0.2094478527607362, + "grad_norm": 1.1547088450157856, + "learning_rate": 1.8359000809539584e-05, + "loss": 0.6557, + "step": 1707 + }, + { + "epoch": 0.20957055214723927, + "grad_norm": 0.9773388850529429, + "learning_rate": 1.8356818792709542e-05, + "loss": 0.6287, + "step": 1708 + }, + { + "epoch": 0.20969325153374233, + "grad_norm": 1.086019607791305, + "learning_rate": 1.835463545599181e-05, + "loss": 0.6741, + "step": 1709 + }, + { + "epoch": 0.2098159509202454, + "grad_norm": 1.2046550735197812, + "learning_rate": 1.835245079973124e-05, + "loss": 0.6464, + "step": 1710 + }, + { + "epoch": 0.20993865030674846, + "grad_norm": 1.037387720308135, + "learning_rate": 1.8350264824272868e-05, + "loss": 0.6138, + "step": 1711 + }, + { + "epoch": 0.21006134969325152, + "grad_norm": 1.0046929325090392, + "learning_rate": 1.8348077529961957e-05, + "loss": 0.6675, + "step": 1712 + }, + { + "epoch": 0.2101840490797546, + "grad_norm": 0.9134827735937282, + "learning_rate": 1.8345888917143972e-05, + "loss": 0.6028, + "step": 1713 + }, + { + "epoch": 0.21030674846625766, + "grad_norm": 0.920307024925667, + "learning_rate": 1.8343698986164583e-05, + "loss": 0.5901, + "step": 1714 + }, + { + "epoch": 0.21042944785276074, + "grad_norm": 0.975168441048423, + "learning_rate": 1.834150773736967e-05, + "loss": 0.6449, + "step": 1715 + }, + { + "epoch": 0.2105521472392638, + "grad_norm": 1.0976360302551424, + "learning_rate": 1.8339315171105324e-05, + "loss": 0.6232, + "step": 1716 + }, + { + "epoch": 0.21067484662576688, + "grad_norm": 1.0430483312449077, + "learning_rate": 1.833712128771784e-05, + "loss": 0.6381, + "step": 1717 + }, + { + "epoch": 0.21079754601226994, + "grad_norm": 1.024505940570779, + "learning_rate": 1.8334926087553727e-05, + "loss": 0.5944, + "step": 1718 + }, + { + "epoch": 0.210920245398773, + "grad_norm": 0.9719421956966301, + "learning_rate": 1.833272957095969e-05, + "loss": 0.6283, + "step": 1719 + }, + { + "epoch": 0.21104294478527608, + "grad_norm": 1.098923040661208, + "learning_rate": 1.8330531738282656e-05, + "loss": 0.6164, + "step": 1720 + }, + { + "epoch": 0.21116564417177913, + "grad_norm": 1.1612356159067192, + "learning_rate": 1.8328332589869756e-05, + "loss": 0.6914, + "step": 1721 + }, + { + "epoch": 0.21128834355828222, + "grad_norm": 0.9360482175601192, + "learning_rate": 1.832613212606832e-05, + "loss": 0.5983, + "step": 1722 + }, + { + "epoch": 0.21141104294478527, + "grad_norm": 1.0312810289016383, + "learning_rate": 1.83239303472259e-05, + "loss": 0.6534, + "step": 1723 + }, + { + "epoch": 0.21153374233128835, + "grad_norm": 0.928888834774644, + "learning_rate": 1.832172725369024e-05, + "loss": 0.6296, + "step": 1724 + }, + { + "epoch": 0.2116564417177914, + "grad_norm": 1.0277695177280641, + "learning_rate": 1.8319522845809306e-05, + "loss": 0.5767, + "step": 1725 + }, + { + "epoch": 0.21177914110429447, + "grad_norm": 1.0576485503454092, + "learning_rate": 1.8317317123931262e-05, + "loss": 0.619, + "step": 1726 + }, + { + "epoch": 0.21190184049079755, + "grad_norm": 1.0205300643949713, + "learning_rate": 1.8315110088404484e-05, + "loss": 0.6202, + "step": 1727 + }, + { + "epoch": 0.2120245398773006, + "grad_norm": 1.1373583323099294, + "learning_rate": 1.831290173957755e-05, + "loss": 0.6206, + "step": 1728 + }, + { + "epoch": 0.2121472392638037, + "grad_norm": 1.060112388432389, + "learning_rate": 1.8310692077799257e-05, + "loss": 0.6625, + "step": 1729 + }, + { + "epoch": 0.21226993865030674, + "grad_norm": 0.9260261473742116, + "learning_rate": 1.8308481103418597e-05, + "loss": 0.6097, + "step": 1730 + }, + { + "epoch": 0.21239263803680983, + "grad_norm": 1.1016600402493402, + "learning_rate": 1.8306268816784778e-05, + "loss": 0.6461, + "step": 1731 + }, + { + "epoch": 0.21251533742331288, + "grad_norm": 1.2106750060361382, + "learning_rate": 1.8304055218247207e-05, + "loss": 0.6216, + "step": 1732 + }, + { + "epoch": 0.21263803680981594, + "grad_norm": 1.0801027064150028, + "learning_rate": 1.8301840308155507e-05, + "loss": 0.6909, + "step": 1733 + }, + { + "epoch": 0.21276073619631902, + "grad_norm": 0.9567332480332673, + "learning_rate": 1.82996240868595e-05, + "loss": 0.6974, + "step": 1734 + }, + { + "epoch": 0.21288343558282208, + "grad_norm": 1.0386231076133017, + "learning_rate": 1.8297406554709228e-05, + "loss": 0.6732, + "step": 1735 + }, + { + "epoch": 0.21300613496932516, + "grad_norm": 1.3375242826277525, + "learning_rate": 1.8295187712054922e-05, + "loss": 0.6778, + "step": 1736 + }, + { + "epoch": 0.21312883435582822, + "grad_norm": 1.122971074739515, + "learning_rate": 1.8292967559247028e-05, + "loss": 0.6324, + "step": 1737 + }, + { + "epoch": 0.2132515337423313, + "grad_norm": 1.1032995083521855, + "learning_rate": 1.8290746096636207e-05, + "loss": 0.6563, + "step": 1738 + }, + { + "epoch": 0.21337423312883436, + "grad_norm": 0.9906636447514543, + "learning_rate": 1.8288523324573314e-05, + "loss": 0.6674, + "step": 1739 + }, + { + "epoch": 0.2134969325153374, + "grad_norm": 1.0157881002180082, + "learning_rate": 1.8286299243409424e-05, + "loss": 0.694, + "step": 1740 + }, + { + "epoch": 0.2136196319018405, + "grad_norm": 1.1442118893252637, + "learning_rate": 1.8284073853495808e-05, + "loss": 0.6646, + "step": 1741 + }, + { + "epoch": 0.21374233128834355, + "grad_norm": 1.0052706180899849, + "learning_rate": 1.8281847155183944e-05, + "loss": 0.5817, + "step": 1742 + }, + { + "epoch": 0.21386503067484663, + "grad_norm": 1.1571585153773765, + "learning_rate": 1.8279619148825517e-05, + "loss": 0.6558, + "step": 1743 + }, + { + "epoch": 0.2139877300613497, + "grad_norm": 1.0853942970394135, + "learning_rate": 1.8277389834772432e-05, + "loss": 0.5963, + "step": 1744 + }, + { + "epoch": 0.21411042944785277, + "grad_norm": 0.9624377230193629, + "learning_rate": 1.8275159213376783e-05, + "loss": 0.6059, + "step": 1745 + }, + { + "epoch": 0.21423312883435583, + "grad_norm": 1.0109579734954284, + "learning_rate": 1.827292728499088e-05, + "loss": 0.6061, + "step": 1746 + }, + { + "epoch": 0.21435582822085888, + "grad_norm": 1.0089969161834222, + "learning_rate": 1.8270694049967237e-05, + "loss": 0.5903, + "step": 1747 + }, + { + "epoch": 0.21447852760736197, + "grad_norm": 0.9239429833937186, + "learning_rate": 1.826845950865857e-05, + "loss": 0.6509, + "step": 1748 + }, + { + "epoch": 0.21460122699386502, + "grad_norm": 1.140074654628719, + "learning_rate": 1.826622366141781e-05, + "loss": 0.7156, + "step": 1749 + }, + { + "epoch": 0.2147239263803681, + "grad_norm": 1.0726562138111884, + "learning_rate": 1.826398650859809e-05, + "loss": 0.6336, + "step": 1750 + }, + { + "epoch": 0.21484662576687116, + "grad_norm": 1.2560117998275366, + "learning_rate": 1.8261748050552745e-05, + "loss": 0.6173, + "step": 1751 + }, + { + "epoch": 0.21496932515337425, + "grad_norm": 1.118701697009423, + "learning_rate": 1.825950828763533e-05, + "loss": 0.6871, + "step": 1752 + }, + { + "epoch": 0.2150920245398773, + "grad_norm": 1.1584101545486796, + "learning_rate": 1.8257267220199583e-05, + "loss": 0.6134, + "step": 1753 + }, + { + "epoch": 0.21521472392638036, + "grad_norm": 1.1041437536965844, + "learning_rate": 1.8255024848599466e-05, + "loss": 0.6129, + "step": 1754 + }, + { + "epoch": 0.21533742331288344, + "grad_norm": 1.086412275442266, + "learning_rate": 1.8252781173189148e-05, + "loss": 0.6596, + "step": 1755 + }, + { + "epoch": 0.2154601226993865, + "grad_norm": 1.0222915261736893, + "learning_rate": 1.825053619432299e-05, + "loss": 0.6743, + "step": 1756 + }, + { + "epoch": 0.21558282208588958, + "grad_norm": 1.2552420574215346, + "learning_rate": 1.8248289912355575e-05, + "loss": 0.6611, + "step": 1757 + }, + { + "epoch": 0.21570552147239264, + "grad_norm": 1.091642185179408, + "learning_rate": 1.8246042327641678e-05, + "loss": 0.6502, + "step": 1758 + }, + { + "epoch": 0.2158282208588957, + "grad_norm": 1.3311786762588242, + "learning_rate": 1.8243793440536287e-05, + "loss": 0.6361, + "step": 1759 + }, + { + "epoch": 0.21595092024539878, + "grad_norm": 0.9095005170889414, + "learning_rate": 1.82415432513946e-05, + "loss": 0.56, + "step": 1760 + }, + { + "epoch": 0.21607361963190183, + "grad_norm": 1.2154860657148299, + "learning_rate": 1.8239291760572007e-05, + "loss": 0.6486, + "step": 1761 + }, + { + "epoch": 0.21619631901840491, + "grad_norm": 1.0391668882141396, + "learning_rate": 1.823703896842411e-05, + "loss": 0.6787, + "step": 1762 + }, + { + "epoch": 0.21631901840490797, + "grad_norm": 0.9533821438738602, + "learning_rate": 1.823478487530673e-05, + "loss": 0.5839, + "step": 1763 + }, + { + "epoch": 0.21644171779141105, + "grad_norm": 1.1574543716241812, + "learning_rate": 1.8232529481575874e-05, + "loss": 0.6166, + "step": 1764 + }, + { + "epoch": 0.2165644171779141, + "grad_norm": 1.1313900903195584, + "learning_rate": 1.823027278758776e-05, + "loss": 0.6562, + "step": 1765 + }, + { + "epoch": 0.21668711656441716, + "grad_norm": 0.939163750894956, + "learning_rate": 1.8228014793698817e-05, + "loss": 0.6302, + "step": 1766 + }, + { + "epoch": 0.21680981595092025, + "grad_norm": 0.9601918746117332, + "learning_rate": 1.8225755500265676e-05, + "loss": 0.5715, + "step": 1767 + }, + { + "epoch": 0.2169325153374233, + "grad_norm": 1.1288263384727553, + "learning_rate": 1.822349490764517e-05, + "loss": 0.6268, + "step": 1768 + }, + { + "epoch": 0.2170552147239264, + "grad_norm": 1.0299475540639633, + "learning_rate": 1.8221233016194343e-05, + "loss": 0.6539, + "step": 1769 + }, + { + "epoch": 0.21717791411042944, + "grad_norm": 1.0972244653418668, + "learning_rate": 1.821896982627044e-05, + "loss": 0.608, + "step": 1770 + }, + { + "epoch": 0.21730061349693253, + "grad_norm": 1.1055497917629578, + "learning_rate": 1.8216705338230913e-05, + "loss": 0.6555, + "step": 1771 + }, + { + "epoch": 0.21742331288343558, + "grad_norm": 1.0393677451122405, + "learning_rate": 1.8214439552433417e-05, + "loss": 0.6108, + "step": 1772 + }, + { + "epoch": 0.21754601226993864, + "grad_norm": 1.1724653200395894, + "learning_rate": 1.8212172469235817e-05, + "loss": 0.6284, + "step": 1773 + }, + { + "epoch": 0.21766871165644172, + "grad_norm": 0.8670416788970554, + "learning_rate": 1.8209904088996174e-05, + "loss": 0.6073, + "step": 1774 + }, + { + "epoch": 0.21779141104294478, + "grad_norm": 1.370831118016265, + "learning_rate": 1.8207634412072765e-05, + "loss": 0.6262, + "step": 1775 + }, + { + "epoch": 0.21791411042944786, + "grad_norm": 1.003200180822066, + "learning_rate": 1.820536343882406e-05, + "loss": 0.5999, + "step": 1776 + }, + { + "epoch": 0.21803680981595092, + "grad_norm": 1.0640345086757295, + "learning_rate": 1.8203091169608743e-05, + "loss": 0.6623, + "step": 1777 + }, + { + "epoch": 0.218159509202454, + "grad_norm": 0.9529822993429099, + "learning_rate": 1.8200817604785702e-05, + "loss": 0.6255, + "step": 1778 + }, + { + "epoch": 0.21828220858895706, + "grad_norm": 1.0582654755838254, + "learning_rate": 1.819854274471402e-05, + "loss": 0.5965, + "step": 1779 + }, + { + "epoch": 0.2184049079754601, + "grad_norm": 0.9638837076754149, + "learning_rate": 1.8196266589753e-05, + "loss": 0.6522, + "step": 1780 + }, + { + "epoch": 0.2185276073619632, + "grad_norm": 1.0281326213442865, + "learning_rate": 1.8193989140262133e-05, + "loss": 0.6272, + "step": 1781 + }, + { + "epoch": 0.21865030674846625, + "grad_norm": 0.9549365280432991, + "learning_rate": 1.8191710396601126e-05, + "loss": 0.6282, + "step": 1782 + }, + { + "epoch": 0.21877300613496933, + "grad_norm": 1.1900425419785858, + "learning_rate": 1.818943035912989e-05, + "loss": 0.659, + "step": 1783 + }, + { + "epoch": 0.2188957055214724, + "grad_norm": 1.100062412957141, + "learning_rate": 1.818714902820853e-05, + "loss": 0.6813, + "step": 1784 + }, + { + "epoch": 0.21901840490797547, + "grad_norm": 0.9378392874753789, + "learning_rate": 1.818486640419737e-05, + "loss": 0.6692, + "step": 1785 + }, + { + "epoch": 0.21914110429447853, + "grad_norm": 0.9466585827170177, + "learning_rate": 1.818258248745693e-05, + "loss": 0.641, + "step": 1786 + }, + { + "epoch": 0.21926380368098158, + "grad_norm": 1.0844464934532343, + "learning_rate": 1.818029727834793e-05, + "loss": 0.6217, + "step": 1787 + }, + { + "epoch": 0.21938650306748467, + "grad_norm": 1.2031154543200602, + "learning_rate": 1.8178010777231302e-05, + "loss": 0.6763, + "step": 1788 + }, + { + "epoch": 0.21950920245398772, + "grad_norm": 0.9808317364233782, + "learning_rate": 1.817572298446818e-05, + "loss": 0.6276, + "step": 1789 + }, + { + "epoch": 0.2196319018404908, + "grad_norm": 0.8773483635975353, + "learning_rate": 1.81734339004199e-05, + "loss": 0.5998, + "step": 1790 + }, + { + "epoch": 0.21975460122699386, + "grad_norm": 1.039092749279893, + "learning_rate": 1.8171143525448006e-05, + "loss": 0.6347, + "step": 1791 + }, + { + "epoch": 0.21987730061349695, + "grad_norm": 1.0303847034039166, + "learning_rate": 1.816885185991424e-05, + "loss": 0.5823, + "step": 1792 + }, + { + "epoch": 0.22, + "grad_norm": 0.9911721156613591, + "learning_rate": 1.8166558904180547e-05, + "loss": 0.5545, + "step": 1793 + }, + { + "epoch": 0.22012269938650306, + "grad_norm": 1.0031029223334786, + "learning_rate": 1.816426465860909e-05, + "loss": 0.5855, + "step": 1794 + }, + { + "epoch": 0.22024539877300614, + "grad_norm": 1.039342656994947, + "learning_rate": 1.816196912356222e-05, + "loss": 0.6817, + "step": 1795 + }, + { + "epoch": 0.2203680981595092, + "grad_norm": 0.9906611454065227, + "learning_rate": 1.8159672299402493e-05, + "loss": 0.6194, + "step": 1796 + }, + { + "epoch": 0.22049079754601228, + "grad_norm": 0.9556316505009305, + "learning_rate": 1.815737418649268e-05, + "loss": 0.5999, + "step": 1797 + }, + { + "epoch": 0.22061349693251533, + "grad_norm": 1.050128128846568, + "learning_rate": 1.8155074785195738e-05, + "loss": 0.6623, + "step": 1798 + }, + { + "epoch": 0.22073619631901842, + "grad_norm": 2.164985522936892, + "learning_rate": 1.815277409587485e-05, + "loss": 0.64, + "step": 1799 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 1.0377304416969186, + "learning_rate": 1.8150472118893382e-05, + "loss": 0.6267, + "step": 1800 + }, + { + "epoch": 0.22098159509202453, + "grad_norm": 1.083397556488492, + "learning_rate": 1.8148168854614915e-05, + "loss": 0.5938, + "step": 1801 + }, + { + "epoch": 0.2211042944785276, + "grad_norm": 1.0558789837156413, + "learning_rate": 1.8145864303403224e-05, + "loss": 0.5908, + "step": 1802 + }, + { + "epoch": 0.22122699386503067, + "grad_norm": 0.971218604864722, + "learning_rate": 1.81435584656223e-05, + "loss": 0.6553, + "step": 1803 + }, + { + "epoch": 0.22134969325153375, + "grad_norm": 0.9065411182124162, + "learning_rate": 1.814125134163633e-05, + "loss": 0.5985, + "step": 1804 + }, + { + "epoch": 0.2214723926380368, + "grad_norm": 1.1409588944487654, + "learning_rate": 1.8138942931809702e-05, + "loss": 0.6654, + "step": 1805 + }, + { + "epoch": 0.22159509202453986, + "grad_norm": 0.9918033219288129, + "learning_rate": 1.813663323650701e-05, + "loss": 0.6348, + "step": 1806 + }, + { + "epoch": 0.22171779141104295, + "grad_norm": 0.9485379416213425, + "learning_rate": 1.8134322256093046e-05, + "loss": 0.6456, + "step": 1807 + }, + { + "epoch": 0.221840490797546, + "grad_norm": 1.2423124627995406, + "learning_rate": 1.8132009990932816e-05, + "loss": 0.6138, + "step": 1808 + }, + { + "epoch": 0.2219631901840491, + "grad_norm": 0.8727276135521267, + "learning_rate": 1.812969644139152e-05, + "loss": 0.5806, + "step": 1809 + }, + { + "epoch": 0.22208588957055214, + "grad_norm": 0.9485092404965277, + "learning_rate": 1.8127381607834563e-05, + "loss": 0.6551, + "step": 1810 + }, + { + "epoch": 0.22220858895705523, + "grad_norm": 1.278368403418647, + "learning_rate": 1.8125065490627556e-05, + "loss": 0.6049, + "step": 1811 + }, + { + "epoch": 0.22233128834355828, + "grad_norm": 1.0060381798652298, + "learning_rate": 1.8122748090136303e-05, + "loss": 0.6389, + "step": 1812 + }, + { + "epoch": 0.22245398773006134, + "grad_norm": 0.9804114214220485, + "learning_rate": 1.8120429406726827e-05, + "loss": 0.6242, + "step": 1813 + }, + { + "epoch": 0.22257668711656442, + "grad_norm": 1.0239407541528052, + "learning_rate": 1.8118109440765334e-05, + "loss": 0.6555, + "step": 1814 + }, + { + "epoch": 0.22269938650306748, + "grad_norm": 1.2522577425471721, + "learning_rate": 1.8115788192618247e-05, + "loss": 0.7072, + "step": 1815 + }, + { + "epoch": 0.22282208588957056, + "grad_norm": 0.993914911144947, + "learning_rate": 1.811346566265219e-05, + "loss": 0.6738, + "step": 1816 + }, + { + "epoch": 0.22294478527607361, + "grad_norm": 1.0200707205076973, + "learning_rate": 1.8111141851233986e-05, + "loss": 0.6239, + "step": 1817 + }, + { + "epoch": 0.2230674846625767, + "grad_norm": 1.0070564358919485, + "learning_rate": 1.8108816758730656e-05, + "loss": 0.6065, + "step": 1818 + }, + { + "epoch": 0.22319018404907975, + "grad_norm": 0.9904889200001891, + "learning_rate": 1.810649038550943e-05, + "loss": 0.6521, + "step": 1819 + }, + { + "epoch": 0.2233128834355828, + "grad_norm": 1.1387364725598086, + "learning_rate": 1.8104162731937746e-05, + "loss": 0.6606, + "step": 1820 + }, + { + "epoch": 0.2234355828220859, + "grad_norm": 1.169092143015251, + "learning_rate": 1.8101833798383227e-05, + "loss": 0.6662, + "step": 1821 + }, + { + "epoch": 0.22355828220858895, + "grad_norm": 1.0458752384719292, + "learning_rate": 1.8099503585213715e-05, + "loss": 0.6173, + "step": 1822 + }, + { + "epoch": 0.22368098159509203, + "grad_norm": 1.1213687419235494, + "learning_rate": 1.8097172092797244e-05, + "loss": 0.6471, + "step": 1823 + }, + { + "epoch": 0.2238036809815951, + "grad_norm": 0.9605778259087605, + "learning_rate": 1.809483932150205e-05, + "loss": 0.616, + "step": 1824 + }, + { + "epoch": 0.22392638036809817, + "grad_norm": 1.0047659600671377, + "learning_rate": 1.8092505271696582e-05, + "loss": 0.6113, + "step": 1825 + }, + { + "epoch": 0.22404907975460123, + "grad_norm": 1.0499726974858483, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.6578, + "step": 1826 + }, + { + "epoch": 0.22417177914110428, + "grad_norm": 1.0259929995383963, + "learning_rate": 1.808783333802958e-05, + "loss": 0.653, + "step": 1827 + }, + { + "epoch": 0.22429447852760737, + "grad_norm": 1.197277166306749, + "learning_rate": 1.8085495454905944e-05, + "loss": 0.7076, + "step": 1828 + }, + { + "epoch": 0.22441717791411042, + "grad_norm": 0.9328014364767648, + "learning_rate": 1.8083156294747807e-05, + "loss": 0.5974, + "step": 1829 + }, + { + "epoch": 0.2245398773006135, + "grad_norm": 1.0651149113672957, + "learning_rate": 1.808081585792463e-05, + "loss": 0.6054, + "step": 1830 + }, + { + "epoch": 0.22466257668711656, + "grad_norm": 1.095850843720693, + "learning_rate": 1.807847414480606e-05, + "loss": 0.6729, + "step": 1831 + }, + { + "epoch": 0.22478527607361964, + "grad_norm": 0.9746770171168175, + "learning_rate": 1.807613115576195e-05, + "loss": 0.6275, + "step": 1832 + }, + { + "epoch": 0.2249079754601227, + "grad_norm": 1.176637829958022, + "learning_rate": 1.8073786891162356e-05, + "loss": 0.6093, + "step": 1833 + }, + { + "epoch": 0.22503067484662576, + "grad_norm": 0.9569194029295127, + "learning_rate": 1.8071441351377534e-05, + "loss": 0.5494, + "step": 1834 + }, + { + "epoch": 0.22515337423312884, + "grad_norm": 0.9661811615691452, + "learning_rate": 1.8069094536777938e-05, + "loss": 0.5924, + "step": 1835 + }, + { + "epoch": 0.2252760736196319, + "grad_norm": 1.0080196109086588, + "learning_rate": 1.8066746447734237e-05, + "loss": 0.6384, + "step": 1836 + }, + { + "epoch": 0.22539877300613498, + "grad_norm": 1.131540586503095, + "learning_rate": 1.806439708461728e-05, + "loss": 0.5494, + "step": 1837 + }, + { + "epoch": 0.22552147239263803, + "grad_norm": 1.180612799726002, + "learning_rate": 1.806204644779814e-05, + "loss": 0.5944, + "step": 1838 + }, + { + "epoch": 0.22564417177914112, + "grad_norm": 0.9900624661919424, + "learning_rate": 1.805969453764807e-05, + "loss": 0.5403, + "step": 1839 + }, + { + "epoch": 0.22576687116564417, + "grad_norm": 0.8992921177438409, + "learning_rate": 1.805734135453854e-05, + "loss": 0.6154, + "step": 1840 + }, + { + "epoch": 0.22588957055214723, + "grad_norm": 1.0675314047249398, + "learning_rate": 1.8054986898841217e-05, + "loss": 0.6679, + "step": 1841 + }, + { + "epoch": 0.2260122699386503, + "grad_norm": 1.0133704153563394, + "learning_rate": 1.8052631170927958e-05, + "loss": 0.6311, + "step": 1842 + }, + { + "epoch": 0.22613496932515337, + "grad_norm": 0.9676237415452759, + "learning_rate": 1.8050274171170835e-05, + "loss": 0.5926, + "step": 1843 + }, + { + "epoch": 0.22625766871165645, + "grad_norm": 1.0613633094888912, + "learning_rate": 1.804791589994212e-05, + "loss": 0.6229, + "step": 1844 + }, + { + "epoch": 0.2263803680981595, + "grad_norm": 1.0373289187477777, + "learning_rate": 1.8045556357614273e-05, + "loss": 0.6312, + "step": 1845 + }, + { + "epoch": 0.2265030674846626, + "grad_norm": 1.2380036717637726, + "learning_rate": 1.8043195544559972e-05, + "loss": 0.659, + "step": 1846 + }, + { + "epoch": 0.22662576687116565, + "grad_norm": 1.054805606428618, + "learning_rate": 1.804083346115208e-05, + "loss": 0.6479, + "step": 1847 + }, + { + "epoch": 0.2267484662576687, + "grad_norm": 1.2881258121031616, + "learning_rate": 1.8038470107763673e-05, + "loss": 0.6825, + "step": 1848 + }, + { + "epoch": 0.22687116564417178, + "grad_norm": 1.087865809053787, + "learning_rate": 1.8036105484768023e-05, + "loss": 0.6603, + "step": 1849 + }, + { + "epoch": 0.22699386503067484, + "grad_norm": 1.1050690830985757, + "learning_rate": 1.8033739592538598e-05, + "loss": 0.6109, + "step": 1850 + }, + { + "epoch": 0.22711656441717792, + "grad_norm": 1.0846258545672485, + "learning_rate": 1.803137243144907e-05, + "loss": 0.6617, + "step": 1851 + }, + { + "epoch": 0.22723926380368098, + "grad_norm": 1.1727485425591961, + "learning_rate": 1.8029004001873312e-05, + "loss": 0.6197, + "step": 1852 + }, + { + "epoch": 0.22736196319018404, + "grad_norm": 1.039728553064627, + "learning_rate": 1.80266343041854e-05, + "loss": 0.6333, + "step": 1853 + }, + { + "epoch": 0.22748466257668712, + "grad_norm": 1.0148178868632438, + "learning_rate": 1.8024263338759607e-05, + "loss": 0.6009, + "step": 1854 + }, + { + "epoch": 0.22760736196319017, + "grad_norm": 1.118616909152692, + "learning_rate": 1.8021891105970405e-05, + "loss": 0.6746, + "step": 1855 + }, + { + "epoch": 0.22773006134969326, + "grad_norm": 1.2533647773079262, + "learning_rate": 1.8019517606192467e-05, + "loss": 0.688, + "step": 1856 + }, + { + "epoch": 0.2278527607361963, + "grad_norm": 1.2312660472146748, + "learning_rate": 1.8017142839800667e-05, + "loss": 0.6529, + "step": 1857 + }, + { + "epoch": 0.2279754601226994, + "grad_norm": 1.1279015431741362, + "learning_rate": 1.8014766807170082e-05, + "loss": 0.66, + "step": 1858 + }, + { + "epoch": 0.22809815950920245, + "grad_norm": 1.0631697680086767, + "learning_rate": 1.8012389508675984e-05, + "loss": 0.5998, + "step": 1859 + }, + { + "epoch": 0.2282208588957055, + "grad_norm": 1.0291017036009753, + "learning_rate": 1.8010010944693846e-05, + "loss": 0.6174, + "step": 1860 + }, + { + "epoch": 0.2283435582822086, + "grad_norm": 0.9940878080326012, + "learning_rate": 1.800763111559935e-05, + "loss": 0.6416, + "step": 1861 + }, + { + "epoch": 0.22846625766871165, + "grad_norm": 1.0287008724687503, + "learning_rate": 1.800525002176835e-05, + "loss": 0.6348, + "step": 1862 + }, + { + "epoch": 0.22858895705521473, + "grad_norm": 0.9653057929467409, + "learning_rate": 1.8002867663576937e-05, + "loss": 0.6124, + "step": 1863 + }, + { + "epoch": 0.2287116564417178, + "grad_norm": 1.102244161910205, + "learning_rate": 1.800048404140138e-05, + "loss": 0.6313, + "step": 1864 + }, + { + "epoch": 0.22883435582822087, + "grad_norm": 1.009402916709175, + "learning_rate": 1.7998099155618147e-05, + "loss": 0.6892, + "step": 1865 + }, + { + "epoch": 0.22895705521472393, + "grad_norm": 0.9107049793217096, + "learning_rate": 1.7995713006603913e-05, + "loss": 0.6576, + "step": 1866 + }, + { + "epoch": 0.22907975460122698, + "grad_norm": 1.0124238603245261, + "learning_rate": 1.7993325594735552e-05, + "loss": 0.6118, + "step": 1867 + }, + { + "epoch": 0.22920245398773006, + "grad_norm": 0.9545120176938704, + "learning_rate": 1.7990936920390133e-05, + "loss": 0.6168, + "step": 1868 + }, + { + "epoch": 0.22932515337423312, + "grad_norm": 0.917692052918553, + "learning_rate": 1.7988546983944925e-05, + "loss": 0.6044, + "step": 1869 + }, + { + "epoch": 0.2294478527607362, + "grad_norm": 0.9892055273682788, + "learning_rate": 1.7986155785777402e-05, + "loss": 0.6633, + "step": 1870 + }, + { + "epoch": 0.22957055214723926, + "grad_norm": 0.8797039025837357, + "learning_rate": 1.798376332626523e-05, + "loss": 0.6685, + "step": 1871 + }, + { + "epoch": 0.22969325153374234, + "grad_norm": 0.9804304708861717, + "learning_rate": 1.7981369605786277e-05, + "loss": 0.627, + "step": 1872 + }, + { + "epoch": 0.2298159509202454, + "grad_norm": 1.01592756172574, + "learning_rate": 1.7978974624718614e-05, + "loss": 0.593, + "step": 1873 + }, + { + "epoch": 0.22993865030674845, + "grad_norm": 1.0338678833817585, + "learning_rate": 1.7976578383440502e-05, + "loss": 0.6522, + "step": 1874 + }, + { + "epoch": 0.23006134969325154, + "grad_norm": 0.9161263507242781, + "learning_rate": 1.7974180882330413e-05, + "loss": 0.6001, + "step": 1875 + }, + { + "epoch": 0.2301840490797546, + "grad_norm": 1.0361208071613934, + "learning_rate": 1.797178212176701e-05, + "loss": 0.5952, + "step": 1876 + }, + { + "epoch": 0.23030674846625768, + "grad_norm": 1.0270780135822428, + "learning_rate": 1.7969382102129153e-05, + "loss": 0.6788, + "step": 1877 + }, + { + "epoch": 0.23042944785276073, + "grad_norm": 1.0266093212151557, + "learning_rate": 1.7966980823795904e-05, + "loss": 0.6109, + "step": 1878 + }, + { + "epoch": 0.23055214723926382, + "grad_norm": 0.9830156416233545, + "learning_rate": 1.796457828714653e-05, + "loss": 0.6104, + "step": 1879 + }, + { + "epoch": 0.23067484662576687, + "grad_norm": 1.1019369423729652, + "learning_rate": 1.7962174492560492e-05, + "loss": 0.6196, + "step": 1880 + }, + { + "epoch": 0.23079754601226993, + "grad_norm": 1.107238635336944, + "learning_rate": 1.795976944041744e-05, + "loss": 0.6215, + "step": 1881 + }, + { + "epoch": 0.230920245398773, + "grad_norm": 0.8480001986125165, + "learning_rate": 1.795736313109724e-05, + "loss": 0.6404, + "step": 1882 + }, + { + "epoch": 0.23104294478527607, + "grad_norm": 1.206275554205202, + "learning_rate": 1.7954955564979944e-05, + "loss": 0.6925, + "step": 1883 + }, + { + "epoch": 0.23116564417177915, + "grad_norm": 1.0632516003595127, + "learning_rate": 1.7952546742445802e-05, + "loss": 0.6328, + "step": 1884 + }, + { + "epoch": 0.2312883435582822, + "grad_norm": 1.1021720266357353, + "learning_rate": 1.7950136663875274e-05, + "loss": 0.6508, + "step": 1885 + }, + { + "epoch": 0.2314110429447853, + "grad_norm": 0.9780710739552569, + "learning_rate": 1.794772532964901e-05, + "loss": 0.6269, + "step": 1886 + }, + { + "epoch": 0.23153374233128834, + "grad_norm": 0.98442048173137, + "learning_rate": 1.7945312740147857e-05, + "loss": 0.6513, + "step": 1887 + }, + { + "epoch": 0.2316564417177914, + "grad_norm": 1.1265748803847158, + "learning_rate": 1.794289889575286e-05, + "loss": 0.6208, + "step": 1888 + }, + { + "epoch": 0.23177914110429448, + "grad_norm": 0.9613376023478815, + "learning_rate": 1.7940483796845275e-05, + "loss": 0.6339, + "step": 1889 + }, + { + "epoch": 0.23190184049079754, + "grad_norm": 0.9710357744062104, + "learning_rate": 1.7938067443806538e-05, + "loss": 0.6075, + "step": 1890 + }, + { + "epoch": 0.23202453987730062, + "grad_norm": 0.933673190380908, + "learning_rate": 1.793564983701829e-05, + "loss": 0.5805, + "step": 1891 + }, + { + "epoch": 0.23214723926380368, + "grad_norm": 0.9488073239291511, + "learning_rate": 1.7933230976862375e-05, + "loss": 0.6348, + "step": 1892 + }, + { + "epoch": 0.23226993865030676, + "grad_norm": 1.0098478413744405, + "learning_rate": 1.7930810863720833e-05, + "loss": 0.6332, + "step": 1893 + }, + { + "epoch": 0.23239263803680982, + "grad_norm": 1.1623357881277, + "learning_rate": 1.7928389497975897e-05, + "loss": 0.5903, + "step": 1894 + }, + { + "epoch": 0.23251533742331287, + "grad_norm": 1.1149116190407276, + "learning_rate": 1.7925966880009998e-05, + "loss": 0.6342, + "step": 1895 + }, + { + "epoch": 0.23263803680981596, + "grad_norm": 1.1909123187330264, + "learning_rate": 1.7923543010205774e-05, + "loss": 0.7126, + "step": 1896 + }, + { + "epoch": 0.232760736196319, + "grad_norm": 0.8966208258825502, + "learning_rate": 1.7921117888946052e-05, + "loss": 0.6243, + "step": 1897 + }, + { + "epoch": 0.2328834355828221, + "grad_norm": 1.0508609528508073, + "learning_rate": 1.7918691516613855e-05, + "loss": 0.6633, + "step": 1898 + }, + { + "epoch": 0.23300613496932515, + "grad_norm": 0.9719540316600598, + "learning_rate": 1.7916263893592412e-05, + "loss": 0.6166, + "step": 1899 + }, + { + "epoch": 0.2331288343558282, + "grad_norm": 1.014671477246682, + "learning_rate": 1.791383502026515e-05, + "loss": 0.645, + "step": 1900 + }, + { + "epoch": 0.2332515337423313, + "grad_norm": 0.8907012401874824, + "learning_rate": 1.7911404897015674e-05, + "loss": 0.6666, + "step": 1901 + }, + { + "epoch": 0.23337423312883435, + "grad_norm": 1.0619723625820903, + "learning_rate": 1.7908973524227814e-05, + "loss": 0.6078, + "step": 1902 + }, + { + "epoch": 0.23349693251533743, + "grad_norm": 1.0539161032492093, + "learning_rate": 1.7906540902285582e-05, + "loss": 0.6276, + "step": 1903 + }, + { + "epoch": 0.23361963190184049, + "grad_norm": 0.9080308899132005, + "learning_rate": 1.7904107031573186e-05, + "loss": 0.5953, + "step": 1904 + }, + { + "epoch": 0.23374233128834357, + "grad_norm": 0.916496801570089, + "learning_rate": 1.790167191247504e-05, + "loss": 0.6101, + "step": 1905 + }, + { + "epoch": 0.23386503067484662, + "grad_norm": 0.945841075614056, + "learning_rate": 1.7899235545375745e-05, + "loss": 0.6661, + "step": 1906 + }, + { + "epoch": 0.23398773006134968, + "grad_norm": 0.9412819639499502, + "learning_rate": 1.7896797930660107e-05, + "loss": 0.6516, + "step": 1907 + }, + { + "epoch": 0.23411042944785276, + "grad_norm": 1.3044905074996083, + "learning_rate": 1.7894359068713126e-05, + "loss": 0.6589, + "step": 1908 + }, + { + "epoch": 0.23423312883435582, + "grad_norm": 1.0469646263889414, + "learning_rate": 1.789191895992e-05, + "loss": 0.5874, + "step": 1909 + }, + { + "epoch": 0.2343558282208589, + "grad_norm": 1.740692421178088, + "learning_rate": 1.7889477604666124e-05, + "loss": 0.629, + "step": 1910 + }, + { + "epoch": 0.23447852760736196, + "grad_norm": 1.0429916404461872, + "learning_rate": 1.7887035003337082e-05, + "loss": 0.6501, + "step": 1911 + }, + { + "epoch": 0.23460122699386504, + "grad_norm": 1.000812362735024, + "learning_rate": 1.788459115631867e-05, + "loss": 0.6531, + "step": 1912 + }, + { + "epoch": 0.2347239263803681, + "grad_norm": 0.9299512366363624, + "learning_rate": 1.788214606399687e-05, + "loss": 0.6384, + "step": 1913 + }, + { + "epoch": 0.23484662576687115, + "grad_norm": 0.9808683317827175, + "learning_rate": 1.7879699726757865e-05, + "loss": 0.6633, + "step": 1914 + }, + { + "epoch": 0.23496932515337424, + "grad_norm": 1.0909188085927706, + "learning_rate": 1.787725214498803e-05, + "loss": 0.6615, + "step": 1915 + }, + { + "epoch": 0.2350920245398773, + "grad_norm": 0.9260807740452512, + "learning_rate": 1.787480331907394e-05, + "loss": 0.5924, + "step": 1916 + }, + { + "epoch": 0.23521472392638038, + "grad_norm": 1.0989903852871943, + "learning_rate": 1.7872353249402366e-05, + "loss": 0.6592, + "step": 1917 + }, + { + "epoch": 0.23533742331288343, + "grad_norm": 0.9412956109745889, + "learning_rate": 1.7869901936360275e-05, + "loss": 0.6454, + "step": 1918 + }, + { + "epoch": 0.23546012269938651, + "grad_norm": 1.121945407572883, + "learning_rate": 1.7867449380334834e-05, + "loss": 0.6956, + "step": 1919 + }, + { + "epoch": 0.23558282208588957, + "grad_norm": 0.9914885405357696, + "learning_rate": 1.78649955817134e-05, + "loss": 0.6324, + "step": 1920 + }, + { + "epoch": 0.23570552147239263, + "grad_norm": 1.0466371969940071, + "learning_rate": 1.786254054088353e-05, + "loss": 0.6406, + "step": 1921 + }, + { + "epoch": 0.2358282208588957, + "grad_norm": 1.0864512690960633, + "learning_rate": 1.7860084258232978e-05, + "loss": 0.6762, + "step": 1922 + }, + { + "epoch": 0.23595092024539877, + "grad_norm": 1.053540282133026, + "learning_rate": 1.7857626734149694e-05, + "loss": 0.6194, + "step": 1923 + }, + { + "epoch": 0.23607361963190185, + "grad_norm": 1.0953075791258873, + "learning_rate": 1.7855167969021817e-05, + "loss": 0.6603, + "step": 1924 + }, + { + "epoch": 0.2361963190184049, + "grad_norm": 1.36362214088143, + "learning_rate": 1.785270796323769e-05, + "loss": 0.6105, + "step": 1925 + }, + { + "epoch": 0.236319018404908, + "grad_norm": 1.1098225610571872, + "learning_rate": 1.7850246717185856e-05, + "loss": 0.6796, + "step": 1926 + }, + { + "epoch": 0.23644171779141104, + "grad_norm": 0.9386917664251772, + "learning_rate": 1.784778423125504e-05, + "loss": 0.6481, + "step": 1927 + }, + { + "epoch": 0.2365644171779141, + "grad_norm": 1.1102163533379144, + "learning_rate": 1.7845320505834176e-05, + "loss": 0.6651, + "step": 1928 + }, + { + "epoch": 0.23668711656441718, + "grad_norm": 1.0269948621843006, + "learning_rate": 1.7842855541312382e-05, + "loss": 0.6305, + "step": 1929 + }, + { + "epoch": 0.23680981595092024, + "grad_norm": 0.9334841279029817, + "learning_rate": 1.784038933807898e-05, + "loss": 0.6095, + "step": 1930 + }, + { + "epoch": 0.23693251533742332, + "grad_norm": 1.0760283541317073, + "learning_rate": 1.783792189652349e-05, + "loss": 0.6742, + "step": 1931 + }, + { + "epoch": 0.23705521472392638, + "grad_norm": 0.8580685207878876, + "learning_rate": 1.7835453217035622e-05, + "loss": 0.6278, + "step": 1932 + }, + { + "epoch": 0.23717791411042946, + "grad_norm": 0.8806284922230058, + "learning_rate": 1.783298330000528e-05, + "loss": 0.6198, + "step": 1933 + }, + { + "epoch": 0.23730061349693252, + "grad_norm": 1.0060287500818799, + "learning_rate": 1.7830512145822564e-05, + "loss": 0.6409, + "step": 1934 + }, + { + "epoch": 0.23742331288343557, + "grad_norm": 0.9960590331254614, + "learning_rate": 1.7828039754877778e-05, + "loss": 0.6357, + "step": 1935 + }, + { + "epoch": 0.23754601226993866, + "grad_norm": 1.057445841656457, + "learning_rate": 1.782556612756141e-05, + "loss": 0.6588, + "step": 1936 + }, + { + "epoch": 0.2376687116564417, + "grad_norm": 0.9418011141715303, + "learning_rate": 1.782309126426415e-05, + "loss": 0.6254, + "step": 1937 + }, + { + "epoch": 0.2377914110429448, + "grad_norm": 1.0510025268677048, + "learning_rate": 1.782061516537688e-05, + "loss": 0.6465, + "step": 1938 + }, + { + "epoch": 0.23791411042944785, + "grad_norm": 0.9341437877130986, + "learning_rate": 1.781813783129068e-05, + "loss": 0.5834, + "step": 1939 + }, + { + "epoch": 0.23803680981595093, + "grad_norm": 0.9765994182027354, + "learning_rate": 1.7815659262396825e-05, + "loss": 0.6254, + "step": 1940 + }, + { + "epoch": 0.238159509202454, + "grad_norm": 0.9978039521133529, + "learning_rate": 1.7813179459086782e-05, + "loss": 0.632, + "step": 1941 + }, + { + "epoch": 0.23828220858895705, + "grad_norm": 0.9263392645917258, + "learning_rate": 1.7810698421752212e-05, + "loss": 0.6152, + "step": 1942 + }, + { + "epoch": 0.23840490797546013, + "grad_norm": 1.0687558459606896, + "learning_rate": 1.7808216150784977e-05, + "loss": 0.636, + "step": 1943 + }, + { + "epoch": 0.23852760736196318, + "grad_norm": 0.9034219904098624, + "learning_rate": 1.780573264657713e-05, + "loss": 0.6175, + "step": 1944 + }, + { + "epoch": 0.23865030674846627, + "grad_norm": 0.9275050665518233, + "learning_rate": 1.780324790952092e-05, + "loss": 0.6458, + "step": 1945 + }, + { + "epoch": 0.23877300613496932, + "grad_norm": 1.02915104737862, + "learning_rate": 1.780076194000879e-05, + "loss": 0.6666, + "step": 1946 + }, + { + "epoch": 0.23889570552147238, + "grad_norm": 0.93705323812174, + "learning_rate": 1.779827473843337e-05, + "loss": 0.5906, + "step": 1947 + }, + { + "epoch": 0.23901840490797546, + "grad_norm": 1.0034260508848334, + "learning_rate": 1.77957863051875e-05, + "loss": 0.7077, + "step": 1948 + }, + { + "epoch": 0.23914110429447852, + "grad_norm": 0.9449761322908586, + "learning_rate": 1.7793296640664205e-05, + "loss": 0.6589, + "step": 1949 + }, + { + "epoch": 0.2392638036809816, + "grad_norm": 0.9585734026385104, + "learning_rate": 1.7790805745256703e-05, + "loss": 0.6307, + "step": 1950 + }, + { + "epoch": 0.23938650306748466, + "grad_norm": 1.5815661254723687, + "learning_rate": 1.778831361935842e-05, + "loss": 0.6201, + "step": 1951 + }, + { + "epoch": 0.23950920245398774, + "grad_norm": 1.0065990903779147, + "learning_rate": 1.7785820263362953e-05, + "loss": 0.6519, + "step": 1952 + }, + { + "epoch": 0.2396319018404908, + "grad_norm": 0.9101172209532505, + "learning_rate": 1.7783325677664113e-05, + "loss": 0.613, + "step": 1953 + }, + { + "epoch": 0.23975460122699385, + "grad_norm": 1.0299380372051752, + "learning_rate": 1.7780829862655894e-05, + "loss": 0.6231, + "step": 1954 + }, + { + "epoch": 0.23987730061349694, + "grad_norm": 0.9235894983846478, + "learning_rate": 1.7778332818732492e-05, + "loss": 0.6168, + "step": 1955 + }, + { + "epoch": 0.24, + "grad_norm": 1.0811348650850634, + "learning_rate": 1.7775834546288292e-05, + "loss": 0.6997, + "step": 1956 + }, + { + "epoch": 0.24012269938650307, + "grad_norm": 1.0404259712753636, + "learning_rate": 1.7773335045717875e-05, + "loss": 0.6707, + "step": 1957 + }, + { + "epoch": 0.24024539877300613, + "grad_norm": 1.198891394601472, + "learning_rate": 1.7770834317416018e-05, + "loss": 0.5924, + "step": 1958 + }, + { + "epoch": 0.2403680981595092, + "grad_norm": 1.0278129387998753, + "learning_rate": 1.776833236177769e-05, + "loss": 0.5726, + "step": 1959 + }, + { + "epoch": 0.24049079754601227, + "grad_norm": 1.0020237196409514, + "learning_rate": 1.7765829179198048e-05, + "loss": 0.6088, + "step": 1960 + }, + { + "epoch": 0.24061349693251532, + "grad_norm": 0.9978373320328515, + "learning_rate": 1.776332477007245e-05, + "loss": 0.6118, + "step": 1961 + }, + { + "epoch": 0.2407361963190184, + "grad_norm": 0.9106382487201005, + "learning_rate": 1.776081913479645e-05, + "loss": 0.6118, + "step": 1962 + }, + { + "epoch": 0.24085889570552146, + "grad_norm": 0.9627368528801565, + "learning_rate": 1.7758312273765787e-05, + "loss": 0.6233, + "step": 1963 + }, + { + "epoch": 0.24098159509202455, + "grad_norm": 1.0310381958161794, + "learning_rate": 1.7755804187376398e-05, + "loss": 0.6553, + "step": 1964 + }, + { + "epoch": 0.2411042944785276, + "grad_norm": 0.9866142670141139, + "learning_rate": 1.7753294876024417e-05, + "loss": 0.6792, + "step": 1965 + }, + { + "epoch": 0.2412269938650307, + "grad_norm": 1.040618463426992, + "learning_rate": 1.775078434010617e-05, + "loss": 0.6349, + "step": 1966 + }, + { + "epoch": 0.24134969325153374, + "grad_norm": 0.9607050933543506, + "learning_rate": 1.7748272580018168e-05, + "loss": 0.6418, + "step": 1967 + }, + { + "epoch": 0.2414723926380368, + "grad_norm": 0.9533774624258411, + "learning_rate": 1.7745759596157123e-05, + "loss": 0.6412, + "step": 1968 + }, + { + "epoch": 0.24159509202453988, + "grad_norm": 1.197724417433668, + "learning_rate": 1.7743245388919944e-05, + "loss": 0.5944, + "step": 1969 + }, + { + "epoch": 0.24171779141104294, + "grad_norm": 1.0011700855589951, + "learning_rate": 1.7740729958703725e-05, + "loss": 0.6263, + "step": 1970 + }, + { + "epoch": 0.24184049079754602, + "grad_norm": 0.9821892025393243, + "learning_rate": 1.7738213305905757e-05, + "loss": 0.6381, + "step": 1971 + }, + { + "epoch": 0.24196319018404908, + "grad_norm": 0.998659325979004, + "learning_rate": 1.7735695430923523e-05, + "loss": 0.6999, + "step": 1972 + }, + { + "epoch": 0.24208588957055216, + "grad_norm": 1.0760948134055286, + "learning_rate": 1.7733176334154704e-05, + "loss": 0.6741, + "step": 1973 + }, + { + "epoch": 0.24220858895705522, + "grad_norm": 1.0245283009476605, + "learning_rate": 1.773065601599716e-05, + "loss": 0.6036, + "step": 1974 + }, + { + "epoch": 0.24233128834355827, + "grad_norm": 1.162036343903141, + "learning_rate": 1.7728134476848965e-05, + "loss": 0.6377, + "step": 1975 + }, + { + "epoch": 0.24245398773006135, + "grad_norm": 1.1847763642141778, + "learning_rate": 1.772561171710837e-05, + "loss": 0.6984, + "step": 1976 + }, + { + "epoch": 0.2425766871165644, + "grad_norm": 1.0612666485805826, + "learning_rate": 1.7723087737173822e-05, + "loss": 0.6627, + "step": 1977 + }, + { + "epoch": 0.2426993865030675, + "grad_norm": 1.0255353638132834, + "learning_rate": 1.772056253744396e-05, + "loss": 0.6223, + "step": 1978 + }, + { + "epoch": 0.24282208588957055, + "grad_norm": 1.1353002022206375, + "learning_rate": 1.771803611831762e-05, + "loss": 0.6808, + "step": 1979 + }, + { + "epoch": 0.24294478527607363, + "grad_norm": 1.082967080927986, + "learning_rate": 1.7715508480193832e-05, + "loss": 0.6686, + "step": 1980 + }, + { + "epoch": 0.2430674846625767, + "grad_norm": 0.9753872789343543, + "learning_rate": 1.771297962347181e-05, + "loss": 0.5931, + "step": 1981 + }, + { + "epoch": 0.24319018404907974, + "grad_norm": 1.0228134771036446, + "learning_rate": 1.7710449548550964e-05, + "loss": 0.6043, + "step": 1982 + }, + { + "epoch": 0.24331288343558283, + "grad_norm": 1.0098819719129988, + "learning_rate": 1.77079182558309e-05, + "loss": 0.6629, + "step": 1983 + }, + { + "epoch": 0.24343558282208588, + "grad_norm": 1.023280098072051, + "learning_rate": 1.7705385745711412e-05, + "loss": 0.618, + "step": 1984 + }, + { + "epoch": 0.24355828220858897, + "grad_norm": 1.0616697575631506, + "learning_rate": 1.7702852018592493e-05, + "loss": 0.6403, + "step": 1985 + }, + { + "epoch": 0.24368098159509202, + "grad_norm": 0.9323590923935421, + "learning_rate": 1.7700317074874318e-05, + "loss": 0.6401, + "step": 1986 + }, + { + "epoch": 0.2438036809815951, + "grad_norm": 1.1302988988609224, + "learning_rate": 1.7697780914957262e-05, + "loss": 0.6289, + "step": 1987 + }, + { + "epoch": 0.24392638036809816, + "grad_norm": 1.0891032233274716, + "learning_rate": 1.7695243539241893e-05, + "loss": 0.5984, + "step": 1988 + }, + { + "epoch": 0.24404907975460122, + "grad_norm": 0.9515850369357293, + "learning_rate": 1.769270494812896e-05, + "loss": 0.5673, + "step": 1989 + }, + { + "epoch": 0.2441717791411043, + "grad_norm": 0.9954562358310626, + "learning_rate": 1.769016514201942e-05, + "loss": 0.5854, + "step": 1990 + }, + { + "epoch": 0.24429447852760736, + "grad_norm": 0.9896852003920571, + "learning_rate": 1.7687624121314406e-05, + "loss": 0.601, + "step": 1991 + }, + { + "epoch": 0.24441717791411044, + "grad_norm": 1.5105058123984012, + "learning_rate": 1.7685081886415252e-05, + "loss": 0.6367, + "step": 1992 + }, + { + "epoch": 0.2445398773006135, + "grad_norm": 1.0094809362549224, + "learning_rate": 1.7682538437723486e-05, + "loss": 0.5742, + "step": 1993 + }, + { + "epoch": 0.24466257668711655, + "grad_norm": 1.0191403383837874, + "learning_rate": 1.7679993775640824e-05, + "loss": 0.6117, + "step": 1994 + }, + { + "epoch": 0.24478527607361963, + "grad_norm": 1.2438550485610014, + "learning_rate": 1.7677447900569166e-05, + "loss": 0.6214, + "step": 1995 + }, + { + "epoch": 0.2449079754601227, + "grad_norm": 1.0603125318823396, + "learning_rate": 1.767490081291062e-05, + "loss": 0.636, + "step": 1996 + }, + { + "epoch": 0.24503067484662577, + "grad_norm": 1.008969496922192, + "learning_rate": 1.7672352513067476e-05, + "loss": 0.6541, + "step": 1997 + }, + { + "epoch": 0.24515337423312883, + "grad_norm": 1.0770928445659378, + "learning_rate": 1.7669803001442205e-05, + "loss": 0.6225, + "step": 1998 + }, + { + "epoch": 0.2452760736196319, + "grad_norm": 1.0473889231388402, + "learning_rate": 1.7667252278437492e-05, + "loss": 0.6207, + "step": 1999 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 1.3095294856309077, + "learning_rate": 1.7664700344456198e-05, + "loss": 0.6467, + "step": 2000 + }, + { + "epoch": 0.24552147239263802, + "grad_norm": 1.0335597762742732, + "learning_rate": 1.766214719990138e-05, + "loss": 0.6177, + "step": 2001 + }, + { + "epoch": 0.2456441717791411, + "grad_norm": 1.0099223532595627, + "learning_rate": 1.7659592845176286e-05, + "loss": 0.6482, + "step": 2002 + }, + { + "epoch": 0.24576687116564416, + "grad_norm": 1.0293311831386138, + "learning_rate": 1.7657037280684352e-05, + "loss": 0.6123, + "step": 2003 + }, + { + "epoch": 0.24588957055214725, + "grad_norm": 1.0336363878799657, + "learning_rate": 1.7654480506829205e-05, + "loss": 0.6208, + "step": 2004 + }, + { + "epoch": 0.2460122699386503, + "grad_norm": 0.9966450502213797, + "learning_rate": 1.765192252401467e-05, + "loss": 0.6, + "step": 2005 + }, + { + "epoch": 0.24613496932515339, + "grad_norm": 0.866029511649878, + "learning_rate": 1.764936333264476e-05, + "loss": 0.6032, + "step": 2006 + }, + { + "epoch": 0.24625766871165644, + "grad_norm": 0.8729967661609772, + "learning_rate": 1.7646802933123672e-05, + "loss": 0.6064, + "step": 2007 + }, + { + "epoch": 0.2463803680981595, + "grad_norm": 0.9327648379471765, + "learning_rate": 1.76442413258558e-05, + "loss": 0.5933, + "step": 2008 + }, + { + "epoch": 0.24650306748466258, + "grad_norm": 1.268336499460367, + "learning_rate": 1.7641678511245733e-05, + "loss": 0.6127, + "step": 2009 + }, + { + "epoch": 0.24662576687116564, + "grad_norm": 0.965250098263086, + "learning_rate": 1.7639114489698238e-05, + "loss": 0.6128, + "step": 2010 + }, + { + "epoch": 0.24674846625766872, + "grad_norm": 1.3031973009479896, + "learning_rate": 1.7636549261618286e-05, + "loss": 0.579, + "step": 2011 + }, + { + "epoch": 0.24687116564417177, + "grad_norm": 1.1559471407415143, + "learning_rate": 1.763398282741103e-05, + "loss": 0.6633, + "step": 2012 + }, + { + "epoch": 0.24699386503067486, + "grad_norm": 1.1184486588678724, + "learning_rate": 1.7631415187481818e-05, + "loss": 0.6235, + "step": 2013 + }, + { + "epoch": 0.24711656441717791, + "grad_norm": 0.9875889406629887, + "learning_rate": 1.7628846342236187e-05, + "loss": 0.6528, + "step": 2014 + }, + { + "epoch": 0.24723926380368097, + "grad_norm": 1.010990126189862, + "learning_rate": 1.762627629207986e-05, + "loss": 0.6189, + "step": 2015 + }, + { + "epoch": 0.24736196319018405, + "grad_norm": 0.9611026029132517, + "learning_rate": 1.7623705037418757e-05, + "loss": 0.6352, + "step": 2016 + }, + { + "epoch": 0.2474846625766871, + "grad_norm": 0.9823112695114076, + "learning_rate": 1.7621132578658985e-05, + "loss": 0.655, + "step": 2017 + }, + { + "epoch": 0.2476073619631902, + "grad_norm": 1.0274217267084487, + "learning_rate": 1.761855891620684e-05, + "loss": 0.6318, + "step": 2018 + }, + { + "epoch": 0.24773006134969325, + "grad_norm": 1.042994864704137, + "learning_rate": 1.7615984050468817e-05, + "loss": 0.6422, + "step": 2019 + }, + { + "epoch": 0.24785276073619633, + "grad_norm": 1.027371443464929, + "learning_rate": 1.7613407981851586e-05, + "loss": 0.6326, + "step": 2020 + }, + { + "epoch": 0.2479754601226994, + "grad_norm": 0.882153990790989, + "learning_rate": 1.7610830710762022e-05, + "loss": 0.6536, + "step": 2021 + }, + { + "epoch": 0.24809815950920244, + "grad_norm": 0.8697647980496372, + "learning_rate": 1.7608252237607176e-05, + "loss": 0.6267, + "step": 2022 + }, + { + "epoch": 0.24822085889570553, + "grad_norm": 1.0366173631326474, + "learning_rate": 1.7605672562794298e-05, + "loss": 0.643, + "step": 2023 + }, + { + "epoch": 0.24834355828220858, + "grad_norm": 1.1690090616296898, + "learning_rate": 1.7603091686730827e-05, + "loss": 0.6522, + "step": 2024 + }, + { + "epoch": 0.24846625766871167, + "grad_norm": 0.9671609415268294, + "learning_rate": 1.760050960982439e-05, + "loss": 0.6649, + "step": 2025 + }, + { + "epoch": 0.24858895705521472, + "grad_norm": 1.086639690659751, + "learning_rate": 1.7597926332482798e-05, + "loss": 0.6077, + "step": 2026 + }, + { + "epoch": 0.2487116564417178, + "grad_norm": 1.0095128707819794, + "learning_rate": 1.7595341855114065e-05, + "loss": 0.6371, + "step": 2027 + }, + { + "epoch": 0.24883435582822086, + "grad_norm": 1.00750060330102, + "learning_rate": 1.7592756178126382e-05, + "loss": 0.6311, + "step": 2028 + }, + { + "epoch": 0.24895705521472392, + "grad_norm": 0.9416667244333662, + "learning_rate": 1.7590169301928138e-05, + "loss": 0.6227, + "step": 2029 + }, + { + "epoch": 0.249079754601227, + "grad_norm": 1.103972733276433, + "learning_rate": 1.758758122692791e-05, + "loss": 0.5782, + "step": 2030 + }, + { + "epoch": 0.24920245398773005, + "grad_norm": 0.9718505893740281, + "learning_rate": 1.7584991953534456e-05, + "loss": 0.601, + "step": 2031 + }, + { + "epoch": 0.24932515337423314, + "grad_norm": 1.06859339867249, + "learning_rate": 1.7582401482156733e-05, + "loss": 0.6143, + "step": 2032 + }, + { + "epoch": 0.2494478527607362, + "grad_norm": 1.0179928790882742, + "learning_rate": 1.7579809813203883e-05, + "loss": 0.5854, + "step": 2033 + }, + { + "epoch": 0.24957055214723928, + "grad_norm": 0.9422144692208075, + "learning_rate": 1.7577216947085237e-05, + "loss": 0.6315, + "step": 2034 + }, + { + "epoch": 0.24969325153374233, + "grad_norm": 1.003015348796459, + "learning_rate": 1.757462288421032e-05, + "loss": 0.6335, + "step": 2035 + }, + { + "epoch": 0.2498159509202454, + "grad_norm": 0.9601592217474948, + "learning_rate": 1.757202762498883e-05, + "loss": 0.6613, + "step": 2036 + }, + { + "epoch": 0.24993865030674847, + "grad_norm": 0.9827672755453004, + "learning_rate": 1.7569431169830685e-05, + "loss": 0.6663, + "step": 2037 + }, + { + "epoch": 0.25006134969325156, + "grad_norm": 0.8737170578907918, + "learning_rate": 1.7566833519145958e-05, + "loss": 0.5543, + "step": 2038 + }, + { + "epoch": 0.2501840490797546, + "grad_norm": 1.0716756996735028, + "learning_rate": 1.7564234673344928e-05, + "loss": 0.6072, + "step": 2039 + }, + { + "epoch": 0.25030674846625767, + "grad_norm": 0.9369000371653234, + "learning_rate": 1.7561634632838062e-05, + "loss": 0.5875, + "step": 2040 + }, + { + "epoch": 0.2504294478527607, + "grad_norm": 0.9806541849598203, + "learning_rate": 1.7559033398036018e-05, + "loss": 0.5932, + "step": 2041 + }, + { + "epoch": 0.2505521472392638, + "grad_norm": 0.9047597158606379, + "learning_rate": 1.7556430969349633e-05, + "loss": 0.641, + "step": 2042 + }, + { + "epoch": 0.2506748466257669, + "grad_norm": 0.9067110962983275, + "learning_rate": 1.7553827347189937e-05, + "loss": 0.632, + "step": 2043 + }, + { + "epoch": 0.25079754601226995, + "grad_norm": 0.9391449148644188, + "learning_rate": 1.755122253196816e-05, + "loss": 0.6157, + "step": 2044 + }, + { + "epoch": 0.250920245398773, + "grad_norm": 0.8614128017992638, + "learning_rate": 1.7548616524095697e-05, + "loss": 0.6369, + "step": 2045 + }, + { + "epoch": 0.25104294478527606, + "grad_norm": 1.0249245988620013, + "learning_rate": 1.7546009323984152e-05, + "loss": 0.6358, + "step": 2046 + }, + { + "epoch": 0.25116564417177917, + "grad_norm": 1.0688737202995287, + "learning_rate": 1.754340093204531e-05, + "loss": 0.6402, + "step": 2047 + }, + { + "epoch": 0.2512883435582822, + "grad_norm": 0.8912679489890596, + "learning_rate": 1.7540791348691144e-05, + "loss": 0.6461, + "step": 2048 + }, + { + "epoch": 0.2514110429447853, + "grad_norm": 1.0465865504052732, + "learning_rate": 1.7538180574333806e-05, + "loss": 0.669, + "step": 2049 + }, + { + "epoch": 0.25153374233128833, + "grad_norm": 0.9291097298483739, + "learning_rate": 1.753556860938566e-05, + "loss": 0.6077, + "step": 2050 + }, + { + "epoch": 0.2516564417177914, + "grad_norm": 1.0726952816900617, + "learning_rate": 1.7532955454259236e-05, + "loss": 0.6745, + "step": 2051 + }, + { + "epoch": 0.2517791411042945, + "grad_norm": 1.230890438601564, + "learning_rate": 1.753034110936726e-05, + "loss": 0.6754, + "step": 2052 + }, + { + "epoch": 0.25190184049079756, + "grad_norm": 0.991282729573984, + "learning_rate": 1.7527725575122642e-05, + "loss": 0.6333, + "step": 2053 + }, + { + "epoch": 0.2520245398773006, + "grad_norm": 1.6169348169909497, + "learning_rate": 1.752510885193849e-05, + "loss": 0.6029, + "step": 2054 + }, + { + "epoch": 0.25214723926380367, + "grad_norm": 1.337509380647035, + "learning_rate": 1.7522490940228086e-05, + "loss": 0.6249, + "step": 2055 + }, + { + "epoch": 0.2522699386503067, + "grad_norm": 0.9339778477756681, + "learning_rate": 1.7519871840404913e-05, + "loss": 0.5917, + "step": 2056 + }, + { + "epoch": 0.25239263803680984, + "grad_norm": 1.215659004484134, + "learning_rate": 1.751725155288263e-05, + "loss": 0.6357, + "step": 2057 + }, + { + "epoch": 0.2525153374233129, + "grad_norm": 1.960616062507808, + "learning_rate": 1.7514630078075095e-05, + "loss": 0.625, + "step": 2058 + }, + { + "epoch": 0.25263803680981595, + "grad_norm": 1.0373232518288955, + "learning_rate": 1.7512007416396342e-05, + "loss": 0.6045, + "step": 2059 + }, + { + "epoch": 0.252760736196319, + "grad_norm": 0.9428976204727595, + "learning_rate": 1.7509383568260597e-05, + "loss": 0.6075, + "step": 2060 + }, + { + "epoch": 0.25288343558282206, + "grad_norm": 1.0396263672920172, + "learning_rate": 1.7506758534082278e-05, + "loss": 0.6209, + "step": 2061 + }, + { + "epoch": 0.25300613496932517, + "grad_norm": 1.0893575946326854, + "learning_rate": 1.7504132314275984e-05, + "loss": 0.6277, + "step": 2062 + }, + { + "epoch": 0.2531288343558282, + "grad_norm": 0.9685026128259626, + "learning_rate": 1.750150490925651e-05, + "loss": 0.6359, + "step": 2063 + }, + { + "epoch": 0.2532515337423313, + "grad_norm": 0.9653798737517985, + "learning_rate": 1.749887631943882e-05, + "loss": 0.6405, + "step": 2064 + }, + { + "epoch": 0.25337423312883434, + "grad_norm": 0.997494797965767, + "learning_rate": 1.749624654523809e-05, + "loss": 0.6672, + "step": 2065 + }, + { + "epoch": 0.25349693251533745, + "grad_norm": 1.0018774444615137, + "learning_rate": 1.749361558706966e-05, + "loss": 0.6424, + "step": 2066 + }, + { + "epoch": 0.2536196319018405, + "grad_norm": 1.0822625741096297, + "learning_rate": 1.7490983445349076e-05, + "loss": 0.6368, + "step": 2067 + }, + { + "epoch": 0.25374233128834356, + "grad_norm": 1.5005412509331035, + "learning_rate": 1.7488350120492056e-05, + "loss": 0.6204, + "step": 2068 + }, + { + "epoch": 0.2538650306748466, + "grad_norm": 1.146559384434662, + "learning_rate": 1.7485715612914513e-05, + "loss": 0.6514, + "step": 2069 + }, + { + "epoch": 0.25398773006134967, + "grad_norm": 0.8933364881544169, + "learning_rate": 1.7483079923032543e-05, + "loss": 0.5952, + "step": 2070 + }, + { + "epoch": 0.2541104294478528, + "grad_norm": 0.9591037744946813, + "learning_rate": 1.7480443051262438e-05, + "loss": 0.6332, + "step": 2071 + }, + { + "epoch": 0.25423312883435584, + "grad_norm": 1.0060082879505263, + "learning_rate": 1.7477804998020658e-05, + "loss": 0.6759, + "step": 2072 + }, + { + "epoch": 0.2543558282208589, + "grad_norm": 1.0038979830011652, + "learning_rate": 1.7475165763723868e-05, + "loss": 0.6291, + "step": 2073 + }, + { + "epoch": 0.25447852760736195, + "grad_norm": 0.9705323178057882, + "learning_rate": 1.747252534878891e-05, + "loss": 0.692, + "step": 2074 + }, + { + "epoch": 0.254601226993865, + "grad_norm": 0.8852346066286289, + "learning_rate": 1.7469883753632817e-05, + "loss": 0.5779, + "step": 2075 + }, + { + "epoch": 0.2547239263803681, + "grad_norm": 0.9749997712161544, + "learning_rate": 1.7467240978672805e-05, + "loss": 0.6526, + "step": 2076 + }, + { + "epoch": 0.25484662576687117, + "grad_norm": 0.9039031966909283, + "learning_rate": 1.746459702432628e-05, + "loss": 0.66, + "step": 2077 + }, + { + "epoch": 0.2549693251533742, + "grad_norm": 1.066948171330366, + "learning_rate": 1.7461951891010822e-05, + "loss": 0.6154, + "step": 2078 + }, + { + "epoch": 0.2550920245398773, + "grad_norm": 1.1103458390388639, + "learning_rate": 1.745930557914422e-05, + "loss": 0.6388, + "step": 2079 + }, + { + "epoch": 0.2552147239263804, + "grad_norm": 1.0168761874387486, + "learning_rate": 1.745665808914443e-05, + "loss": 0.6205, + "step": 2080 + }, + { + "epoch": 0.25533742331288345, + "grad_norm": 0.9560441254779063, + "learning_rate": 1.74540094214296e-05, + "loss": 0.6143, + "step": 2081 + }, + { + "epoch": 0.2554601226993865, + "grad_norm": 0.921558894916353, + "learning_rate": 1.7451359576418062e-05, + "loss": 0.6297, + "step": 2082 + }, + { + "epoch": 0.25558282208588956, + "grad_norm": 0.9856959618271539, + "learning_rate": 1.7448708554528343e-05, + "loss": 0.6128, + "step": 2083 + }, + { + "epoch": 0.2557055214723926, + "grad_norm": 0.8810277189355732, + "learning_rate": 1.7446056356179145e-05, + "loss": 0.6275, + "step": 2084 + }, + { + "epoch": 0.2558282208588957, + "grad_norm": 0.8876969172985326, + "learning_rate": 1.744340298178936e-05, + "loss": 0.6387, + "step": 2085 + }, + { + "epoch": 0.2559509202453988, + "grad_norm": 1.0230981028985056, + "learning_rate": 1.7440748431778066e-05, + "loss": 0.6087, + "step": 2086 + }, + { + "epoch": 0.25607361963190184, + "grad_norm": 0.8632757802949692, + "learning_rate": 1.7438092706564527e-05, + "loss": 0.6158, + "step": 2087 + }, + { + "epoch": 0.2561963190184049, + "grad_norm": 0.9334000638618783, + "learning_rate": 1.743543580656819e-05, + "loss": 0.6471, + "step": 2088 + }, + { + "epoch": 0.25631901840490795, + "grad_norm": 0.9332949344492215, + "learning_rate": 1.743277773220869e-05, + "loss": 0.6043, + "step": 2089 + }, + { + "epoch": 0.25644171779141106, + "grad_norm": 1.0858332779021909, + "learning_rate": 1.743011848390585e-05, + "loss": 0.6105, + "step": 2090 + }, + { + "epoch": 0.2565644171779141, + "grad_norm": 1.33311103749468, + "learning_rate": 1.7427458062079675e-05, + "loss": 0.6241, + "step": 2091 + }, + { + "epoch": 0.2566871165644172, + "grad_norm": 1.0471866210316807, + "learning_rate": 1.7424796467150355e-05, + "loss": 0.6607, + "step": 2092 + }, + { + "epoch": 0.25680981595092023, + "grad_norm": 1.0098125302103245, + "learning_rate": 1.742213369953826e-05, + "loss": 0.6423, + "step": 2093 + }, + { + "epoch": 0.25693251533742334, + "grad_norm": 0.9780040082797848, + "learning_rate": 1.741946975966396e-05, + "loss": 0.6187, + "step": 2094 + }, + { + "epoch": 0.2570552147239264, + "grad_norm": 0.9388499708691019, + "learning_rate": 1.7416804647948194e-05, + "loss": 0.5866, + "step": 2095 + }, + { + "epoch": 0.25717791411042945, + "grad_norm": 0.9186042503169607, + "learning_rate": 1.74141383648119e-05, + "loss": 0.5518, + "step": 2096 + }, + { + "epoch": 0.2573006134969325, + "grad_norm": 0.8792526177619637, + "learning_rate": 1.7411470910676193e-05, + "loss": 0.6182, + "step": 2097 + }, + { + "epoch": 0.25742331288343556, + "grad_norm": 1.0369085967397718, + "learning_rate": 1.740880228596237e-05, + "loss": 0.6119, + "step": 2098 + }, + { + "epoch": 0.2575460122699387, + "grad_norm": 1.0954644309580868, + "learning_rate": 1.7406132491091922e-05, + "loss": 0.621, + "step": 2099 + }, + { + "epoch": 0.25766871165644173, + "grad_norm": 1.0065116947215809, + "learning_rate": 1.740346152648652e-05, + "loss": 0.598, + "step": 2100 + }, + { + "epoch": 0.2577914110429448, + "grad_norm": 1.0935575357039602, + "learning_rate": 1.740078939256802e-05, + "loss": 0.6484, + "step": 2101 + }, + { + "epoch": 0.25791411042944784, + "grad_norm": 0.9624040626810101, + "learning_rate": 1.7398116089758454e-05, + "loss": 0.5764, + "step": 2102 + }, + { + "epoch": 0.2580368098159509, + "grad_norm": 1.051615489809678, + "learning_rate": 1.739544161848006e-05, + "loss": 0.5962, + "step": 2103 + }, + { + "epoch": 0.258159509202454, + "grad_norm": 0.9871361545259499, + "learning_rate": 1.7392765979155242e-05, + "loss": 0.6407, + "step": 2104 + }, + { + "epoch": 0.25828220858895706, + "grad_norm": 0.8485346156148076, + "learning_rate": 1.7390089172206594e-05, + "loss": 0.5769, + "step": 2105 + }, + { + "epoch": 0.2584049079754601, + "grad_norm": 0.9741451049961863, + "learning_rate": 1.7387411198056892e-05, + "loss": 0.6351, + "step": 2106 + }, + { + "epoch": 0.2585276073619632, + "grad_norm": 0.9742670081417576, + "learning_rate": 1.7384732057129107e-05, + "loss": 0.5455, + "step": 2107 + }, + { + "epoch": 0.25865030674846623, + "grad_norm": 0.8732787427209652, + "learning_rate": 1.7382051749846376e-05, + "loss": 0.6337, + "step": 2108 + }, + { + "epoch": 0.25877300613496934, + "grad_norm": 0.9230460806856718, + "learning_rate": 1.737937027663204e-05, + "loss": 0.6083, + "step": 2109 + }, + { + "epoch": 0.2588957055214724, + "grad_norm": 1.065385625828287, + "learning_rate": 1.7376687637909607e-05, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 0.25901840490797545, + "grad_norm": 0.9533922143777535, + "learning_rate": 1.737400383410278e-05, + "loss": 0.5923, + "step": 2111 + }, + { + "epoch": 0.2591411042944785, + "grad_norm": 0.9748150382142758, + "learning_rate": 1.7371318865635448e-05, + "loss": 0.6166, + "step": 2112 + }, + { + "epoch": 0.2592638036809816, + "grad_norm": 1.0136079532991074, + "learning_rate": 1.736863273293167e-05, + "loss": 0.6471, + "step": 2113 + }, + { + "epoch": 0.2593865030674847, + "grad_norm": 0.9830602799940438, + "learning_rate": 1.73659454364157e-05, + "loss": 0.5599, + "step": 2114 + }, + { + "epoch": 0.25950920245398773, + "grad_norm": 1.1005549131763674, + "learning_rate": 1.7363256976511972e-05, + "loss": 0.6603, + "step": 2115 + }, + { + "epoch": 0.2596319018404908, + "grad_norm": 0.8971573254980898, + "learning_rate": 1.7360567353645113e-05, + "loss": 0.5892, + "step": 2116 + }, + { + "epoch": 0.25975460122699384, + "grad_norm": 1.2760799989945468, + "learning_rate": 1.735787656823992e-05, + "loss": 0.6138, + "step": 2117 + }, + { + "epoch": 0.25987730061349695, + "grad_norm": 1.0014253905297663, + "learning_rate": 1.735518462072138e-05, + "loss": 0.6596, + "step": 2118 + }, + { + "epoch": 0.26, + "grad_norm": 0.9082590918800902, + "learning_rate": 1.7352491511514658e-05, + "loss": 0.621, + "step": 2119 + }, + { + "epoch": 0.26012269938650306, + "grad_norm": 1.0285838681090653, + "learning_rate": 1.7349797241045115e-05, + "loss": 0.6791, + "step": 2120 + }, + { + "epoch": 0.2602453987730061, + "grad_norm": 0.9339715183485526, + "learning_rate": 1.7347101809738288e-05, + "loss": 0.6839, + "step": 2121 + }, + { + "epoch": 0.2603680981595092, + "grad_norm": 0.8833019462536696, + "learning_rate": 1.734440521801989e-05, + "loss": 0.6764, + "step": 2122 + }, + { + "epoch": 0.2604907975460123, + "grad_norm": 2.312003252284938, + "learning_rate": 1.734170746631583e-05, + "loss": 0.6112, + "step": 2123 + }, + { + "epoch": 0.26061349693251534, + "grad_norm": 0.9481676416630268, + "learning_rate": 1.7339008555052196e-05, + "loss": 0.6269, + "step": 2124 + }, + { + "epoch": 0.2607361963190184, + "grad_norm": 0.8791416720453158, + "learning_rate": 1.733630848465525e-05, + "loss": 0.6766, + "step": 2125 + }, + { + "epoch": 0.26085889570552145, + "grad_norm": 1.0963384887740952, + "learning_rate": 1.7333607255551455e-05, + "loss": 0.566, + "step": 2126 + }, + { + "epoch": 0.26098159509202457, + "grad_norm": 0.860595057581385, + "learning_rate": 1.733090486816744e-05, + "loss": 0.6144, + "step": 2127 + }, + { + "epoch": 0.2611042944785276, + "grad_norm": 0.9489467675703371, + "learning_rate": 1.7328201322930028e-05, + "loss": 0.6278, + "step": 2128 + }, + { + "epoch": 0.2612269938650307, + "grad_norm": 1.035750017875434, + "learning_rate": 1.7325496620266216e-05, + "loss": 0.6353, + "step": 2129 + }, + { + "epoch": 0.26134969325153373, + "grad_norm": 0.945613596037389, + "learning_rate": 1.732279076060319e-05, + "loss": 0.6193, + "step": 2130 + }, + { + "epoch": 0.2614723926380368, + "grad_norm": 0.9352609135548264, + "learning_rate": 1.7320083744368323e-05, + "loss": 0.5914, + "step": 2131 + }, + { + "epoch": 0.2615950920245399, + "grad_norm": 0.9474108290960324, + "learning_rate": 1.7317375571989158e-05, + "loss": 0.6422, + "step": 2132 + }, + { + "epoch": 0.26171779141104295, + "grad_norm": 1.0586746627115613, + "learning_rate": 1.7314666243893433e-05, + "loss": 0.6283, + "step": 2133 + }, + { + "epoch": 0.261840490797546, + "grad_norm": 0.9584374258214698, + "learning_rate": 1.7311955760509057e-05, + "loss": 0.6526, + "step": 2134 + }, + { + "epoch": 0.26196319018404907, + "grad_norm": 4.262885401504637, + "learning_rate": 1.730924412226413e-05, + "loss": 0.604, + "step": 2135 + }, + { + "epoch": 0.2620858895705521, + "grad_norm": 0.9233913891391117, + "learning_rate": 1.7306531329586933e-05, + "loss": 0.6074, + "step": 2136 + }, + { + "epoch": 0.26220858895705523, + "grad_norm": 1.0094466475423747, + "learning_rate": 1.7303817382905928e-05, + "loss": 0.6976, + "step": 2137 + }, + { + "epoch": 0.2623312883435583, + "grad_norm": 1.1879239649868645, + "learning_rate": 1.730110228264976e-05, + "loss": 0.5918, + "step": 2138 + }, + { + "epoch": 0.26245398773006134, + "grad_norm": 1.109520941904826, + "learning_rate": 1.7298386029247253e-05, + "loss": 0.6586, + "step": 2139 + }, + { + "epoch": 0.2625766871165644, + "grad_norm": 1.080061145589442, + "learning_rate": 1.729566862312742e-05, + "loss": 0.6336, + "step": 2140 + }, + { + "epoch": 0.2626993865030675, + "grad_norm": 1.1013157092762207, + "learning_rate": 1.729295006471945e-05, + "loss": 0.6256, + "step": 2141 + }, + { + "epoch": 0.26282208588957057, + "grad_norm": 1.0126212584769483, + "learning_rate": 1.729023035445272e-05, + "loss": 0.5875, + "step": 2142 + }, + { + "epoch": 0.2629447852760736, + "grad_norm": 1.2648837546460037, + "learning_rate": 1.7287509492756777e-05, + "loss": 0.6302, + "step": 2143 + }, + { + "epoch": 0.2630674846625767, + "grad_norm": 0.9985341741464758, + "learning_rate": 1.728478748006136e-05, + "loss": 0.6258, + "step": 2144 + }, + { + "epoch": 0.26319018404907973, + "grad_norm": 0.9524461635900783, + "learning_rate": 1.7282064316796387e-05, + "loss": 0.6111, + "step": 2145 + }, + { + "epoch": 0.26331288343558285, + "grad_norm": 1.2168472045656018, + "learning_rate": 1.7279340003391962e-05, + "loss": 0.6398, + "step": 2146 + }, + { + "epoch": 0.2634355828220859, + "grad_norm": 1.0535155934802587, + "learning_rate": 1.7276614540278368e-05, + "loss": 0.5914, + "step": 2147 + }, + { + "epoch": 0.26355828220858896, + "grad_norm": 1.1842718876507259, + "learning_rate": 1.7273887927886062e-05, + "loss": 0.625, + "step": 2148 + }, + { + "epoch": 0.263680981595092, + "grad_norm": 0.9937230219239602, + "learning_rate": 1.7271160166645695e-05, + "loss": 0.5925, + "step": 2149 + }, + { + "epoch": 0.26380368098159507, + "grad_norm": 1.0333642054798227, + "learning_rate": 1.726843125698809e-05, + "loss": 0.6264, + "step": 2150 + }, + { + "epoch": 0.2639263803680982, + "grad_norm": 1.0508239190668338, + "learning_rate": 1.7265701199344257e-05, + "loss": 0.6682, + "step": 2151 + }, + { + "epoch": 0.26404907975460123, + "grad_norm": 0.9409593480148793, + "learning_rate": 1.726296999414538e-05, + "loss": 0.6216, + "step": 2152 + }, + { + "epoch": 0.2641717791411043, + "grad_norm": 1.1401080542094464, + "learning_rate": 1.726023764182284e-05, + "loss": 0.5993, + "step": 2153 + }, + { + "epoch": 0.26429447852760735, + "grad_norm": 0.9849347460634668, + "learning_rate": 1.7257504142808176e-05, + "loss": 0.6268, + "step": 2154 + }, + { + "epoch": 0.2644171779141104, + "grad_norm": 0.93746249642253, + "learning_rate": 1.7254769497533128e-05, + "loss": 0.6397, + "step": 2155 + }, + { + "epoch": 0.2645398773006135, + "grad_norm": 1.1237085086301286, + "learning_rate": 1.725203370642961e-05, + "loss": 0.6682, + "step": 2156 + }, + { + "epoch": 0.26466257668711657, + "grad_norm": 1.121001241589207, + "learning_rate": 1.724929676992971e-05, + "loss": 0.5777, + "step": 2157 + }, + { + "epoch": 0.2647852760736196, + "grad_norm": 0.9283710409268928, + "learning_rate": 1.7246558688465713e-05, + "loss": 0.5751, + "step": 2158 + }, + { + "epoch": 0.2649079754601227, + "grad_norm": 1.035005991361154, + "learning_rate": 1.724381946247007e-05, + "loss": 0.6334, + "step": 2159 + }, + { + "epoch": 0.2650306748466258, + "grad_norm": 0.8900431476729576, + "learning_rate": 1.724107909237542e-05, + "loss": 0.5805, + "step": 2160 + }, + { + "epoch": 0.26515337423312885, + "grad_norm": 0.9631276525329231, + "learning_rate": 1.7238337578614578e-05, + "loss": 0.6315, + "step": 2161 + }, + { + "epoch": 0.2652760736196319, + "grad_norm": 0.9184471792121498, + "learning_rate": 1.7235594921620546e-05, + "loss": 0.58, + "step": 2162 + }, + { + "epoch": 0.26539877300613496, + "grad_norm": 1.0341958916438536, + "learning_rate": 1.72328511218265e-05, + "loss": 0.5895, + "step": 2163 + }, + { + "epoch": 0.265521472392638, + "grad_norm": 0.9269807603454098, + "learning_rate": 1.72301061796658e-05, + "loss": 0.6198, + "step": 2164 + }, + { + "epoch": 0.2656441717791411, + "grad_norm": 0.8703662975801418, + "learning_rate": 1.7227360095571992e-05, + "loss": 0.5996, + "step": 2165 + }, + { + "epoch": 0.2657668711656442, + "grad_norm": 1.0387580107502037, + "learning_rate": 1.722461286997879e-05, + "loss": 0.6203, + "step": 2166 + }, + { + "epoch": 0.26588957055214724, + "grad_norm": 0.9362724831276003, + "learning_rate": 1.7221864503320093e-05, + "loss": 0.6518, + "step": 2167 + }, + { + "epoch": 0.2660122699386503, + "grad_norm": 0.9432116597973323, + "learning_rate": 1.721911499602999e-05, + "loss": 0.6019, + "step": 2168 + }, + { + "epoch": 0.26613496932515335, + "grad_norm": 1.0861696754488845, + "learning_rate": 1.7216364348542733e-05, + "loss": 0.6306, + "step": 2169 + }, + { + "epoch": 0.26625766871165646, + "grad_norm": 0.9902899714321599, + "learning_rate": 1.721361256129277e-05, + "loss": 0.6205, + "step": 2170 + }, + { + "epoch": 0.2663803680981595, + "grad_norm": 1.0292069529531593, + "learning_rate": 1.721085963471472e-05, + "loss": 0.614, + "step": 2171 + }, + { + "epoch": 0.26650306748466257, + "grad_norm": 1.022584421204869, + "learning_rate": 1.7208105569243382e-05, + "loss": 0.6318, + "step": 2172 + }, + { + "epoch": 0.2666257668711656, + "grad_norm": 1.0664203770969212, + "learning_rate": 1.720535036531374e-05, + "loss": 0.6394, + "step": 2173 + }, + { + "epoch": 0.26674846625766874, + "grad_norm": 0.9874547461237091, + "learning_rate": 1.720259402336095e-05, + "loss": 0.6304, + "step": 2174 + }, + { + "epoch": 0.2668711656441718, + "grad_norm": 0.9435743907096582, + "learning_rate": 1.719983654382036e-05, + "loss": 0.6511, + "step": 2175 + }, + { + "epoch": 0.26699386503067485, + "grad_norm": 0.9813508952042189, + "learning_rate": 1.719707792712748e-05, + "loss": 0.5951, + "step": 2176 + }, + { + "epoch": 0.2671165644171779, + "grad_norm": 1.0805245843774482, + "learning_rate": 1.719431817371802e-05, + "loss": 0.6199, + "step": 2177 + }, + { + "epoch": 0.26723926380368096, + "grad_norm": 1.0144932921991345, + "learning_rate": 1.7191557284027856e-05, + "loss": 0.6332, + "step": 2178 + }, + { + "epoch": 0.26736196319018407, + "grad_norm": 0.9908340335301729, + "learning_rate": 1.7188795258493043e-05, + "loss": 0.6158, + "step": 2179 + }, + { + "epoch": 0.2674846625766871, + "grad_norm": 1.0187680741591227, + "learning_rate": 1.7186032097549822e-05, + "loss": 0.6395, + "step": 2180 + }, + { + "epoch": 0.2676073619631902, + "grad_norm": 1.4122405518850094, + "learning_rate": 1.7183267801634612e-05, + "loss": 0.6471, + "step": 2181 + }, + { + "epoch": 0.26773006134969324, + "grad_norm": 1.0152446357135152, + "learning_rate": 1.7180502371184008e-05, + "loss": 0.5885, + "step": 2182 + }, + { + "epoch": 0.2678527607361963, + "grad_norm": 0.9341689023164051, + "learning_rate": 1.717773580663479e-05, + "loss": 0.6393, + "step": 2183 + }, + { + "epoch": 0.2679754601226994, + "grad_norm": 1.2489922424597206, + "learning_rate": 1.7174968108423903e-05, + "loss": 0.5672, + "step": 2184 + }, + { + "epoch": 0.26809815950920246, + "grad_norm": 0.9821565224740914, + "learning_rate": 1.717219927698849e-05, + "loss": 0.6333, + "step": 2185 + }, + { + "epoch": 0.2682208588957055, + "grad_norm": 1.1700105161677588, + "learning_rate": 1.7169429312765863e-05, + "loss": 0.5867, + "step": 2186 + }, + { + "epoch": 0.26834355828220857, + "grad_norm": 0.8608032572193922, + "learning_rate": 1.7166658216193516e-05, + "loss": 0.5667, + "step": 2187 + }, + { + "epoch": 0.2684662576687117, + "grad_norm": 0.9417172424914083, + "learning_rate": 1.716388598770911e-05, + "loss": 0.6369, + "step": 2188 + }, + { + "epoch": 0.26858895705521474, + "grad_norm": 0.9627171234759996, + "learning_rate": 1.7161112627750503e-05, + "loss": 0.6352, + "step": 2189 + }, + { + "epoch": 0.2687116564417178, + "grad_norm": 0.9137584056280551, + "learning_rate": 1.7158338136755724e-05, + "loss": 0.6221, + "step": 2190 + }, + { + "epoch": 0.26883435582822085, + "grad_norm": 1.0898727934557502, + "learning_rate": 1.7155562515162977e-05, + "loss": 0.6624, + "step": 2191 + }, + { + "epoch": 0.2689570552147239, + "grad_norm": 0.9965621511994296, + "learning_rate": 1.7152785763410648e-05, + "loss": 0.609, + "step": 2192 + }, + { + "epoch": 0.269079754601227, + "grad_norm": 1.0065821382819744, + "learning_rate": 1.7150007881937297e-05, + "loss": 0.6595, + "step": 2193 + }, + { + "epoch": 0.2692024539877301, + "grad_norm": 0.9437294999529302, + "learning_rate": 1.7147228871181674e-05, + "loss": 0.6276, + "step": 2194 + }, + { + "epoch": 0.26932515337423313, + "grad_norm": 1.0074586538439823, + "learning_rate": 1.7144448731582698e-05, + "loss": 0.5969, + "step": 2195 + }, + { + "epoch": 0.2694478527607362, + "grad_norm": 1.0492234451127793, + "learning_rate": 1.7141667463579467e-05, + "loss": 0.6315, + "step": 2196 + }, + { + "epoch": 0.26957055214723924, + "grad_norm": 1.0088286502036916, + "learning_rate": 1.7138885067611256e-05, + "loss": 0.6122, + "step": 2197 + }, + { + "epoch": 0.26969325153374235, + "grad_norm": 0.9331243333114052, + "learning_rate": 1.7136101544117526e-05, + "loss": 0.6419, + "step": 2198 + }, + { + "epoch": 0.2698159509202454, + "grad_norm": 1.0589900037010131, + "learning_rate": 1.7133316893537903e-05, + "loss": 0.6557, + "step": 2199 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 1.1403964191988716, + "learning_rate": 1.7130531116312202e-05, + "loss": 0.6396, + "step": 2200 + }, + { + "epoch": 0.2700613496932515, + "grad_norm": 1.1700896014755615, + "learning_rate": 1.712774421288042e-05, + "loss": 0.6226, + "step": 2201 + }, + { + "epoch": 0.2701840490797546, + "grad_norm": 1.0600903802295543, + "learning_rate": 1.7124956183682712e-05, + "loss": 0.5988, + "step": 2202 + }, + { + "epoch": 0.2703067484662577, + "grad_norm": 1.0613760502179481, + "learning_rate": 1.712216702915943e-05, + "loss": 0.6364, + "step": 2203 + }, + { + "epoch": 0.27042944785276074, + "grad_norm": 1.05252380127527, + "learning_rate": 1.71193767497511e-05, + "loss": 0.6603, + "step": 2204 + }, + { + "epoch": 0.2705521472392638, + "grad_norm": 1.0172569296137337, + "learning_rate": 1.7116585345898413e-05, + "loss": 0.666, + "step": 2205 + }, + { + "epoch": 0.27067484662576685, + "grad_norm": 1.0422907241316959, + "learning_rate": 1.711379281804226e-05, + "loss": 0.647, + "step": 2206 + }, + { + "epoch": 0.27079754601226996, + "grad_norm": 1.207588397931923, + "learning_rate": 1.7110999166623683e-05, + "loss": 0.6726, + "step": 2207 + }, + { + "epoch": 0.270920245398773, + "grad_norm": 1.0068958815798141, + "learning_rate": 1.7108204392083926e-05, + "loss": 0.6248, + "step": 2208 + }, + { + "epoch": 0.2710429447852761, + "grad_norm": 1.0419014723976623, + "learning_rate": 1.7105408494864392e-05, + "loss": 0.603, + "step": 2209 + }, + { + "epoch": 0.27116564417177913, + "grad_norm": 0.9970796194111148, + "learning_rate": 1.7102611475406676e-05, + "loss": 0.6266, + "step": 2210 + }, + { + "epoch": 0.2712883435582822, + "grad_norm": 0.9106043526861909, + "learning_rate": 1.7099813334152537e-05, + "loss": 0.6107, + "step": 2211 + }, + { + "epoch": 0.2714110429447853, + "grad_norm": 0.9399740156507913, + "learning_rate": 1.709701407154392e-05, + "loss": 0.6431, + "step": 2212 + }, + { + "epoch": 0.27153374233128835, + "grad_norm": 1.1684438474490442, + "learning_rate": 1.7094213688022947e-05, + "loss": 0.5833, + "step": 2213 + }, + { + "epoch": 0.2716564417177914, + "grad_norm": 1.0041964402242929, + "learning_rate": 1.709141218403191e-05, + "loss": 0.5826, + "step": 2214 + }, + { + "epoch": 0.27177914110429446, + "grad_norm": 0.8865326916155561, + "learning_rate": 1.7088609560013284e-05, + "loss": 0.6579, + "step": 2215 + }, + { + "epoch": 0.2719018404907975, + "grad_norm": 0.8658867814349032, + "learning_rate": 1.7085805816409723e-05, + "loss": 0.5861, + "step": 2216 + }, + { + "epoch": 0.27202453987730063, + "grad_norm": 1.0528067712661495, + "learning_rate": 1.708300095366405e-05, + "loss": 0.6869, + "step": 2217 + }, + { + "epoch": 0.2721472392638037, + "grad_norm": 1.0295185489312972, + "learning_rate": 1.7080194972219267e-05, + "loss": 0.6474, + "step": 2218 + }, + { + "epoch": 0.27226993865030674, + "grad_norm": 0.9258746290724336, + "learning_rate": 1.707738787251856e-05, + "loss": 0.6591, + "step": 2219 + }, + { + "epoch": 0.2723926380368098, + "grad_norm": 0.9959003101390137, + "learning_rate": 1.7074579655005282e-05, + "loss": 0.6495, + "step": 2220 + }, + { + "epoch": 0.2725153374233129, + "grad_norm": 0.9845228392107637, + "learning_rate": 1.7071770320122973e-05, + "loss": 0.5889, + "step": 2221 + }, + { + "epoch": 0.27263803680981596, + "grad_norm": 0.9177291752650041, + "learning_rate": 1.7068959868315334e-05, + "loss": 0.6163, + "step": 2222 + }, + { + "epoch": 0.272760736196319, + "grad_norm": 1.0683541297125665, + "learning_rate": 1.706614830002626e-05, + "loss": 0.623, + "step": 2223 + }, + { + "epoch": 0.2728834355828221, + "grad_norm": 1.025734746601832, + "learning_rate": 1.7063335615699812e-05, + "loss": 0.574, + "step": 2224 + }, + { + "epoch": 0.27300613496932513, + "grad_norm": 1.0132017402918647, + "learning_rate": 1.7060521815780225e-05, + "loss": 0.6337, + "step": 2225 + }, + { + "epoch": 0.27312883435582824, + "grad_norm": 1.04353600464115, + "learning_rate": 1.705770690071192e-05, + "loss": 0.6172, + "step": 2226 + }, + { + "epoch": 0.2732515337423313, + "grad_norm": 0.8880904014210279, + "learning_rate": 1.705489087093948e-05, + "loss": 0.5767, + "step": 2227 + }, + { + "epoch": 0.27337423312883435, + "grad_norm": 0.981430526038488, + "learning_rate": 1.705207372690769e-05, + "loss": 0.635, + "step": 2228 + }, + { + "epoch": 0.2734969325153374, + "grad_norm": 0.8611042505570975, + "learning_rate": 1.7049255469061476e-05, + "loss": 0.5633, + "step": 2229 + }, + { + "epoch": 0.27361963190184047, + "grad_norm": 1.1338256120234804, + "learning_rate": 1.704643609784596e-05, + "loss": 0.644, + "step": 2230 + }, + { + "epoch": 0.2737423312883436, + "grad_norm": 0.9376402174573664, + "learning_rate": 1.7043615613706448e-05, + "loss": 0.6338, + "step": 2231 + }, + { + "epoch": 0.27386503067484663, + "grad_norm": 1.035565437628234, + "learning_rate": 1.7040794017088403e-05, + "loss": 0.6133, + "step": 2232 + }, + { + "epoch": 0.2739877300613497, + "grad_norm": 0.9389058141310025, + "learning_rate": 1.7037971308437472e-05, + "loss": 0.6146, + "step": 2233 + }, + { + "epoch": 0.27411042944785274, + "grad_norm": 1.1972730646222332, + "learning_rate": 1.703514748819948e-05, + "loss": 0.6434, + "step": 2234 + }, + { + "epoch": 0.27423312883435585, + "grad_norm": 1.0910723528763226, + "learning_rate": 1.7032322556820428e-05, + "loss": 0.6257, + "step": 2235 + }, + { + "epoch": 0.2743558282208589, + "grad_norm": 0.9766939791832195, + "learning_rate": 1.702949651474648e-05, + "loss": 0.6545, + "step": 2236 + }, + { + "epoch": 0.27447852760736197, + "grad_norm": 1.004421976378525, + "learning_rate": 1.7026669362423995e-05, + "loss": 0.639, + "step": 2237 + }, + { + "epoch": 0.274601226993865, + "grad_norm": 1.0556765876063106, + "learning_rate": 1.7023841100299496e-05, + "loss": 0.6573, + "step": 2238 + }, + { + "epoch": 0.2747239263803681, + "grad_norm": 1.044093890509969, + "learning_rate": 1.7021011728819676e-05, + "loss": 0.5724, + "step": 2239 + }, + { + "epoch": 0.2748466257668712, + "grad_norm": 1.0750920946443314, + "learning_rate": 1.7018181248431416e-05, + "loss": 0.6223, + "step": 2240 + }, + { + "epoch": 0.27496932515337424, + "grad_norm": 1.0235844478711198, + "learning_rate": 1.7015349659581764e-05, + "loss": 0.5343, + "step": 2241 + }, + { + "epoch": 0.2750920245398773, + "grad_norm": 0.87501789496964, + "learning_rate": 1.7012516962717943e-05, + "loss": 0.5588, + "step": 2242 + }, + { + "epoch": 0.27521472392638036, + "grad_norm": 0.9083780015439595, + "learning_rate": 1.700968315828736e-05, + "loss": 0.5938, + "step": 2243 + }, + { + "epoch": 0.2753374233128834, + "grad_norm": 1.0067604283268905, + "learning_rate": 1.7006848246737583e-05, + "loss": 0.6312, + "step": 2244 + }, + { + "epoch": 0.2754601226993865, + "grad_norm": 1.1057678435455103, + "learning_rate": 1.700401222851636e-05, + "loss": 0.6281, + "step": 2245 + }, + { + "epoch": 0.2755828220858896, + "grad_norm": 1.0394438384497495, + "learning_rate": 1.7001175104071625e-05, + "loss": 0.722, + "step": 2246 + }, + { + "epoch": 0.27570552147239263, + "grad_norm": 1.169326299906738, + "learning_rate": 1.6998336873851472e-05, + "loss": 0.656, + "step": 2247 + }, + { + "epoch": 0.2758282208588957, + "grad_norm": 1.0804468932425502, + "learning_rate": 1.699549753830418e-05, + "loss": 0.6212, + "step": 2248 + }, + { + "epoch": 0.27595092024539875, + "grad_norm": 1.1073352172166686, + "learning_rate": 1.6992657097878184e-05, + "loss": 0.6861, + "step": 2249 + }, + { + "epoch": 0.27607361963190186, + "grad_norm": 1.1590149482596388, + "learning_rate": 1.698981555302212e-05, + "loss": 0.6338, + "step": 2250 + }, + { + "epoch": 0.2761963190184049, + "grad_norm": 1.012777358483554, + "learning_rate": 1.6986972904184783e-05, + "loss": 0.6204, + "step": 2251 + }, + { + "epoch": 0.27631901840490797, + "grad_norm": 1.1918774576902844, + "learning_rate": 1.6984129151815147e-05, + "loss": 0.6074, + "step": 2252 + }, + { + "epoch": 0.276441717791411, + "grad_norm": 1.0908678034023411, + "learning_rate": 1.6981284296362346e-05, + "loss": 0.615, + "step": 2253 + }, + { + "epoch": 0.27656441717791413, + "grad_norm": 1.0700609625160529, + "learning_rate": 1.6978438338275717e-05, + "loss": 0.6373, + "step": 2254 + }, + { + "epoch": 0.2766871165644172, + "grad_norm": 1.0223818713735628, + "learning_rate": 1.6975591278004747e-05, + "loss": 0.642, + "step": 2255 + }, + { + "epoch": 0.27680981595092025, + "grad_norm": 1.0002084140271392, + "learning_rate": 1.69727431159991e-05, + "loss": 0.5998, + "step": 2256 + }, + { + "epoch": 0.2769325153374233, + "grad_norm": 1.2132916028952208, + "learning_rate": 1.696989385270863e-05, + "loss": 0.6528, + "step": 2257 + }, + { + "epoch": 0.27705521472392636, + "grad_norm": 0.937085273433488, + "learning_rate": 1.6967043488583342e-05, + "loss": 0.5886, + "step": 2258 + }, + { + "epoch": 0.27717791411042947, + "grad_norm": 1.2584479637550852, + "learning_rate": 1.6964192024073436e-05, + "loss": 0.6441, + "step": 2259 + }, + { + "epoch": 0.2773006134969325, + "grad_norm": 1.165508752403678, + "learning_rate": 1.696133945962927e-05, + "loss": 0.6426, + "step": 2260 + }, + { + "epoch": 0.2774233128834356, + "grad_norm": 1.0558175801754819, + "learning_rate": 1.695848579570138e-05, + "loss": 0.608, + "step": 2261 + }, + { + "epoch": 0.27754601226993864, + "grad_norm": 0.99542722309566, + "learning_rate": 1.695563103274049e-05, + "loss": 0.6329, + "step": 2262 + }, + { + "epoch": 0.2776687116564417, + "grad_norm": 0.8954094319414886, + "learning_rate": 1.6952775171197467e-05, + "loss": 0.6564, + "step": 2263 + }, + { + "epoch": 0.2777914110429448, + "grad_norm": 0.8579455954975058, + "learning_rate": 1.6949918211523386e-05, + "loss": 0.6267, + "step": 2264 + }, + { + "epoch": 0.27791411042944786, + "grad_norm": 1.0807401756434718, + "learning_rate": 1.6947060154169473e-05, + "loss": 0.6816, + "step": 2265 + }, + { + "epoch": 0.2780368098159509, + "grad_norm": 1.0384690760517146, + "learning_rate": 1.694420099958713e-05, + "loss": 0.5818, + "step": 2266 + }, + { + "epoch": 0.27815950920245397, + "grad_norm": 0.9709139115310091, + "learning_rate": 1.6941340748227942e-05, + "loss": 0.5694, + "step": 2267 + }, + { + "epoch": 0.2782822085889571, + "grad_norm": 0.9873599029951828, + "learning_rate": 1.693847940054366e-05, + "loss": 0.6484, + "step": 2268 + }, + { + "epoch": 0.27840490797546014, + "grad_norm": 1.0950102475965948, + "learning_rate": 1.6935616956986207e-05, + "loss": 0.6629, + "step": 2269 + }, + { + "epoch": 0.2785276073619632, + "grad_norm": 0.9430684270356928, + "learning_rate": 1.6932753418007683e-05, + "loss": 0.5695, + "step": 2270 + }, + { + "epoch": 0.27865030674846625, + "grad_norm": 1.2074622853989116, + "learning_rate": 1.6929888784060356e-05, + "loss": 0.6313, + "step": 2271 + }, + { + "epoch": 0.2787730061349693, + "grad_norm": 1.032329500206823, + "learning_rate": 1.6927023055596673e-05, + "loss": 0.6026, + "step": 2272 + }, + { + "epoch": 0.2788957055214724, + "grad_norm": 1.0178468271938834, + "learning_rate": 1.6924156233069253e-05, + "loss": 0.6359, + "step": 2273 + }, + { + "epoch": 0.27901840490797547, + "grad_norm": 0.8767515992969243, + "learning_rate": 1.6921288316930883e-05, + "loss": 0.6141, + "step": 2274 + }, + { + "epoch": 0.2791411042944785, + "grad_norm": 1.0325306788584625, + "learning_rate": 1.691841930763453e-05, + "loss": 0.6214, + "step": 2275 + }, + { + "epoch": 0.2792638036809816, + "grad_norm": 0.9251735785112147, + "learning_rate": 1.691554920563332e-05, + "loss": 0.5841, + "step": 2276 + }, + { + "epoch": 0.27938650306748464, + "grad_norm": 0.9134906346955144, + "learning_rate": 1.6912678011380567e-05, + "loss": 0.6108, + "step": 2277 + }, + { + "epoch": 0.27950920245398775, + "grad_norm": 1.0193281710563429, + "learning_rate": 1.6909805725329755e-05, + "loss": 0.6027, + "step": 2278 + }, + { + "epoch": 0.2796319018404908, + "grad_norm": 0.8982792122222624, + "learning_rate": 1.690693234793453e-05, + "loss": 0.6064, + "step": 2279 + }, + { + "epoch": 0.27975460122699386, + "grad_norm": 0.8890035093942315, + "learning_rate": 1.690405787964873e-05, + "loss": 0.5948, + "step": 2280 + }, + { + "epoch": 0.2798773006134969, + "grad_norm": 1.1166446586767873, + "learning_rate": 1.6901182320926334e-05, + "loss": 0.6492, + "step": 2281 + }, + { + "epoch": 0.28, + "grad_norm": 1.00908656465514, + "learning_rate": 1.6898305672221523e-05, + "loss": 0.641, + "step": 2282 + }, + { + "epoch": 0.2801226993865031, + "grad_norm": 1.0400952123238012, + "learning_rate": 1.6895427933988636e-05, + "loss": 0.6134, + "step": 2283 + }, + { + "epoch": 0.28024539877300614, + "grad_norm": 0.9121783854500178, + "learning_rate": 1.689254910668219e-05, + "loss": 0.5857, + "step": 2284 + }, + { + "epoch": 0.2803680981595092, + "grad_norm": 0.9059333260707986, + "learning_rate": 1.688966919075687e-05, + "loss": 0.6279, + "step": 2285 + }, + { + "epoch": 0.28049079754601225, + "grad_norm": 0.9772051920371276, + "learning_rate": 1.6886788186667528e-05, + "loss": 0.5769, + "step": 2286 + }, + { + "epoch": 0.28061349693251536, + "grad_norm": 1.072278557863421, + "learning_rate": 1.6883906094869204e-05, + "loss": 0.6651, + "step": 2287 + }, + { + "epoch": 0.2807361963190184, + "grad_norm": 0.9862150715406383, + "learning_rate": 1.6881022915817088e-05, + "loss": 0.5919, + "step": 2288 + }, + { + "epoch": 0.28085889570552147, + "grad_norm": 1.119358802230759, + "learning_rate": 1.6878138649966566e-05, + "loss": 0.6099, + "step": 2289 + }, + { + "epoch": 0.2809815950920245, + "grad_norm": 1.1225480411247208, + "learning_rate": 1.687525329777317e-05, + "loss": 0.6334, + "step": 2290 + }, + { + "epoch": 0.2811042944785276, + "grad_norm": 0.9619821933866356, + "learning_rate": 1.687236685969263e-05, + "loss": 0.6327, + "step": 2291 + }, + { + "epoch": 0.2812269938650307, + "grad_norm": 0.9507129283430523, + "learning_rate": 1.686947933618082e-05, + "loss": 0.5903, + "step": 2292 + }, + { + "epoch": 0.28134969325153375, + "grad_norm": 0.9956080254678927, + "learning_rate": 1.6866590727693816e-05, + "loss": 0.6372, + "step": 2293 + }, + { + "epoch": 0.2814723926380368, + "grad_norm": 1.0551185753417014, + "learning_rate": 1.686370103468783e-05, + "loss": 0.6589, + "step": 2294 + }, + { + "epoch": 0.28159509202453986, + "grad_norm": 0.9045777464100606, + "learning_rate": 1.686081025761928e-05, + "loss": 0.6093, + "step": 2295 + }, + { + "epoch": 0.2817177914110429, + "grad_norm": 1.289540532033655, + "learning_rate": 1.685791839694473e-05, + "loss": 0.6193, + "step": 2296 + }, + { + "epoch": 0.28184049079754603, + "grad_norm": 1.0280557681046296, + "learning_rate": 1.6855025453120935e-05, + "loss": 0.5991, + "step": 2297 + }, + { + "epoch": 0.2819631901840491, + "grad_norm": 1.7603925247010612, + "learning_rate": 1.6852131426604795e-05, + "loss": 0.6147, + "step": 2298 + }, + { + "epoch": 0.28208588957055214, + "grad_norm": 0.8970895855318529, + "learning_rate": 1.684923631785341e-05, + "loss": 0.6041, + "step": 2299 + }, + { + "epoch": 0.2822085889570552, + "grad_norm": 0.9949073381792065, + "learning_rate": 1.684634012732403e-05, + "loss": 0.6245, + "step": 2300 + }, + { + "epoch": 0.2823312883435583, + "grad_norm": 0.9045765485460757, + "learning_rate": 1.6843442855474085e-05, + "loss": 0.6079, + "step": 2301 + }, + { + "epoch": 0.28245398773006136, + "grad_norm": 0.836776389861249, + "learning_rate": 1.684054450276118e-05, + "loss": 0.6632, + "step": 2302 + }, + { + "epoch": 0.2825766871165644, + "grad_norm": 1.041163355442967, + "learning_rate": 1.6837645069643075e-05, + "loss": 0.6644, + "step": 2303 + }, + { + "epoch": 0.2826993865030675, + "grad_norm": 0.9445787316227672, + "learning_rate": 1.6834744556577716e-05, + "loss": 0.5739, + "step": 2304 + }, + { + "epoch": 0.28282208588957053, + "grad_norm": 1.0236859419821702, + "learning_rate": 1.6831842964023212e-05, + "loss": 0.6488, + "step": 2305 + }, + { + "epoch": 0.28294478527607364, + "grad_norm": 0.9706073396672111, + "learning_rate": 1.682894029243785e-05, + "loss": 0.6005, + "step": 2306 + }, + { + "epoch": 0.2830674846625767, + "grad_norm": 1.8565240912567127, + "learning_rate": 1.6826036542280078e-05, + "loss": 0.6159, + "step": 2307 + }, + { + "epoch": 0.28319018404907975, + "grad_norm": 1.2330479775404448, + "learning_rate": 1.682313171400852e-05, + "loss": 0.6821, + "step": 2308 + }, + { + "epoch": 0.2833128834355828, + "grad_norm": 1.0631335045063057, + "learning_rate": 1.682022580808196e-05, + "loss": 0.6398, + "step": 2309 + }, + { + "epoch": 0.28343558282208586, + "grad_norm": 1.1531018974904912, + "learning_rate": 1.6817318824959375e-05, + "loss": 0.5775, + "step": 2310 + }, + { + "epoch": 0.283558282208589, + "grad_norm": 0.9366186150149427, + "learning_rate": 1.681441076509989e-05, + "loss": 0.619, + "step": 2311 + }, + { + "epoch": 0.28368098159509203, + "grad_norm": 0.9285057354983007, + "learning_rate": 1.6811501628962807e-05, + "loss": 0.6492, + "step": 2312 + }, + { + "epoch": 0.2838036809815951, + "grad_norm": 1.035199463730559, + "learning_rate": 1.6808591417007604e-05, + "loss": 0.6314, + "step": 2313 + }, + { + "epoch": 0.28392638036809814, + "grad_norm": 0.9680916671434564, + "learning_rate": 1.680568012969392e-05, + "loss": 0.6266, + "step": 2314 + }, + { + "epoch": 0.28404907975460125, + "grad_norm": 1.0019425573872514, + "learning_rate": 1.680276776748157e-05, + "loss": 0.649, + "step": 2315 + }, + { + "epoch": 0.2841717791411043, + "grad_norm": 0.9855461166526143, + "learning_rate": 1.6799854330830533e-05, + "loss": 0.6541, + "step": 2316 + }, + { + "epoch": 0.28429447852760736, + "grad_norm": 1.1205877186460271, + "learning_rate": 1.6796939820200965e-05, + "loss": 0.6435, + "step": 2317 + }, + { + "epoch": 0.2844171779141104, + "grad_norm": 1.1363965384393804, + "learning_rate": 1.6794024236053186e-05, + "loss": 0.5847, + "step": 2318 + }, + { + "epoch": 0.2845398773006135, + "grad_norm": 0.8989518012482073, + "learning_rate": 1.6791107578847688e-05, + "loss": 0.6461, + "step": 2319 + }, + { + "epoch": 0.2846625766871166, + "grad_norm": 0.9853891458345618, + "learning_rate": 1.6788189849045135e-05, + "loss": 0.6412, + "step": 2320 + }, + { + "epoch": 0.28478527607361964, + "grad_norm": 0.9153756407607591, + "learning_rate": 1.6785271047106352e-05, + "loss": 0.5658, + "step": 2321 + }, + { + "epoch": 0.2849079754601227, + "grad_norm": 1.1232645695804802, + "learning_rate": 1.678235117349234e-05, + "loss": 0.575, + "step": 2322 + }, + { + "epoch": 0.28503067484662575, + "grad_norm": 0.893176012634098, + "learning_rate": 1.6779430228664272e-05, + "loss": 0.6275, + "step": 2323 + }, + { + "epoch": 0.2851533742331288, + "grad_norm": 0.9169584157296783, + "learning_rate": 1.6776508213083484e-05, + "loss": 0.5823, + "step": 2324 + }, + { + "epoch": 0.2852760736196319, + "grad_norm": 1.0233474946665835, + "learning_rate": 1.6773585127211478e-05, + "loss": 0.6594, + "step": 2325 + }, + { + "epoch": 0.285398773006135, + "grad_norm": 0.9921843043807398, + "learning_rate": 1.6770660971509937e-05, + "loss": 0.6541, + "step": 2326 + }, + { + "epoch": 0.28552147239263803, + "grad_norm": 1.0557876868193685, + "learning_rate": 1.6767735746440705e-05, + "loss": 0.5941, + "step": 2327 + }, + { + "epoch": 0.2856441717791411, + "grad_norm": 0.9669272848091304, + "learning_rate": 1.6764809452465794e-05, + "loss": 0.6394, + "step": 2328 + }, + { + "epoch": 0.2857668711656442, + "grad_norm": 0.9102791909723412, + "learning_rate": 1.676188209004739e-05, + "loss": 0.6513, + "step": 2329 + }, + { + "epoch": 0.28588957055214725, + "grad_norm": 0.8106655597734524, + "learning_rate": 1.6758953659647838e-05, + "loss": 0.5782, + "step": 2330 + }, + { + "epoch": 0.2860122699386503, + "grad_norm": 0.8869125723520189, + "learning_rate": 1.6756024161729665e-05, + "loss": 0.5835, + "step": 2331 + }, + { + "epoch": 0.28613496932515337, + "grad_norm": 0.9715169550512996, + "learning_rate": 1.675309359675556e-05, + "loss": 0.6044, + "step": 2332 + }, + { + "epoch": 0.2862576687116564, + "grad_norm": 0.9942570730008155, + "learning_rate": 1.6750161965188376e-05, + "loss": 0.63, + "step": 2333 + }, + { + "epoch": 0.28638036809815953, + "grad_norm": 1.006028641894878, + "learning_rate": 1.6747229267491145e-05, + "loss": 0.6145, + "step": 2334 + }, + { + "epoch": 0.2865030674846626, + "grad_norm": 1.1216455239882634, + "learning_rate": 1.6744295504127055e-05, + "loss": 0.668, + "step": 2335 + }, + { + "epoch": 0.28662576687116564, + "grad_norm": 1.0969554929357397, + "learning_rate": 1.6741360675559475e-05, + "loss": 0.6565, + "step": 2336 + }, + { + "epoch": 0.2867484662576687, + "grad_norm": 0.9506772503027621, + "learning_rate": 1.6738424782251933e-05, + "loss": 0.604, + "step": 2337 + }, + { + "epoch": 0.28687116564417176, + "grad_norm": 0.9894754446814802, + "learning_rate": 1.6735487824668127e-05, + "loss": 0.5669, + "step": 2338 + }, + { + "epoch": 0.28699386503067487, + "grad_norm": 1.0026616082180688, + "learning_rate": 1.6732549803271922e-05, + "loss": 0.5968, + "step": 2339 + }, + { + "epoch": 0.2871165644171779, + "grad_norm": 0.88257289520922, + "learning_rate": 1.6729610718527357e-05, + "loss": 0.6643, + "step": 2340 + }, + { + "epoch": 0.287239263803681, + "grad_norm": 0.8969093254370463, + "learning_rate": 1.672667057089864e-05, + "loss": 0.6483, + "step": 2341 + }, + { + "epoch": 0.28736196319018403, + "grad_norm": 0.9153859679563439, + "learning_rate": 1.672372936085013e-05, + "loss": 0.5942, + "step": 2342 + }, + { + "epoch": 0.2874846625766871, + "grad_norm": 0.9831706905636102, + "learning_rate": 1.672078708884638e-05, + "loss": 0.5736, + "step": 2343 + }, + { + "epoch": 0.2876073619631902, + "grad_norm": 0.8166742113923048, + "learning_rate": 1.6717843755352082e-05, + "loss": 0.6668, + "step": 2344 + }, + { + "epoch": 0.28773006134969326, + "grad_norm": 1.0699395096223725, + "learning_rate": 1.6714899360832118e-05, + "loss": 0.6023, + "step": 2345 + }, + { + "epoch": 0.2878527607361963, + "grad_norm": 0.9956913990527162, + "learning_rate": 1.671195390575153e-05, + "loss": 0.621, + "step": 2346 + }, + { + "epoch": 0.28797546012269937, + "grad_norm": 1.1157666002029865, + "learning_rate": 1.6709007390575526e-05, + "loss": 0.6621, + "step": 2347 + }, + { + "epoch": 0.2880981595092025, + "grad_norm": 0.8917084659504801, + "learning_rate": 1.6706059815769483e-05, + "loss": 0.5822, + "step": 2348 + }, + { + "epoch": 0.28822085889570553, + "grad_norm": 1.0069092641224773, + "learning_rate": 1.670311118179895e-05, + "loss": 0.5906, + "step": 2349 + }, + { + "epoch": 0.2883435582822086, + "grad_norm": 0.8920450390268759, + "learning_rate": 1.6700161489129624e-05, + "loss": 0.689, + "step": 2350 + }, + { + "epoch": 0.28846625766871165, + "grad_norm": 0.9961189111065201, + "learning_rate": 1.6697210738227402e-05, + "loss": 0.6093, + "step": 2351 + }, + { + "epoch": 0.2885889570552147, + "grad_norm": 1.0224472835209177, + "learning_rate": 1.6694258929558316e-05, + "loss": 0.6708, + "step": 2352 + }, + { + "epoch": 0.2887116564417178, + "grad_norm": 1.0365713417448605, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.5714, + "step": 2353 + }, + { + "epoch": 0.28883435582822087, + "grad_norm": 0.990710218822204, + "learning_rate": 1.6688352140784587e-05, + "loss": 0.6041, + "step": 2354 + }, + { + "epoch": 0.2889570552147239, + "grad_norm": 0.9351814774062019, + "learning_rate": 1.668539716161287e-05, + "loss": 0.5613, + "step": 2355 + }, + { + "epoch": 0.289079754601227, + "grad_norm": 1.022948832697759, + "learning_rate": 1.6682441126540143e-05, + "loss": 0.6534, + "step": 2356 + }, + { + "epoch": 0.28920245398773003, + "grad_norm": 1.0711997615256486, + "learning_rate": 1.6679484036033295e-05, + "loss": 0.6515, + "step": 2357 + }, + { + "epoch": 0.28932515337423315, + "grad_norm": 0.9180692186321467, + "learning_rate": 1.6676525890559365e-05, + "loss": 0.6048, + "step": 2358 + }, + { + "epoch": 0.2894478527607362, + "grad_norm": 0.9234283039792345, + "learning_rate": 1.6673566690585568e-05, + "loss": 0.6648, + "step": 2359 + }, + { + "epoch": 0.28957055214723926, + "grad_norm": 1.01808853419519, + "learning_rate": 1.667060643657929e-05, + "loss": 0.5739, + "step": 2360 + }, + { + "epoch": 0.2896932515337423, + "grad_norm": 1.0515562699084455, + "learning_rate": 1.6667645129008074e-05, + "loss": 0.621, + "step": 2361 + }, + { + "epoch": 0.2898159509202454, + "grad_norm": 0.9262599481508835, + "learning_rate": 1.666468276833963e-05, + "loss": 0.629, + "step": 2362 + }, + { + "epoch": 0.2899386503067485, + "grad_norm": 1.1477585050839547, + "learning_rate": 1.6661719355041838e-05, + "loss": 0.6528, + "step": 2363 + }, + { + "epoch": 0.29006134969325154, + "grad_norm": 0.9292819028694629, + "learning_rate": 1.665875488958275e-05, + "loss": 0.546, + "step": 2364 + }, + { + "epoch": 0.2901840490797546, + "grad_norm": 1.0940595898908023, + "learning_rate": 1.6655789372430572e-05, + "loss": 0.6095, + "step": 2365 + }, + { + "epoch": 0.29030674846625765, + "grad_norm": 0.9895425070500005, + "learning_rate": 1.6652822804053683e-05, + "loss": 0.6155, + "step": 2366 + }, + { + "epoch": 0.29042944785276076, + "grad_norm": 1.0634396424778938, + "learning_rate": 1.664985518492063e-05, + "loss": 0.6356, + "step": 2367 + }, + { + "epoch": 0.2905521472392638, + "grad_norm": 1.0115209029025096, + "learning_rate": 1.6646886515500118e-05, + "loss": 0.6011, + "step": 2368 + }, + { + "epoch": 0.29067484662576687, + "grad_norm": 0.9156819366232362, + "learning_rate": 1.6643916796261025e-05, + "loss": 0.6116, + "step": 2369 + }, + { + "epoch": 0.2907975460122699, + "grad_norm": 1.0920381611599574, + "learning_rate": 1.6640946027672395e-05, + "loss": 0.5733, + "step": 2370 + }, + { + "epoch": 0.290920245398773, + "grad_norm": 1.0339409483842394, + "learning_rate": 1.6637974210203428e-05, + "loss": 0.6003, + "step": 2371 + }, + { + "epoch": 0.2910429447852761, + "grad_norm": 0.8848349440936696, + "learning_rate": 1.6635001344323506e-05, + "loss": 0.6091, + "step": 2372 + }, + { + "epoch": 0.29116564417177915, + "grad_norm": 0.871070841054293, + "learning_rate": 1.663202743050216e-05, + "loss": 0.6432, + "step": 2373 + }, + { + "epoch": 0.2912883435582822, + "grad_norm": 0.9485450894836815, + "learning_rate": 1.6629052469209105e-05, + "loss": 0.6313, + "step": 2374 + }, + { + "epoch": 0.29141104294478526, + "grad_norm": 0.9177030879094101, + "learning_rate": 1.66260764609142e-05, + "loss": 0.5988, + "step": 2375 + }, + { + "epoch": 0.29153374233128837, + "grad_norm": 0.9978160848316652, + "learning_rate": 1.662309940608748e-05, + "loss": 0.6817, + "step": 2376 + }, + { + "epoch": 0.2916564417177914, + "grad_norm": 1.1884553227024628, + "learning_rate": 1.6620121305199152e-05, + "loss": 0.6217, + "step": 2377 + }, + { + "epoch": 0.2917791411042945, + "grad_norm": 0.9724302525068927, + "learning_rate": 1.6617142158719577e-05, + "loss": 0.6176, + "step": 2378 + }, + { + "epoch": 0.29190184049079754, + "grad_norm": 0.9786195588645211, + "learning_rate": 1.6614161967119286e-05, + "loss": 0.6802, + "step": 2379 + }, + { + "epoch": 0.2920245398773006, + "grad_norm": 0.9862466378678586, + "learning_rate": 1.6611180730868975e-05, + "loss": 0.551, + "step": 2380 + }, + { + "epoch": 0.2921472392638037, + "grad_norm": 0.9505015331000901, + "learning_rate": 1.660819845043951e-05, + "loss": 0.6315, + "step": 2381 + }, + { + "epoch": 0.29226993865030676, + "grad_norm": 0.916490305507325, + "learning_rate": 1.660521512630191e-05, + "loss": 0.609, + "step": 2382 + }, + { + "epoch": 0.2923926380368098, + "grad_norm": 0.9073309933870276, + "learning_rate": 1.660223075892736e-05, + "loss": 0.6108, + "step": 2383 + }, + { + "epoch": 0.29251533742331287, + "grad_norm": 0.9361991012365652, + "learning_rate": 1.659924534878723e-05, + "loss": 0.6432, + "step": 2384 + }, + { + "epoch": 0.2926380368098159, + "grad_norm": 0.9870446412403976, + "learning_rate": 1.6596258896353027e-05, + "loss": 0.5383, + "step": 2385 + }, + { + "epoch": 0.29276073619631904, + "grad_norm": 1.0880096251701183, + "learning_rate": 1.6593271402096443e-05, + "loss": 0.6181, + "step": 2386 + }, + { + "epoch": 0.2928834355828221, + "grad_norm": 1.0661243504033413, + "learning_rate": 1.659028286648932e-05, + "loss": 0.6096, + "step": 2387 + }, + { + "epoch": 0.29300613496932515, + "grad_norm": 0.9676073821160147, + "learning_rate": 1.6587293290003678e-05, + "loss": 0.6369, + "step": 2388 + }, + { + "epoch": 0.2931288343558282, + "grad_norm": 0.9746144811151071, + "learning_rate": 1.6584302673111694e-05, + "loss": 0.6499, + "step": 2389 + }, + { + "epoch": 0.29325153374233126, + "grad_norm": 0.9072102606681242, + "learning_rate": 1.658131101628571e-05, + "loss": 0.5791, + "step": 2390 + }, + { + "epoch": 0.29337423312883437, + "grad_norm": 0.9027105702798143, + "learning_rate": 1.657831831999823e-05, + "loss": 0.6037, + "step": 2391 + }, + { + "epoch": 0.2934969325153374, + "grad_norm": 0.9310203533119153, + "learning_rate": 1.6575324584721927e-05, + "loss": 0.5926, + "step": 2392 + }, + { + "epoch": 0.2936196319018405, + "grad_norm": 0.8638722698285288, + "learning_rate": 1.6572329810929635e-05, + "loss": 0.613, + "step": 2393 + }, + { + "epoch": 0.29374233128834354, + "grad_norm": 0.8154452089029959, + "learning_rate": 1.656933399909435e-05, + "loss": 0.6149, + "step": 2394 + }, + { + "epoch": 0.29386503067484665, + "grad_norm": 0.9099505376531759, + "learning_rate": 1.656633714968924e-05, + "loss": 0.6464, + "step": 2395 + }, + { + "epoch": 0.2939877300613497, + "grad_norm": 0.9992942433130241, + "learning_rate": 1.656333926318763e-05, + "loss": 0.6215, + "step": 2396 + }, + { + "epoch": 0.29411042944785276, + "grad_norm": 0.9997646662882828, + "learning_rate": 1.6560340340063007e-05, + "loss": 0.6731, + "step": 2397 + }, + { + "epoch": 0.2942331288343558, + "grad_norm": 0.9357363692291952, + "learning_rate": 1.655734038078903e-05, + "loss": 0.6048, + "step": 2398 + }, + { + "epoch": 0.2943558282208589, + "grad_norm": 0.9674664838097906, + "learning_rate": 1.655433938583951e-05, + "loss": 0.6146, + "step": 2399 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 0.8829077534175648, + "learning_rate": 1.6551337355688437e-05, + "loss": 0.6071, + "step": 2400 + }, + { + "epoch": 0.29460122699386504, + "grad_norm": 1.271087662471138, + "learning_rate": 1.654833429080995e-05, + "loss": 0.6184, + "step": 2401 + }, + { + "epoch": 0.2947239263803681, + "grad_norm": 0.9664464466890657, + "learning_rate": 1.6545330191678356e-05, + "loss": 0.6842, + "step": 2402 + }, + { + "epoch": 0.29484662576687115, + "grad_norm": 1.0417303813691818, + "learning_rate": 1.6542325058768133e-05, + "loss": 0.6037, + "step": 2403 + }, + { + "epoch": 0.2949693251533742, + "grad_norm": 1.0110516486152716, + "learning_rate": 1.653931889255391e-05, + "loss": 0.6196, + "step": 2404 + }, + { + "epoch": 0.2950920245398773, + "grad_norm": 1.132191823189473, + "learning_rate": 1.653631169351049e-05, + "loss": 0.6222, + "step": 2405 + }, + { + "epoch": 0.2952147239263804, + "grad_norm": 0.9634899892771747, + "learning_rate": 1.653330346211283e-05, + "loss": 0.6184, + "step": 2406 + }, + { + "epoch": 0.29533742331288343, + "grad_norm": 0.994775163574159, + "learning_rate": 1.6530294198836058e-05, + "loss": 0.6431, + "step": 2407 + }, + { + "epoch": 0.2954601226993865, + "grad_norm": 0.975434794505996, + "learning_rate": 1.6527283904155457e-05, + "loss": 0.649, + "step": 2408 + }, + { + "epoch": 0.2955828220858896, + "grad_norm": 0.9865225480551283, + "learning_rate": 1.652427257854648e-05, + "loss": 0.6415, + "step": 2409 + }, + { + "epoch": 0.29570552147239265, + "grad_norm": 0.9727263700565968, + "learning_rate": 1.6521260222484738e-05, + "loss": 0.6193, + "step": 2410 + }, + { + "epoch": 0.2958282208588957, + "grad_norm": 0.8681548107245399, + "learning_rate": 1.651824683644601e-05, + "loss": 0.6462, + "step": 2411 + }, + { + "epoch": 0.29595092024539876, + "grad_norm": 0.9661876562456925, + "learning_rate": 1.651523242090623e-05, + "loss": 0.6284, + "step": 2412 + }, + { + "epoch": 0.2960736196319018, + "grad_norm": 1.0256741117860162, + "learning_rate": 1.6512216976341507e-05, + "loss": 0.6539, + "step": 2413 + }, + { + "epoch": 0.29619631901840493, + "grad_norm": 1.1086291877807384, + "learning_rate": 1.6509200503228092e-05, + "loss": 0.626, + "step": 2414 + }, + { + "epoch": 0.296319018404908, + "grad_norm": 1.2613355859306432, + "learning_rate": 1.650618300204242e-05, + "loss": 0.6257, + "step": 2415 + }, + { + "epoch": 0.29644171779141104, + "grad_norm": 0.9627188732470942, + "learning_rate": 1.6503164473261082e-05, + "loss": 0.6227, + "step": 2416 + }, + { + "epoch": 0.2965644171779141, + "grad_norm": 0.9238681842376625, + "learning_rate": 1.650014491736082e-05, + "loss": 0.6179, + "step": 2417 + }, + { + "epoch": 0.29668711656441715, + "grad_norm": 1.0178637870511875, + "learning_rate": 1.6497124334818552e-05, + "loss": 0.6319, + "step": 2418 + }, + { + "epoch": 0.29680981595092026, + "grad_norm": 0.9022878250163657, + "learning_rate": 1.6494102726111354e-05, + "loss": 0.625, + "step": 2419 + }, + { + "epoch": 0.2969325153374233, + "grad_norm": 0.8119247413834335, + "learning_rate": 1.6491080091716457e-05, + "loss": 0.5774, + "step": 2420 + }, + { + "epoch": 0.2970552147239264, + "grad_norm": 1.0615419878744294, + "learning_rate": 1.648805643211127e-05, + "loss": 0.664, + "step": 2421 + }, + { + "epoch": 0.29717791411042943, + "grad_norm": 0.9046153676075616, + "learning_rate": 1.6485031747773344e-05, + "loss": 0.5791, + "step": 2422 + }, + { + "epoch": 0.29730061349693254, + "grad_norm": 0.9336024201802401, + "learning_rate": 1.6482006039180406e-05, + "loss": 0.6771, + "step": 2423 + }, + { + "epoch": 0.2974233128834356, + "grad_norm": 0.9692475410676245, + "learning_rate": 1.6478979306810345e-05, + "loss": 0.6061, + "step": 2424 + }, + { + "epoch": 0.29754601226993865, + "grad_norm": 0.9664892453935189, + "learning_rate": 1.64759515511412e-05, + "loss": 0.5876, + "step": 2425 + }, + { + "epoch": 0.2976687116564417, + "grad_norm": 0.9180769815858418, + "learning_rate": 1.6472922772651182e-05, + "loss": 0.5783, + "step": 2426 + }, + { + "epoch": 0.29779141104294476, + "grad_norm": 0.9659309522809532, + "learning_rate": 1.6469892971818662e-05, + "loss": 0.6193, + "step": 2427 + }, + { + "epoch": 0.2979141104294479, + "grad_norm": 1.0138830300420734, + "learning_rate": 1.646686214912217e-05, + "loss": 0.597, + "step": 2428 + }, + { + "epoch": 0.29803680981595093, + "grad_norm": 1.2172830754499886, + "learning_rate": 1.6463830305040395e-05, + "loss": 0.6487, + "step": 2429 + }, + { + "epoch": 0.298159509202454, + "grad_norm": 0.8264221061538717, + "learning_rate": 1.6460797440052195e-05, + "loss": 0.6006, + "step": 2430 + }, + { + "epoch": 0.29828220858895704, + "grad_norm": 1.0031291193108434, + "learning_rate": 1.6457763554636587e-05, + "loss": 0.5983, + "step": 2431 + }, + { + "epoch": 0.2984049079754601, + "grad_norm": 1.0716693019618981, + "learning_rate": 1.6454728649272743e-05, + "loss": 0.5911, + "step": 2432 + }, + { + "epoch": 0.2985276073619632, + "grad_norm": 1.1162611523585046, + "learning_rate": 1.645169272444e-05, + "loss": 0.5407, + "step": 2433 + }, + { + "epoch": 0.29865030674846627, + "grad_norm": 0.9204018168426461, + "learning_rate": 1.6448655780617857e-05, + "loss": 0.6185, + "step": 2434 + }, + { + "epoch": 0.2987730061349693, + "grad_norm": 0.9422567359093673, + "learning_rate": 1.6445617818285974e-05, + "loss": 0.6692, + "step": 2435 + }, + { + "epoch": 0.2988957055214724, + "grad_norm": 0.9334815599656039, + "learning_rate": 1.6442578837924172e-05, + "loss": 0.5496, + "step": 2436 + }, + { + "epoch": 0.29901840490797543, + "grad_norm": 0.9757195134873486, + "learning_rate": 1.6439538840012428e-05, + "loss": 0.701, + "step": 2437 + }, + { + "epoch": 0.29914110429447854, + "grad_norm": 0.89326751477535, + "learning_rate": 1.6436497825030886e-05, + "loss": 0.6334, + "step": 2438 + }, + { + "epoch": 0.2992638036809816, + "grad_norm": 1.060326041995712, + "learning_rate": 1.6433455793459853e-05, + "loss": 0.6366, + "step": 2439 + }, + { + "epoch": 0.29938650306748466, + "grad_norm": 0.8371162299633925, + "learning_rate": 1.643041274577978e-05, + "loss": 0.5958, + "step": 2440 + }, + { + "epoch": 0.2995092024539877, + "grad_norm": 0.8382818530600696, + "learning_rate": 1.6427368682471302e-05, + "loss": 0.6267, + "step": 2441 + }, + { + "epoch": 0.2996319018404908, + "grad_norm": 1.039283730678276, + "learning_rate": 1.6424323604015194e-05, + "loss": 0.6079, + "step": 2442 + }, + { + "epoch": 0.2997546012269939, + "grad_norm": 0.9112953445595332, + "learning_rate": 1.6421277510892403e-05, + "loss": 0.6115, + "step": 2443 + }, + { + "epoch": 0.29987730061349693, + "grad_norm": 0.9842494522647882, + "learning_rate": 1.6418230403584034e-05, + "loss": 0.6547, + "step": 2444 + }, + { + "epoch": 0.3, + "grad_norm": 1.0324816089406381, + "learning_rate": 1.6415182282571356e-05, + "loss": 0.6069, + "step": 2445 + }, + { + "epoch": 0.30012269938650304, + "grad_norm": 0.8353314245607436, + "learning_rate": 1.6412133148335786e-05, + "loss": 0.6032, + "step": 2446 + }, + { + "epoch": 0.30024539877300616, + "grad_norm": 0.8487955819393807, + "learning_rate": 1.640908300135891e-05, + "loss": 0.6279, + "step": 2447 + }, + { + "epoch": 0.3003680981595092, + "grad_norm": 1.0016917484240186, + "learning_rate": 1.6406031842122474e-05, + "loss": 0.6209, + "step": 2448 + }, + { + "epoch": 0.30049079754601227, + "grad_norm": 0.8462652715311509, + "learning_rate": 1.6402979671108386e-05, + "loss": 0.6622, + "step": 2449 + }, + { + "epoch": 0.3006134969325153, + "grad_norm": 1.0683597007590642, + "learning_rate": 1.6399926488798702e-05, + "loss": 0.635, + "step": 2450 + }, + { + "epoch": 0.3007361963190184, + "grad_norm": 0.8957792303230198, + "learning_rate": 1.639687229567565e-05, + "loss": 0.6207, + "step": 2451 + }, + { + "epoch": 0.3008588957055215, + "grad_norm": 1.5427716205695141, + "learning_rate": 1.639381709222162e-05, + "loss": 0.6126, + "step": 2452 + }, + { + "epoch": 0.30098159509202455, + "grad_norm": 0.9871313607586887, + "learning_rate": 1.639076087891914e-05, + "loss": 0.6147, + "step": 2453 + }, + { + "epoch": 0.3011042944785276, + "grad_norm": 0.9349059933337732, + "learning_rate": 1.6387703656250926e-05, + "loss": 0.6583, + "step": 2454 + }, + { + "epoch": 0.30122699386503066, + "grad_norm": 0.8754174791115169, + "learning_rate": 1.6384645424699835e-05, + "loss": 0.5952, + "step": 2455 + }, + { + "epoch": 0.30134969325153377, + "grad_norm": 0.9675916340697353, + "learning_rate": 1.638158618474889e-05, + "loss": 0.6428, + "step": 2456 + }, + { + "epoch": 0.3014723926380368, + "grad_norm": 1.0196493922245078, + "learning_rate": 1.637852593688127e-05, + "loss": 0.6424, + "step": 2457 + }, + { + "epoch": 0.3015950920245399, + "grad_norm": 0.9616366918111499, + "learning_rate": 1.6375464681580315e-05, + "loss": 0.6397, + "step": 2458 + }, + { + "epoch": 0.30171779141104293, + "grad_norm": 1.0411970053425856, + "learning_rate": 1.6372402419329523e-05, + "loss": 0.5932, + "step": 2459 + }, + { + "epoch": 0.301840490797546, + "grad_norm": 0.8610057103649918, + "learning_rate": 1.6369339150612557e-05, + "loss": 0.6111, + "step": 2460 + }, + { + "epoch": 0.3019631901840491, + "grad_norm": 0.9955129455278842, + "learning_rate": 1.636627487591323e-05, + "loss": 0.5892, + "step": 2461 + }, + { + "epoch": 0.30208588957055216, + "grad_norm": 0.8665474282450525, + "learning_rate": 1.6363209595715518e-05, + "loss": 0.5949, + "step": 2462 + }, + { + "epoch": 0.3022085889570552, + "grad_norm": 0.9065119452006595, + "learning_rate": 1.6360143310503553e-05, + "loss": 0.5722, + "step": 2463 + }, + { + "epoch": 0.30233128834355827, + "grad_norm": 1.4642498801424366, + "learning_rate": 1.6357076020761632e-05, + "loss": 0.6304, + "step": 2464 + }, + { + "epoch": 0.3024539877300613, + "grad_norm": 1.0540238452695074, + "learning_rate": 1.6354007726974205e-05, + "loss": 0.6635, + "step": 2465 + }, + { + "epoch": 0.30257668711656444, + "grad_norm": 0.9958592513629329, + "learning_rate": 1.6350938429625885e-05, + "loss": 0.6656, + "step": 2466 + }, + { + "epoch": 0.3026993865030675, + "grad_norm": 1.0101345362452525, + "learning_rate": 1.6347868129201444e-05, + "loss": 0.6157, + "step": 2467 + }, + { + "epoch": 0.30282208588957055, + "grad_norm": 0.9681888106850371, + "learning_rate": 1.63447968261858e-05, + "loss": 0.6753, + "step": 2468 + }, + { + "epoch": 0.3029447852760736, + "grad_norm": 1.032437379755511, + "learning_rate": 1.634172452106405e-05, + "loss": 0.6411, + "step": 2469 + }, + { + "epoch": 0.3030674846625767, + "grad_norm": 1.0157362035623358, + "learning_rate": 1.6338651214321426e-05, + "loss": 0.6547, + "step": 2470 + }, + { + "epoch": 0.30319018404907977, + "grad_norm": 1.0495465942469255, + "learning_rate": 1.633557690644334e-05, + "loss": 0.6493, + "step": 2471 + }, + { + "epoch": 0.3033128834355828, + "grad_norm": 1.0230547189857166, + "learning_rate": 1.6332501597915353e-05, + "loss": 0.6202, + "step": 2472 + }, + { + "epoch": 0.3034355828220859, + "grad_norm": 0.8758245130774645, + "learning_rate": 1.6329425289223176e-05, + "loss": 0.6149, + "step": 2473 + }, + { + "epoch": 0.30355828220858894, + "grad_norm": 1.029573982089221, + "learning_rate": 1.632634798085269e-05, + "loss": 0.5651, + "step": 2474 + }, + { + "epoch": 0.30368098159509205, + "grad_norm": 0.8980862988163587, + "learning_rate": 1.632326967328993e-05, + "loss": 0.6518, + "step": 2475 + }, + { + "epoch": 0.3038036809815951, + "grad_norm": 1.039867971438969, + "learning_rate": 1.6320190367021086e-05, + "loss": 0.614, + "step": 2476 + }, + { + "epoch": 0.30392638036809816, + "grad_norm": 0.9037628693696435, + "learning_rate": 1.631711006253251e-05, + "loss": 0.5809, + "step": 2477 + }, + { + "epoch": 0.3040490797546012, + "grad_norm": 0.8849463025346948, + "learning_rate": 1.631402876031071e-05, + "loss": 0.6446, + "step": 2478 + }, + { + "epoch": 0.30417177914110427, + "grad_norm": 0.974953623649749, + "learning_rate": 1.631094646084235e-05, + "loss": 0.6098, + "step": 2479 + }, + { + "epoch": 0.3042944785276074, + "grad_norm": 1.050167559931487, + "learning_rate": 1.630786316461425e-05, + "loss": 0.6488, + "step": 2480 + }, + { + "epoch": 0.30441717791411044, + "grad_norm": 0.9630335297357068, + "learning_rate": 1.6304778872113396e-05, + "loss": 0.6251, + "step": 2481 + }, + { + "epoch": 0.3045398773006135, + "grad_norm": 0.9082391166100225, + "learning_rate": 1.630169358382692e-05, + "loss": 0.6107, + "step": 2482 + }, + { + "epoch": 0.30466257668711655, + "grad_norm": 0.9274103988137776, + "learning_rate": 1.6298607300242117e-05, + "loss": 0.6121, + "step": 2483 + }, + { + "epoch": 0.3047852760736196, + "grad_norm": 1.0091568334525582, + "learning_rate": 1.6295520021846445e-05, + "loss": 0.6665, + "step": 2484 + }, + { + "epoch": 0.3049079754601227, + "grad_norm": 1.0069239923419064, + "learning_rate": 1.6292431749127507e-05, + "loss": 0.6203, + "step": 2485 + }, + { + "epoch": 0.30503067484662577, + "grad_norm": 1.1360876016999994, + "learning_rate": 1.6289342482573073e-05, + "loss": 0.6422, + "step": 2486 + }, + { + "epoch": 0.3051533742331288, + "grad_norm": 0.9885542641236945, + "learning_rate": 1.6286252222671063e-05, + "loss": 0.5831, + "step": 2487 + }, + { + "epoch": 0.3052760736196319, + "grad_norm": 0.9997976963483876, + "learning_rate": 1.6283160969909558e-05, + "loss": 0.6256, + "step": 2488 + }, + { + "epoch": 0.305398773006135, + "grad_norm": 0.9624897333811907, + "learning_rate": 1.6280068724776795e-05, + "loss": 0.6075, + "step": 2489 + }, + { + "epoch": 0.30552147239263805, + "grad_norm": 0.8349419608578875, + "learning_rate": 1.627697548776117e-05, + "loss": 0.6035, + "step": 2490 + }, + { + "epoch": 0.3056441717791411, + "grad_norm": 0.768196634669225, + "learning_rate": 1.6273881259351227e-05, + "loss": 0.6397, + "step": 2491 + }, + { + "epoch": 0.30576687116564416, + "grad_norm": 1.2668893002955446, + "learning_rate": 1.6270786040035678e-05, + "loss": 0.6225, + "step": 2492 + }, + { + "epoch": 0.3058895705521472, + "grad_norm": 0.9305939758851497, + "learning_rate": 1.626768983030339e-05, + "loss": 0.5864, + "step": 2493 + }, + { + "epoch": 0.3060122699386503, + "grad_norm": 0.8976557165470086, + "learning_rate": 1.6264592630643373e-05, + "loss": 0.623, + "step": 2494 + }, + { + "epoch": 0.3061349693251534, + "grad_norm": 0.9206350520510475, + "learning_rate": 1.6261494441544805e-05, + "loss": 0.6076, + "step": 2495 + }, + { + "epoch": 0.30625766871165644, + "grad_norm": 0.9160042030724965, + "learning_rate": 1.625839526349702e-05, + "loss": 0.5872, + "step": 2496 + }, + { + "epoch": 0.3063803680981595, + "grad_norm": 1.008714785795401, + "learning_rate": 1.6255295096989514e-05, + "loss": 0.6271, + "step": 2497 + }, + { + "epoch": 0.30650306748466255, + "grad_norm": 0.8989712470965558, + "learning_rate": 1.625219394251192e-05, + "loss": 0.5909, + "step": 2498 + }, + { + "epoch": 0.30662576687116566, + "grad_norm": 1.1336378958319877, + "learning_rate": 1.624909180055405e-05, + "loss": 0.6212, + "step": 2499 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 0.9579438438182059, + "learning_rate": 1.624598867160585e-05, + "loss": 0.5688, + "step": 2500 + }, + { + "epoch": 0.3068711656441718, + "grad_norm": 0.8284784064264479, + "learning_rate": 1.6242884556157438e-05, + "loss": 0.5787, + "step": 2501 + }, + { + "epoch": 0.30699386503067483, + "grad_norm": 0.967679316809881, + "learning_rate": 1.623977945469908e-05, + "loss": 0.6226, + "step": 2502 + }, + { + "epoch": 0.30711656441717794, + "grad_norm": 1.015517777442808, + "learning_rate": 1.6236673367721203e-05, + "loss": 0.6311, + "step": 2503 + }, + { + "epoch": 0.307239263803681, + "grad_norm": 0.9666065027406306, + "learning_rate": 1.6233566295714384e-05, + "loss": 0.6324, + "step": 2504 + }, + { + "epoch": 0.30736196319018405, + "grad_norm": 0.9558801727137424, + "learning_rate": 1.623045823916936e-05, + "loss": 0.682, + "step": 2505 + }, + { + "epoch": 0.3074846625766871, + "grad_norm": 0.9747998733260917, + "learning_rate": 1.622734919857702e-05, + "loss": 0.6515, + "step": 2506 + }, + { + "epoch": 0.30760736196319016, + "grad_norm": 0.9789152722924609, + "learning_rate": 1.6224239174428414e-05, + "loss": 0.6077, + "step": 2507 + }, + { + "epoch": 0.3077300613496933, + "grad_norm": 0.9803534952079209, + "learning_rate": 1.6221128167214742e-05, + "loss": 0.557, + "step": 2508 + }, + { + "epoch": 0.30785276073619633, + "grad_norm": 0.9055547892724363, + "learning_rate": 1.6218016177427358e-05, + "loss": 0.6484, + "step": 2509 + }, + { + "epoch": 0.3079754601226994, + "grad_norm": 0.966822500301666, + "learning_rate": 1.6214903205557774e-05, + "loss": 0.5942, + "step": 2510 + }, + { + "epoch": 0.30809815950920244, + "grad_norm": 0.992749088157836, + "learning_rate": 1.6211789252097662e-05, + "loss": 0.5913, + "step": 2511 + }, + { + "epoch": 0.3082208588957055, + "grad_norm": 0.8597822686208092, + "learning_rate": 1.620867431753884e-05, + "loss": 0.624, + "step": 2512 + }, + { + "epoch": 0.3083435582822086, + "grad_norm": 1.0353044877952393, + "learning_rate": 1.6205558402373286e-05, + "loss": 0.6164, + "step": 2513 + }, + { + "epoch": 0.30846625766871166, + "grad_norm": 0.9855460191678732, + "learning_rate": 1.6202441507093135e-05, + "loss": 0.6526, + "step": 2514 + }, + { + "epoch": 0.3085889570552147, + "grad_norm": 1.0098925724807364, + "learning_rate": 1.619932363219067e-05, + "loss": 0.6316, + "step": 2515 + }, + { + "epoch": 0.3087116564417178, + "grad_norm": 1.1547841889579413, + "learning_rate": 1.6196204778158334e-05, + "loss": 0.6037, + "step": 2516 + }, + { + "epoch": 0.3088343558282209, + "grad_norm": 0.9467219930803837, + "learning_rate": 1.6193084945488722e-05, + "loss": 0.6268, + "step": 2517 + }, + { + "epoch": 0.30895705521472394, + "grad_norm": 0.9808377469895716, + "learning_rate": 1.6189964134674587e-05, + "loss": 0.6198, + "step": 2518 + }, + { + "epoch": 0.309079754601227, + "grad_norm": 0.9227526267363221, + "learning_rate": 1.618684234620883e-05, + "loss": 0.5911, + "step": 2519 + }, + { + "epoch": 0.30920245398773005, + "grad_norm": 1.1022394611698465, + "learning_rate": 1.6183719580584515e-05, + "loss": 0.6179, + "step": 2520 + }, + { + "epoch": 0.3093251533742331, + "grad_norm": 0.9408530858408983, + "learning_rate": 1.618059583829486e-05, + "loss": 0.6292, + "step": 2521 + }, + { + "epoch": 0.3094478527607362, + "grad_norm": 1.1918860746391693, + "learning_rate": 1.617747111983322e-05, + "loss": 0.6475, + "step": 2522 + }, + { + "epoch": 0.3095705521472393, + "grad_norm": 1.0323232902937636, + "learning_rate": 1.617434542569313e-05, + "loss": 0.5863, + "step": 2523 + }, + { + "epoch": 0.30969325153374233, + "grad_norm": 0.8947877343207633, + "learning_rate": 1.617121875636826e-05, + "loss": 0.645, + "step": 2524 + }, + { + "epoch": 0.3098159509202454, + "grad_norm": 0.9354958612879443, + "learning_rate": 1.6168091112352443e-05, + "loss": 0.5976, + "step": 2525 + }, + { + "epoch": 0.30993865030674844, + "grad_norm": 0.988724638491029, + "learning_rate": 1.6164962494139663e-05, + "loss": 0.6002, + "step": 2526 + }, + { + "epoch": 0.31006134969325155, + "grad_norm": 0.9545342270874885, + "learning_rate": 1.6161832902224058e-05, + "loss": 0.6373, + "step": 2527 + }, + { + "epoch": 0.3101840490797546, + "grad_norm": 0.936879397133014, + "learning_rate": 1.615870233709992e-05, + "loss": 0.6148, + "step": 2528 + }, + { + "epoch": 0.31030674846625766, + "grad_norm": 1.0743004934468212, + "learning_rate": 1.615557079926169e-05, + "loss": 0.5798, + "step": 2529 + }, + { + "epoch": 0.3104294478527607, + "grad_norm": 0.9022586486594238, + "learning_rate": 1.6152438289203982e-05, + "loss": 0.5923, + "step": 2530 + }, + { + "epoch": 0.3105521472392638, + "grad_norm": 0.9277706024891608, + "learning_rate": 1.6149304807421535e-05, + "loss": 0.6276, + "step": 2531 + }, + { + "epoch": 0.3106748466257669, + "grad_norm": 0.9551797350027179, + "learning_rate": 1.6146170354409264e-05, + "loss": 0.6137, + "step": 2532 + }, + { + "epoch": 0.31079754601226994, + "grad_norm": 0.8369663781671028, + "learning_rate": 1.614303493066222e-05, + "loss": 0.5818, + "step": 2533 + }, + { + "epoch": 0.310920245398773, + "grad_norm": 0.9377690833406829, + "learning_rate": 1.6139898536675622e-05, + "loss": 0.6245, + "step": 2534 + }, + { + "epoch": 0.31104294478527605, + "grad_norm": 1.0619449375396213, + "learning_rate": 1.6136761172944837e-05, + "loss": 0.6495, + "step": 2535 + }, + { + "epoch": 0.31116564417177917, + "grad_norm": 0.956878502716611, + "learning_rate": 1.6133622839965383e-05, + "loss": 0.6359, + "step": 2536 + }, + { + "epoch": 0.3112883435582822, + "grad_norm": 0.8504451167143928, + "learning_rate": 1.6130483538232932e-05, + "loss": 0.597, + "step": 2537 + }, + { + "epoch": 0.3114110429447853, + "grad_norm": 1.1320500606313124, + "learning_rate": 1.6127343268243315e-05, + "loss": 0.6455, + "step": 2538 + }, + { + "epoch": 0.31153374233128833, + "grad_norm": 1.028901902920431, + "learning_rate": 1.61242020304925e-05, + "loss": 0.6636, + "step": 2539 + }, + { + "epoch": 0.3116564417177914, + "grad_norm": 0.9313785247288648, + "learning_rate": 1.612105982547663e-05, + "loss": 0.6063, + "step": 2540 + }, + { + "epoch": 0.3117791411042945, + "grad_norm": 0.9211199355148706, + "learning_rate": 1.6117916653691982e-05, + "loss": 0.6225, + "step": 2541 + }, + { + "epoch": 0.31190184049079756, + "grad_norm": 1.1704097527803174, + "learning_rate": 1.6114772515634997e-05, + "loss": 0.5968, + "step": 2542 + }, + { + "epoch": 0.3120245398773006, + "grad_norm": 1.0122731697632796, + "learning_rate": 1.6111627411802263e-05, + "loss": 0.6352, + "step": 2543 + }, + { + "epoch": 0.31214723926380367, + "grad_norm": 0.9840016755158049, + "learning_rate": 1.6108481342690518e-05, + "loss": 0.6463, + "step": 2544 + }, + { + "epoch": 0.3122699386503067, + "grad_norm": 0.8703420856073659, + "learning_rate": 1.6105334308796665e-05, + "loss": 0.5936, + "step": 2545 + }, + { + "epoch": 0.31239263803680983, + "grad_norm": 0.9083330253352233, + "learning_rate": 1.6102186310617744e-05, + "loss": 0.6131, + "step": 2546 + }, + { + "epoch": 0.3125153374233129, + "grad_norm": 1.0373834850998864, + "learning_rate": 1.6099037348650956e-05, + "loss": 0.6224, + "step": 2547 + }, + { + "epoch": 0.31263803680981594, + "grad_norm": 1.0160714322251465, + "learning_rate": 1.6095887423393658e-05, + "loss": 0.6542, + "step": 2548 + }, + { + "epoch": 0.312760736196319, + "grad_norm": 0.9014496950965066, + "learning_rate": 1.6092736535343343e-05, + "loss": 0.6137, + "step": 2549 + }, + { + "epoch": 0.3128834355828221, + "grad_norm": 1.1012820438272966, + "learning_rate": 1.6089584684997674e-05, + "loss": 0.6481, + "step": 2550 + }, + { + "epoch": 0.31300613496932517, + "grad_norm": 1.0069057929729055, + "learning_rate": 1.608643187285446e-05, + "loss": 0.6177, + "step": 2551 + }, + { + "epoch": 0.3131288343558282, + "grad_norm": 1.0059161086895838, + "learning_rate": 1.608327809941165e-05, + "loss": 0.6603, + "step": 2552 + }, + { + "epoch": 0.3132515337423313, + "grad_norm": 0.9362424919635093, + "learning_rate": 1.608012336516737e-05, + "loss": 0.6597, + "step": 2553 + }, + { + "epoch": 0.31337423312883433, + "grad_norm": 1.0054426156854117, + "learning_rate": 1.6076967670619872e-05, + "loss": 0.6361, + "step": 2554 + }, + { + "epoch": 0.31349693251533745, + "grad_norm": 0.987905392311116, + "learning_rate": 1.607381101626758e-05, + "loss": 0.648, + "step": 2555 + }, + { + "epoch": 0.3136196319018405, + "grad_norm": 1.0245608395454733, + "learning_rate": 1.6070653402609054e-05, + "loss": 0.6365, + "step": 2556 + }, + { + "epoch": 0.31374233128834356, + "grad_norm": 0.9479284192605977, + "learning_rate": 1.6067494830143014e-05, + "loss": 0.6448, + "step": 2557 + }, + { + "epoch": 0.3138650306748466, + "grad_norm": 1.0197997875572178, + "learning_rate": 1.6064335299368322e-05, + "loss": 0.5794, + "step": 2558 + }, + { + "epoch": 0.31398773006134967, + "grad_norm": 1.0003110558288353, + "learning_rate": 1.6061174810784013e-05, + "loss": 0.6591, + "step": 2559 + }, + { + "epoch": 0.3141104294478528, + "grad_norm": 0.9513747797002657, + "learning_rate": 1.6058013364889247e-05, + "loss": 0.6303, + "step": 2560 + }, + { + "epoch": 0.31423312883435583, + "grad_norm": 1.01070085982261, + "learning_rate": 1.6054850962183354e-05, + "loss": 0.6768, + "step": 2561 + }, + { + "epoch": 0.3143558282208589, + "grad_norm": 1.1728629912071613, + "learning_rate": 1.6051687603165808e-05, + "loss": 0.6147, + "step": 2562 + }, + { + "epoch": 0.31447852760736195, + "grad_norm": 0.9139175368496651, + "learning_rate": 1.604852328833623e-05, + "loss": 0.6145, + "step": 2563 + }, + { + "epoch": 0.31460122699386506, + "grad_norm": 1.0800120557319621, + "learning_rate": 1.60453580181944e-05, + "loss": 0.6195, + "step": 2564 + }, + { + "epoch": 0.3147239263803681, + "grad_norm": 0.8464923804736425, + "learning_rate": 1.6042191793240242e-05, + "loss": 0.6005, + "step": 2565 + }, + { + "epoch": 0.31484662576687117, + "grad_norm": 0.9822768334228931, + "learning_rate": 1.6039024613973833e-05, + "loss": 0.6126, + "step": 2566 + }, + { + "epoch": 0.3149693251533742, + "grad_norm": 1.0256652767887418, + "learning_rate": 1.603585648089541e-05, + "loss": 0.5288, + "step": 2567 + }, + { + "epoch": 0.3150920245398773, + "grad_norm": 0.9864605524503848, + "learning_rate": 1.6032687394505345e-05, + "loss": 0.5872, + "step": 2568 + }, + { + "epoch": 0.3152147239263804, + "grad_norm": 0.9746873006087581, + "learning_rate": 1.6029517355304168e-05, + "loss": 0.6069, + "step": 2569 + }, + { + "epoch": 0.31533742331288345, + "grad_norm": 0.8534504155598558, + "learning_rate": 1.6026346363792565e-05, + "loss": 0.6141, + "step": 2570 + }, + { + "epoch": 0.3154601226993865, + "grad_norm": 1.6844625269203666, + "learning_rate": 1.602317442047136e-05, + "loss": 0.603, + "step": 2571 + }, + { + "epoch": 0.31558282208588956, + "grad_norm": 0.9208223566865525, + "learning_rate": 1.6020001525841542e-05, + "loss": 0.6129, + "step": 2572 + }, + { + "epoch": 0.3157055214723926, + "grad_norm": 1.0736608547464972, + "learning_rate": 1.6016827680404236e-05, + "loss": 0.6347, + "step": 2573 + }, + { + "epoch": 0.3158282208588957, + "grad_norm": 0.926032545144869, + "learning_rate": 1.6013652884660723e-05, + "loss": 0.6307, + "step": 2574 + }, + { + "epoch": 0.3159509202453988, + "grad_norm": 1.0347941276379564, + "learning_rate": 1.6010477139112438e-05, + "loss": 0.6311, + "step": 2575 + }, + { + "epoch": 0.31607361963190184, + "grad_norm": 0.9099071502794024, + "learning_rate": 1.6007300444260963e-05, + "loss": 0.6175, + "step": 2576 + }, + { + "epoch": 0.3161963190184049, + "grad_norm": 1.1147347016984415, + "learning_rate": 1.6004122800608025e-05, + "loss": 0.6062, + "step": 2577 + }, + { + "epoch": 0.31631901840490795, + "grad_norm": 0.9391800495767408, + "learning_rate": 1.6000944208655516e-05, + "loss": 0.6352, + "step": 2578 + }, + { + "epoch": 0.31644171779141106, + "grad_norm": 0.8831656662956909, + "learning_rate": 1.5997764668905455e-05, + "loss": 0.6237, + "step": 2579 + }, + { + "epoch": 0.3165644171779141, + "grad_norm": 0.983824656719974, + "learning_rate": 1.5994584181860028e-05, + "loss": 0.6497, + "step": 2580 + }, + { + "epoch": 0.31668711656441717, + "grad_norm": 0.9321826838354845, + "learning_rate": 1.5991402748021568e-05, + "loss": 0.6243, + "step": 2581 + }, + { + "epoch": 0.3168098159509202, + "grad_norm": 1.0044979558900116, + "learning_rate": 1.598822036789255e-05, + "loss": 0.6065, + "step": 2582 + }, + { + "epoch": 0.31693251533742334, + "grad_norm": 0.9324249045445165, + "learning_rate": 1.598503704197561e-05, + "loss": 0.6234, + "step": 2583 + }, + { + "epoch": 0.3170552147239264, + "grad_norm": 0.8321231674540374, + "learning_rate": 1.5981852770773523e-05, + "loss": 0.5944, + "step": 2584 + }, + { + "epoch": 0.31717791411042945, + "grad_norm": 1.085354511491543, + "learning_rate": 1.5978667554789216e-05, + "loss": 0.5966, + "step": 2585 + }, + { + "epoch": 0.3173006134969325, + "grad_norm": 0.8727192549453628, + "learning_rate": 1.597548139452577e-05, + "loss": 0.5929, + "step": 2586 + }, + { + "epoch": 0.31742331288343556, + "grad_norm": 0.8866618729503196, + "learning_rate": 1.597229429048641e-05, + "loss": 0.5974, + "step": 2587 + }, + { + "epoch": 0.31754601226993867, + "grad_norm": 0.8990353936376343, + "learning_rate": 1.596910624317451e-05, + "loss": 0.6043, + "step": 2588 + }, + { + "epoch": 0.3176687116564417, + "grad_norm": 1.0412205566397752, + "learning_rate": 1.5965917253093596e-05, + "loss": 0.6294, + "step": 2589 + }, + { + "epoch": 0.3177914110429448, + "grad_norm": 0.9276083130513758, + "learning_rate": 1.596272732074734e-05, + "loss": 0.6331, + "step": 2590 + }, + { + "epoch": 0.31791411042944784, + "grad_norm": 1.1534887296404452, + "learning_rate": 1.5959536446639572e-05, + "loss": 0.6398, + "step": 2591 + }, + { + "epoch": 0.3180368098159509, + "grad_norm": 1.00273922071555, + "learning_rate": 1.5956344631274254e-05, + "loss": 0.6591, + "step": 2592 + }, + { + "epoch": 0.318159509202454, + "grad_norm": 1.1335156694034583, + "learning_rate": 1.595315187515551e-05, + "loss": 0.6285, + "step": 2593 + }, + { + "epoch": 0.31828220858895706, + "grad_norm": 1.0284514434185925, + "learning_rate": 1.5949958178787605e-05, + "loss": 0.6654, + "step": 2594 + }, + { + "epoch": 0.3184049079754601, + "grad_norm": 0.9856121096872653, + "learning_rate": 1.5946763542674958e-05, + "loss": 0.5653, + "step": 2595 + }, + { + "epoch": 0.31852760736196317, + "grad_norm": 0.9739004184929292, + "learning_rate": 1.5943567967322136e-05, + "loss": 0.6162, + "step": 2596 + }, + { + "epoch": 0.3186503067484663, + "grad_norm": 1.0667045330894809, + "learning_rate": 1.5940371453233853e-05, + "loss": 0.584, + "step": 2597 + }, + { + "epoch": 0.31877300613496934, + "grad_norm": 0.8465337152803052, + "learning_rate": 1.5937174000914968e-05, + "loss": 0.5961, + "step": 2598 + }, + { + "epoch": 0.3188957055214724, + "grad_norm": 0.9613071432528427, + "learning_rate": 1.5933975610870494e-05, + "loss": 0.6208, + "step": 2599 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 1.1409534903148704, + "learning_rate": 1.5930776283605585e-05, + "loss": 0.6389, + "step": 2600 + }, + { + "epoch": 0.3191411042944785, + "grad_norm": 1.0765173065486566, + "learning_rate": 1.592757601962555e-05, + "loss": 0.6347, + "step": 2601 + }, + { + "epoch": 0.3192638036809816, + "grad_norm": 0.9943196942223894, + "learning_rate": 1.5924374819435843e-05, + "loss": 0.6359, + "step": 2602 + }, + { + "epoch": 0.3193865030674847, + "grad_norm": 1.106895609908029, + "learning_rate": 1.592117268354207e-05, + "loss": 0.6244, + "step": 2603 + }, + { + "epoch": 0.31950920245398773, + "grad_norm": 1.1731974824099922, + "learning_rate": 1.5917969612449972e-05, + "loss": 0.5796, + "step": 2604 + }, + { + "epoch": 0.3196319018404908, + "grad_norm": 0.9405720307198829, + "learning_rate": 1.5914765606665454e-05, + "loss": 0.6119, + "step": 2605 + }, + { + "epoch": 0.31975460122699384, + "grad_norm": 1.007829163458493, + "learning_rate": 1.5911560666694557e-05, + "loss": 0.5864, + "step": 2606 + }, + { + "epoch": 0.31987730061349695, + "grad_norm": 0.8863860901113892, + "learning_rate": 1.5908354793043473e-05, + "loss": 0.6137, + "step": 2607 + }, + { + "epoch": 0.32, + "grad_norm": 1.171805512967773, + "learning_rate": 1.5905147986218546e-05, + "loss": 0.6033, + "step": 2608 + }, + { + "epoch": 0.32012269938650306, + "grad_norm": 0.8794986677956823, + "learning_rate": 1.5901940246726268e-05, + "loss": 0.6226, + "step": 2609 + }, + { + "epoch": 0.3202453987730061, + "grad_norm": 0.853167288514517, + "learning_rate": 1.5898731575073262e-05, + "loss": 0.595, + "step": 2610 + }, + { + "epoch": 0.32036809815950923, + "grad_norm": 0.9710586307122447, + "learning_rate": 1.5895521971766316e-05, + "loss": 0.6762, + "step": 2611 + }, + { + "epoch": 0.3204907975460123, + "grad_norm": 0.893637290313566, + "learning_rate": 1.589231143731236e-05, + "loss": 0.6823, + "step": 2612 + }, + { + "epoch": 0.32061349693251534, + "grad_norm": 1.009526996119122, + "learning_rate": 1.588909997221847e-05, + "loss": 0.6443, + "step": 2613 + }, + { + "epoch": 0.3207361963190184, + "grad_norm": 1.021871472748373, + "learning_rate": 1.588588757699187e-05, + "loss": 0.5442, + "step": 2614 + }, + { + "epoch": 0.32085889570552145, + "grad_norm": 0.8697747797041242, + "learning_rate": 1.5882674252139928e-05, + "loss": 0.579, + "step": 2615 + }, + { + "epoch": 0.32098159509202456, + "grad_norm": 0.9675425270896706, + "learning_rate": 1.5879459998170158e-05, + "loss": 0.5858, + "step": 2616 + }, + { + "epoch": 0.3211042944785276, + "grad_norm": 0.9247925836092767, + "learning_rate": 1.5876244815590233e-05, + "loss": 0.5841, + "step": 2617 + }, + { + "epoch": 0.3212269938650307, + "grad_norm": 0.8948370404793782, + "learning_rate": 1.587302870490796e-05, + "loss": 0.5746, + "step": 2618 + }, + { + "epoch": 0.32134969325153373, + "grad_norm": 0.9544181628856261, + "learning_rate": 1.586981166663129e-05, + "loss": 0.6205, + "step": 2619 + }, + { + "epoch": 0.3214723926380368, + "grad_norm": 0.8989720721791886, + "learning_rate": 1.5866593701268334e-05, + "loss": 0.6655, + "step": 2620 + }, + { + "epoch": 0.3215950920245399, + "grad_norm": 1.0027869779039829, + "learning_rate": 1.5863374809327337e-05, + "loss": 0.5897, + "step": 2621 + }, + { + "epoch": 0.32171779141104295, + "grad_norm": 0.9700670367548647, + "learning_rate": 1.5860154991316697e-05, + "loss": 0.6289, + "step": 2622 + }, + { + "epoch": 0.321840490797546, + "grad_norm": 0.8784780332623384, + "learning_rate": 1.5856934247744957e-05, + "loss": 0.5726, + "step": 2623 + }, + { + "epoch": 0.32196319018404906, + "grad_norm": 0.9125475110905977, + "learning_rate": 1.5853712579120807e-05, + "loss": 0.5993, + "step": 2624 + }, + { + "epoch": 0.3220858895705521, + "grad_norm": 0.8888071780337924, + "learning_rate": 1.5850489985953076e-05, + "loss": 0.5825, + "step": 2625 + }, + { + "epoch": 0.32220858895705523, + "grad_norm": 0.8035445995047664, + "learning_rate": 1.5847266468750754e-05, + "loss": 0.5956, + "step": 2626 + }, + { + "epoch": 0.3223312883435583, + "grad_norm": 1.012354310745852, + "learning_rate": 1.584404202802296e-05, + "loss": 0.6357, + "step": 2627 + }, + { + "epoch": 0.32245398773006134, + "grad_norm": 0.9531920621165504, + "learning_rate": 1.584081666427897e-05, + "loss": 0.5658, + "step": 2628 + }, + { + "epoch": 0.3225766871165644, + "grad_norm": 0.8918044879678948, + "learning_rate": 1.5837590378028207e-05, + "loss": 0.658, + "step": 2629 + }, + { + "epoch": 0.3226993865030675, + "grad_norm": 0.8290861306595241, + "learning_rate": 1.5834363169780227e-05, + "loss": 0.6165, + "step": 2630 + }, + { + "epoch": 0.32282208588957056, + "grad_norm": 1.0048313478270956, + "learning_rate": 1.5831135040044744e-05, + "loss": 0.5814, + "step": 2631 + }, + { + "epoch": 0.3229447852760736, + "grad_norm": 1.0760210704625182, + "learning_rate": 1.582790598933161e-05, + "loss": 0.6395, + "step": 2632 + }, + { + "epoch": 0.3230674846625767, + "grad_norm": 1.0903530807687491, + "learning_rate": 1.582467601815083e-05, + "loss": 0.573, + "step": 2633 + }, + { + "epoch": 0.32319018404907973, + "grad_norm": 0.9756712673336652, + "learning_rate": 1.582144512701255e-05, + "loss": 0.5708, + "step": 2634 + }, + { + "epoch": 0.32331288343558284, + "grad_norm": 0.9661379969162642, + "learning_rate": 1.5818213316427056e-05, + "loss": 0.6708, + "step": 2635 + }, + { + "epoch": 0.3234355828220859, + "grad_norm": 1.107760017713347, + "learning_rate": 1.5814980586904795e-05, + "loss": 0.6242, + "step": 2636 + }, + { + "epoch": 0.32355828220858895, + "grad_norm": 1.0140925719690252, + "learning_rate": 1.581174693895634e-05, + "loss": 0.641, + "step": 2637 + }, + { + "epoch": 0.323680981595092, + "grad_norm": 0.9040241916241227, + "learning_rate": 1.5808512373092418e-05, + "loss": 0.6096, + "step": 2638 + }, + { + "epoch": 0.32380368098159507, + "grad_norm": 0.9063100151808825, + "learning_rate": 1.5805276889823903e-05, + "loss": 0.6347, + "step": 2639 + }, + { + "epoch": 0.3239263803680982, + "grad_norm": 0.986889008839103, + "learning_rate": 1.5802040489661817e-05, + "loss": 0.6301, + "step": 2640 + }, + { + "epoch": 0.32404907975460123, + "grad_norm": 0.9080239348820295, + "learning_rate": 1.5798803173117312e-05, + "loss": 0.5481, + "step": 2641 + }, + { + "epoch": 0.3241717791411043, + "grad_norm": 0.9683155559288861, + "learning_rate": 1.57955649407017e-05, + "loss": 0.6425, + "step": 2642 + }, + { + "epoch": 0.32429447852760734, + "grad_norm": 0.8919430011428511, + "learning_rate": 1.5792325792926434e-05, + "loss": 0.6387, + "step": 2643 + }, + { + "epoch": 0.32441717791411046, + "grad_norm": 1.0001876507230403, + "learning_rate": 1.5789085730303103e-05, + "loss": 0.5692, + "step": 2644 + }, + { + "epoch": 0.3245398773006135, + "grad_norm": 1.0259669121278574, + "learning_rate": 1.578584475334345e-05, + "loss": 0.6214, + "step": 2645 + }, + { + "epoch": 0.32466257668711657, + "grad_norm": 1.0009293585359036, + "learning_rate": 1.5782602862559362e-05, + "loss": 0.6158, + "step": 2646 + }, + { + "epoch": 0.3247852760736196, + "grad_norm": 1.0014466277407161, + "learning_rate": 1.5779360058462865e-05, + "loss": 0.5579, + "step": 2647 + }, + { + "epoch": 0.3249079754601227, + "grad_norm": 1.3256834538409932, + "learning_rate": 1.5776116341566134e-05, + "loss": 0.644, + "step": 2648 + }, + { + "epoch": 0.3250306748466258, + "grad_norm": 1.511219566923623, + "learning_rate": 1.5772871712381485e-05, + "loss": 0.6469, + "step": 2649 + }, + { + "epoch": 0.32515337423312884, + "grad_norm": 0.9737545261850229, + "learning_rate": 1.5769626171421376e-05, + "loss": 0.6557, + "step": 2650 + }, + { + "epoch": 0.3252760736196319, + "grad_norm": 0.9655647099607545, + "learning_rate": 1.5766379719198418e-05, + "loss": 0.6321, + "step": 2651 + }, + { + "epoch": 0.32539877300613496, + "grad_norm": 0.8690064738382293, + "learning_rate": 1.576313235622536e-05, + "loss": 0.6644, + "step": 2652 + }, + { + "epoch": 0.325521472392638, + "grad_norm": 0.9519945562977881, + "learning_rate": 1.5759884083015088e-05, + "loss": 0.5989, + "step": 2653 + }, + { + "epoch": 0.3256441717791411, + "grad_norm": 0.9915980397653497, + "learning_rate": 1.575663490008065e-05, + "loss": 0.6356, + "step": 2654 + }, + { + "epoch": 0.3257668711656442, + "grad_norm": 0.8500418239265859, + "learning_rate": 1.5753384807935214e-05, + "loss": 0.6171, + "step": 2655 + }, + { + "epoch": 0.32588957055214723, + "grad_norm": 1.1031180233616256, + "learning_rate": 1.5750133807092112e-05, + "loss": 0.6674, + "step": 2656 + }, + { + "epoch": 0.3260122699386503, + "grad_norm": 0.9734396562759465, + "learning_rate": 1.5746881898064813e-05, + "loss": 0.6259, + "step": 2657 + }, + { + "epoch": 0.3261349693251534, + "grad_norm": 0.9217227363154327, + "learning_rate": 1.5743629081366922e-05, + "loss": 0.6049, + "step": 2658 + }, + { + "epoch": 0.32625766871165646, + "grad_norm": 0.9870076600094788, + "learning_rate": 1.5740375357512198e-05, + "loss": 0.5758, + "step": 2659 + }, + { + "epoch": 0.3263803680981595, + "grad_norm": 0.9887322801546405, + "learning_rate": 1.5737120727014535e-05, + "loss": 0.6359, + "step": 2660 + }, + { + "epoch": 0.32650306748466257, + "grad_norm": 1.074541509370923, + "learning_rate": 1.5733865190387978e-05, + "loss": 0.6436, + "step": 2661 + }, + { + "epoch": 0.3266257668711656, + "grad_norm": 0.9641676551742601, + "learning_rate": 1.5730608748146706e-05, + "loss": 0.6689, + "step": 2662 + }, + { + "epoch": 0.32674846625766873, + "grad_norm": 1.0459466949123877, + "learning_rate": 1.5727351400805054e-05, + "loss": 0.6363, + "step": 2663 + }, + { + "epoch": 0.3268711656441718, + "grad_norm": 0.799733745965209, + "learning_rate": 1.5724093148877484e-05, + "loss": 0.5335, + "step": 2664 + }, + { + "epoch": 0.32699386503067485, + "grad_norm": 0.8699922682855523, + "learning_rate": 1.572083399287861e-05, + "loss": 0.5885, + "step": 2665 + }, + { + "epoch": 0.3271165644171779, + "grad_norm": 1.0591652725112926, + "learning_rate": 1.5717573933323195e-05, + "loss": 0.6199, + "step": 2666 + }, + { + "epoch": 0.32723926380368096, + "grad_norm": 1.0441065921382129, + "learning_rate": 1.571431297072613e-05, + "loss": 0.6296, + "step": 2667 + }, + { + "epoch": 0.32736196319018407, + "grad_norm": 0.8944277859772995, + "learning_rate": 1.5711051105602456e-05, + "loss": 0.5819, + "step": 2668 + }, + { + "epoch": 0.3274846625766871, + "grad_norm": 0.9285317162234872, + "learning_rate": 1.5707788338467362e-05, + "loss": 0.5924, + "step": 2669 + }, + { + "epoch": 0.3276073619631902, + "grad_norm": 1.028233419222845, + "learning_rate": 1.570452466983617e-05, + "loss": 0.6054, + "step": 2670 + }, + { + "epoch": 0.32773006134969324, + "grad_norm": 0.908553945489978, + "learning_rate": 1.5701260100224346e-05, + "loss": 0.613, + "step": 2671 + }, + { + "epoch": 0.3278527607361963, + "grad_norm": 1.0592635034427078, + "learning_rate": 1.569799463014751e-05, + "loss": 0.6597, + "step": 2672 + }, + { + "epoch": 0.3279754601226994, + "grad_norm": 0.9005460703393297, + "learning_rate": 1.5694728260121403e-05, + "loss": 0.6529, + "step": 2673 + }, + { + "epoch": 0.32809815950920246, + "grad_norm": 0.9885609832771377, + "learning_rate": 1.5691460990661926e-05, + "loss": 0.5922, + "step": 2674 + }, + { + "epoch": 0.3282208588957055, + "grad_norm": 0.9101963796949689, + "learning_rate": 1.5688192822285116e-05, + "loss": 0.61, + "step": 2675 + }, + { + "epoch": 0.32834355828220857, + "grad_norm": 0.8937939623725277, + "learning_rate": 1.568492375550715e-05, + "loss": 0.6089, + "step": 2676 + }, + { + "epoch": 0.3284662576687117, + "grad_norm": 0.9366230567020932, + "learning_rate": 1.5681653790844357e-05, + "loss": 0.5934, + "step": 2677 + }, + { + "epoch": 0.32858895705521474, + "grad_norm": 0.8662050546631664, + "learning_rate": 1.567838292881319e-05, + "loss": 0.5258, + "step": 2678 + }, + { + "epoch": 0.3287116564417178, + "grad_norm": 0.9952007134403672, + "learning_rate": 1.567511116993026e-05, + "loss": 0.6152, + "step": 2679 + }, + { + "epoch": 0.32883435582822085, + "grad_norm": 1.069322777311046, + "learning_rate": 1.567183851471231e-05, + "loss": 0.6396, + "step": 2680 + }, + { + "epoch": 0.3289570552147239, + "grad_norm": 0.9210264358582969, + "learning_rate": 1.5668564963676224e-05, + "loss": 0.6068, + "step": 2681 + }, + { + "epoch": 0.329079754601227, + "grad_norm": 0.9059016319428805, + "learning_rate": 1.566529051733904e-05, + "loss": 0.5996, + "step": 2682 + }, + { + "epoch": 0.32920245398773007, + "grad_norm": 0.872468392816864, + "learning_rate": 1.566201517621792e-05, + "loss": 0.6321, + "step": 2683 + }, + { + "epoch": 0.3293251533742331, + "grad_norm": 1.0318233166039286, + "learning_rate": 1.5658738940830185e-05, + "loss": 0.5952, + "step": 2684 + }, + { + "epoch": 0.3294478527607362, + "grad_norm": 0.781924634277276, + "learning_rate": 1.565546181169328e-05, + "loss": 0.6064, + "step": 2685 + }, + { + "epoch": 0.32957055214723924, + "grad_norm": 0.875310755632365, + "learning_rate": 1.5652183789324805e-05, + "loss": 0.6018, + "step": 2686 + }, + { + "epoch": 0.32969325153374235, + "grad_norm": 0.9912048649462477, + "learning_rate": 1.564890487424249e-05, + "loss": 0.6333, + "step": 2687 + }, + { + "epoch": 0.3298159509202454, + "grad_norm": 0.8685771447597378, + "learning_rate": 1.564562506696422e-05, + "loss": 0.5945, + "step": 2688 + }, + { + "epoch": 0.32993865030674846, + "grad_norm": 0.9538928317525901, + "learning_rate": 1.5642344368008e-05, + "loss": 0.619, + "step": 2689 + }, + { + "epoch": 0.3300613496932515, + "grad_norm": 0.9895646552261627, + "learning_rate": 1.5639062777892e-05, + "loss": 0.6301, + "step": 2690 + }, + { + "epoch": 0.3301840490797546, + "grad_norm": 0.8796428610413447, + "learning_rate": 1.563578029713451e-05, + "loss": 0.5744, + "step": 2691 + }, + { + "epoch": 0.3303067484662577, + "grad_norm": 0.9131921520216798, + "learning_rate": 1.5632496926253976e-05, + "loss": 0.5979, + "step": 2692 + }, + { + "epoch": 0.33042944785276074, + "grad_norm": 0.82991254353997, + "learning_rate": 1.562921266576898e-05, + "loss": 0.6142, + "step": 2693 + }, + { + "epoch": 0.3305521472392638, + "grad_norm": 0.9460855758614418, + "learning_rate": 1.5625927516198235e-05, + "loss": 0.6047, + "step": 2694 + }, + { + "epoch": 0.33067484662576685, + "grad_norm": 0.9896789225486798, + "learning_rate": 1.5622641478060602e-05, + "loss": 0.6348, + "step": 2695 + }, + { + "epoch": 0.33079754601226996, + "grad_norm": 0.9629005409496051, + "learning_rate": 1.5619354551875093e-05, + "loss": 0.6373, + "step": 2696 + }, + { + "epoch": 0.330920245398773, + "grad_norm": 1.0162428797421734, + "learning_rate": 1.561606673816084e-05, + "loss": 0.5957, + "step": 2697 + }, + { + "epoch": 0.33104294478527607, + "grad_norm": 0.8874582700773124, + "learning_rate": 1.5612778037437125e-05, + "loss": 0.5853, + "step": 2698 + }, + { + "epoch": 0.3311656441717791, + "grad_norm": 0.9505714205097309, + "learning_rate": 1.560948845022338e-05, + "loss": 0.6039, + "step": 2699 + }, + { + "epoch": 0.3312883435582822, + "grad_norm": 0.9716264165982008, + "learning_rate": 1.5606197977039154e-05, + "loss": 0.593, + "step": 2700 + }, + { + "epoch": 0.3314110429447853, + "grad_norm": 0.9131296181000809, + "learning_rate": 1.5602906618404154e-05, + "loss": 0.522, + "step": 2701 + }, + { + "epoch": 0.33153374233128835, + "grad_norm": 0.9000144172368045, + "learning_rate": 1.5599614374838226e-05, + "loss": 0.658, + "step": 2702 + }, + { + "epoch": 0.3316564417177914, + "grad_norm": 1.0192898377777309, + "learning_rate": 1.5596321246861345e-05, + "loss": 0.5978, + "step": 2703 + }, + { + "epoch": 0.33177914110429446, + "grad_norm": 0.895952732681463, + "learning_rate": 1.559302723499364e-05, + "loss": 0.583, + "step": 2704 + }, + { + "epoch": 0.3319018404907976, + "grad_norm": 0.8406595901781906, + "learning_rate": 1.5589732339755362e-05, + "loss": 0.6067, + "step": 2705 + }, + { + "epoch": 0.33202453987730063, + "grad_norm": 0.9710226879964949, + "learning_rate": 1.5586436561666923e-05, + "loss": 0.564, + "step": 2706 + }, + { + "epoch": 0.3321472392638037, + "grad_norm": 0.9760844708783919, + "learning_rate": 1.5583139901248853e-05, + "loss": 0.5607, + "step": 2707 + }, + { + "epoch": 0.33226993865030674, + "grad_norm": 0.9414973857066065, + "learning_rate": 1.5579842359021834e-05, + "loss": 0.5746, + "step": 2708 + }, + { + "epoch": 0.3323926380368098, + "grad_norm": 0.9199805681849873, + "learning_rate": 1.5576543935506684e-05, + "loss": 0.5883, + "step": 2709 + }, + { + "epoch": 0.3325153374233129, + "grad_norm": 1.0006776648635627, + "learning_rate": 1.5573244631224364e-05, + "loss": 0.6138, + "step": 2710 + }, + { + "epoch": 0.33263803680981596, + "grad_norm": 0.8648034012327637, + "learning_rate": 1.556994444669597e-05, + "loss": 0.6122, + "step": 2711 + }, + { + "epoch": 0.332760736196319, + "grad_norm": 0.975456018858573, + "learning_rate": 1.5566643382442733e-05, + "loss": 0.5713, + "step": 2712 + }, + { + "epoch": 0.3328834355828221, + "grad_norm": 0.9600744035162289, + "learning_rate": 1.5563341438986036e-05, + "loss": 0.5905, + "step": 2713 + }, + { + "epoch": 0.33300613496932513, + "grad_norm": 0.9743745716800717, + "learning_rate": 1.5560038616847384e-05, + "loss": 0.5846, + "step": 2714 + }, + { + "epoch": 0.33312883435582824, + "grad_norm": 0.8998782335522147, + "learning_rate": 1.5556734916548432e-05, + "loss": 0.5674, + "step": 2715 + }, + { + "epoch": 0.3332515337423313, + "grad_norm": 0.9080398054327613, + "learning_rate": 1.5553430338610973e-05, + "loss": 0.6604, + "step": 2716 + }, + { + "epoch": 0.33337423312883435, + "grad_norm": 0.8928279272694689, + "learning_rate": 1.5550124883556938e-05, + "loss": 0.609, + "step": 2717 + }, + { + "epoch": 0.3334969325153374, + "grad_norm": 1.0562168423209322, + "learning_rate": 1.5546818551908387e-05, + "loss": 0.6154, + "step": 2718 + }, + { + "epoch": 0.33361963190184046, + "grad_norm": 0.9542112125841891, + "learning_rate": 1.554351134418754e-05, + "loss": 0.6748, + "step": 2719 + }, + { + "epoch": 0.3337423312883436, + "grad_norm": 1.1054890788483744, + "learning_rate": 1.5540203260916728e-05, + "loss": 0.6178, + "step": 2720 + }, + { + "epoch": 0.33386503067484663, + "grad_norm": 1.1998174065762024, + "learning_rate": 1.5536894302618445e-05, + "loss": 0.6838, + "step": 2721 + }, + { + "epoch": 0.3339877300613497, + "grad_norm": 0.809782490913109, + "learning_rate": 1.5533584469815307e-05, + "loss": 0.6163, + "step": 2722 + }, + { + "epoch": 0.33411042944785274, + "grad_norm": 1.0343876880864062, + "learning_rate": 1.5530273763030076e-05, + "loss": 0.5894, + "step": 2723 + }, + { + "epoch": 0.33423312883435585, + "grad_norm": 0.9514233788984554, + "learning_rate": 1.5526962182785646e-05, + "loss": 0.6381, + "step": 2724 + }, + { + "epoch": 0.3343558282208589, + "grad_norm": 0.9571648121551901, + "learning_rate": 1.552364972960506e-05, + "loss": 0.6361, + "step": 2725 + }, + { + "epoch": 0.33447852760736196, + "grad_norm": 0.8391834242813303, + "learning_rate": 1.5520336404011483e-05, + "loss": 0.5863, + "step": 2726 + }, + { + "epoch": 0.334601226993865, + "grad_norm": 1.0467894645025242, + "learning_rate": 1.5517022206528233e-05, + "loss": 0.6212, + "step": 2727 + }, + { + "epoch": 0.3347239263803681, + "grad_norm": 0.8892059432198708, + "learning_rate": 1.5513707137678754e-05, + "loss": 0.5742, + "step": 2728 + }, + { + "epoch": 0.3348466257668712, + "grad_norm": 1.1151467031317666, + "learning_rate": 1.5510391197986635e-05, + "loss": 0.6315, + "step": 2729 + }, + { + "epoch": 0.33496932515337424, + "grad_norm": 1.0485835934547274, + "learning_rate": 1.5507074387975603e-05, + "loss": 0.6118, + "step": 2730 + }, + { + "epoch": 0.3350920245398773, + "grad_norm": 1.1696261907872443, + "learning_rate": 1.550375670816951e-05, + "loss": 0.6414, + "step": 2731 + }, + { + "epoch": 0.33521472392638035, + "grad_norm": 0.9056147522351831, + "learning_rate": 1.550043815909237e-05, + "loss": 0.5843, + "step": 2732 + }, + { + "epoch": 0.3353374233128834, + "grad_norm": 0.8399729463468536, + "learning_rate": 1.5497118741268304e-05, + "loss": 0.5345, + "step": 2733 + }, + { + "epoch": 0.3354601226993865, + "grad_norm": 1.0227706232686993, + "learning_rate": 1.5493798455221593e-05, + "loss": 0.6649, + "step": 2734 + }, + { + "epoch": 0.3355828220858896, + "grad_norm": 0.8787138811457158, + "learning_rate": 1.5490477301476648e-05, + "loss": 0.5987, + "step": 2735 + }, + { + "epoch": 0.33570552147239263, + "grad_norm": 0.9369199995725602, + "learning_rate": 1.548715528055802e-05, + "loss": 0.5932, + "step": 2736 + }, + { + "epoch": 0.3358282208588957, + "grad_norm": 0.9616050358094897, + "learning_rate": 1.5483832392990382e-05, + "loss": 0.617, + "step": 2737 + }, + { + "epoch": 0.3359509202453988, + "grad_norm": 0.8983534881897681, + "learning_rate": 1.548050863929857e-05, + "loss": 0.575, + "step": 2738 + }, + { + "epoch": 0.33607361963190185, + "grad_norm": 0.8621284765399084, + "learning_rate": 1.547718402000753e-05, + "loss": 0.6109, + "step": 2739 + }, + { + "epoch": 0.3361963190184049, + "grad_norm": 0.9585860975753836, + "learning_rate": 1.5473858535642365e-05, + "loss": 0.6139, + "step": 2740 + }, + { + "epoch": 0.33631901840490797, + "grad_norm": 1.2772820157884053, + "learning_rate": 1.5470532186728303e-05, + "loss": 0.6156, + "step": 2741 + }, + { + "epoch": 0.336441717791411, + "grad_norm": 1.009275243958201, + "learning_rate": 1.5467204973790714e-05, + "loss": 0.5729, + "step": 2742 + }, + { + "epoch": 0.33656441717791413, + "grad_norm": 1.0066717215551106, + "learning_rate": 1.54638768973551e-05, + "loss": 0.62, + "step": 2743 + }, + { + "epoch": 0.3366871165644172, + "grad_norm": 0.8809123220144159, + "learning_rate": 1.5460547957947105e-05, + "loss": 0.6283, + "step": 2744 + }, + { + "epoch": 0.33680981595092024, + "grad_norm": 0.9919530927449643, + "learning_rate": 1.5457218156092503e-05, + "loss": 0.5986, + "step": 2745 + }, + { + "epoch": 0.3369325153374233, + "grad_norm": 0.920051093510598, + "learning_rate": 1.545388749231721e-05, + "loss": 0.5749, + "step": 2746 + }, + { + "epoch": 0.33705521472392636, + "grad_norm": 1.0488943388621863, + "learning_rate": 1.5450555967147282e-05, + "loss": 0.6607, + "step": 2747 + }, + { + "epoch": 0.33717791411042947, + "grad_norm": 0.9203864186635078, + "learning_rate": 1.544722358110889e-05, + "loss": 0.6443, + "step": 2748 + }, + { + "epoch": 0.3373006134969325, + "grad_norm": 0.8816002719969505, + "learning_rate": 1.5443890334728368e-05, + "loss": 0.6354, + "step": 2749 + }, + { + "epoch": 0.3374233128834356, + "grad_norm": 0.891353266738574, + "learning_rate": 1.5440556228532168e-05, + "loss": 0.6083, + "step": 2750 + }, + { + "epoch": 0.33754601226993863, + "grad_norm": 0.8493109436796621, + "learning_rate": 1.5437221263046887e-05, + "loss": 0.5275, + "step": 2751 + }, + { + "epoch": 0.33766871165644174, + "grad_norm": 0.9132493362984373, + "learning_rate": 1.543388543879925e-05, + "loss": 0.5541, + "step": 2752 + }, + { + "epoch": 0.3377914110429448, + "grad_norm": 0.947883620355436, + "learning_rate": 1.5430548756316127e-05, + "loss": 0.6218, + "step": 2753 + }, + { + "epoch": 0.33791411042944786, + "grad_norm": 1.0767719321712481, + "learning_rate": 1.5427211216124512e-05, + "loss": 0.6466, + "step": 2754 + }, + { + "epoch": 0.3380368098159509, + "grad_norm": 0.8934139054630652, + "learning_rate": 1.5423872818751544e-05, + "loss": 0.6405, + "step": 2755 + }, + { + "epoch": 0.33815950920245397, + "grad_norm": 0.9176040349028818, + "learning_rate": 1.5420533564724495e-05, + "loss": 0.6197, + "step": 2756 + }, + { + "epoch": 0.3382822085889571, + "grad_norm": 0.89580290131185, + "learning_rate": 1.541719345457077e-05, + "loss": 0.5679, + "step": 2757 + }, + { + "epoch": 0.33840490797546013, + "grad_norm": 0.9425212528537966, + "learning_rate": 1.5413852488817913e-05, + "loss": 0.6625, + "step": 2758 + }, + { + "epoch": 0.3385276073619632, + "grad_norm": 1.1265908581656285, + "learning_rate": 1.5410510667993596e-05, + "loss": 0.5779, + "step": 2759 + }, + { + "epoch": 0.33865030674846625, + "grad_norm": 1.0705368198781093, + "learning_rate": 1.5407167992625636e-05, + "loss": 0.6258, + "step": 2760 + }, + { + "epoch": 0.3387730061349693, + "grad_norm": 0.8952006259255355, + "learning_rate": 1.540382446324198e-05, + "loss": 0.6714, + "step": 2761 + }, + { + "epoch": 0.3388957055214724, + "grad_norm": 1.0781495728080206, + "learning_rate": 1.5400480080370702e-05, + "loss": 0.6469, + "step": 2762 + }, + { + "epoch": 0.33901840490797547, + "grad_norm": 0.8162493042968713, + "learning_rate": 1.539713484454003e-05, + "loss": 0.6081, + "step": 2763 + }, + { + "epoch": 0.3391411042944785, + "grad_norm": 0.9158254160693599, + "learning_rate": 1.5393788756278306e-05, + "loss": 0.5792, + "step": 2764 + }, + { + "epoch": 0.3392638036809816, + "grad_norm": 1.024947235951406, + "learning_rate": 1.5390441816114022e-05, + "loss": 0.5873, + "step": 2765 + }, + { + "epoch": 0.33938650306748464, + "grad_norm": 0.9797749003576292, + "learning_rate": 1.5387094024575794e-05, + "loss": 0.6144, + "step": 2766 + }, + { + "epoch": 0.33950920245398775, + "grad_norm": 0.8869965893991986, + "learning_rate": 1.5383745382192375e-05, + "loss": 0.5455, + "step": 2767 + }, + { + "epoch": 0.3396319018404908, + "grad_norm": 0.8360868101796571, + "learning_rate": 1.538039588949266e-05, + "loss": 0.6234, + "step": 2768 + }, + { + "epoch": 0.33975460122699386, + "grad_norm": 1.655644301641495, + "learning_rate": 1.5377045547005673e-05, + "loss": 0.619, + "step": 2769 + }, + { + "epoch": 0.3398773006134969, + "grad_norm": 1.0821117313629882, + "learning_rate": 1.5373694355260565e-05, + "loss": 0.6058, + "step": 2770 + }, + { + "epoch": 0.34, + "grad_norm": 0.966267620757068, + "learning_rate": 1.5370342314786638e-05, + "loss": 0.5806, + "step": 2771 + }, + { + "epoch": 0.3401226993865031, + "grad_norm": 0.9167196473901436, + "learning_rate": 1.5366989426113308e-05, + "loss": 0.5898, + "step": 2772 + }, + { + "epoch": 0.34024539877300614, + "grad_norm": 0.8403764759891474, + "learning_rate": 1.5363635689770136e-05, + "loss": 0.652, + "step": 2773 + }, + { + "epoch": 0.3403680981595092, + "grad_norm": 0.9676226486298151, + "learning_rate": 1.5360281106286823e-05, + "loss": 0.6205, + "step": 2774 + }, + { + "epoch": 0.34049079754601225, + "grad_norm": 0.9526925982081641, + "learning_rate": 1.5356925676193192e-05, + "loss": 0.5918, + "step": 2775 + }, + { + "epoch": 0.34061349693251536, + "grad_norm": 0.9377733149472236, + "learning_rate": 1.5353569400019204e-05, + "loss": 0.5737, + "step": 2776 + }, + { + "epoch": 0.3407361963190184, + "grad_norm": 1.0086864669958957, + "learning_rate": 1.5350212278294952e-05, + "loss": 0.578, + "step": 2777 + }, + { + "epoch": 0.34085889570552147, + "grad_norm": 1.023318984454164, + "learning_rate": 1.5346854311550673e-05, + "loss": 0.6887, + "step": 2778 + }, + { + "epoch": 0.3409815950920245, + "grad_norm": 0.9824472507265841, + "learning_rate": 1.534349550031672e-05, + "loss": 0.5832, + "step": 2779 + }, + { + "epoch": 0.3411042944785276, + "grad_norm": 0.907305376696462, + "learning_rate": 1.534013584512359e-05, + "loss": 0.6295, + "step": 2780 + }, + { + "epoch": 0.3412269938650307, + "grad_norm": 0.7821589193803996, + "learning_rate": 1.5336775346501917e-05, + "loss": 0.5777, + "step": 2781 + }, + { + "epoch": 0.34134969325153375, + "grad_norm": 0.8895416303235486, + "learning_rate": 1.5333414004982457e-05, + "loss": 0.609, + "step": 2782 + }, + { + "epoch": 0.3414723926380368, + "grad_norm": 0.8862612241105888, + "learning_rate": 1.5330051821096107e-05, + "loss": 0.5878, + "step": 2783 + }, + { + "epoch": 0.34159509202453986, + "grad_norm": 1.0471367516902137, + "learning_rate": 1.5326688795373892e-05, + "loss": 0.6449, + "step": 2784 + }, + { + "epoch": 0.34171779141104297, + "grad_norm": 1.2981722896493155, + "learning_rate": 1.5323324928346984e-05, + "loss": 0.6063, + "step": 2785 + }, + { + "epoch": 0.341840490797546, + "grad_norm": 1.1684096739257457, + "learning_rate": 1.531996022054666e-05, + "loss": 0.5755, + "step": 2786 + }, + { + "epoch": 0.3419631901840491, + "grad_norm": 0.8869334609878797, + "learning_rate": 1.5316594672504362e-05, + "loss": 0.5832, + "step": 2787 + }, + { + "epoch": 0.34208588957055214, + "grad_norm": 0.9134806787307488, + "learning_rate": 1.531322828475164e-05, + "loss": 0.563, + "step": 2788 + }, + { + "epoch": 0.3422085889570552, + "grad_norm": 0.9279375251695218, + "learning_rate": 1.530986105782019e-05, + "loss": 0.5874, + "step": 2789 + }, + { + "epoch": 0.3423312883435583, + "grad_norm": 1.021065670588168, + "learning_rate": 1.5306492992241836e-05, + "loss": 0.5956, + "step": 2790 + }, + { + "epoch": 0.34245398773006136, + "grad_norm": 0.9330869807725701, + "learning_rate": 1.5303124088548535e-05, + "loss": 0.5609, + "step": 2791 + }, + { + "epoch": 0.3425766871165644, + "grad_norm": 0.9407370676192598, + "learning_rate": 1.5299754347272375e-05, + "loss": 0.625, + "step": 2792 + }, + { + "epoch": 0.34269938650306747, + "grad_norm": 0.9354006577558313, + "learning_rate": 1.529638376894558e-05, + "loss": 0.6077, + "step": 2793 + }, + { + "epoch": 0.3428220858895705, + "grad_norm": 0.9081020393066701, + "learning_rate": 1.52930123541005e-05, + "loss": 0.6161, + "step": 2794 + }, + { + "epoch": 0.34294478527607364, + "grad_norm": 0.9153600415243248, + "learning_rate": 1.5289640103269626e-05, + "loss": 0.6377, + "step": 2795 + }, + { + "epoch": 0.3430674846625767, + "grad_norm": 1.0235226426102655, + "learning_rate": 1.5286267016985575e-05, + "loss": 0.6168, + "step": 2796 + }, + { + "epoch": 0.34319018404907975, + "grad_norm": 0.9734793413068273, + "learning_rate": 1.528289309578109e-05, + "loss": 0.6138, + "step": 2797 + }, + { + "epoch": 0.3433128834355828, + "grad_norm": 1.048751262525629, + "learning_rate": 1.5279518340189063e-05, + "loss": 0.6385, + "step": 2798 + }, + { + "epoch": 0.3434355828220859, + "grad_norm": 1.204320316038668, + "learning_rate": 1.52761427507425e-05, + "loss": 0.6651, + "step": 2799 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 0.943460592143929, + "learning_rate": 1.527276632797455e-05, + "loss": 0.6271, + "step": 2800 + }, + { + "epoch": 0.343680981595092, + "grad_norm": 0.9629416211983017, + "learning_rate": 1.5269389072418488e-05, + "loss": 0.6323, + "step": 2801 + }, + { + "epoch": 0.3438036809815951, + "grad_norm": 0.9726518872610258, + "learning_rate": 1.526601098460773e-05, + "loss": 0.6639, + "step": 2802 + }, + { + "epoch": 0.34392638036809814, + "grad_norm": 1.0912010537852734, + "learning_rate": 1.5262632065075803e-05, + "loss": 0.6878, + "step": 2803 + }, + { + "epoch": 0.34404907975460125, + "grad_norm": 0.8743431151967944, + "learning_rate": 1.5259252314356387e-05, + "loss": 0.6015, + "step": 2804 + }, + { + "epoch": 0.3441717791411043, + "grad_norm": 0.9026902423392366, + "learning_rate": 1.5255871732983284e-05, + "loss": 0.5585, + "step": 2805 + }, + { + "epoch": 0.34429447852760736, + "grad_norm": 1.0047762850459228, + "learning_rate": 1.5252490321490424e-05, + "loss": 0.5523, + "step": 2806 + }, + { + "epoch": 0.3444171779141104, + "grad_norm": 0.9287246639052362, + "learning_rate": 1.5249108080411876e-05, + "loss": 0.5821, + "step": 2807 + }, + { + "epoch": 0.3445398773006135, + "grad_norm": 1.0001531451447985, + "learning_rate": 1.5245725010281834e-05, + "loss": 0.5852, + "step": 2808 + }, + { + "epoch": 0.3446625766871166, + "grad_norm": 0.8759391246399599, + "learning_rate": 1.5242341111634625e-05, + "loss": 0.5961, + "step": 2809 + }, + { + "epoch": 0.34478527607361964, + "grad_norm": 0.899085942161092, + "learning_rate": 1.5238956385004703e-05, + "loss": 0.5652, + "step": 2810 + }, + { + "epoch": 0.3449079754601227, + "grad_norm": 1.051903004969216, + "learning_rate": 1.5235570830926665e-05, + "loss": 0.6293, + "step": 2811 + }, + { + "epoch": 0.34503067484662575, + "grad_norm": 1.0536019623175592, + "learning_rate": 1.523218444993522e-05, + "loss": 0.6049, + "step": 2812 + }, + { + "epoch": 0.3451533742331288, + "grad_norm": 1.0006660747944052, + "learning_rate": 1.5228797242565229e-05, + "loss": 0.597, + "step": 2813 + }, + { + "epoch": 0.3452760736196319, + "grad_norm": 0.9171518259748632, + "learning_rate": 1.5225409209351663e-05, + "loss": 0.6379, + "step": 2814 + }, + { + "epoch": 0.345398773006135, + "grad_norm": 0.8084808831976328, + "learning_rate": 1.5222020350829636e-05, + "loss": 0.5392, + "step": 2815 + }, + { + "epoch": 0.34552147239263803, + "grad_norm": 1.0557753220056612, + "learning_rate": 1.5218630667534391e-05, + "loss": 0.5932, + "step": 2816 + }, + { + "epoch": 0.3456441717791411, + "grad_norm": 0.8774194767304317, + "learning_rate": 1.5215240160001294e-05, + "loss": 0.6129, + "step": 2817 + }, + { + "epoch": 0.3457668711656442, + "grad_norm": 0.9444485675108316, + "learning_rate": 1.5211848828765852e-05, + "loss": 0.6503, + "step": 2818 + }, + { + "epoch": 0.34588957055214725, + "grad_norm": 1.0903927320404996, + "learning_rate": 1.5208456674363693e-05, + "loss": 0.608, + "step": 2819 + }, + { + "epoch": 0.3460122699386503, + "grad_norm": 1.045759760302803, + "learning_rate": 1.5205063697330582e-05, + "loss": 0.6337, + "step": 2820 + }, + { + "epoch": 0.34613496932515336, + "grad_norm": 1.096220590588135, + "learning_rate": 1.520166989820241e-05, + "loss": 0.5516, + "step": 2821 + }, + { + "epoch": 0.3462576687116564, + "grad_norm": 1.0601835485699476, + "learning_rate": 1.5198275277515195e-05, + "loss": 0.6104, + "step": 2822 + }, + { + "epoch": 0.34638036809815953, + "grad_norm": 0.984241175681994, + "learning_rate": 1.519487983580509e-05, + "loss": 0.6172, + "step": 2823 + }, + { + "epoch": 0.3465030674846626, + "grad_norm": 1.0147129247282243, + "learning_rate": 1.519148357360838e-05, + "loss": 0.5851, + "step": 2824 + }, + { + "epoch": 0.34662576687116564, + "grad_norm": 0.9361091026863501, + "learning_rate": 1.5188086491461467e-05, + "loss": 0.5735, + "step": 2825 + }, + { + "epoch": 0.3467484662576687, + "grad_norm": 0.945514401474194, + "learning_rate": 1.5184688589900898e-05, + "loss": 0.6215, + "step": 2826 + }, + { + "epoch": 0.34687116564417175, + "grad_norm": 0.9972946628717423, + "learning_rate": 1.5181289869463338e-05, + "loss": 0.6014, + "step": 2827 + }, + { + "epoch": 0.34699386503067486, + "grad_norm": 0.9607899006865112, + "learning_rate": 1.5177890330685588e-05, + "loss": 0.5988, + "step": 2828 + }, + { + "epoch": 0.3471165644171779, + "grad_norm": 1.0997538709180215, + "learning_rate": 1.5174489974104574e-05, + "loss": 0.6146, + "step": 2829 + }, + { + "epoch": 0.347239263803681, + "grad_norm": 0.7808751161427581, + "learning_rate": 1.5171088800257354e-05, + "loss": 0.5528, + "step": 2830 + }, + { + "epoch": 0.34736196319018403, + "grad_norm": 0.924060752323268, + "learning_rate": 1.5167686809681117e-05, + "loss": 0.6058, + "step": 2831 + }, + { + "epoch": 0.34748466257668714, + "grad_norm": 1.0214699917341659, + "learning_rate": 1.5164284002913174e-05, + "loss": 0.6588, + "step": 2832 + }, + { + "epoch": 0.3476073619631902, + "grad_norm": 0.8577447859856664, + "learning_rate": 1.5160880380490972e-05, + "loss": 0.5753, + "step": 2833 + }, + { + "epoch": 0.34773006134969325, + "grad_norm": 0.9577761909509362, + "learning_rate": 1.5157475942952085e-05, + "loss": 0.6486, + "step": 2834 + }, + { + "epoch": 0.3478527607361963, + "grad_norm": 0.9175584773537947, + "learning_rate": 1.5154070690834211e-05, + "loss": 0.6117, + "step": 2835 + }, + { + "epoch": 0.34797546012269936, + "grad_norm": 0.9004015137962281, + "learning_rate": 1.5150664624675183e-05, + "loss": 0.6866, + "step": 2836 + }, + { + "epoch": 0.3480981595092025, + "grad_norm": 0.936821289638196, + "learning_rate": 1.5147257745012956e-05, + "loss": 0.6585, + "step": 2837 + }, + { + "epoch": 0.34822085889570553, + "grad_norm": 0.972587281572083, + "learning_rate": 1.514385005238562e-05, + "loss": 0.6122, + "step": 2838 + }, + { + "epoch": 0.3483435582822086, + "grad_norm": 0.8829912824908248, + "learning_rate": 1.514044154733139e-05, + "loss": 0.5742, + "step": 2839 + }, + { + "epoch": 0.34846625766871164, + "grad_norm": 1.1543370489059446, + "learning_rate": 1.5137032230388613e-05, + "loss": 0.654, + "step": 2840 + }, + { + "epoch": 0.3485889570552147, + "grad_norm": 1.0635325330521923, + "learning_rate": 1.5133622102095755e-05, + "loss": 0.5895, + "step": 2841 + }, + { + "epoch": 0.3487116564417178, + "grad_norm": 1.0228204158107348, + "learning_rate": 1.5130211162991424e-05, + "loss": 0.5642, + "step": 2842 + }, + { + "epoch": 0.34883435582822087, + "grad_norm": 0.9481477684461003, + "learning_rate": 1.5126799413614346e-05, + "loss": 0.6407, + "step": 2843 + }, + { + "epoch": 0.3489570552147239, + "grad_norm": 0.944700310103083, + "learning_rate": 1.5123386854503373e-05, + "loss": 0.5805, + "step": 2844 + }, + { + "epoch": 0.349079754601227, + "grad_norm": 0.8923019866009297, + "learning_rate": 1.5119973486197497e-05, + "loss": 0.548, + "step": 2845 + }, + { + "epoch": 0.3492024539877301, + "grad_norm": 1.0316489636057218, + "learning_rate": 1.5116559309235825e-05, + "loss": 0.5973, + "step": 2846 + }, + { + "epoch": 0.34932515337423314, + "grad_norm": 1.1238509412365787, + "learning_rate": 1.5113144324157596e-05, + "loss": 0.6496, + "step": 2847 + }, + { + "epoch": 0.3494478527607362, + "grad_norm": 0.9772701575910127, + "learning_rate": 1.5109728531502179e-05, + "loss": 0.6356, + "step": 2848 + }, + { + "epoch": 0.34957055214723926, + "grad_norm": 1.2162572490939203, + "learning_rate": 1.510631193180907e-05, + "loss": 0.606, + "step": 2849 + }, + { + "epoch": 0.3496932515337423, + "grad_norm": 1.2737535166791238, + "learning_rate": 1.5102894525617892e-05, + "loss": 0.6179, + "step": 2850 + }, + { + "epoch": 0.3498159509202454, + "grad_norm": 0.8776820620043813, + "learning_rate": 1.5099476313468396e-05, + "loss": 0.5896, + "step": 2851 + }, + { + "epoch": 0.3499386503067485, + "grad_norm": 0.8743148325116475, + "learning_rate": 1.5096057295900455e-05, + "loss": 0.5776, + "step": 2852 + }, + { + "epoch": 0.35006134969325153, + "grad_norm": 1.0244538888099353, + "learning_rate": 1.5092637473454077e-05, + "loss": 0.626, + "step": 2853 + }, + { + "epoch": 0.3501840490797546, + "grad_norm": 0.9211285236840381, + "learning_rate": 1.508921684666939e-05, + "loss": 0.5983, + "step": 2854 + }, + { + "epoch": 0.35030674846625764, + "grad_norm": 0.8958424947873158, + "learning_rate": 1.5085795416086655e-05, + "loss": 0.5963, + "step": 2855 + }, + { + "epoch": 0.35042944785276076, + "grad_norm": 0.9951629462281869, + "learning_rate": 1.5082373182246256e-05, + "loss": 0.6235, + "step": 2856 + }, + { + "epoch": 0.3505521472392638, + "grad_norm": 1.0653676474117522, + "learning_rate": 1.507895014568871e-05, + "loss": 0.602, + "step": 2857 + }, + { + "epoch": 0.35067484662576687, + "grad_norm": 0.9564273515104564, + "learning_rate": 1.5075526306954653e-05, + "loss": 0.5931, + "step": 2858 + }, + { + "epoch": 0.3507975460122699, + "grad_norm": 0.925172234378432, + "learning_rate": 1.507210166658485e-05, + "loss": 0.575, + "step": 2859 + }, + { + "epoch": 0.350920245398773, + "grad_norm": 0.9427774590712413, + "learning_rate": 1.5068676225120196e-05, + "loss": 0.5934, + "step": 2860 + }, + { + "epoch": 0.3510429447852761, + "grad_norm": 0.8914746180486628, + "learning_rate": 1.5065249983101706e-05, + "loss": 0.5445, + "step": 2861 + }, + { + "epoch": 0.35116564417177915, + "grad_norm": 0.8526059503488724, + "learning_rate": 1.506182294107053e-05, + "loss": 0.6421, + "step": 2862 + }, + { + "epoch": 0.3512883435582822, + "grad_norm": 0.8847088554439498, + "learning_rate": 1.5058395099567935e-05, + "loss": 0.5733, + "step": 2863 + }, + { + "epoch": 0.35141104294478526, + "grad_norm": 0.8791809569489152, + "learning_rate": 1.5054966459135323e-05, + "loss": 0.5874, + "step": 2864 + }, + { + "epoch": 0.35153374233128837, + "grad_norm": 0.8738132292114087, + "learning_rate": 1.5051537020314218e-05, + "loss": 0.6619, + "step": 2865 + }, + { + "epoch": 0.3516564417177914, + "grad_norm": 0.8856412274640997, + "learning_rate": 1.504810678364627e-05, + "loss": 0.5783, + "step": 2866 + }, + { + "epoch": 0.3517791411042945, + "grad_norm": 1.0097388022849185, + "learning_rate": 1.5044675749673254e-05, + "loss": 0.6292, + "step": 2867 + }, + { + "epoch": 0.35190184049079754, + "grad_norm": 0.9268203696609292, + "learning_rate": 1.5041243918937071e-05, + "loss": 0.5662, + "step": 2868 + }, + { + "epoch": 0.3520245398773006, + "grad_norm": 0.8955059215884368, + "learning_rate": 1.5037811291979752e-05, + "loss": 0.566, + "step": 2869 + }, + { + "epoch": 0.3521472392638037, + "grad_norm": 0.9006463014450762, + "learning_rate": 1.5034377869343453e-05, + "loss": 0.616, + "step": 2870 + }, + { + "epoch": 0.35226993865030676, + "grad_norm": 0.8902305447139258, + "learning_rate": 1.5030943651570447e-05, + "loss": 0.6891, + "step": 2871 + }, + { + "epoch": 0.3523926380368098, + "grad_norm": 0.8747616445569123, + "learning_rate": 1.5027508639203148e-05, + "loss": 0.5906, + "step": 2872 + }, + { + "epoch": 0.35251533742331287, + "grad_norm": 0.9588966915175048, + "learning_rate": 1.5024072832784078e-05, + "loss": 0.6452, + "step": 2873 + }, + { + "epoch": 0.3526380368098159, + "grad_norm": 0.8762979346737437, + "learning_rate": 1.5020636232855896e-05, + "loss": 0.5997, + "step": 2874 + }, + { + "epoch": 0.35276073619631904, + "grad_norm": 0.8577121841750953, + "learning_rate": 1.5017198839961388e-05, + "loss": 0.616, + "step": 2875 + }, + { + "epoch": 0.3528834355828221, + "grad_norm": 0.9179077083129625, + "learning_rate": 1.5013760654643456e-05, + "loss": 0.6193, + "step": 2876 + }, + { + "epoch": 0.35300613496932515, + "grad_norm": 0.9650618458616914, + "learning_rate": 1.501032167744513e-05, + "loss": 0.6008, + "step": 2877 + }, + { + "epoch": 0.3531288343558282, + "grad_norm": 0.8541756386632966, + "learning_rate": 1.5006881908909571e-05, + "loss": 0.5947, + "step": 2878 + }, + { + "epoch": 0.3532515337423313, + "grad_norm": 1.0147839789911972, + "learning_rate": 1.5003441349580059e-05, + "loss": 0.6511, + "step": 2879 + }, + { + "epoch": 0.35337423312883437, + "grad_norm": 0.9542518773921206, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.5784, + "step": 2880 + }, + { + "epoch": 0.3534969325153374, + "grad_norm": 0.9109144568639301, + "learning_rate": 1.499655786071293e-05, + "loss": 0.5789, + "step": 2881 + }, + { + "epoch": 0.3536196319018405, + "grad_norm": 0.9799920091192821, + "learning_rate": 1.49931149322625e-05, + "loss": 0.6174, + "step": 2882 + }, + { + "epoch": 0.35374233128834354, + "grad_norm": 0.8818490274646231, + "learning_rate": 1.4989671215192492e-05, + "loss": 0.6011, + "step": 2883 + }, + { + "epoch": 0.35386503067484665, + "grad_norm": 0.954254102347903, + "learning_rate": 1.4986226710046813e-05, + "loss": 0.6084, + "step": 2884 + }, + { + "epoch": 0.3539877300613497, + "grad_norm": 0.9109921455783808, + "learning_rate": 1.4982781417369496e-05, + "loss": 0.6505, + "step": 2885 + }, + { + "epoch": 0.35411042944785276, + "grad_norm": 0.957483936533579, + "learning_rate": 1.497933533770469e-05, + "loss": 0.589, + "step": 2886 + }, + { + "epoch": 0.3542331288343558, + "grad_norm": 0.8683690329947671, + "learning_rate": 1.4975888471596673e-05, + "loss": 0.6356, + "step": 2887 + }, + { + "epoch": 0.35435582822085887, + "grad_norm": 0.994133294458263, + "learning_rate": 1.4972440819589853e-05, + "loss": 0.6121, + "step": 2888 + }, + { + "epoch": 0.354478527607362, + "grad_norm": 0.8633562664132279, + "learning_rate": 1.4968992382228754e-05, + "loss": 0.6283, + "step": 2889 + }, + { + "epoch": 0.35460122699386504, + "grad_norm": 1.0526470180968988, + "learning_rate": 1.4965543160058028e-05, + "loss": 0.617, + "step": 2890 + }, + { + "epoch": 0.3547239263803681, + "grad_norm": 0.8775355375075391, + "learning_rate": 1.4962093153622445e-05, + "loss": 0.6599, + "step": 2891 + }, + { + "epoch": 0.35484662576687115, + "grad_norm": 0.8279967319813246, + "learning_rate": 1.4958642363466912e-05, + "loss": 0.6205, + "step": 2892 + }, + { + "epoch": 0.35496932515337426, + "grad_norm": 1.0851407239972846, + "learning_rate": 1.495519079013645e-05, + "loss": 0.6319, + "step": 2893 + }, + { + "epoch": 0.3550920245398773, + "grad_norm": 1.0570730136221709, + "learning_rate": 1.4951738434176197e-05, + "loss": 0.6464, + "step": 2894 + }, + { + "epoch": 0.35521472392638037, + "grad_norm": 1.135968988361215, + "learning_rate": 1.4948285296131435e-05, + "loss": 0.6261, + "step": 2895 + }, + { + "epoch": 0.3553374233128834, + "grad_norm": 0.9868286932392512, + "learning_rate": 1.4944831376547549e-05, + "loss": 0.6067, + "step": 2896 + }, + { + "epoch": 0.3554601226993865, + "grad_norm": 1.037764622458221, + "learning_rate": 1.4941376675970058e-05, + "loss": 0.6046, + "step": 2897 + }, + { + "epoch": 0.3555828220858896, + "grad_norm": 0.8632062292107531, + "learning_rate": 1.4937921194944605e-05, + "loss": 0.6, + "step": 2898 + }, + { + "epoch": 0.35570552147239265, + "grad_norm": 0.9421948952220689, + "learning_rate": 1.4934464934016948e-05, + "loss": 0.5966, + "step": 2899 + }, + { + "epoch": 0.3558282208588957, + "grad_norm": 0.9361870879201384, + "learning_rate": 1.4931007893732981e-05, + "loss": 0.5965, + "step": 2900 + }, + { + "epoch": 0.35595092024539876, + "grad_norm": 0.903163819833144, + "learning_rate": 1.4927550074638708e-05, + "loss": 0.5985, + "step": 2901 + }, + { + "epoch": 0.3560736196319018, + "grad_norm": 0.8391313205398224, + "learning_rate": 1.4924091477280262e-05, + "loss": 0.6048, + "step": 2902 + }, + { + "epoch": 0.3561963190184049, + "grad_norm": 0.8858131507483412, + "learning_rate": 1.4920632102203902e-05, + "loss": 0.6118, + "step": 2903 + }, + { + "epoch": 0.356319018404908, + "grad_norm": 0.985510691831022, + "learning_rate": 1.4917171949956004e-05, + "loss": 0.6054, + "step": 2904 + }, + { + "epoch": 0.35644171779141104, + "grad_norm": 0.9889029008366262, + "learning_rate": 1.4913711021083071e-05, + "loss": 0.6034, + "step": 2905 + }, + { + "epoch": 0.3565644171779141, + "grad_norm": 0.969712550730888, + "learning_rate": 1.4910249316131728e-05, + "loss": 0.5924, + "step": 2906 + }, + { + "epoch": 0.35668711656441715, + "grad_norm": 0.9646572062899482, + "learning_rate": 1.4906786835648719e-05, + "loss": 0.6149, + "step": 2907 + }, + { + "epoch": 0.35680981595092026, + "grad_norm": 0.9246902053938476, + "learning_rate": 1.490332358018091e-05, + "loss": 0.5988, + "step": 2908 + }, + { + "epoch": 0.3569325153374233, + "grad_norm": 0.9962849205759721, + "learning_rate": 1.48998595502753e-05, + "loss": 0.5655, + "step": 2909 + }, + { + "epoch": 0.3570552147239264, + "grad_norm": 0.9812309620969606, + "learning_rate": 1.4896394746478995e-05, + "loss": 0.5922, + "step": 2910 + }, + { + "epoch": 0.35717791411042943, + "grad_norm": 0.9354978776421746, + "learning_rate": 1.4892929169339237e-05, + "loss": 0.5947, + "step": 2911 + }, + { + "epoch": 0.35730061349693254, + "grad_norm": 0.9203384760482909, + "learning_rate": 1.4889462819403381e-05, + "loss": 0.6169, + "step": 2912 + }, + { + "epoch": 0.3574233128834356, + "grad_norm": 1.008594115234822, + "learning_rate": 1.4885995697218908e-05, + "loss": 0.6185, + "step": 2913 + }, + { + "epoch": 0.35754601226993865, + "grad_norm": 0.95948553484938, + "learning_rate": 1.4882527803333422e-05, + "loss": 0.6407, + "step": 2914 + }, + { + "epoch": 0.3576687116564417, + "grad_norm": 0.8343189284919252, + "learning_rate": 1.4879059138294647e-05, + "loss": 0.6032, + "step": 2915 + }, + { + "epoch": 0.35779141104294476, + "grad_norm": 1.3197137094588585, + "learning_rate": 1.4875589702650425e-05, + "loss": 0.6068, + "step": 2916 + }, + { + "epoch": 0.3579141104294479, + "grad_norm": 0.9692014391851115, + "learning_rate": 1.4872119496948724e-05, + "loss": 0.5454, + "step": 2917 + }, + { + "epoch": 0.35803680981595093, + "grad_norm": 0.893845936450848, + "learning_rate": 1.4868648521737638e-05, + "loss": 0.6003, + "step": 2918 + }, + { + "epoch": 0.358159509202454, + "grad_norm": 0.9600598402768772, + "learning_rate": 1.4865176777565374e-05, + "loss": 0.6282, + "step": 2919 + }, + { + "epoch": 0.35828220858895704, + "grad_norm": 1.0799497716759694, + "learning_rate": 1.4861704264980264e-05, + "loss": 0.62, + "step": 2920 + }, + { + "epoch": 0.3584049079754601, + "grad_norm": 0.9174308369144375, + "learning_rate": 1.4858230984530759e-05, + "loss": 0.6372, + "step": 2921 + }, + { + "epoch": 0.3585276073619632, + "grad_norm": 1.0234328696824424, + "learning_rate": 1.485475693676544e-05, + "loss": 0.6287, + "step": 2922 + }, + { + "epoch": 0.35865030674846626, + "grad_norm": 0.9054217695434273, + "learning_rate": 1.4851282122232999e-05, + "loss": 0.6219, + "step": 2923 + }, + { + "epoch": 0.3587730061349693, + "grad_norm": 1.0241490376403624, + "learning_rate": 1.4847806541482253e-05, + "loss": 0.5702, + "step": 2924 + }, + { + "epoch": 0.3588957055214724, + "grad_norm": 1.030296938644599, + "learning_rate": 1.4844330195062145e-05, + "loss": 0.602, + "step": 2925 + }, + { + "epoch": 0.3590184049079755, + "grad_norm": 0.794869758906967, + "learning_rate": 1.4840853083521726e-05, + "loss": 0.6043, + "step": 2926 + }, + { + "epoch": 0.35914110429447854, + "grad_norm": 0.979326536780529, + "learning_rate": 1.4837375207410182e-05, + "loss": 0.6308, + "step": 2927 + }, + { + "epoch": 0.3592638036809816, + "grad_norm": 1.0211458571320755, + "learning_rate": 1.4833896567276812e-05, + "loss": 0.5732, + "step": 2928 + }, + { + "epoch": 0.35938650306748465, + "grad_norm": 0.9294473455506953, + "learning_rate": 1.4830417163671036e-05, + "loss": 0.6178, + "step": 2929 + }, + { + "epoch": 0.3595092024539877, + "grad_norm": 0.9597049910084645, + "learning_rate": 1.4826936997142399e-05, + "loss": 0.6281, + "step": 2930 + }, + { + "epoch": 0.3596319018404908, + "grad_norm": 0.8471781143889012, + "learning_rate": 1.4823456068240558e-05, + "loss": 0.6148, + "step": 2931 + }, + { + "epoch": 0.3597546012269939, + "grad_norm": 0.9559451440752239, + "learning_rate": 1.4819974377515301e-05, + "loss": 0.6494, + "step": 2932 + }, + { + "epoch": 0.35987730061349693, + "grad_norm": 0.8423736134258397, + "learning_rate": 1.4816491925516528e-05, + "loss": 0.5608, + "step": 2933 + }, + { + "epoch": 0.36, + "grad_norm": 0.975667819940145, + "learning_rate": 1.4813008712794263e-05, + "loss": 0.5867, + "step": 2934 + }, + { + "epoch": 0.36012269938650304, + "grad_norm": 0.9003654655094538, + "learning_rate": 1.4809524739898651e-05, + "loss": 0.5489, + "step": 2935 + }, + { + "epoch": 0.36024539877300615, + "grad_norm": 0.9529501062871729, + "learning_rate": 1.4806040007379955e-05, + "loss": 0.6321, + "step": 2936 + }, + { + "epoch": 0.3603680981595092, + "grad_norm": 0.8283836366722507, + "learning_rate": 1.4802554515788559e-05, + "loss": 0.6089, + "step": 2937 + }, + { + "epoch": 0.36049079754601226, + "grad_norm": 0.909616118160054, + "learning_rate": 1.4799068265674961e-05, + "loss": 0.6327, + "step": 2938 + }, + { + "epoch": 0.3606134969325153, + "grad_norm": 0.9955299761166897, + "learning_rate": 1.4795581257589793e-05, + "loss": 0.6115, + "step": 2939 + }, + { + "epoch": 0.36073619631901843, + "grad_norm": 1.048193347968661, + "learning_rate": 1.4792093492083792e-05, + "loss": 0.6629, + "step": 2940 + }, + { + "epoch": 0.3608588957055215, + "grad_norm": 0.9382576745960024, + "learning_rate": 1.4788604969707823e-05, + "loss": 0.6379, + "step": 2941 + }, + { + "epoch": 0.36098159509202454, + "grad_norm": 0.831944304256972, + "learning_rate": 1.4785115691012866e-05, + "loss": 0.5921, + "step": 2942 + }, + { + "epoch": 0.3611042944785276, + "grad_norm": 0.9405761898783264, + "learning_rate": 1.4781625656550024e-05, + "loss": 0.5947, + "step": 2943 + }, + { + "epoch": 0.36122699386503065, + "grad_norm": 0.8343732125832914, + "learning_rate": 1.477813486687052e-05, + "loss": 0.5718, + "step": 2944 + }, + { + "epoch": 0.36134969325153377, + "grad_norm": 1.1671512531418549, + "learning_rate": 1.4774643322525691e-05, + "loss": 0.62, + "step": 2945 + }, + { + "epoch": 0.3614723926380368, + "grad_norm": 0.8864318219002085, + "learning_rate": 1.4771151024067e-05, + "loss": 0.6157, + "step": 2946 + }, + { + "epoch": 0.3615950920245399, + "grad_norm": 0.907359026212404, + "learning_rate": 1.4767657972046022e-05, + "loss": 0.569, + "step": 2947 + }, + { + "epoch": 0.36171779141104293, + "grad_norm": 1.1033408869801318, + "learning_rate": 1.4764164167014451e-05, + "loss": 0.6297, + "step": 2948 + }, + { + "epoch": 0.361840490797546, + "grad_norm": 1.0036255151542652, + "learning_rate": 1.4760669609524114e-05, + "loss": 0.5958, + "step": 2949 + }, + { + "epoch": 0.3619631901840491, + "grad_norm": 0.9296500745868302, + "learning_rate": 1.4757174300126935e-05, + "loss": 0.6019, + "step": 2950 + }, + { + "epoch": 0.36208588957055216, + "grad_norm": 0.9688955057326802, + "learning_rate": 1.4753678239374977e-05, + "loss": 0.6464, + "step": 2951 + }, + { + "epoch": 0.3622085889570552, + "grad_norm": 0.8919620035998569, + "learning_rate": 1.4750181427820407e-05, + "loss": 0.6115, + "step": 2952 + }, + { + "epoch": 0.36233128834355827, + "grad_norm": 1.2790639934197816, + "learning_rate": 1.474668386601552e-05, + "loss": 0.6042, + "step": 2953 + }, + { + "epoch": 0.3624539877300613, + "grad_norm": 0.8958868550905498, + "learning_rate": 1.4743185554512725e-05, + "loss": 0.584, + "step": 2954 + }, + { + "epoch": 0.36257668711656443, + "grad_norm": 0.9380963594462993, + "learning_rate": 1.473968649386455e-05, + "loss": 0.5687, + "step": 2955 + }, + { + "epoch": 0.3626993865030675, + "grad_norm": 1.0004534061546388, + "learning_rate": 1.4736186684623639e-05, + "loss": 0.5969, + "step": 2956 + }, + { + "epoch": 0.36282208588957054, + "grad_norm": 0.8863758818471793, + "learning_rate": 1.473268612734276e-05, + "loss": 0.592, + "step": 2957 + }, + { + "epoch": 0.3629447852760736, + "grad_norm": 0.8787952846067341, + "learning_rate": 1.4729184822574794e-05, + "loss": 0.5664, + "step": 2958 + }, + { + "epoch": 0.3630674846625767, + "grad_norm": 1.0671689385892993, + "learning_rate": 1.4725682770872742e-05, + "loss": 0.6114, + "step": 2959 + }, + { + "epoch": 0.36319018404907977, + "grad_norm": 1.134168403472036, + "learning_rate": 1.4722179972789725e-05, + "loss": 0.5846, + "step": 2960 + }, + { + "epoch": 0.3633128834355828, + "grad_norm": 0.9693287915715982, + "learning_rate": 1.471867642887898e-05, + "loss": 0.5806, + "step": 2961 + }, + { + "epoch": 0.3634355828220859, + "grad_norm": 0.9788317328054642, + "learning_rate": 1.4715172139693859e-05, + "loss": 0.5828, + "step": 2962 + }, + { + "epoch": 0.36355828220858893, + "grad_norm": 0.9423240632966737, + "learning_rate": 1.4711667105787835e-05, + "loss": 0.5874, + "step": 2963 + }, + { + "epoch": 0.36368098159509205, + "grad_norm": 0.8870200778180205, + "learning_rate": 1.4708161327714499e-05, + "loss": 0.5725, + "step": 2964 + }, + { + "epoch": 0.3638036809815951, + "grad_norm": 0.9768260863574915, + "learning_rate": 1.4704654806027558e-05, + "loss": 0.5612, + "step": 2965 + }, + { + "epoch": 0.36392638036809816, + "grad_norm": 0.9966197813582827, + "learning_rate": 1.4701147541280836e-05, + "loss": 0.6713, + "step": 2966 + }, + { + "epoch": 0.3640490797546012, + "grad_norm": 0.8944524686818571, + "learning_rate": 1.469763953402828e-05, + "loss": 0.6505, + "step": 2967 + }, + { + "epoch": 0.36417177914110427, + "grad_norm": 1.2379223392893524, + "learning_rate": 1.4694130784823941e-05, + "loss": 0.5898, + "step": 2968 + }, + { + "epoch": 0.3642944785276074, + "grad_norm": 1.0004139580033475, + "learning_rate": 1.4690621294222002e-05, + "loss": 0.5764, + "step": 2969 + }, + { + "epoch": 0.36441717791411044, + "grad_norm": 0.9935223488449707, + "learning_rate": 1.4687111062776758e-05, + "loss": 0.6255, + "step": 2970 + }, + { + "epoch": 0.3645398773006135, + "grad_norm": 0.9743523117553543, + "learning_rate": 1.4683600091042616e-05, + "loss": 0.5887, + "step": 2971 + }, + { + "epoch": 0.36466257668711655, + "grad_norm": 0.9998807322699447, + "learning_rate": 1.4680088379574104e-05, + "loss": 0.6562, + "step": 2972 + }, + { + "epoch": 0.36478527607361966, + "grad_norm": 0.9604707812400028, + "learning_rate": 1.4676575928925869e-05, + "loss": 0.6018, + "step": 2973 + }, + { + "epoch": 0.3649079754601227, + "grad_norm": 1.1810511959556689, + "learning_rate": 1.4673062739652673e-05, + "loss": 0.6044, + "step": 2974 + }, + { + "epoch": 0.36503067484662577, + "grad_norm": 0.9013449055703805, + "learning_rate": 1.466954881230939e-05, + "loss": 0.6112, + "step": 2975 + }, + { + "epoch": 0.3651533742331288, + "grad_norm": 1.3121623671405638, + "learning_rate": 1.4666034147451019e-05, + "loss": 0.5615, + "step": 2976 + }, + { + "epoch": 0.3652760736196319, + "grad_norm": 1.0851117267974384, + "learning_rate": 1.4662518745632666e-05, + "loss": 0.6359, + "step": 2977 + }, + { + "epoch": 0.365398773006135, + "grad_norm": 0.8334055376606315, + "learning_rate": 1.4659002607409565e-05, + "loss": 0.6477, + "step": 2978 + }, + { + "epoch": 0.36552147239263805, + "grad_norm": 0.8310839004391128, + "learning_rate": 1.4655485733337051e-05, + "loss": 0.5967, + "step": 2979 + }, + { + "epoch": 0.3656441717791411, + "grad_norm": 1.0233438090350022, + "learning_rate": 1.4651968123970592e-05, + "loss": 0.6126, + "step": 2980 + }, + { + "epoch": 0.36576687116564416, + "grad_norm": 1.0334076000273038, + "learning_rate": 1.4648449779865763e-05, + "loss": 0.6271, + "step": 2981 + }, + { + "epoch": 0.3658895705521472, + "grad_norm": 0.9522748048296571, + "learning_rate": 1.4644930701578252e-05, + "loss": 0.6173, + "step": 2982 + }, + { + "epoch": 0.3660122699386503, + "grad_norm": 1.057210522956901, + "learning_rate": 1.4641410889663874e-05, + "loss": 0.6142, + "step": 2983 + }, + { + "epoch": 0.3661349693251534, + "grad_norm": 0.883768858437231, + "learning_rate": 1.4637890344678547e-05, + "loss": 0.6207, + "step": 2984 + }, + { + "epoch": 0.36625766871165644, + "grad_norm": 0.9775394341144641, + "learning_rate": 1.4634369067178312e-05, + "loss": 0.6015, + "step": 2985 + }, + { + "epoch": 0.3663803680981595, + "grad_norm": 0.8932871644486975, + "learning_rate": 1.4630847057719326e-05, + "loss": 0.6278, + "step": 2986 + }, + { + "epoch": 0.3665030674846626, + "grad_norm": 0.9931994831952909, + "learning_rate": 1.4627324316857858e-05, + "loss": 0.5913, + "step": 2987 + }, + { + "epoch": 0.36662576687116566, + "grad_norm": 1.010568994364846, + "learning_rate": 1.4623800845150298e-05, + "loss": 0.6321, + "step": 2988 + }, + { + "epoch": 0.3667484662576687, + "grad_norm": 0.9689713130895837, + "learning_rate": 1.4620276643153144e-05, + "loss": 0.6175, + "step": 2989 + }, + { + "epoch": 0.36687116564417177, + "grad_norm": 0.9030355614828921, + "learning_rate": 1.4616751711423016e-05, + "loss": 0.6069, + "step": 2990 + }, + { + "epoch": 0.3669938650306748, + "grad_norm": 0.8742091416991785, + "learning_rate": 1.4613226050516647e-05, + "loss": 0.5762, + "step": 2991 + }, + { + "epoch": 0.36711656441717794, + "grad_norm": 0.8846714924933575, + "learning_rate": 1.4609699660990884e-05, + "loss": 0.6024, + "step": 2992 + }, + { + "epoch": 0.367239263803681, + "grad_norm": 1.081670565225486, + "learning_rate": 1.4606172543402685e-05, + "loss": 0.6204, + "step": 2993 + }, + { + "epoch": 0.36736196319018405, + "grad_norm": 1.0122318752491162, + "learning_rate": 1.4602644698309139e-05, + "loss": 0.5955, + "step": 2994 + }, + { + "epoch": 0.3674846625766871, + "grad_norm": 1.0344394381906297, + "learning_rate": 1.4599116126267431e-05, + "loss": 0.5973, + "step": 2995 + }, + { + "epoch": 0.36760736196319016, + "grad_norm": 0.9503358015549178, + "learning_rate": 1.4595586827834869e-05, + "loss": 0.6922, + "step": 2996 + }, + { + "epoch": 0.36773006134969327, + "grad_norm": 0.8781702514128202, + "learning_rate": 1.4592056803568876e-05, + "loss": 0.5738, + "step": 2997 + }, + { + "epoch": 0.3678527607361963, + "grad_norm": 0.8640430810013853, + "learning_rate": 1.4588526054026992e-05, + "loss": 0.6282, + "step": 2998 + }, + { + "epoch": 0.3679754601226994, + "grad_norm": 0.9078633802848762, + "learning_rate": 1.4584994579766865e-05, + "loss": 0.599, + "step": 2999 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.9070771245573545, + "learning_rate": 1.4581462381346261e-05, + "loss": 0.6174, + "step": 3000 + }, + { + "epoch": 0.3682208588957055, + "grad_norm": 0.9020010718532157, + "learning_rate": 1.4577929459323061e-05, + "loss": 0.5698, + "step": 3001 + }, + { + "epoch": 0.3683435582822086, + "grad_norm": 1.1252285051296118, + "learning_rate": 1.457439581425526e-05, + "loss": 0.6434, + "step": 3002 + }, + { + "epoch": 0.36846625766871166, + "grad_norm": 0.8644192001728949, + "learning_rate": 1.4570861446700968e-05, + "loss": 0.6112, + "step": 3003 + }, + { + "epoch": 0.3685889570552147, + "grad_norm": 0.9474231429300036, + "learning_rate": 1.4567326357218408e-05, + "loss": 0.5364, + "step": 3004 + }, + { + "epoch": 0.36871165644171777, + "grad_norm": 0.9738247923323781, + "learning_rate": 1.4563790546365914e-05, + "loss": 0.619, + "step": 3005 + }, + { + "epoch": 0.3688343558282209, + "grad_norm": 0.8499372244145481, + "learning_rate": 1.456025401470194e-05, + "loss": 0.5824, + "step": 3006 + }, + { + "epoch": 0.36895705521472394, + "grad_norm": 0.9784724543364458, + "learning_rate": 1.4556716762785047e-05, + "loss": 0.6159, + "step": 3007 + }, + { + "epoch": 0.369079754601227, + "grad_norm": 0.9913144508253315, + "learning_rate": 1.455317879117392e-05, + "loss": 0.6267, + "step": 3008 + }, + { + "epoch": 0.36920245398773005, + "grad_norm": 0.9562882580273971, + "learning_rate": 1.4549640100427345e-05, + "loss": 0.5937, + "step": 3009 + }, + { + "epoch": 0.3693251533742331, + "grad_norm": 0.8489968812653199, + "learning_rate": 1.454610069110423e-05, + "loss": 0.5587, + "step": 3010 + }, + { + "epoch": 0.3694478527607362, + "grad_norm": 0.9590799295086633, + "learning_rate": 1.4542560563763594e-05, + "loss": 0.5478, + "step": 3011 + }, + { + "epoch": 0.3695705521472393, + "grad_norm": 0.965805767076632, + "learning_rate": 1.453901971896457e-05, + "loss": 0.6699, + "step": 3012 + }, + { + "epoch": 0.36969325153374233, + "grad_norm": 0.9286148052391603, + "learning_rate": 1.4535478157266405e-05, + "loss": 0.6618, + "step": 3013 + }, + { + "epoch": 0.3698159509202454, + "grad_norm": 0.8340053037844873, + "learning_rate": 1.4531935879228457e-05, + "loss": 0.5773, + "step": 3014 + }, + { + "epoch": 0.36993865030674844, + "grad_norm": 0.9201751382623384, + "learning_rate": 1.45283928854102e-05, + "loss": 0.6469, + "step": 3015 + }, + { + "epoch": 0.37006134969325155, + "grad_norm": 0.9245213548369573, + "learning_rate": 1.4524849176371219e-05, + "loss": 0.5788, + "step": 3016 + }, + { + "epoch": 0.3701840490797546, + "grad_norm": 1.1288686191481696, + "learning_rate": 1.4521304752671209e-05, + "loss": 0.6407, + "step": 3017 + }, + { + "epoch": 0.37030674846625766, + "grad_norm": 0.8941880880151194, + "learning_rate": 1.4517759614869981e-05, + "loss": 0.5886, + "step": 3018 + }, + { + "epoch": 0.3704294478527607, + "grad_norm": 1.0493512012992947, + "learning_rate": 1.4514213763527467e-05, + "loss": 0.6141, + "step": 3019 + }, + { + "epoch": 0.37055214723926383, + "grad_norm": 1.0792321210494333, + "learning_rate": 1.4510667199203697e-05, + "loss": 0.6087, + "step": 3020 + }, + { + "epoch": 0.3706748466257669, + "grad_norm": 0.8517111659198661, + "learning_rate": 1.4507119922458816e-05, + "loss": 0.6179, + "step": 3021 + }, + { + "epoch": 0.37079754601226994, + "grad_norm": 0.9096191448849075, + "learning_rate": 1.4503571933853096e-05, + "loss": 0.5403, + "step": 3022 + }, + { + "epoch": 0.370920245398773, + "grad_norm": 0.8654840842183334, + "learning_rate": 1.4500023233946907e-05, + "loss": 0.5708, + "step": 3023 + }, + { + "epoch": 0.37104294478527605, + "grad_norm": 0.975532337513345, + "learning_rate": 1.4496473823300736e-05, + "loss": 0.6009, + "step": 3024 + }, + { + "epoch": 0.37116564417177916, + "grad_norm": 0.9775240661140716, + "learning_rate": 1.4492923702475183e-05, + "loss": 0.6078, + "step": 3025 + }, + { + "epoch": 0.3712883435582822, + "grad_norm": 0.9562271033943345, + "learning_rate": 1.4489372872030955e-05, + "loss": 0.5849, + "step": 3026 + }, + { + "epoch": 0.3714110429447853, + "grad_norm": 1.026565826663137, + "learning_rate": 1.4485821332528879e-05, + "loss": 0.6204, + "step": 3027 + }, + { + "epoch": 0.37153374233128833, + "grad_norm": 0.9819770016292679, + "learning_rate": 1.4482269084529889e-05, + "loss": 0.6577, + "step": 3028 + }, + { + "epoch": 0.3716564417177914, + "grad_norm": 0.9524076654896295, + "learning_rate": 1.4478716128595032e-05, + "loss": 0.6435, + "step": 3029 + }, + { + "epoch": 0.3717791411042945, + "grad_norm": 1.1300183084497177, + "learning_rate": 1.4475162465285463e-05, + "loss": 0.5711, + "step": 3030 + }, + { + "epoch": 0.37190184049079755, + "grad_norm": 0.9289700338980283, + "learning_rate": 1.4471608095162462e-05, + "loss": 0.5987, + "step": 3031 + }, + { + "epoch": 0.3720245398773006, + "grad_norm": 0.8958141560852718, + "learning_rate": 1.4468053018787402e-05, + "loss": 0.5767, + "step": 3032 + }, + { + "epoch": 0.37214723926380366, + "grad_norm": 0.991212665919733, + "learning_rate": 1.4464497236721779e-05, + "loss": 0.6018, + "step": 3033 + }, + { + "epoch": 0.3722699386503068, + "grad_norm": 0.8709140148885695, + "learning_rate": 1.4460940749527202e-05, + "loss": 0.5929, + "step": 3034 + }, + { + "epoch": 0.37239263803680983, + "grad_norm": 0.8826584070562774, + "learning_rate": 1.4457383557765385e-05, + "loss": 0.6506, + "step": 3035 + }, + { + "epoch": 0.3725153374233129, + "grad_norm": 1.071727002993312, + "learning_rate": 1.4453825661998155e-05, + "loss": 0.603, + "step": 3036 + }, + { + "epoch": 0.37263803680981594, + "grad_norm": 0.9076654630823409, + "learning_rate": 1.4450267062787451e-05, + "loss": 0.631, + "step": 3037 + }, + { + "epoch": 0.372760736196319, + "grad_norm": 1.0415717217018874, + "learning_rate": 1.4446707760695324e-05, + "loss": 0.6122, + "step": 3038 + }, + { + "epoch": 0.3728834355828221, + "grad_norm": 0.922226023086247, + "learning_rate": 1.4443147756283937e-05, + "loss": 0.5807, + "step": 3039 + }, + { + "epoch": 0.37300613496932516, + "grad_norm": 0.8856817581310974, + "learning_rate": 1.443958705011556e-05, + "loss": 0.6291, + "step": 3040 + }, + { + "epoch": 0.3731288343558282, + "grad_norm": 1.035273541267309, + "learning_rate": 1.4436025642752575e-05, + "loss": 0.5898, + "step": 3041 + }, + { + "epoch": 0.3732515337423313, + "grad_norm": 0.9320868295413596, + "learning_rate": 1.4432463534757477e-05, + "loss": 0.6356, + "step": 3042 + }, + { + "epoch": 0.37337423312883433, + "grad_norm": 0.8416875858045287, + "learning_rate": 1.442890072669287e-05, + "loss": 0.5993, + "step": 3043 + }, + { + "epoch": 0.37349693251533744, + "grad_norm": 0.90355537445922, + "learning_rate": 1.442533721912147e-05, + "loss": 0.5856, + "step": 3044 + }, + { + "epoch": 0.3736196319018405, + "grad_norm": 0.8974758736560028, + "learning_rate": 1.4421773012606104e-05, + "loss": 0.6408, + "step": 3045 + }, + { + "epoch": 0.37374233128834355, + "grad_norm": 0.9696144143512615, + "learning_rate": 1.4418208107709701e-05, + "loss": 0.5784, + "step": 3046 + }, + { + "epoch": 0.3738650306748466, + "grad_norm": 0.9642114555273298, + "learning_rate": 1.4414642504995315e-05, + "loss": 0.6345, + "step": 3047 + }, + { + "epoch": 0.37398773006134967, + "grad_norm": 0.9024593893083727, + "learning_rate": 1.4411076205026096e-05, + "loss": 0.5991, + "step": 3048 + }, + { + "epoch": 0.3741104294478528, + "grad_norm": 0.9003851770818934, + "learning_rate": 1.4407509208365313e-05, + "loss": 0.6012, + "step": 3049 + }, + { + "epoch": 0.37423312883435583, + "grad_norm": 1.1839787009092033, + "learning_rate": 1.4403941515576344e-05, + "loss": 0.6037, + "step": 3050 + }, + { + "epoch": 0.3743558282208589, + "grad_norm": 0.9372400409376912, + "learning_rate": 1.4400373127222673e-05, + "loss": 0.6044, + "step": 3051 + }, + { + "epoch": 0.37447852760736194, + "grad_norm": 1.1538375886849426, + "learning_rate": 1.4396804043867897e-05, + "loss": 0.5999, + "step": 3052 + }, + { + "epoch": 0.37460122699386506, + "grad_norm": 0.9663455414721047, + "learning_rate": 1.4393234266075725e-05, + "loss": 0.6124, + "step": 3053 + }, + { + "epoch": 0.3747239263803681, + "grad_norm": 0.909258186702368, + "learning_rate": 1.4389663794409968e-05, + "loss": 0.6242, + "step": 3054 + }, + { + "epoch": 0.37484662576687117, + "grad_norm": 0.9723437242056371, + "learning_rate": 1.4386092629434551e-05, + "loss": 0.5717, + "step": 3055 + }, + { + "epoch": 0.3749693251533742, + "grad_norm": 0.8840188234379496, + "learning_rate": 1.4382520771713517e-05, + "loss": 0.5935, + "step": 3056 + }, + { + "epoch": 0.3750920245398773, + "grad_norm": 0.9301713066974383, + "learning_rate": 1.4378948221811e-05, + "loss": 0.5998, + "step": 3057 + }, + { + "epoch": 0.3752147239263804, + "grad_norm": 1.0353038482812449, + "learning_rate": 1.4375374980291259e-05, + "loss": 0.6073, + "step": 3058 + }, + { + "epoch": 0.37533742331288344, + "grad_norm": 0.9994885881054579, + "learning_rate": 1.4371801047718654e-05, + "loss": 0.6232, + "step": 3059 + }, + { + "epoch": 0.3754601226993865, + "grad_norm": 0.8427952606197436, + "learning_rate": 1.4368226424657661e-05, + "loss": 0.5374, + "step": 3060 + }, + { + "epoch": 0.37558282208588956, + "grad_norm": 1.0070581583267526, + "learning_rate": 1.4364651111672857e-05, + "loss": 0.6558, + "step": 3061 + }, + { + "epoch": 0.3757055214723926, + "grad_norm": 0.9383634865810841, + "learning_rate": 1.436107510932893e-05, + "loss": 0.5837, + "step": 3062 + }, + { + "epoch": 0.3758282208588957, + "grad_norm": 1.0298031061950443, + "learning_rate": 1.4357498418190687e-05, + "loss": 0.5948, + "step": 3063 + }, + { + "epoch": 0.3759509202453988, + "grad_norm": 0.9714864469210767, + "learning_rate": 1.4353921038823029e-05, + "loss": 0.5836, + "step": 3064 + }, + { + "epoch": 0.37607361963190183, + "grad_norm": 0.998666933986552, + "learning_rate": 1.4350342971790979e-05, + "loss": 0.6001, + "step": 3065 + }, + { + "epoch": 0.3761963190184049, + "grad_norm": 1.6568965354907754, + "learning_rate": 1.4346764217659652e-05, + "loss": 0.5996, + "step": 3066 + }, + { + "epoch": 0.376319018404908, + "grad_norm": 0.8374231592144281, + "learning_rate": 1.4343184776994288e-05, + "loss": 0.5525, + "step": 3067 + }, + { + "epoch": 0.37644171779141106, + "grad_norm": 0.9317054908129788, + "learning_rate": 1.4339604650360227e-05, + "loss": 0.6087, + "step": 3068 + }, + { + "epoch": 0.3765644171779141, + "grad_norm": 0.9098763250339911, + "learning_rate": 1.4336023838322919e-05, + "loss": 0.5612, + "step": 3069 + }, + { + "epoch": 0.37668711656441717, + "grad_norm": 1.2757451345056159, + "learning_rate": 1.4332442341447926e-05, + "loss": 0.6494, + "step": 3070 + }, + { + "epoch": 0.3768098159509202, + "grad_norm": 1.0033277873130777, + "learning_rate": 1.4328860160300909e-05, + "loss": 0.6246, + "step": 3071 + }, + { + "epoch": 0.37693251533742334, + "grad_norm": 0.8611351544790965, + "learning_rate": 1.4325277295447645e-05, + "loss": 0.6189, + "step": 3072 + }, + { + "epoch": 0.3770552147239264, + "grad_norm": 1.5650545985055897, + "learning_rate": 1.4321693747454019e-05, + "loss": 0.6314, + "step": 3073 + }, + { + "epoch": 0.37717791411042945, + "grad_norm": 1.1147029838989602, + "learning_rate": 1.431810951688602e-05, + "loss": 0.6217, + "step": 3074 + }, + { + "epoch": 0.3773006134969325, + "grad_norm": 1.0271845199883072, + "learning_rate": 1.4314524604309748e-05, + "loss": 0.6415, + "step": 3075 + }, + { + "epoch": 0.37742331288343556, + "grad_norm": 1.0380211040427256, + "learning_rate": 1.4310939010291405e-05, + "loss": 0.583, + "step": 3076 + }, + { + "epoch": 0.37754601226993867, + "grad_norm": 1.0129193658240567, + "learning_rate": 1.4307352735397307e-05, + "loss": 0.6115, + "step": 3077 + }, + { + "epoch": 0.3776687116564417, + "grad_norm": 0.8663938764335097, + "learning_rate": 1.4303765780193875e-05, + "loss": 0.5507, + "step": 3078 + }, + { + "epoch": 0.3777914110429448, + "grad_norm": 0.8882633347657701, + "learning_rate": 1.4300178145247638e-05, + "loss": 0.555, + "step": 3079 + }, + { + "epoch": 0.37791411042944784, + "grad_norm": 1.0588609085752287, + "learning_rate": 1.4296589831125234e-05, + "loss": 0.6093, + "step": 3080 + }, + { + "epoch": 0.37803680981595095, + "grad_norm": 0.8572088614482251, + "learning_rate": 1.4293000838393405e-05, + "loss": 0.6601, + "step": 3081 + }, + { + "epoch": 0.378159509202454, + "grad_norm": 0.9737893005518601, + "learning_rate": 1.4289411167618998e-05, + "loss": 0.6479, + "step": 3082 + }, + { + "epoch": 0.37828220858895706, + "grad_norm": 1.0012140446500941, + "learning_rate": 1.4285820819368976e-05, + "loss": 0.6262, + "step": 3083 + }, + { + "epoch": 0.3784049079754601, + "grad_norm": 0.926367198285528, + "learning_rate": 1.4282229794210404e-05, + "loss": 0.5964, + "step": 3084 + }, + { + "epoch": 0.37852760736196317, + "grad_norm": 0.9289539889235696, + "learning_rate": 1.4278638092710446e-05, + "loss": 0.6237, + "step": 3085 + }, + { + "epoch": 0.3786503067484663, + "grad_norm": 0.906506447376457, + "learning_rate": 1.4275045715436387e-05, + "loss": 0.5847, + "step": 3086 + }, + { + "epoch": 0.37877300613496934, + "grad_norm": 0.8999871821354465, + "learning_rate": 1.4271452662955614e-05, + "loss": 0.5901, + "step": 3087 + }, + { + "epoch": 0.3788957055214724, + "grad_norm": 0.862860936604246, + "learning_rate": 1.4267858935835612e-05, + "loss": 0.646, + "step": 3088 + }, + { + "epoch": 0.37901840490797545, + "grad_norm": 0.9514762702659371, + "learning_rate": 1.4264264534643979e-05, + "loss": 0.6008, + "step": 3089 + }, + { + "epoch": 0.3791411042944785, + "grad_norm": 0.9856859564130733, + "learning_rate": 1.4260669459948429e-05, + "loss": 0.5865, + "step": 3090 + }, + { + "epoch": 0.3792638036809816, + "grad_norm": 0.9339249997003819, + "learning_rate": 1.4257073712316764e-05, + "loss": 0.6056, + "step": 3091 + }, + { + "epoch": 0.37938650306748467, + "grad_norm": 0.9055676732185496, + "learning_rate": 1.4253477292316907e-05, + "loss": 0.6327, + "step": 3092 + }, + { + "epoch": 0.3795092024539877, + "grad_norm": 1.009867609670732, + "learning_rate": 1.4249880200516879e-05, + "loss": 0.5623, + "step": 3093 + }, + { + "epoch": 0.3796319018404908, + "grad_norm": 0.9041892987076856, + "learning_rate": 1.4246282437484808e-05, + "loss": 0.6638, + "step": 3094 + }, + { + "epoch": 0.37975460122699384, + "grad_norm": 0.8729962112410354, + "learning_rate": 1.4242684003788934e-05, + "loss": 0.6094, + "step": 3095 + }, + { + "epoch": 0.37987730061349695, + "grad_norm": 1.0205069117152206, + "learning_rate": 1.4239084899997599e-05, + "loss": 0.6652, + "step": 3096 + }, + { + "epoch": 0.38, + "grad_norm": 0.9002080906506071, + "learning_rate": 1.4235485126679244e-05, + "loss": 0.6823, + "step": 3097 + }, + { + "epoch": 0.38012269938650306, + "grad_norm": 0.9853808888803891, + "learning_rate": 1.4231884684402428e-05, + "loss": 0.6178, + "step": 3098 + }, + { + "epoch": 0.3802453987730061, + "grad_norm": 1.0556572887320643, + "learning_rate": 1.4228283573735808e-05, + "loss": 0.6106, + "step": 3099 + }, + { + "epoch": 0.3803680981595092, + "grad_norm": 0.8967748978318509, + "learning_rate": 1.4224681795248149e-05, + "loss": 0.5785, + "step": 3100 + }, + { + "epoch": 0.3804907975460123, + "grad_norm": 1.2506998919085788, + "learning_rate": 1.422107934950832e-05, + "loss": 0.5576, + "step": 3101 + }, + { + "epoch": 0.38061349693251534, + "grad_norm": 1.1360127117729786, + "learning_rate": 1.42174762370853e-05, + "loss": 0.6188, + "step": 3102 + }, + { + "epoch": 0.3807361963190184, + "grad_norm": 0.8243863592178167, + "learning_rate": 1.4213872458548162e-05, + "loss": 0.544, + "step": 3103 + }, + { + "epoch": 0.38085889570552145, + "grad_norm": 0.9583184347739044, + "learning_rate": 1.4210268014466097e-05, + "loss": 0.5774, + "step": 3104 + }, + { + "epoch": 0.38098159509202456, + "grad_norm": 0.8680608488059408, + "learning_rate": 1.42066629054084e-05, + "loss": 0.6311, + "step": 3105 + }, + { + "epoch": 0.3811042944785276, + "grad_norm": 0.9063329709034952, + "learning_rate": 1.4203057131944457e-05, + "loss": 0.6088, + "step": 3106 + }, + { + "epoch": 0.38122699386503067, + "grad_norm": 0.9413261307934838, + "learning_rate": 1.4199450694643777e-05, + "loss": 0.6149, + "step": 3107 + }, + { + "epoch": 0.38134969325153373, + "grad_norm": 1.0510626388088344, + "learning_rate": 1.4195843594075965e-05, + "loss": 0.6435, + "step": 3108 + }, + { + "epoch": 0.3814723926380368, + "grad_norm": 0.8657371656147433, + "learning_rate": 1.4192235830810725e-05, + "loss": 0.6095, + "step": 3109 + }, + { + "epoch": 0.3815950920245399, + "grad_norm": 0.9247568775781015, + "learning_rate": 1.418862740541788e-05, + "loss": 0.6088, + "step": 3110 + }, + { + "epoch": 0.38171779141104295, + "grad_norm": 0.9520251809301307, + "learning_rate": 1.4185018318467347e-05, + "loss": 0.6519, + "step": 3111 + }, + { + "epoch": 0.381840490797546, + "grad_norm": 0.878292422286099, + "learning_rate": 1.4181408570529146e-05, + "loss": 0.5702, + "step": 3112 + }, + { + "epoch": 0.38196319018404906, + "grad_norm": 0.9841243173766097, + "learning_rate": 1.4177798162173412e-05, + "loss": 0.5839, + "step": 3113 + }, + { + "epoch": 0.3820858895705522, + "grad_norm": 0.8598492198485105, + "learning_rate": 1.4174187093970377e-05, + "loss": 0.5939, + "step": 3114 + }, + { + "epoch": 0.38220858895705523, + "grad_norm": 0.906807924182022, + "learning_rate": 1.4170575366490376e-05, + "loss": 0.5344, + "step": 3115 + }, + { + "epoch": 0.3823312883435583, + "grad_norm": 0.9058379664105998, + "learning_rate": 1.4166962980303849e-05, + "loss": 0.5611, + "step": 3116 + }, + { + "epoch": 0.38245398773006134, + "grad_norm": 0.9314089376464578, + "learning_rate": 1.4163349935981345e-05, + "loss": 0.6278, + "step": 3117 + }, + { + "epoch": 0.3825766871165644, + "grad_norm": 0.9175774543997426, + "learning_rate": 1.415973623409351e-05, + "loss": 0.6087, + "step": 3118 + }, + { + "epoch": 0.3826993865030675, + "grad_norm": 0.9687709007558923, + "learning_rate": 1.4156121875211101e-05, + "loss": 0.5985, + "step": 3119 + }, + { + "epoch": 0.38282208588957056, + "grad_norm": 0.9480316511639293, + "learning_rate": 1.415250685990497e-05, + "loss": 0.5987, + "step": 3120 + }, + { + "epoch": 0.3829447852760736, + "grad_norm": 1.0727537958942053, + "learning_rate": 1.4148891188746081e-05, + "loss": 0.6006, + "step": 3121 + }, + { + "epoch": 0.3830674846625767, + "grad_norm": 0.8777358899010018, + "learning_rate": 1.4145274862305497e-05, + "loss": 0.5738, + "step": 3122 + }, + { + "epoch": 0.38319018404907973, + "grad_norm": 1.0053248402285178, + "learning_rate": 1.414165788115439e-05, + "loss": 0.6391, + "step": 3123 + }, + { + "epoch": 0.38331288343558284, + "grad_norm": 1.0085453099857717, + "learning_rate": 1.4138040245864022e-05, + "loss": 0.6236, + "step": 3124 + }, + { + "epoch": 0.3834355828220859, + "grad_norm": 0.9149602720804147, + "learning_rate": 1.4134421957005775e-05, + "loss": 0.6266, + "step": 3125 + }, + { + "epoch": 0.38355828220858895, + "grad_norm": 1.010683041639983, + "learning_rate": 1.4130803015151126e-05, + "loss": 0.65, + "step": 3126 + }, + { + "epoch": 0.383680981595092, + "grad_norm": 1.037330495529105, + "learning_rate": 1.4127183420871653e-05, + "loss": 0.6009, + "step": 3127 + }, + { + "epoch": 0.3838036809815951, + "grad_norm": 1.0124303230089589, + "learning_rate": 1.4123563174739036e-05, + "loss": 0.5985, + "step": 3128 + }, + { + "epoch": 0.3839263803680982, + "grad_norm": 0.9850465381350494, + "learning_rate": 1.4119942277325072e-05, + "loss": 0.6336, + "step": 3129 + }, + { + "epoch": 0.38404907975460123, + "grad_norm": 0.8839523287200252, + "learning_rate": 1.4116320729201642e-05, + "loss": 0.6214, + "step": 3130 + }, + { + "epoch": 0.3841717791411043, + "grad_norm": 0.9726976176835187, + "learning_rate": 1.411269853094074e-05, + "loss": 0.6216, + "step": 3131 + }, + { + "epoch": 0.38429447852760734, + "grad_norm": 0.8919278258481754, + "learning_rate": 1.4109075683114464e-05, + "loss": 0.5652, + "step": 3132 + }, + { + "epoch": 0.38441717791411045, + "grad_norm": 1.2839490013264494, + "learning_rate": 1.4105452186295013e-05, + "loss": 0.5808, + "step": 3133 + }, + { + "epoch": 0.3845398773006135, + "grad_norm": 0.9733185829767975, + "learning_rate": 1.4101828041054682e-05, + "loss": 0.6219, + "step": 3134 + }, + { + "epoch": 0.38466257668711656, + "grad_norm": 1.0370603959276175, + "learning_rate": 1.4098203247965876e-05, + "loss": 0.579, + "step": 3135 + }, + { + "epoch": 0.3847852760736196, + "grad_norm": 1.184864275529741, + "learning_rate": 1.4094577807601101e-05, + "loss": 0.607, + "step": 3136 + }, + { + "epoch": 0.3849079754601227, + "grad_norm": 1.0959387420850375, + "learning_rate": 1.4090951720532965e-05, + "loss": 0.6107, + "step": 3137 + }, + { + "epoch": 0.3850306748466258, + "grad_norm": 0.9269667998404957, + "learning_rate": 1.408732498733417e-05, + "loss": 0.6405, + "step": 3138 + }, + { + "epoch": 0.38515337423312884, + "grad_norm": 0.9996611235799133, + "learning_rate": 1.4083697608577535e-05, + "loss": 0.5853, + "step": 3139 + }, + { + "epoch": 0.3852760736196319, + "grad_norm": 1.0224051401050858, + "learning_rate": 1.4080069584835971e-05, + "loss": 0.6093, + "step": 3140 + }, + { + "epoch": 0.38539877300613495, + "grad_norm": 1.0247774417775282, + "learning_rate": 1.4076440916682492e-05, + "loss": 0.6397, + "step": 3141 + }, + { + "epoch": 0.385521472392638, + "grad_norm": 1.0211551983677225, + "learning_rate": 1.4072811604690218e-05, + "loss": 0.5884, + "step": 3142 + }, + { + "epoch": 0.3856441717791411, + "grad_norm": 1.0185031456098537, + "learning_rate": 1.4069181649432363e-05, + "loss": 0.5798, + "step": 3143 + }, + { + "epoch": 0.3857668711656442, + "grad_norm": 0.9679833621816796, + "learning_rate": 1.406555105148225e-05, + "loss": 0.6038, + "step": 3144 + }, + { + "epoch": 0.38588957055214723, + "grad_norm": 0.8410200421161498, + "learning_rate": 1.4061919811413305e-05, + "loss": 0.5558, + "step": 3145 + }, + { + "epoch": 0.3860122699386503, + "grad_norm": 1.0409853754975735, + "learning_rate": 1.4058287929799042e-05, + "loss": 0.5609, + "step": 3146 + }, + { + "epoch": 0.3861349693251534, + "grad_norm": 0.8782240516523989, + "learning_rate": 1.4054655407213091e-05, + "loss": 0.5901, + "step": 3147 + }, + { + "epoch": 0.38625766871165645, + "grad_norm": 0.8241576397675157, + "learning_rate": 1.405102224422918e-05, + "loss": 0.5912, + "step": 3148 + }, + { + "epoch": 0.3863803680981595, + "grad_norm": 0.8631523414440742, + "learning_rate": 1.404738844142113e-05, + "loss": 0.632, + "step": 3149 + }, + { + "epoch": 0.38650306748466257, + "grad_norm": 1.0579514187282302, + "learning_rate": 1.4043753999362872e-05, + "loss": 0.6219, + "step": 3150 + }, + { + "epoch": 0.3866257668711656, + "grad_norm": 1.0251728683452792, + "learning_rate": 1.4040118918628433e-05, + "loss": 0.6157, + "step": 3151 + }, + { + "epoch": 0.38674846625766873, + "grad_norm": 0.9012948733500157, + "learning_rate": 1.4036483199791949e-05, + "loss": 0.61, + "step": 3152 + }, + { + "epoch": 0.3868711656441718, + "grad_norm": 0.9503718375370808, + "learning_rate": 1.4032846843427641e-05, + "loss": 0.5763, + "step": 3153 + }, + { + "epoch": 0.38699386503067484, + "grad_norm": 0.9625960251643233, + "learning_rate": 1.4029209850109848e-05, + "loss": 0.6281, + "step": 3154 + }, + { + "epoch": 0.3871165644171779, + "grad_norm": 0.9201160902412165, + "learning_rate": 1.4025572220412998e-05, + "loss": 0.635, + "step": 3155 + }, + { + "epoch": 0.38723926380368096, + "grad_norm": 0.9836674373129481, + "learning_rate": 1.4021933954911625e-05, + "loss": 0.5965, + "step": 3156 + }, + { + "epoch": 0.38736196319018407, + "grad_norm": 0.8630403024287154, + "learning_rate": 1.4018295054180361e-05, + "loss": 0.5908, + "step": 3157 + }, + { + "epoch": 0.3874846625766871, + "grad_norm": 0.8265341401985004, + "learning_rate": 1.4014655518793936e-05, + "loss": 0.6125, + "step": 3158 + }, + { + "epoch": 0.3876073619631902, + "grad_norm": 0.8656431562398589, + "learning_rate": 1.4011015349327188e-05, + "loss": 0.6351, + "step": 3159 + }, + { + "epoch": 0.38773006134969323, + "grad_norm": 0.9195544396656515, + "learning_rate": 1.400737454635505e-05, + "loss": 0.5855, + "step": 3160 + }, + { + "epoch": 0.38785276073619634, + "grad_norm": 1.566711894238346, + "learning_rate": 1.4003733110452556e-05, + "loss": 0.5545, + "step": 3161 + }, + { + "epoch": 0.3879754601226994, + "grad_norm": 0.908382074191266, + "learning_rate": 1.4000091042194835e-05, + "loss": 0.626, + "step": 3162 + }, + { + "epoch": 0.38809815950920246, + "grad_norm": 0.9812945405970339, + "learning_rate": 1.3996448342157123e-05, + "loss": 0.5909, + "step": 3163 + }, + { + "epoch": 0.3882208588957055, + "grad_norm": 0.9232919056969543, + "learning_rate": 1.3992805010914754e-05, + "loss": 0.5902, + "step": 3164 + }, + { + "epoch": 0.38834355828220857, + "grad_norm": 1.1242441342703315, + "learning_rate": 1.398916104904316e-05, + "loss": 0.6151, + "step": 3165 + }, + { + "epoch": 0.3884662576687117, + "grad_norm": 1.004333540228228, + "learning_rate": 1.3985516457117878e-05, + "loss": 0.6148, + "step": 3166 + }, + { + "epoch": 0.38858895705521473, + "grad_norm": 0.9603691235246389, + "learning_rate": 1.3981871235714532e-05, + "loss": 0.6122, + "step": 3167 + }, + { + "epoch": 0.3887116564417178, + "grad_norm": 0.9188681136396226, + "learning_rate": 1.3978225385408857e-05, + "loss": 0.6369, + "step": 3168 + }, + { + "epoch": 0.38883435582822085, + "grad_norm": 0.9105900724172656, + "learning_rate": 1.3974578906776683e-05, + "loss": 0.5999, + "step": 3169 + }, + { + "epoch": 0.3889570552147239, + "grad_norm": 0.8681463559565223, + "learning_rate": 1.3970931800393943e-05, + "loss": 0.5486, + "step": 3170 + }, + { + "epoch": 0.389079754601227, + "grad_norm": 0.9901117186513686, + "learning_rate": 1.3967284066836662e-05, + "loss": 0.5989, + "step": 3171 + }, + { + "epoch": 0.38920245398773007, + "grad_norm": 1.101545991027699, + "learning_rate": 1.3963635706680969e-05, + "loss": 0.5854, + "step": 3172 + }, + { + "epoch": 0.3893251533742331, + "grad_norm": 1.0433708243002984, + "learning_rate": 1.3959986720503093e-05, + "loss": 0.5812, + "step": 3173 + }, + { + "epoch": 0.3894478527607362, + "grad_norm": 0.9619862045117554, + "learning_rate": 1.3956337108879362e-05, + "loss": 0.6139, + "step": 3174 + }, + { + "epoch": 0.3895705521472393, + "grad_norm": 0.9754034797787783, + "learning_rate": 1.3952686872386195e-05, + "loss": 0.6171, + "step": 3175 + }, + { + "epoch": 0.38969325153374235, + "grad_norm": 0.9397422950794064, + "learning_rate": 1.394903601160012e-05, + "loss": 0.6104, + "step": 3176 + }, + { + "epoch": 0.3898159509202454, + "grad_norm": 1.0662768287080098, + "learning_rate": 1.3945384527097756e-05, + "loss": 0.6077, + "step": 3177 + }, + { + "epoch": 0.38993865030674846, + "grad_norm": 1.1397207971136423, + "learning_rate": 1.3941732419455826e-05, + "loss": 0.5563, + "step": 3178 + }, + { + "epoch": 0.3900613496932515, + "grad_norm": 0.933061150290282, + "learning_rate": 1.3938079689251147e-05, + "loss": 0.6179, + "step": 3179 + }, + { + "epoch": 0.3901840490797546, + "grad_norm": 1.0076148067310278, + "learning_rate": 1.3934426337060638e-05, + "loss": 0.5664, + "step": 3180 + }, + { + "epoch": 0.3903067484662577, + "grad_norm": 0.9184524033726456, + "learning_rate": 1.3930772363461313e-05, + "loss": 0.6155, + "step": 3181 + }, + { + "epoch": 0.39042944785276074, + "grad_norm": 1.0046565229132025, + "learning_rate": 1.3927117769030287e-05, + "loss": 0.5855, + "step": 3182 + }, + { + "epoch": 0.3905521472392638, + "grad_norm": 0.9849530006935155, + "learning_rate": 1.3923462554344774e-05, + "loss": 0.5713, + "step": 3183 + }, + { + "epoch": 0.39067484662576685, + "grad_norm": 0.9535148423802783, + "learning_rate": 1.3919806719982076e-05, + "loss": 0.6343, + "step": 3184 + }, + { + "epoch": 0.39079754601226996, + "grad_norm": 1.1032260823365059, + "learning_rate": 1.391615026651961e-05, + "loss": 0.539, + "step": 3185 + }, + { + "epoch": 0.390920245398773, + "grad_norm": 0.9854463723482949, + "learning_rate": 1.3912493194534876e-05, + "loss": 0.541, + "step": 3186 + }, + { + "epoch": 0.39104294478527607, + "grad_norm": 0.9803574306347479, + "learning_rate": 1.3908835504605478e-05, + "loss": 0.6066, + "step": 3187 + }, + { + "epoch": 0.3911656441717791, + "grad_norm": 0.8771707353621303, + "learning_rate": 1.3905177197309116e-05, + "loss": 0.5772, + "step": 3188 + }, + { + "epoch": 0.3912883435582822, + "grad_norm": 1.0984791124665776, + "learning_rate": 1.390151827322359e-05, + "loss": 0.5742, + "step": 3189 + }, + { + "epoch": 0.3914110429447853, + "grad_norm": 0.8408929983803347, + "learning_rate": 1.3897858732926794e-05, + "loss": 0.5693, + "step": 3190 + }, + { + "epoch": 0.39153374233128835, + "grad_norm": 0.8879882851479364, + "learning_rate": 1.3894198576996722e-05, + "loss": 0.6222, + "step": 3191 + }, + { + "epoch": 0.3916564417177914, + "grad_norm": 1.0301108778156354, + "learning_rate": 1.3890537806011467e-05, + "loss": 0.6061, + "step": 3192 + }, + { + "epoch": 0.39177914110429446, + "grad_norm": 1.093969803244121, + "learning_rate": 1.3886876420549209e-05, + "loss": 0.6183, + "step": 3193 + }, + { + "epoch": 0.39190184049079757, + "grad_norm": 0.8935557527892626, + "learning_rate": 1.3883214421188239e-05, + "loss": 0.5681, + "step": 3194 + }, + { + "epoch": 0.3920245398773006, + "grad_norm": 1.072203589680446, + "learning_rate": 1.3879551808506932e-05, + "loss": 0.5486, + "step": 3195 + }, + { + "epoch": 0.3921472392638037, + "grad_norm": 0.920678450545745, + "learning_rate": 1.3875888583083772e-05, + "loss": 0.5759, + "step": 3196 + }, + { + "epoch": 0.39226993865030674, + "grad_norm": 0.9254522846956921, + "learning_rate": 1.3872224745497334e-05, + "loss": 0.5728, + "step": 3197 + }, + { + "epoch": 0.3923926380368098, + "grad_norm": 1.0459166400981972, + "learning_rate": 1.3868560296326286e-05, + "loss": 0.613, + "step": 3198 + }, + { + "epoch": 0.3925153374233129, + "grad_norm": 1.118345072005281, + "learning_rate": 1.3864895236149399e-05, + "loss": 0.5725, + "step": 3199 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 0.9290008975159388, + "learning_rate": 1.3861229565545532e-05, + "loss": 0.5786, + "step": 3200 + }, + { + "epoch": 0.392760736196319, + "grad_norm": 1.0555516441945203, + "learning_rate": 1.3857563285093653e-05, + "loss": 0.6576, + "step": 3201 + }, + { + "epoch": 0.39288343558282207, + "grad_norm": 1.0871789554152935, + "learning_rate": 1.3853896395372818e-05, + "loss": 0.572, + "step": 3202 + }, + { + "epoch": 0.3930061349693251, + "grad_norm": 1.0812712235611965, + "learning_rate": 1.3850228896962178e-05, + "loss": 0.5998, + "step": 3203 + }, + { + "epoch": 0.39312883435582824, + "grad_norm": 0.8216365618078975, + "learning_rate": 1.3846560790440988e-05, + "loss": 0.6091, + "step": 3204 + }, + { + "epoch": 0.3932515337423313, + "grad_norm": 0.9445764035896962, + "learning_rate": 1.384289207638859e-05, + "loss": 0.5712, + "step": 3205 + }, + { + "epoch": 0.39337423312883435, + "grad_norm": 1.0217380606528634, + "learning_rate": 1.3839222755384428e-05, + "loss": 0.6486, + "step": 3206 + }, + { + "epoch": 0.3934969325153374, + "grad_norm": 0.9445676205381746, + "learning_rate": 1.3835552828008034e-05, + "loss": 0.598, + "step": 3207 + }, + { + "epoch": 0.3936196319018405, + "grad_norm": 0.8594560496991586, + "learning_rate": 1.3831882294839048e-05, + "loss": 0.5934, + "step": 3208 + }, + { + "epoch": 0.39374233128834357, + "grad_norm": 1.0850182697234454, + "learning_rate": 1.3828211156457196e-05, + "loss": 0.5833, + "step": 3209 + }, + { + "epoch": 0.39386503067484663, + "grad_norm": 0.8776195694383082, + "learning_rate": 1.3824539413442304e-05, + "loss": 0.6051, + "step": 3210 + }, + { + "epoch": 0.3939877300613497, + "grad_norm": 0.9264710858961777, + "learning_rate": 1.3820867066374291e-05, + "loss": 0.6257, + "step": 3211 + }, + { + "epoch": 0.39411042944785274, + "grad_norm": 1.0443802688797856, + "learning_rate": 1.3817194115833174e-05, + "loss": 0.6353, + "step": 3212 + }, + { + "epoch": 0.39423312883435585, + "grad_norm": 0.8715151178515667, + "learning_rate": 1.381352056239906e-05, + "loss": 0.5876, + "step": 3213 + }, + { + "epoch": 0.3943558282208589, + "grad_norm": 1.160692965586111, + "learning_rate": 1.3809846406652163e-05, + "loss": 0.6056, + "step": 3214 + }, + { + "epoch": 0.39447852760736196, + "grad_norm": 0.9435393821446939, + "learning_rate": 1.3806171649172782e-05, + "loss": 0.6145, + "step": 3215 + }, + { + "epoch": 0.394601226993865, + "grad_norm": 0.876662017276439, + "learning_rate": 1.3802496290541308e-05, + "loss": 0.5873, + "step": 3216 + }, + { + "epoch": 0.3947239263803681, + "grad_norm": 1.1265238776061948, + "learning_rate": 1.3798820331338234e-05, + "loss": 0.6125, + "step": 3217 + }, + { + "epoch": 0.3948466257668712, + "grad_norm": 0.9765224502831998, + "learning_rate": 1.3795143772144151e-05, + "loss": 0.6204, + "step": 3218 + }, + { + "epoch": 0.39496932515337424, + "grad_norm": 1.178689394017331, + "learning_rate": 1.3791466613539735e-05, + "loss": 0.6262, + "step": 3219 + }, + { + "epoch": 0.3950920245398773, + "grad_norm": 0.9821079976494098, + "learning_rate": 1.3787788856105762e-05, + "loss": 0.5695, + "step": 3220 + }, + { + "epoch": 0.39521472392638035, + "grad_norm": 1.010952768292068, + "learning_rate": 1.3784110500423104e-05, + "loss": 0.5878, + "step": 3221 + }, + { + "epoch": 0.39533742331288346, + "grad_norm": 0.905410533100459, + "learning_rate": 1.3780431547072725e-05, + "loss": 0.5692, + "step": 3222 + }, + { + "epoch": 0.3954601226993865, + "grad_norm": 0.9104098989881163, + "learning_rate": 1.3776751996635684e-05, + "loss": 0.5541, + "step": 3223 + }, + { + "epoch": 0.3955828220858896, + "grad_norm": 0.8834015390478616, + "learning_rate": 1.3773071849693137e-05, + "loss": 0.5647, + "step": 3224 + }, + { + "epoch": 0.39570552147239263, + "grad_norm": 0.9029608415286003, + "learning_rate": 1.3769391106826326e-05, + "loss": 0.6303, + "step": 3225 + }, + { + "epoch": 0.3958282208588957, + "grad_norm": 1.0111930402643032, + "learning_rate": 1.3765709768616598e-05, + "loss": 0.6035, + "step": 3226 + }, + { + "epoch": 0.3959509202453988, + "grad_norm": 1.0266393197647912, + "learning_rate": 1.3762027835645384e-05, + "loss": 0.6203, + "step": 3227 + }, + { + "epoch": 0.39607361963190185, + "grad_norm": 0.9459941667943533, + "learning_rate": 1.3758345308494217e-05, + "loss": 0.5834, + "step": 3228 + }, + { + "epoch": 0.3961963190184049, + "grad_norm": 1.063968086508675, + "learning_rate": 1.375466218774472e-05, + "loss": 0.6857, + "step": 3229 + }, + { + "epoch": 0.39631901840490796, + "grad_norm": 0.9391609165439068, + "learning_rate": 1.3750978473978611e-05, + "loss": 0.6095, + "step": 3230 + }, + { + "epoch": 0.396441717791411, + "grad_norm": 0.9177658186414296, + "learning_rate": 1.3747294167777696e-05, + "loss": 0.5973, + "step": 3231 + }, + { + "epoch": 0.39656441717791413, + "grad_norm": 0.866885589226769, + "learning_rate": 1.3743609269723888e-05, + "loss": 0.6088, + "step": 3232 + }, + { + "epoch": 0.3966871165644172, + "grad_norm": 1.0277177924854803, + "learning_rate": 1.3739923780399176e-05, + "loss": 0.6144, + "step": 3233 + }, + { + "epoch": 0.39680981595092024, + "grad_norm": 0.9332648455989153, + "learning_rate": 1.3736237700385657e-05, + "loss": 0.5732, + "step": 3234 + }, + { + "epoch": 0.3969325153374233, + "grad_norm": 0.9401854117784831, + "learning_rate": 1.3732551030265514e-05, + "loss": 0.5758, + "step": 3235 + }, + { + "epoch": 0.39705521472392635, + "grad_norm": 0.9707605679009972, + "learning_rate": 1.3728863770621028e-05, + "loss": 0.5636, + "step": 3236 + }, + { + "epoch": 0.39717791411042946, + "grad_norm": 0.9658921345645878, + "learning_rate": 1.3725175922034566e-05, + "loss": 0.5896, + "step": 3237 + }, + { + "epoch": 0.3973006134969325, + "grad_norm": 0.8862508728569416, + "learning_rate": 1.3721487485088591e-05, + "loss": 0.5933, + "step": 3238 + }, + { + "epoch": 0.3974233128834356, + "grad_norm": 1.018558717886841, + "learning_rate": 1.3717798460365663e-05, + "loss": 0.6018, + "step": 3239 + }, + { + "epoch": 0.39754601226993863, + "grad_norm": 0.9077229916529668, + "learning_rate": 1.371410884844843e-05, + "loss": 0.5946, + "step": 3240 + }, + { + "epoch": 0.39766871165644174, + "grad_norm": 0.9082209325864163, + "learning_rate": 1.3710418649919635e-05, + "loss": 0.5995, + "step": 3241 + }, + { + "epoch": 0.3977914110429448, + "grad_norm": 0.9194289029976705, + "learning_rate": 1.3706727865362113e-05, + "loss": 0.632, + "step": 3242 + }, + { + "epoch": 0.39791411042944785, + "grad_norm": 0.8594196415204928, + "learning_rate": 1.3703036495358792e-05, + "loss": 0.6221, + "step": 3243 + }, + { + "epoch": 0.3980368098159509, + "grad_norm": 0.9822902717676724, + "learning_rate": 1.3699344540492694e-05, + "loss": 0.6051, + "step": 3244 + }, + { + "epoch": 0.39815950920245397, + "grad_norm": 0.8768869049944166, + "learning_rate": 1.3695652001346928e-05, + "loss": 0.6279, + "step": 3245 + }, + { + "epoch": 0.3982822085889571, + "grad_norm": 0.7932545798978994, + "learning_rate": 1.3691958878504703e-05, + "loss": 0.6057, + "step": 3246 + }, + { + "epoch": 0.39840490797546013, + "grad_norm": 0.9671912128140674, + "learning_rate": 1.3688265172549314e-05, + "loss": 0.6153, + "step": 3247 + }, + { + "epoch": 0.3985276073619632, + "grad_norm": 1.07134603392009, + "learning_rate": 1.3684570884064146e-05, + "loss": 0.6028, + "step": 3248 + }, + { + "epoch": 0.39865030674846624, + "grad_norm": 0.9735756151479151, + "learning_rate": 1.3680876013632683e-05, + "loss": 0.6186, + "step": 3249 + }, + { + "epoch": 0.3987730061349693, + "grad_norm": 0.8476476898092998, + "learning_rate": 1.3677180561838501e-05, + "loss": 0.564, + "step": 3250 + }, + { + "epoch": 0.3988957055214724, + "grad_norm": 0.8616142052138256, + "learning_rate": 1.3673484529265262e-05, + "loss": 0.5292, + "step": 3251 + }, + { + "epoch": 0.39901840490797547, + "grad_norm": 0.8862351869489633, + "learning_rate": 1.3669787916496722e-05, + "loss": 0.6043, + "step": 3252 + }, + { + "epoch": 0.3991411042944785, + "grad_norm": 0.9073998457118768, + "learning_rate": 1.3666090724116732e-05, + "loss": 0.5754, + "step": 3253 + }, + { + "epoch": 0.3992638036809816, + "grad_norm": 1.0119863245464298, + "learning_rate": 1.366239295270923e-05, + "loss": 0.5897, + "step": 3254 + }, + { + "epoch": 0.3993865030674847, + "grad_norm": 0.8363711830500721, + "learning_rate": 1.3658694602858247e-05, + "loss": 0.6548, + "step": 3255 + }, + { + "epoch": 0.39950920245398774, + "grad_norm": 0.9564324778282365, + "learning_rate": 1.3654995675147904e-05, + "loss": 0.6017, + "step": 3256 + }, + { + "epoch": 0.3996319018404908, + "grad_norm": 0.8693756057673525, + "learning_rate": 1.3651296170162417e-05, + "loss": 0.598, + "step": 3257 + }, + { + "epoch": 0.39975460122699386, + "grad_norm": 0.9090092178512655, + "learning_rate": 1.3647596088486093e-05, + "loss": 0.594, + "step": 3258 + }, + { + "epoch": 0.3998773006134969, + "grad_norm": 1.1118069853969426, + "learning_rate": 1.3643895430703324e-05, + "loss": 0.6265, + "step": 3259 + }, + { + "epoch": 0.4, + "grad_norm": 1.040779085323522, + "learning_rate": 1.36401941973986e-05, + "loss": 0.5873, + "step": 3260 + }, + { + "epoch": 0.4001226993865031, + "grad_norm": 0.9107400476990144, + "learning_rate": 1.3636492389156498e-05, + "loss": 0.5992, + "step": 3261 + }, + { + "epoch": 0.40024539877300613, + "grad_norm": 1.145306772819547, + "learning_rate": 1.3632790006561685e-05, + "loss": 0.5583, + "step": 3262 + }, + { + "epoch": 0.4003680981595092, + "grad_norm": 0.8945088060757667, + "learning_rate": 1.3629087050198924e-05, + "loss": 0.6263, + "step": 3263 + }, + { + "epoch": 0.40049079754601224, + "grad_norm": 0.8912533575234998, + "learning_rate": 1.3625383520653062e-05, + "loss": 0.5971, + "step": 3264 + }, + { + "epoch": 0.40061349693251536, + "grad_norm": 0.9691619724104504, + "learning_rate": 1.362167941850904e-05, + "loss": 0.6146, + "step": 3265 + }, + { + "epoch": 0.4007361963190184, + "grad_norm": 0.8067304869279993, + "learning_rate": 1.3617974744351891e-05, + "loss": 0.6008, + "step": 3266 + }, + { + "epoch": 0.40085889570552147, + "grad_norm": 0.890809966422039, + "learning_rate": 1.3614269498766736e-05, + "loss": 0.6061, + "step": 3267 + }, + { + "epoch": 0.4009815950920245, + "grad_norm": 1.131764645508836, + "learning_rate": 1.3610563682338787e-05, + "loss": 0.6318, + "step": 3268 + }, + { + "epoch": 0.40110429447852763, + "grad_norm": 1.1524602354231557, + "learning_rate": 1.3606857295653341e-05, + "loss": 0.6175, + "step": 3269 + }, + { + "epoch": 0.4012269938650307, + "grad_norm": 0.8606297585457717, + "learning_rate": 1.3603150339295797e-05, + "loss": 0.645, + "step": 3270 + }, + { + "epoch": 0.40134969325153375, + "grad_norm": 0.9857083920822216, + "learning_rate": 1.3599442813851633e-05, + "loss": 0.6031, + "step": 3271 + }, + { + "epoch": 0.4014723926380368, + "grad_norm": 0.8778827590963257, + "learning_rate": 1.3595734719906421e-05, + "loss": 0.6383, + "step": 3272 + }, + { + "epoch": 0.40159509202453986, + "grad_norm": 0.8961641055704295, + "learning_rate": 1.3592026058045824e-05, + "loss": 0.6316, + "step": 3273 + }, + { + "epoch": 0.40171779141104297, + "grad_norm": 0.77964290924858, + "learning_rate": 1.3588316828855592e-05, + "loss": 0.6417, + "step": 3274 + }, + { + "epoch": 0.401840490797546, + "grad_norm": 0.9475893451691916, + "learning_rate": 1.3584607032921566e-05, + "loss": 0.605, + "step": 3275 + }, + { + "epoch": 0.4019631901840491, + "grad_norm": 0.9086864735299953, + "learning_rate": 1.3580896670829677e-05, + "loss": 0.5469, + "step": 3276 + }, + { + "epoch": 0.40208588957055214, + "grad_norm": 0.798675844317414, + "learning_rate": 1.3577185743165945e-05, + "loss": 0.6152, + "step": 3277 + }, + { + "epoch": 0.4022085889570552, + "grad_norm": 0.9864052880043942, + "learning_rate": 1.357347425051648e-05, + "loss": 0.5566, + "step": 3278 + }, + { + "epoch": 0.4023312883435583, + "grad_norm": 0.8623293348730018, + "learning_rate": 1.3569762193467475e-05, + "loss": 0.6312, + "step": 3279 + }, + { + "epoch": 0.40245398773006136, + "grad_norm": 0.875406263977855, + "learning_rate": 1.3566049572605222e-05, + "loss": 0.5709, + "step": 3280 + }, + { + "epoch": 0.4025766871165644, + "grad_norm": 0.9169526129025055, + "learning_rate": 1.3562336388516097e-05, + "loss": 0.5982, + "step": 3281 + }, + { + "epoch": 0.40269938650306747, + "grad_norm": 0.9860524534834928, + "learning_rate": 1.3558622641786565e-05, + "loss": 0.5665, + "step": 3282 + }, + { + "epoch": 0.4028220858895705, + "grad_norm": 0.9330357207862284, + "learning_rate": 1.355490833300318e-05, + "loss": 0.5923, + "step": 3283 + }, + { + "epoch": 0.40294478527607364, + "grad_norm": 1.002687007472518, + "learning_rate": 1.3551193462752587e-05, + "loss": 0.6127, + "step": 3284 + }, + { + "epoch": 0.4030674846625767, + "grad_norm": 0.8447284160688772, + "learning_rate": 1.3547478031621517e-05, + "loss": 0.6101, + "step": 3285 + }, + { + "epoch": 0.40319018404907975, + "grad_norm": 1.1727887809184099, + "learning_rate": 1.3543762040196792e-05, + "loss": 0.5843, + "step": 3286 + }, + { + "epoch": 0.4033128834355828, + "grad_norm": 0.9562092797584354, + "learning_rate": 1.3540045489065318e-05, + "loss": 0.6523, + "step": 3287 + }, + { + "epoch": 0.4034355828220859, + "grad_norm": 0.8827901997480813, + "learning_rate": 1.3536328378814094e-05, + "loss": 0.617, + "step": 3288 + }, + { + "epoch": 0.40355828220858897, + "grad_norm": 0.8638307153772565, + "learning_rate": 1.3532610710030203e-05, + "loss": 0.594, + "step": 3289 + }, + { + "epoch": 0.403680981595092, + "grad_norm": 0.8420517156686665, + "learning_rate": 1.3528892483300821e-05, + "loss": 0.5837, + "step": 3290 + }, + { + "epoch": 0.4038036809815951, + "grad_norm": 0.8881212313881202, + "learning_rate": 1.352517369921321e-05, + "loss": 0.6423, + "step": 3291 + }, + { + "epoch": 0.40392638036809814, + "grad_norm": 0.9353158318500677, + "learning_rate": 1.352145435835472e-05, + "loss": 0.6018, + "step": 3292 + }, + { + "epoch": 0.40404907975460125, + "grad_norm": 0.9112379118533352, + "learning_rate": 1.3517734461312789e-05, + "loss": 0.6232, + "step": 3293 + }, + { + "epoch": 0.4041717791411043, + "grad_norm": 0.8603195229509158, + "learning_rate": 1.351401400867494e-05, + "loss": 0.6121, + "step": 3294 + }, + { + "epoch": 0.40429447852760736, + "grad_norm": 1.0613875937619928, + "learning_rate": 1.3510293001028792e-05, + "loss": 0.5782, + "step": 3295 + }, + { + "epoch": 0.4044171779141104, + "grad_norm": 0.9107183299290164, + "learning_rate": 1.350657143896204e-05, + "loss": 0.5811, + "step": 3296 + }, + { + "epoch": 0.40453987730061347, + "grad_norm": 0.9296038970684879, + "learning_rate": 1.3502849323062477e-05, + "loss": 0.5962, + "step": 3297 + }, + { + "epoch": 0.4046625766871166, + "grad_norm": 1.02550750977795, + "learning_rate": 1.3499126653917977e-05, + "loss": 0.6042, + "step": 3298 + }, + { + "epoch": 0.40478527607361964, + "grad_norm": 0.9631102787755097, + "learning_rate": 1.3495403432116507e-05, + "loss": 0.5762, + "step": 3299 + }, + { + "epoch": 0.4049079754601227, + "grad_norm": 0.9457089324981315, + "learning_rate": 1.3491679658246114e-05, + "loss": 0.5664, + "step": 3300 + }, + { + "epoch": 0.40503067484662575, + "grad_norm": 0.8888201698135534, + "learning_rate": 1.3487955332894938e-05, + "loss": 0.6296, + "step": 3301 + }, + { + "epoch": 0.40515337423312886, + "grad_norm": 0.838998716484552, + "learning_rate": 1.3484230456651202e-05, + "loss": 0.6535, + "step": 3302 + }, + { + "epoch": 0.4052760736196319, + "grad_norm": 1.0138089957828627, + "learning_rate": 1.3480505030103222e-05, + "loss": 0.5902, + "step": 3303 + }, + { + "epoch": 0.40539877300613497, + "grad_norm": 1.0583005770705798, + "learning_rate": 1.3476779053839393e-05, + "loss": 0.6241, + "step": 3304 + }, + { + "epoch": 0.405521472392638, + "grad_norm": 0.9888526211922297, + "learning_rate": 1.3473052528448203e-05, + "loss": 0.5904, + "step": 3305 + }, + { + "epoch": 0.4056441717791411, + "grad_norm": 0.9596393097397337, + "learning_rate": 1.3469325454518227e-05, + "loss": 0.6213, + "step": 3306 + }, + { + "epoch": 0.4057668711656442, + "grad_norm": 0.833365950805298, + "learning_rate": 1.3465597832638122e-05, + "loss": 0.524, + "step": 3307 + }, + { + "epoch": 0.40588957055214725, + "grad_norm": 0.9581210845752354, + "learning_rate": 1.3461869663396629e-05, + "loss": 0.5681, + "step": 3308 + }, + { + "epoch": 0.4060122699386503, + "grad_norm": 0.9051024089353102, + "learning_rate": 1.3458140947382589e-05, + "loss": 0.5979, + "step": 3309 + }, + { + "epoch": 0.40613496932515336, + "grad_norm": 1.2430003641006118, + "learning_rate": 1.3454411685184913e-05, + "loss": 0.5677, + "step": 3310 + }, + { + "epoch": 0.4062576687116564, + "grad_norm": 0.8517591092061436, + "learning_rate": 1.345068187739261e-05, + "loss": 0.5912, + "step": 3311 + }, + { + "epoch": 0.40638036809815953, + "grad_norm": 0.9139937816387718, + "learning_rate": 1.3446951524594773e-05, + "loss": 0.5685, + "step": 3312 + }, + { + "epoch": 0.4065030674846626, + "grad_norm": 1.0784998574827886, + "learning_rate": 1.3443220627380575e-05, + "loss": 0.6547, + "step": 3313 + }, + { + "epoch": 0.40662576687116564, + "grad_norm": 0.9324506360099462, + "learning_rate": 1.3439489186339283e-05, + "loss": 0.601, + "step": 3314 + }, + { + "epoch": 0.4067484662576687, + "grad_norm": 0.9589579020281833, + "learning_rate": 1.3435757202060242e-05, + "loss": 0.5954, + "step": 3315 + }, + { + "epoch": 0.4068711656441718, + "grad_norm": 0.8358556455443779, + "learning_rate": 1.343202467513289e-05, + "loss": 0.5812, + "step": 3316 + }, + { + "epoch": 0.40699386503067486, + "grad_norm": 0.8761400554308534, + "learning_rate": 1.3428291606146747e-05, + "loss": 0.629, + "step": 3317 + }, + { + "epoch": 0.4071165644171779, + "grad_norm": 0.7894547259274713, + "learning_rate": 1.3424557995691415e-05, + "loss": 0.5574, + "step": 3318 + }, + { + "epoch": 0.407239263803681, + "grad_norm": 0.965099491693241, + "learning_rate": 1.3420823844356589e-05, + "loss": 0.6509, + "step": 3319 + }, + { + "epoch": 0.40736196319018403, + "grad_norm": 1.003070913345677, + "learning_rate": 1.3417089152732049e-05, + "loss": 0.6153, + "step": 3320 + }, + { + "epoch": 0.40748466257668714, + "grad_norm": 0.8966693461782526, + "learning_rate": 1.3413353921407652e-05, + "loss": 0.5863, + "step": 3321 + }, + { + "epoch": 0.4076073619631902, + "grad_norm": 1.0823843216153766, + "learning_rate": 1.3409618150973349e-05, + "loss": 0.5984, + "step": 3322 + }, + { + "epoch": 0.40773006134969325, + "grad_norm": 0.9248184552952229, + "learning_rate": 1.340588184201917e-05, + "loss": 0.5916, + "step": 3323 + }, + { + "epoch": 0.4078527607361963, + "grad_norm": 1.0754476304315985, + "learning_rate": 1.3402144995135237e-05, + "loss": 0.6129, + "step": 3324 + }, + { + "epoch": 0.40797546012269936, + "grad_norm": 0.9666402997943422, + "learning_rate": 1.3398407610911752e-05, + "loss": 0.5508, + "step": 3325 + }, + { + "epoch": 0.4080981595092025, + "grad_norm": 1.1097852909086756, + "learning_rate": 1.3394669689939002e-05, + "loss": 0.6478, + "step": 3326 + }, + { + "epoch": 0.40822085889570553, + "grad_norm": 0.8808826676191376, + "learning_rate": 1.3390931232807359e-05, + "loss": 0.5289, + "step": 3327 + }, + { + "epoch": 0.4083435582822086, + "grad_norm": 0.8764564674227414, + "learning_rate": 1.3387192240107277e-05, + "loss": 0.5707, + "step": 3328 + }, + { + "epoch": 0.40846625766871164, + "grad_norm": 0.9550327915238651, + "learning_rate": 1.3383452712429304e-05, + "loss": 0.6024, + "step": 3329 + }, + { + "epoch": 0.4085889570552147, + "grad_norm": 0.8866814480167486, + "learning_rate": 1.3379712650364061e-05, + "loss": 0.6172, + "step": 3330 + }, + { + "epoch": 0.4087116564417178, + "grad_norm": 0.8523304457432236, + "learning_rate": 1.337597205450226e-05, + "loss": 0.6138, + "step": 3331 + }, + { + "epoch": 0.40883435582822086, + "grad_norm": 0.8680623060446337, + "learning_rate": 1.3372230925434699e-05, + "loss": 0.5905, + "step": 3332 + }, + { + "epoch": 0.4089570552147239, + "grad_norm": 0.9055093842442573, + "learning_rate": 1.3368489263752254e-05, + "loss": 0.6355, + "step": 3333 + }, + { + "epoch": 0.409079754601227, + "grad_norm": 0.9928104268871206, + "learning_rate": 1.3364747070045889e-05, + "loss": 0.6109, + "step": 3334 + }, + { + "epoch": 0.4092024539877301, + "grad_norm": 0.7750386688733788, + "learning_rate": 1.3361004344906652e-05, + "loss": 0.55, + "step": 3335 + }, + { + "epoch": 0.40932515337423314, + "grad_norm": 0.8938105247098711, + "learning_rate": 1.3357261088925673e-05, + "loss": 0.5747, + "step": 3336 + }, + { + "epoch": 0.4094478527607362, + "grad_norm": 1.0011873843889731, + "learning_rate": 1.3353517302694168e-05, + "loss": 0.6714, + "step": 3337 + }, + { + "epoch": 0.40957055214723925, + "grad_norm": 0.9874299219578178, + "learning_rate": 1.3349772986803438e-05, + "loss": 0.6017, + "step": 3338 + }, + { + "epoch": 0.4096932515337423, + "grad_norm": 1.1686556744315015, + "learning_rate": 1.334602814184486e-05, + "loss": 0.6364, + "step": 3339 + }, + { + "epoch": 0.4098159509202454, + "grad_norm": 0.7853493225373308, + "learning_rate": 1.3342282768409904e-05, + "loss": 0.5939, + "step": 3340 + }, + { + "epoch": 0.4099386503067485, + "grad_norm": 0.8736754474017544, + "learning_rate": 1.333853686709012e-05, + "loss": 0.6041, + "step": 3341 + }, + { + "epoch": 0.41006134969325153, + "grad_norm": 0.9769166594074257, + "learning_rate": 1.3334790438477141e-05, + "loss": 0.6031, + "step": 3342 + }, + { + "epoch": 0.4101840490797546, + "grad_norm": 0.8097700357503284, + "learning_rate": 1.333104348316268e-05, + "loss": 0.5675, + "step": 3343 + }, + { + "epoch": 0.41030674846625764, + "grad_norm": 0.789316316258005, + "learning_rate": 1.3327296001738536e-05, + "loss": 0.5976, + "step": 3344 + }, + { + "epoch": 0.41042944785276075, + "grad_norm": 0.884002975196575, + "learning_rate": 1.3323547994796597e-05, + "loss": 0.5885, + "step": 3345 + }, + { + "epoch": 0.4105521472392638, + "grad_norm": 0.9945309078788193, + "learning_rate": 1.3319799462928825e-05, + "loss": 0.6175, + "step": 3346 + }, + { + "epoch": 0.41067484662576687, + "grad_norm": 0.9301129668607161, + "learning_rate": 1.331605040672727e-05, + "loss": 0.6105, + "step": 3347 + }, + { + "epoch": 0.4107975460122699, + "grad_norm": 0.8602804582948316, + "learning_rate": 1.3312300826784062e-05, + "loss": 0.5654, + "step": 3348 + }, + { + "epoch": 0.41092024539877303, + "grad_norm": 0.964666998667026, + "learning_rate": 1.3308550723691411e-05, + "loss": 0.6498, + "step": 3349 + }, + { + "epoch": 0.4110429447852761, + "grad_norm": 0.8316237275767168, + "learning_rate": 1.330480009804162e-05, + "loss": 0.5859, + "step": 3350 + }, + { + "epoch": 0.41116564417177914, + "grad_norm": 0.8135784251988473, + "learning_rate": 1.3301048950427062e-05, + "loss": 0.612, + "step": 3351 + }, + { + "epoch": 0.4112883435582822, + "grad_norm": 1.0009472363153693, + "learning_rate": 1.3297297281440204e-05, + "loss": 0.6308, + "step": 3352 + }, + { + "epoch": 0.41141104294478525, + "grad_norm": 1.0084379276906703, + "learning_rate": 1.3293545091673587e-05, + "loss": 0.6387, + "step": 3353 + }, + { + "epoch": 0.41153374233128837, + "grad_norm": 0.9393470107336301, + "learning_rate": 1.3289792381719839e-05, + "loss": 0.5767, + "step": 3354 + }, + { + "epoch": 0.4116564417177914, + "grad_norm": 0.9876544440722705, + "learning_rate": 1.3286039152171667e-05, + "loss": 0.5604, + "step": 3355 + }, + { + "epoch": 0.4117791411042945, + "grad_norm": 0.8364980797430343, + "learning_rate": 1.3282285403621864e-05, + "loss": 0.5778, + "step": 3356 + }, + { + "epoch": 0.41190184049079753, + "grad_norm": 0.9380223482854465, + "learning_rate": 1.3278531136663299e-05, + "loss": 0.6041, + "step": 3357 + }, + { + "epoch": 0.4120245398773006, + "grad_norm": 1.0385680686983774, + "learning_rate": 1.3274776351888924e-05, + "loss": 0.5779, + "step": 3358 + }, + { + "epoch": 0.4121472392638037, + "grad_norm": 1.1247615153155264, + "learning_rate": 1.3271021049891778e-05, + "loss": 0.5919, + "step": 3359 + }, + { + "epoch": 0.41226993865030676, + "grad_norm": 0.8754571753119711, + "learning_rate": 1.3267265231264982e-05, + "loss": 0.5831, + "step": 3360 + }, + { + "epoch": 0.4123926380368098, + "grad_norm": 0.969478964443864, + "learning_rate": 1.3263508896601732e-05, + "loss": 0.5809, + "step": 3361 + }, + { + "epoch": 0.41251533742331287, + "grad_norm": 0.9119095725723196, + "learning_rate": 1.3259752046495304e-05, + "loss": 0.6012, + "step": 3362 + }, + { + "epoch": 0.412638036809816, + "grad_norm": 0.946214780560667, + "learning_rate": 1.3255994681539069e-05, + "loss": 0.5486, + "step": 3363 + }, + { + "epoch": 0.41276073619631903, + "grad_norm": 0.9328886710467607, + "learning_rate": 1.3252236802326464e-05, + "loss": 0.5447, + "step": 3364 + }, + { + "epoch": 0.4128834355828221, + "grad_norm": 0.8886046772432127, + "learning_rate": 1.3248478409451017e-05, + "loss": 0.6009, + "step": 3365 + }, + { + "epoch": 0.41300613496932514, + "grad_norm": 0.9388442457198046, + "learning_rate": 1.3244719503506334e-05, + "loss": 0.5997, + "step": 3366 + }, + { + "epoch": 0.4131288343558282, + "grad_norm": 2.2813929995384465, + "learning_rate": 1.3240960085086099e-05, + "loss": 0.5955, + "step": 3367 + }, + { + "epoch": 0.4132515337423313, + "grad_norm": 0.8373670626849907, + "learning_rate": 1.3237200154784083e-05, + "loss": 0.6194, + "step": 3368 + }, + { + "epoch": 0.41337423312883437, + "grad_norm": 0.9312404587499108, + "learning_rate": 1.323343971319413e-05, + "loss": 0.6258, + "step": 3369 + }, + { + "epoch": 0.4134969325153374, + "grad_norm": 1.0503500924150952, + "learning_rate": 1.3229678760910174e-05, + "loss": 0.6012, + "step": 3370 + }, + { + "epoch": 0.4136196319018405, + "grad_norm": 0.9620994520898156, + "learning_rate": 1.3225917298526225e-05, + "loss": 0.626, + "step": 3371 + }, + { + "epoch": 0.41374233128834353, + "grad_norm": 0.9239996322717962, + "learning_rate": 1.3222155326636373e-05, + "loss": 0.5763, + "step": 3372 + }, + { + "epoch": 0.41386503067484665, + "grad_norm": 0.8659130037068411, + "learning_rate": 1.3218392845834789e-05, + "loss": 0.6319, + "step": 3373 + }, + { + "epoch": 0.4139877300613497, + "grad_norm": 0.8853671319213593, + "learning_rate": 1.3214629856715721e-05, + "loss": 0.6302, + "step": 3374 + }, + { + "epoch": 0.41411042944785276, + "grad_norm": 0.8889093717817345, + "learning_rate": 1.3210866359873506e-05, + "loss": 0.5162, + "step": 3375 + }, + { + "epoch": 0.4142331288343558, + "grad_norm": 0.8649459028385181, + "learning_rate": 1.3207102355902553e-05, + "loss": 0.6432, + "step": 3376 + }, + { + "epoch": 0.41435582822085887, + "grad_norm": 0.9049464090005938, + "learning_rate": 1.3203337845397358e-05, + "loss": 0.596, + "step": 3377 + }, + { + "epoch": 0.414478527607362, + "grad_norm": 0.9455049739886291, + "learning_rate": 1.3199572828952485e-05, + "loss": 0.6198, + "step": 3378 + }, + { + "epoch": 0.41460122699386504, + "grad_norm": 0.9641815358838114, + "learning_rate": 1.319580730716259e-05, + "loss": 0.535, + "step": 3379 + }, + { + "epoch": 0.4147239263803681, + "grad_norm": 1.1042779750991865, + "learning_rate": 1.3192041280622409e-05, + "loss": 0.6101, + "step": 3380 + }, + { + "epoch": 0.41484662576687115, + "grad_norm": 0.9286480386148812, + "learning_rate": 1.318827474992675e-05, + "loss": 0.6071, + "step": 3381 + }, + { + "epoch": 0.41496932515337426, + "grad_norm": 0.9257656359932002, + "learning_rate": 1.3184507715670505e-05, + "loss": 0.5984, + "step": 3382 + }, + { + "epoch": 0.4150920245398773, + "grad_norm": 1.0334030758750758, + "learning_rate": 1.3180740178448641e-05, + "loss": 0.5831, + "step": 3383 + }, + { + "epoch": 0.41521472392638037, + "grad_norm": 0.8271523481055952, + "learning_rate": 1.3176972138856215e-05, + "loss": 0.5829, + "step": 3384 + }, + { + "epoch": 0.4153374233128834, + "grad_norm": 1.0768818292888769, + "learning_rate": 1.3173203597488348e-05, + "loss": 0.559, + "step": 3385 + }, + { + "epoch": 0.4154601226993865, + "grad_norm": 0.9270347375271655, + "learning_rate": 1.3169434554940259e-05, + "loss": 0.6444, + "step": 3386 + }, + { + "epoch": 0.4155828220858896, + "grad_norm": 1.0632257123067306, + "learning_rate": 1.3165665011807227e-05, + "loss": 0.5412, + "step": 3387 + }, + { + "epoch": 0.41570552147239265, + "grad_norm": 0.9470484494815554, + "learning_rate": 1.3161894968684623e-05, + "loss": 0.6617, + "step": 3388 + }, + { + "epoch": 0.4158282208588957, + "grad_norm": 0.9040963384846393, + "learning_rate": 1.3158124426167891e-05, + "loss": 0.6219, + "step": 3389 + }, + { + "epoch": 0.41595092024539876, + "grad_norm": 0.9619703460733903, + "learning_rate": 1.3154353384852559e-05, + "loss": 0.5786, + "step": 3390 + }, + { + "epoch": 0.4160736196319018, + "grad_norm": 0.860691017205544, + "learning_rate": 1.3150581845334225e-05, + "loss": 0.5512, + "step": 3391 + }, + { + "epoch": 0.4161963190184049, + "grad_norm": 0.9965545809252212, + "learning_rate": 1.3146809808208574e-05, + "loss": 0.5755, + "step": 3392 + }, + { + "epoch": 0.416319018404908, + "grad_norm": 1.0108576022343156, + "learning_rate": 1.3143037274071368e-05, + "loss": 0.6384, + "step": 3393 + }, + { + "epoch": 0.41644171779141104, + "grad_norm": 0.8447560033549634, + "learning_rate": 1.3139264243518448e-05, + "loss": 0.6148, + "step": 3394 + }, + { + "epoch": 0.4165644171779141, + "grad_norm": 0.9008334610428297, + "learning_rate": 1.3135490717145726e-05, + "loss": 0.5997, + "step": 3395 + }, + { + "epoch": 0.4166871165644172, + "grad_norm": 0.8671728764101476, + "learning_rate": 1.3131716695549208e-05, + "loss": 0.5663, + "step": 3396 + }, + { + "epoch": 0.41680981595092026, + "grad_norm": 0.8771900407458072, + "learning_rate": 1.3127942179324957e-05, + "loss": 0.6139, + "step": 3397 + }, + { + "epoch": 0.4169325153374233, + "grad_norm": 0.9476709631655517, + "learning_rate": 1.312416716906913e-05, + "loss": 0.5893, + "step": 3398 + }, + { + "epoch": 0.41705521472392637, + "grad_norm": 0.9319659277193006, + "learning_rate": 1.3120391665377961e-05, + "loss": 0.5831, + "step": 3399 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 0.8689265143500567, + "learning_rate": 1.3116615668847749e-05, + "loss": 0.6263, + "step": 3400 + }, + { + "epoch": 0.41730061349693254, + "grad_norm": 0.9909919006257241, + "learning_rate": 1.3112839180074892e-05, + "loss": 0.5794, + "step": 3401 + }, + { + "epoch": 0.4174233128834356, + "grad_norm": 0.9427140848465336, + "learning_rate": 1.3109062199655844e-05, + "loss": 0.6036, + "step": 3402 + }, + { + "epoch": 0.41754601226993865, + "grad_norm": 0.9294172956739307, + "learning_rate": 1.3105284728187153e-05, + "loss": 0.6154, + "step": 3403 + }, + { + "epoch": 0.4176687116564417, + "grad_norm": 0.9386568305775982, + "learning_rate": 1.3101506766265435e-05, + "loss": 0.6168, + "step": 3404 + }, + { + "epoch": 0.41779141104294476, + "grad_norm": 0.8920247844611586, + "learning_rate": 1.3097728314487385e-05, + "loss": 0.5998, + "step": 3405 + }, + { + "epoch": 0.41791411042944787, + "grad_norm": 0.8734552635150168, + "learning_rate": 1.3093949373449783e-05, + "loss": 0.5633, + "step": 3406 + }, + { + "epoch": 0.4180368098159509, + "grad_norm": 0.8917993103221236, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.5603, + "step": 3407 + }, + { + "epoch": 0.418159509202454, + "grad_norm": 0.9628877955627891, + "learning_rate": 1.3086390025983393e-05, + "loss": 0.6164, + "step": 3408 + }, + { + "epoch": 0.41828220858895704, + "grad_norm": 0.9280461708854775, + "learning_rate": 1.3082609620748538e-05, + "loss": 0.5994, + "step": 3409 + }, + { + "epoch": 0.41840490797546015, + "grad_norm": 0.9805450318818629, + "learning_rate": 1.3078828728641994e-05, + "loss": 0.5805, + "step": 3410 + }, + { + "epoch": 0.4185276073619632, + "grad_norm": 0.9441745734008368, + "learning_rate": 1.3075047350260925e-05, + "loss": 0.5656, + "step": 3411 + }, + { + "epoch": 0.41865030674846626, + "grad_norm": 0.8722143754558999, + "learning_rate": 1.3071265486202562e-05, + "loss": 0.62, + "step": 3412 + }, + { + "epoch": 0.4187730061349693, + "grad_norm": 0.7665757972229554, + "learning_rate": 1.306748313706422e-05, + "loss": 0.5756, + "step": 3413 + }, + { + "epoch": 0.4188957055214724, + "grad_norm": 1.0107177613424836, + "learning_rate": 1.3063700303443287e-05, + "loss": 0.5872, + "step": 3414 + }, + { + "epoch": 0.4190184049079755, + "grad_norm": 0.9052919707153484, + "learning_rate": 1.305991698593723e-05, + "loss": 0.6068, + "step": 3415 + }, + { + "epoch": 0.41914110429447854, + "grad_norm": 0.9264581688304666, + "learning_rate": 1.3056133185143596e-05, + "loss": 0.5546, + "step": 3416 + }, + { + "epoch": 0.4192638036809816, + "grad_norm": 0.8747373382102902, + "learning_rate": 1.3052348901659998e-05, + "loss": 0.559, + "step": 3417 + }, + { + "epoch": 0.41938650306748465, + "grad_norm": 0.8775403913708522, + "learning_rate": 1.304856413608413e-05, + "loss": 0.5966, + "step": 3418 + }, + { + "epoch": 0.4195092024539877, + "grad_norm": 0.9850989790521277, + "learning_rate": 1.3044778889013764e-05, + "loss": 0.6152, + "step": 3419 + }, + { + "epoch": 0.4196319018404908, + "grad_norm": 1.0975974956207315, + "learning_rate": 1.3040993161046749e-05, + "loss": 0.6429, + "step": 3420 + }, + { + "epoch": 0.4197546012269939, + "grad_norm": 0.9619649838931853, + "learning_rate": 1.3037206952781012e-05, + "loss": 0.62, + "step": 3421 + }, + { + "epoch": 0.41987730061349693, + "grad_norm": 0.9207411764101747, + "learning_rate": 1.3033420264814547e-05, + "loss": 0.5802, + "step": 3422 + }, + { + "epoch": 0.42, + "grad_norm": 0.9394355717600839, + "learning_rate": 1.3029633097745427e-05, + "loss": 0.5874, + "step": 3423 + }, + { + "epoch": 0.42012269938650304, + "grad_norm": 0.9048277216974991, + "learning_rate": 1.3025845452171808e-05, + "loss": 0.5665, + "step": 3424 + }, + { + "epoch": 0.42024539877300615, + "grad_norm": 2.3832665572936333, + "learning_rate": 1.3022057328691915e-05, + "loss": 0.6365, + "step": 3425 + }, + { + "epoch": 0.4203680981595092, + "grad_norm": 0.9056975234059261, + "learning_rate": 1.3018268727904046e-05, + "loss": 0.6306, + "step": 3426 + }, + { + "epoch": 0.42049079754601226, + "grad_norm": 0.8078732571761624, + "learning_rate": 1.3014479650406581e-05, + "loss": 0.6465, + "step": 3427 + }, + { + "epoch": 0.4206134969325153, + "grad_norm": 0.891103532908138, + "learning_rate": 1.3010690096797971e-05, + "loss": 0.5751, + "step": 3428 + }, + { + "epoch": 0.42073619631901843, + "grad_norm": 1.0085482034222253, + "learning_rate": 1.3006900067676742e-05, + "loss": 0.63, + "step": 3429 + }, + { + "epoch": 0.4208588957055215, + "grad_norm": 0.9172734525689731, + "learning_rate": 1.3003109563641499e-05, + "loss": 0.6401, + "step": 3430 + }, + { + "epoch": 0.42098159509202454, + "grad_norm": 0.9631790554581752, + "learning_rate": 1.2999318585290919e-05, + "loss": 0.5946, + "step": 3431 + }, + { + "epoch": 0.4211042944785276, + "grad_norm": 0.8986625998830464, + "learning_rate": 1.2995527133223753e-05, + "loss": 0.5806, + "step": 3432 + }, + { + "epoch": 0.42122699386503065, + "grad_norm": 0.996209317739725, + "learning_rate": 1.2991735208038829e-05, + "loss": 0.6108, + "step": 3433 + }, + { + "epoch": 0.42134969325153376, + "grad_norm": 0.9380121190552612, + "learning_rate": 1.298794281033505e-05, + "loss": 0.6105, + "step": 3434 + }, + { + "epoch": 0.4214723926380368, + "grad_norm": 1.0320970213396081, + "learning_rate": 1.298414994071139e-05, + "loss": 0.6079, + "step": 3435 + }, + { + "epoch": 0.4215950920245399, + "grad_norm": 0.9252658315893755, + "learning_rate": 1.2980356599766908e-05, + "loss": 0.5714, + "step": 3436 + }, + { + "epoch": 0.42171779141104293, + "grad_norm": 0.950677958214144, + "learning_rate": 1.297656278810072e-05, + "loss": 0.6415, + "step": 3437 + }, + { + "epoch": 0.421840490797546, + "grad_norm": 0.9735045008037665, + "learning_rate": 1.2972768506312028e-05, + "loss": 0.5532, + "step": 3438 + }, + { + "epoch": 0.4219631901840491, + "grad_norm": 0.8729773506956704, + "learning_rate": 1.2968973755000109e-05, + "loss": 0.5283, + "step": 3439 + }, + { + "epoch": 0.42208588957055215, + "grad_norm": 0.9053676336504252, + "learning_rate": 1.2965178534764311e-05, + "loss": 0.5884, + "step": 3440 + }, + { + "epoch": 0.4222085889570552, + "grad_norm": 1.0892544050453496, + "learning_rate": 1.2961382846204056e-05, + "loss": 0.6055, + "step": 3441 + }, + { + "epoch": 0.42233128834355826, + "grad_norm": 0.9227861352729766, + "learning_rate": 1.2957586689918838e-05, + "loss": 0.5654, + "step": 3442 + }, + { + "epoch": 0.4224539877300614, + "grad_norm": 1.1285984788989758, + "learning_rate": 1.295379006650823e-05, + "loss": 0.609, + "step": 3443 + }, + { + "epoch": 0.42257668711656443, + "grad_norm": 0.9334276035970486, + "learning_rate": 1.2949992976571874e-05, + "loss": 0.5773, + "step": 3444 + }, + { + "epoch": 0.4226993865030675, + "grad_norm": 0.9703540551845605, + "learning_rate": 1.294619542070949e-05, + "loss": 0.5441, + "step": 3445 + }, + { + "epoch": 0.42282208588957054, + "grad_norm": 0.9460356927475005, + "learning_rate": 1.2942397399520867e-05, + "loss": 0.5852, + "step": 3446 + }, + { + "epoch": 0.4229447852760736, + "grad_norm": 0.8445098298340178, + "learning_rate": 1.2938598913605872e-05, + "loss": 0.5927, + "step": 3447 + }, + { + "epoch": 0.4230674846625767, + "grad_norm": 0.9404969488882003, + "learning_rate": 1.2934799963564441e-05, + "loss": 0.555, + "step": 3448 + }, + { + "epoch": 0.42319018404907977, + "grad_norm": 0.9378240027205291, + "learning_rate": 1.293100054999659e-05, + "loss": 0.6197, + "step": 3449 + }, + { + "epoch": 0.4233128834355828, + "grad_norm": 1.2484448728208273, + "learning_rate": 1.2927200673502399e-05, + "loss": 0.5767, + "step": 3450 + }, + { + "epoch": 0.4234355828220859, + "grad_norm": 1.0442444456643274, + "learning_rate": 1.2923400334682025e-05, + "loss": 0.6087, + "step": 3451 + }, + { + "epoch": 0.42355828220858893, + "grad_norm": 0.9648419940900166, + "learning_rate": 1.2919599534135703e-05, + "loss": 0.6478, + "step": 3452 + }, + { + "epoch": 0.42368098159509204, + "grad_norm": 0.9951721846821444, + "learning_rate": 1.2915798272463733e-05, + "loss": 0.6084, + "step": 3453 + }, + { + "epoch": 0.4238036809815951, + "grad_norm": 0.969177699465104, + "learning_rate": 1.2911996550266498e-05, + "loss": 0.5679, + "step": 3454 + }, + { + "epoch": 0.42392638036809815, + "grad_norm": 0.8154068944894052, + "learning_rate": 1.2908194368144437e-05, + "loss": 0.6028, + "step": 3455 + }, + { + "epoch": 0.4240490797546012, + "grad_norm": 0.9158091075866284, + "learning_rate": 1.290439172669808e-05, + "loss": 0.6078, + "step": 3456 + }, + { + "epoch": 0.4241717791411043, + "grad_norm": 0.8561025154669258, + "learning_rate": 1.2900588626528023e-05, + "loss": 0.6375, + "step": 3457 + }, + { + "epoch": 0.4242944785276074, + "grad_norm": 0.9035333622311619, + "learning_rate": 1.2896785068234925e-05, + "loss": 0.5855, + "step": 3458 + }, + { + "epoch": 0.42441717791411043, + "grad_norm": 0.882094875936706, + "learning_rate": 1.2892981052419532e-05, + "loss": 0.6008, + "step": 3459 + }, + { + "epoch": 0.4245398773006135, + "grad_norm": 1.0246835109764891, + "learning_rate": 1.288917657968265e-05, + "loss": 0.604, + "step": 3460 + }, + { + "epoch": 0.42466257668711654, + "grad_norm": 1.0590819522177302, + "learning_rate": 1.2885371650625164e-05, + "loss": 0.5746, + "step": 3461 + }, + { + "epoch": 0.42478527607361966, + "grad_norm": 0.8600509348369193, + "learning_rate": 1.2881566265848035e-05, + "loss": 0.5765, + "step": 3462 + }, + { + "epoch": 0.4249079754601227, + "grad_norm": 1.0585486344011998, + "learning_rate": 1.2877760425952287e-05, + "loss": 0.5935, + "step": 3463 + }, + { + "epoch": 0.42503067484662577, + "grad_norm": 0.9476668336657437, + "learning_rate": 1.2873954131539024e-05, + "loss": 0.6252, + "step": 3464 + }, + { + "epoch": 0.4251533742331288, + "grad_norm": 1.0087663927000834, + "learning_rate": 1.287014738320941e-05, + "loss": 0.5835, + "step": 3465 + }, + { + "epoch": 0.4252760736196319, + "grad_norm": 0.8823272364163952, + "learning_rate": 1.2866340181564694e-05, + "loss": 0.5779, + "step": 3466 + }, + { + "epoch": 0.425398773006135, + "grad_norm": 0.8158450747496844, + "learning_rate": 1.2862532527206186e-05, + "loss": 0.5245, + "step": 3467 + }, + { + "epoch": 0.42552147239263804, + "grad_norm": 0.8957471408165345, + "learning_rate": 1.2858724420735278e-05, + "loss": 0.6313, + "step": 3468 + }, + { + "epoch": 0.4256441717791411, + "grad_norm": 1.0017110807306968, + "learning_rate": 1.2854915862753424e-05, + "loss": 0.5869, + "step": 3469 + }, + { + "epoch": 0.42576687116564416, + "grad_norm": 0.9891292631759666, + "learning_rate": 1.285110685386215e-05, + "loss": 0.6357, + "step": 3470 + }, + { + "epoch": 0.4258895705521472, + "grad_norm": 0.8346310055550162, + "learning_rate": 1.2847297394663064e-05, + "loss": 0.5559, + "step": 3471 + }, + { + "epoch": 0.4260122699386503, + "grad_norm": 0.8707773251884883, + "learning_rate": 1.2843487485757831e-05, + "loss": 0.5825, + "step": 3472 + }, + { + "epoch": 0.4261349693251534, + "grad_norm": 0.9475382796219843, + "learning_rate": 1.2839677127748195e-05, + "loss": 0.5883, + "step": 3473 + }, + { + "epoch": 0.42625766871165643, + "grad_norm": 0.9799004362461402, + "learning_rate": 1.2835866321235972e-05, + "loss": 0.5994, + "step": 3474 + }, + { + "epoch": 0.4263803680981595, + "grad_norm": 0.8768836376150718, + "learning_rate": 1.283205506682304e-05, + "loss": 0.5969, + "step": 3475 + }, + { + "epoch": 0.4265030674846626, + "grad_norm": 0.8779001421719338, + "learning_rate": 1.2828243365111364e-05, + "loss": 0.5829, + "step": 3476 + }, + { + "epoch": 0.42662576687116566, + "grad_norm": 1.2354818389971047, + "learning_rate": 1.282443121670296e-05, + "loss": 0.6319, + "step": 3477 + }, + { + "epoch": 0.4267484662576687, + "grad_norm": 0.8637072072869981, + "learning_rate": 1.2820618622199925e-05, + "loss": 0.6202, + "step": 3478 + }, + { + "epoch": 0.42687116564417177, + "grad_norm": 0.7971579992025735, + "learning_rate": 1.281680558220443e-05, + "loss": 0.5502, + "step": 3479 + }, + { + "epoch": 0.4269938650306748, + "grad_norm": 0.955733413070169, + "learning_rate": 1.2812992097318711e-05, + "loss": 0.6192, + "step": 3480 + }, + { + "epoch": 0.42711656441717794, + "grad_norm": 0.9457946897577758, + "learning_rate": 1.2809178168145074e-05, + "loss": 0.5655, + "step": 3481 + }, + { + "epoch": 0.427239263803681, + "grad_norm": 0.9274999939268076, + "learning_rate": 1.2805363795285895e-05, + "loss": 0.5795, + "step": 3482 + }, + { + "epoch": 0.42736196319018405, + "grad_norm": 0.9682347484655177, + "learning_rate": 1.2801548979343621e-05, + "loss": 0.5759, + "step": 3483 + }, + { + "epoch": 0.4274846625766871, + "grad_norm": 1.0859109536079365, + "learning_rate": 1.2797733720920773e-05, + "loss": 0.616, + "step": 3484 + }, + { + "epoch": 0.42760736196319016, + "grad_norm": 0.8658748002586144, + "learning_rate": 1.2793918020619937e-05, + "loss": 0.6198, + "step": 3485 + }, + { + "epoch": 0.42773006134969327, + "grad_norm": 0.9396365442193038, + "learning_rate": 1.2790101879043768e-05, + "loss": 0.5729, + "step": 3486 + }, + { + "epoch": 0.4278527607361963, + "grad_norm": 0.9058901476826047, + "learning_rate": 1.2786285296794995e-05, + "loss": 0.5967, + "step": 3487 + }, + { + "epoch": 0.4279754601226994, + "grad_norm": 0.8634585986286226, + "learning_rate": 1.2782468274476416e-05, + "loss": 0.6415, + "step": 3488 + }, + { + "epoch": 0.42809815950920244, + "grad_norm": 0.8550779375937541, + "learning_rate": 1.277865081269089e-05, + "loss": 0.5837, + "step": 3489 + }, + { + "epoch": 0.42822085889570555, + "grad_norm": 1.2224124604021616, + "learning_rate": 1.2774832912041356e-05, + "loss": 0.5722, + "step": 3490 + }, + { + "epoch": 0.4283435582822086, + "grad_norm": 0.8570202631101448, + "learning_rate": 1.2771014573130822e-05, + "loss": 0.5729, + "step": 3491 + }, + { + "epoch": 0.42846625766871166, + "grad_norm": 0.8622507626705593, + "learning_rate": 1.2767195796562359e-05, + "loss": 0.6442, + "step": 3492 + }, + { + "epoch": 0.4285889570552147, + "grad_norm": 0.9416077343207127, + "learning_rate": 1.276337658293911e-05, + "loss": 0.5955, + "step": 3493 + }, + { + "epoch": 0.42871165644171777, + "grad_norm": 0.9247119511150913, + "learning_rate": 1.2759556932864285e-05, + "loss": 0.5987, + "step": 3494 + }, + { + "epoch": 0.4288343558282209, + "grad_norm": 0.9750415262557892, + "learning_rate": 1.2755736846941167e-05, + "loss": 0.6081, + "step": 3495 + }, + { + "epoch": 0.42895705521472394, + "grad_norm": 1.0810067387175748, + "learning_rate": 1.2751916325773108e-05, + "loss": 0.6166, + "step": 3496 + }, + { + "epoch": 0.429079754601227, + "grad_norm": 0.8154482929601559, + "learning_rate": 1.2748095369963524e-05, + "loss": 0.5756, + "step": 3497 + }, + { + "epoch": 0.42920245398773005, + "grad_norm": 0.9186528963092264, + "learning_rate": 1.27442739801159e-05, + "loss": 0.572, + "step": 3498 + }, + { + "epoch": 0.4293251533742331, + "grad_norm": 0.9380237839850428, + "learning_rate": 1.2740452156833796e-05, + "loss": 0.5544, + "step": 3499 + }, + { + "epoch": 0.4294478527607362, + "grad_norm": 0.9851147404370556, + "learning_rate": 1.2736629900720832e-05, + "loss": 0.6073, + "step": 3500 + }, + { + "epoch": 0.42957055214723927, + "grad_norm": 0.8272478296302821, + "learning_rate": 1.2732807212380703e-05, + "loss": 0.5948, + "step": 3501 + }, + { + "epoch": 0.4296932515337423, + "grad_norm": 0.8708696048020639, + "learning_rate": 1.2728984092417167e-05, + "loss": 0.5974, + "step": 3502 + }, + { + "epoch": 0.4298159509202454, + "grad_norm": 1.0237929786916269, + "learning_rate": 1.2725160541434053e-05, + "loss": 0.6174, + "step": 3503 + }, + { + "epoch": 0.4299386503067485, + "grad_norm": 0.8791006319047753, + "learning_rate": 1.2721336560035267e-05, + "loss": 0.5558, + "step": 3504 + }, + { + "epoch": 0.43006134969325155, + "grad_norm": 0.842170230169754, + "learning_rate": 1.2717512148824764e-05, + "loss": 0.5781, + "step": 3505 + }, + { + "epoch": 0.4301840490797546, + "grad_norm": 0.8676866047844832, + "learning_rate": 1.2713687308406581e-05, + "loss": 0.5814, + "step": 3506 + }, + { + "epoch": 0.43030674846625766, + "grad_norm": 1.010283309116112, + "learning_rate": 1.2709862039384817e-05, + "loss": 0.5532, + "step": 3507 + }, + { + "epoch": 0.4304294478527607, + "grad_norm": 0.9102871176481608, + "learning_rate": 1.2706036342363641e-05, + "loss": 0.5603, + "step": 3508 + }, + { + "epoch": 0.4305521472392638, + "grad_norm": 1.03467619769474, + "learning_rate": 1.2702210217947289e-05, + "loss": 0.6632, + "step": 3509 + }, + { + "epoch": 0.4306748466257669, + "grad_norm": 0.9533327954098982, + "learning_rate": 1.2698383666740064e-05, + "loss": 0.6423, + "step": 3510 + }, + { + "epoch": 0.43079754601226994, + "grad_norm": 0.9246837273523786, + "learning_rate": 1.2694556689346336e-05, + "loss": 0.6339, + "step": 3511 + }, + { + "epoch": 0.430920245398773, + "grad_norm": 1.28268800533486, + "learning_rate": 1.2690729286370546e-05, + "loss": 0.6151, + "step": 3512 + }, + { + "epoch": 0.43104294478527605, + "grad_norm": 0.8846687784081174, + "learning_rate": 1.2686901458417197e-05, + "loss": 0.566, + "step": 3513 + }, + { + "epoch": 0.43116564417177916, + "grad_norm": 0.9996164006572549, + "learning_rate": 1.2683073206090863e-05, + "loss": 0.6232, + "step": 3514 + }, + { + "epoch": 0.4312883435582822, + "grad_norm": 0.8224602616639994, + "learning_rate": 1.2679244529996182e-05, + "loss": 0.5925, + "step": 3515 + }, + { + "epoch": 0.4314110429447853, + "grad_norm": 1.0564205800830915, + "learning_rate": 1.2675415430737863e-05, + "loss": 0.6059, + "step": 3516 + }, + { + "epoch": 0.43153374233128833, + "grad_norm": 0.9689534802396516, + "learning_rate": 1.2671585908920676e-05, + "loss": 0.5822, + "step": 3517 + }, + { + "epoch": 0.4316564417177914, + "grad_norm": 0.901467623395095, + "learning_rate": 1.2667755965149464e-05, + "loss": 0.5294, + "step": 3518 + }, + { + "epoch": 0.4317791411042945, + "grad_norm": 1.0237829260754165, + "learning_rate": 1.2663925600029132e-05, + "loss": 0.597, + "step": 3519 + }, + { + "epoch": 0.43190184049079755, + "grad_norm": 0.8890411064576237, + "learning_rate": 1.2660094814164653e-05, + "loss": 0.5739, + "step": 3520 + }, + { + "epoch": 0.4320245398773006, + "grad_norm": 0.9838065992781472, + "learning_rate": 1.2656263608161067e-05, + "loss": 0.5882, + "step": 3521 + }, + { + "epoch": 0.43214723926380366, + "grad_norm": 0.9049476931798881, + "learning_rate": 1.265243198262348e-05, + "loss": 0.617, + "step": 3522 + }, + { + "epoch": 0.4322699386503068, + "grad_norm": 0.9211691989897957, + "learning_rate": 1.2648599938157067e-05, + "loss": 0.5744, + "step": 3523 + }, + { + "epoch": 0.43239263803680983, + "grad_norm": 0.8823273587689773, + "learning_rate": 1.2644767475367063e-05, + "loss": 0.6232, + "step": 3524 + }, + { + "epoch": 0.4325153374233129, + "grad_norm": 1.048323516842906, + "learning_rate": 1.2640934594858773e-05, + "loss": 0.6546, + "step": 3525 + }, + { + "epoch": 0.43263803680981594, + "grad_norm": 0.9782252109303103, + "learning_rate": 1.263710129723757e-05, + "loss": 0.5791, + "step": 3526 + }, + { + "epoch": 0.432760736196319, + "grad_norm": 0.9327660323576947, + "learning_rate": 1.263326758310889e-05, + "loss": 0.6417, + "step": 3527 + }, + { + "epoch": 0.4328834355828221, + "grad_norm": 1.0272582515500464, + "learning_rate": 1.2629433453078232e-05, + "loss": 0.5457, + "step": 3528 + }, + { + "epoch": 0.43300613496932516, + "grad_norm": 1.1180724390970334, + "learning_rate": 1.2625598907751163e-05, + "loss": 0.5967, + "step": 3529 + }, + { + "epoch": 0.4331288343558282, + "grad_norm": 0.9570227190701461, + "learning_rate": 1.262176394773332e-05, + "loss": 0.5454, + "step": 3530 + }, + { + "epoch": 0.4332515337423313, + "grad_norm": 0.9625884050606415, + "learning_rate": 1.2617928573630405e-05, + "loss": 0.6111, + "step": 3531 + }, + { + "epoch": 0.43337423312883433, + "grad_norm": 0.8774413983549147, + "learning_rate": 1.2614092786048176e-05, + "loss": 0.6234, + "step": 3532 + }, + { + "epoch": 0.43349693251533744, + "grad_norm": 0.8644855499787828, + "learning_rate": 1.2610256585592464e-05, + "loss": 0.609, + "step": 3533 + }, + { + "epoch": 0.4336196319018405, + "grad_norm": 1.0175518144151245, + "learning_rate": 1.2606419972869167e-05, + "loss": 0.5816, + "step": 3534 + }, + { + "epoch": 0.43374233128834355, + "grad_norm": 0.9181308586814827, + "learning_rate": 1.2602582948484243e-05, + "loss": 0.5835, + "step": 3535 + }, + { + "epoch": 0.4338650306748466, + "grad_norm": 0.9875965591123446, + "learning_rate": 1.2598745513043716e-05, + "loss": 0.5931, + "step": 3536 + }, + { + "epoch": 0.4339877300613497, + "grad_norm": 0.9838867918414264, + "learning_rate": 1.259490766715368e-05, + "loss": 0.5952, + "step": 3537 + }, + { + "epoch": 0.4341104294478528, + "grad_norm": 1.0087679476988103, + "learning_rate": 1.2591069411420285e-05, + "loss": 0.5528, + "step": 3538 + }, + { + "epoch": 0.43423312883435583, + "grad_norm": 0.8628716222308052, + "learning_rate": 1.258723074644975e-05, + "loss": 0.5662, + "step": 3539 + }, + { + "epoch": 0.4343558282208589, + "grad_norm": 1.0444349726204922, + "learning_rate": 1.2583391672848361e-05, + "loss": 0.5643, + "step": 3540 + }, + { + "epoch": 0.43447852760736194, + "grad_norm": 0.8840591987430511, + "learning_rate": 1.2579552191222471e-05, + "loss": 0.5847, + "step": 3541 + }, + { + "epoch": 0.43460122699386505, + "grad_norm": 1.007347475055702, + "learning_rate": 1.2575712302178489e-05, + "loss": 0.598, + "step": 3542 + }, + { + "epoch": 0.4347239263803681, + "grad_norm": 0.987247871981317, + "learning_rate": 1.257187200632289e-05, + "loss": 0.5877, + "step": 3543 + }, + { + "epoch": 0.43484662576687116, + "grad_norm": 0.9728440729208054, + "learning_rate": 1.2568031304262216e-05, + "loss": 0.5929, + "step": 3544 + }, + { + "epoch": 0.4349693251533742, + "grad_norm": 0.9551943233573444, + "learning_rate": 1.256419019660308e-05, + "loss": 0.6148, + "step": 3545 + }, + { + "epoch": 0.4350920245398773, + "grad_norm": 0.9244583990368258, + "learning_rate": 1.2560348683952146e-05, + "loss": 0.5874, + "step": 3546 + }, + { + "epoch": 0.4352147239263804, + "grad_norm": 0.9165280746822032, + "learning_rate": 1.255650676691615e-05, + "loss": 0.5523, + "step": 3547 + }, + { + "epoch": 0.43533742331288344, + "grad_norm": 0.7815689431293029, + "learning_rate": 1.2552664446101888e-05, + "loss": 0.4978, + "step": 3548 + }, + { + "epoch": 0.4354601226993865, + "grad_norm": 0.865633339980005, + "learning_rate": 1.2548821722116223e-05, + "loss": 0.5493, + "step": 3549 + }, + { + "epoch": 0.43558282208588955, + "grad_norm": 0.8901602258548836, + "learning_rate": 1.2544978595566078e-05, + "loss": 0.6031, + "step": 3550 + }, + { + "epoch": 0.43570552147239267, + "grad_norm": 1.035482170689342, + "learning_rate": 1.2541135067058443e-05, + "loss": 0.5884, + "step": 3551 + }, + { + "epoch": 0.4358282208588957, + "grad_norm": 0.8441464106631431, + "learning_rate": 1.2537291137200372e-05, + "loss": 0.6015, + "step": 3552 + }, + { + "epoch": 0.4359509202453988, + "grad_norm": 0.919170558549573, + "learning_rate": 1.253344680659898e-05, + "loss": 0.5551, + "step": 3553 + }, + { + "epoch": 0.43607361963190183, + "grad_norm": 0.9301737944025408, + "learning_rate": 1.2529602075861443e-05, + "loss": 0.5467, + "step": 3554 + }, + { + "epoch": 0.4361963190184049, + "grad_norm": 0.964308703774005, + "learning_rate": 1.2525756945595006e-05, + "loss": 0.6078, + "step": 3555 + }, + { + "epoch": 0.436319018404908, + "grad_norm": 0.962975293419299, + "learning_rate": 1.2521911416406975e-05, + "loss": 0.5911, + "step": 3556 + }, + { + "epoch": 0.43644171779141105, + "grad_norm": 0.7916139345036537, + "learning_rate": 1.2518065488904717e-05, + "loss": 0.5846, + "step": 3557 + }, + { + "epoch": 0.4365644171779141, + "grad_norm": 0.9043362903617312, + "learning_rate": 1.2514219163695663e-05, + "loss": 0.5892, + "step": 3558 + }, + { + "epoch": 0.43668711656441717, + "grad_norm": 0.9444879789819071, + "learning_rate": 1.2510372441387308e-05, + "loss": 0.5992, + "step": 3559 + }, + { + "epoch": 0.4368098159509202, + "grad_norm": 0.9558307339872385, + "learning_rate": 1.2506525322587207e-05, + "loss": 0.5907, + "step": 3560 + }, + { + "epoch": 0.43693251533742333, + "grad_norm": 0.8668909797045753, + "learning_rate": 1.2502677807902978e-05, + "loss": 0.6036, + "step": 3561 + }, + { + "epoch": 0.4370552147239264, + "grad_norm": 0.9373122470573307, + "learning_rate": 1.2498829897942308e-05, + "loss": 0.6195, + "step": 3562 + }, + { + "epoch": 0.43717791411042944, + "grad_norm": 0.8516466389579708, + "learning_rate": 1.249498159331294e-05, + "loss": 0.5705, + "step": 3563 + }, + { + "epoch": 0.4373006134969325, + "grad_norm": 1.013395477447893, + "learning_rate": 1.2491132894622676e-05, + "loss": 0.594, + "step": 3564 + }, + { + "epoch": 0.43742331288343556, + "grad_norm": 1.2129445846952362, + "learning_rate": 1.2487283802479389e-05, + "loss": 0.6204, + "step": 3565 + }, + { + "epoch": 0.43754601226993867, + "grad_norm": 0.8945814087384606, + "learning_rate": 1.2483434317491008e-05, + "loss": 0.6629, + "step": 3566 + }, + { + "epoch": 0.4376687116564417, + "grad_norm": 0.8414626504702082, + "learning_rate": 1.2479584440265531e-05, + "loss": 0.6182, + "step": 3567 + }, + { + "epoch": 0.4377914110429448, + "grad_norm": 1.2017008376368121, + "learning_rate": 1.2475734171411004e-05, + "loss": 0.5987, + "step": 3568 + }, + { + "epoch": 0.43791411042944783, + "grad_norm": 0.8943016032206559, + "learning_rate": 1.2471883511535552e-05, + "loss": 0.5897, + "step": 3569 + }, + { + "epoch": 0.43803680981595094, + "grad_norm": 0.9511490072486437, + "learning_rate": 1.246803246124735e-05, + "loss": 0.5765, + "step": 3570 + }, + { + "epoch": 0.438159509202454, + "grad_norm": 0.946249110105841, + "learning_rate": 1.2464181021154636e-05, + "loss": 0.5681, + "step": 3571 + }, + { + "epoch": 0.43828220858895706, + "grad_norm": 0.9988291930141285, + "learning_rate": 1.2460329191865716e-05, + "loss": 0.5753, + "step": 3572 + }, + { + "epoch": 0.4384049079754601, + "grad_norm": 0.9867074358193808, + "learning_rate": 1.2456476973988954e-05, + "loss": 0.6123, + "step": 3573 + }, + { + "epoch": 0.43852760736196317, + "grad_norm": 0.7804737516351337, + "learning_rate": 1.2452624368132771e-05, + "loss": 0.5668, + "step": 3574 + }, + { + "epoch": 0.4386503067484663, + "grad_norm": 0.8730633506513754, + "learning_rate": 1.2448771374905655e-05, + "loss": 0.5612, + "step": 3575 + }, + { + "epoch": 0.43877300613496933, + "grad_norm": 0.8963795397890196, + "learning_rate": 1.2444917994916153e-05, + "loss": 0.6257, + "step": 3576 + }, + { + "epoch": 0.4388957055214724, + "grad_norm": 0.9270814536927682, + "learning_rate": 1.2441064228772874e-05, + "loss": 0.5946, + "step": 3577 + }, + { + "epoch": 0.43901840490797545, + "grad_norm": 0.7673364583971505, + "learning_rate": 1.2437210077084485e-05, + "loss": 0.5775, + "step": 3578 + }, + { + "epoch": 0.4391411042944785, + "grad_norm": 0.8737528486552436, + "learning_rate": 1.2433355540459716e-05, + "loss": 0.6021, + "step": 3579 + }, + { + "epoch": 0.4392638036809816, + "grad_norm": 0.8510433027362603, + "learning_rate": 1.2429500619507362e-05, + "loss": 0.6438, + "step": 3580 + }, + { + "epoch": 0.43938650306748467, + "grad_norm": 0.9917042643929208, + "learning_rate": 1.242564531483627e-05, + "loss": 0.6056, + "step": 3581 + }, + { + "epoch": 0.4395092024539877, + "grad_norm": 1.150367704555366, + "learning_rate": 1.2421789627055357e-05, + "loss": 0.5756, + "step": 3582 + }, + { + "epoch": 0.4396319018404908, + "grad_norm": 0.9734441788931395, + "learning_rate": 1.241793355677359e-05, + "loss": 0.6209, + "step": 3583 + }, + { + "epoch": 0.4397546012269939, + "grad_norm": 0.9800902510496183, + "learning_rate": 1.2414077104600004e-05, + "loss": 0.5878, + "step": 3584 + }, + { + "epoch": 0.43987730061349695, + "grad_norm": 0.8593151946995065, + "learning_rate": 1.2410220271143693e-05, + "loss": 0.5955, + "step": 3585 + }, + { + "epoch": 0.44, + "grad_norm": 1.0647104900102893, + "learning_rate": 1.2406363057013817e-05, + "loss": 0.5579, + "step": 3586 + }, + { + "epoch": 0.44012269938650306, + "grad_norm": 0.9404029120272712, + "learning_rate": 1.240250546281958e-05, + "loss": 0.5818, + "step": 3587 + }, + { + "epoch": 0.4402453987730061, + "grad_norm": 0.8322364808081433, + "learning_rate": 1.2398647489170262e-05, + "loss": 0.5926, + "step": 3588 + }, + { + "epoch": 0.4403680981595092, + "grad_norm": 1.1999344465384933, + "learning_rate": 1.2394789136675192e-05, + "loss": 0.6162, + "step": 3589 + }, + { + "epoch": 0.4404907975460123, + "grad_norm": 0.9317737994798876, + "learning_rate": 1.2390930405943766e-05, + "loss": 0.5694, + "step": 3590 + }, + { + "epoch": 0.44061349693251534, + "grad_norm": 0.8356008300062585, + "learning_rate": 1.238707129758544e-05, + "loss": 0.564, + "step": 3591 + }, + { + "epoch": 0.4407361963190184, + "grad_norm": 0.7851176318369959, + "learning_rate": 1.2383211812209724e-05, + "loss": 0.6241, + "step": 3592 + }, + { + "epoch": 0.44085889570552145, + "grad_norm": 0.9380639407571633, + "learning_rate": 1.2379351950426188e-05, + "loss": 0.5574, + "step": 3593 + }, + { + "epoch": 0.44098159509202456, + "grad_norm": 1.0087192777087581, + "learning_rate": 1.2375491712844472e-05, + "loss": 0.5795, + "step": 3594 + }, + { + "epoch": 0.4411042944785276, + "grad_norm": 0.9798806700474557, + "learning_rate": 1.237163110007426e-05, + "loss": 0.5978, + "step": 3595 + }, + { + "epoch": 0.44122699386503067, + "grad_norm": 0.9562017664040481, + "learning_rate": 1.2367770112725303e-05, + "loss": 0.6059, + "step": 3596 + }, + { + "epoch": 0.4413496932515337, + "grad_norm": 0.9660731027406079, + "learning_rate": 1.2363908751407414e-05, + "loss": 0.6116, + "step": 3597 + }, + { + "epoch": 0.44147239263803684, + "grad_norm": 1.0001634897081928, + "learning_rate": 1.2360047016730465e-05, + "loss": 0.611, + "step": 3598 + }, + { + "epoch": 0.4415950920245399, + "grad_norm": 0.8675558537579449, + "learning_rate": 1.2356184909304373e-05, + "loss": 0.5562, + "step": 3599 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 0.9639685271070445, + "learning_rate": 1.2352322429739134e-05, + "loss": 0.5986, + "step": 3600 + }, + { + "epoch": 0.441840490797546, + "grad_norm": 0.9317272712517459, + "learning_rate": 1.234845957864479e-05, + "loss": 0.5669, + "step": 3601 + }, + { + "epoch": 0.44196319018404906, + "grad_norm": 1.0460529428549203, + "learning_rate": 1.2344596356631446e-05, + "loss": 0.6262, + "step": 3602 + }, + { + "epoch": 0.44208588957055217, + "grad_norm": 0.900389914625597, + "learning_rate": 1.2340732764309265e-05, + "loss": 0.5925, + "step": 3603 + }, + { + "epoch": 0.4422085889570552, + "grad_norm": 1.0849203771705331, + "learning_rate": 1.2336868802288467e-05, + "loss": 0.621, + "step": 3604 + }, + { + "epoch": 0.4423312883435583, + "grad_norm": 0.8678032482511632, + "learning_rate": 1.233300447117933e-05, + "loss": 0.5816, + "step": 3605 + }, + { + "epoch": 0.44245398773006134, + "grad_norm": 1.061727054407106, + "learning_rate": 1.2329139771592195e-05, + "loss": 0.5771, + "step": 3606 + }, + { + "epoch": 0.4425766871165644, + "grad_norm": 0.9437318779048302, + "learning_rate": 1.2325274704137462e-05, + "loss": 0.5631, + "step": 3607 + }, + { + "epoch": 0.4426993865030675, + "grad_norm": 0.9606477164327561, + "learning_rate": 1.2321409269425575e-05, + "loss": 0.6218, + "step": 3608 + }, + { + "epoch": 0.44282208588957056, + "grad_norm": 0.9332989277767079, + "learning_rate": 1.2317543468067052e-05, + "loss": 0.6058, + "step": 3609 + }, + { + "epoch": 0.4429447852760736, + "grad_norm": 1.0660460083550007, + "learning_rate": 1.2313677300672463e-05, + "loss": 0.5648, + "step": 3610 + }, + { + "epoch": 0.44306748466257667, + "grad_norm": 0.9839005970030339, + "learning_rate": 1.2309810767852435e-05, + "loss": 0.5532, + "step": 3611 + }, + { + "epoch": 0.4431901840490797, + "grad_norm": 0.9543086923713346, + "learning_rate": 1.2305943870217653e-05, + "loss": 0.5752, + "step": 3612 + }, + { + "epoch": 0.44331288343558284, + "grad_norm": 1.095335036937746, + "learning_rate": 1.2302076608378863e-05, + "loss": 0.5425, + "step": 3613 + }, + { + "epoch": 0.4434355828220859, + "grad_norm": 0.9504766975281279, + "learning_rate": 1.2298208982946862e-05, + "loss": 0.5778, + "step": 3614 + }, + { + "epoch": 0.44355828220858895, + "grad_norm": 1.022039738768194, + "learning_rate": 1.2294340994532511e-05, + "loss": 0.5878, + "step": 3615 + }, + { + "epoch": 0.443680981595092, + "grad_norm": 0.917767453110414, + "learning_rate": 1.229047264374673e-05, + "loss": 0.5645, + "step": 3616 + }, + { + "epoch": 0.4438036809815951, + "grad_norm": 0.992414633245474, + "learning_rate": 1.228660393120048e-05, + "loss": 0.6178, + "step": 3617 + }, + { + "epoch": 0.4439263803680982, + "grad_norm": 1.0185169477452525, + "learning_rate": 1.2282734857504802e-05, + "loss": 0.5465, + "step": 3618 + }, + { + "epoch": 0.44404907975460123, + "grad_norm": 0.9879334242409809, + "learning_rate": 1.2278865423270774e-05, + "loss": 0.545, + "step": 3619 + }, + { + "epoch": 0.4441717791411043, + "grad_norm": 0.9033718415250265, + "learning_rate": 1.2274995629109545e-05, + "loss": 0.5495, + "step": 3620 + }, + { + "epoch": 0.44429447852760734, + "grad_norm": 0.9643178541718174, + "learning_rate": 1.2271125475632315e-05, + "loss": 0.6095, + "step": 3621 + }, + { + "epoch": 0.44441717791411045, + "grad_norm": 0.8785858676584027, + "learning_rate": 1.2267254963450344e-05, + "loss": 0.6213, + "step": 3622 + }, + { + "epoch": 0.4445398773006135, + "grad_norm": 1.1386143545351668, + "learning_rate": 1.2263384093174939e-05, + "loss": 0.6236, + "step": 3623 + }, + { + "epoch": 0.44466257668711656, + "grad_norm": 1.038848083028382, + "learning_rate": 1.2259512865417478e-05, + "loss": 0.6054, + "step": 3624 + }, + { + "epoch": 0.4447852760736196, + "grad_norm": 1.0114139039122194, + "learning_rate": 1.2255641280789385e-05, + "loss": 0.5555, + "step": 3625 + }, + { + "epoch": 0.4449079754601227, + "grad_norm": 0.8875473771727187, + "learning_rate": 1.2251769339902143e-05, + "loss": 0.5812, + "step": 3626 + }, + { + "epoch": 0.4450306748466258, + "grad_norm": 0.8571980974052255, + "learning_rate": 1.2247897043367294e-05, + "loss": 0.5966, + "step": 3627 + }, + { + "epoch": 0.44515337423312884, + "grad_norm": 0.8688389114801727, + "learning_rate": 1.2244024391796432e-05, + "loss": 0.5979, + "step": 3628 + }, + { + "epoch": 0.4452760736196319, + "grad_norm": 0.8099940391416596, + "learning_rate": 1.224015138580121e-05, + "loss": 0.5967, + "step": 3629 + }, + { + "epoch": 0.44539877300613495, + "grad_norm": 1.0587955405297402, + "learning_rate": 1.2236278025993334e-05, + "loss": 0.5924, + "step": 3630 + }, + { + "epoch": 0.44552147239263806, + "grad_norm": 0.8421105242014529, + "learning_rate": 1.2232404312984569e-05, + "loss": 0.6395, + "step": 3631 + }, + { + "epoch": 0.4456441717791411, + "grad_norm": 0.8454551709546102, + "learning_rate": 1.2228530247386737e-05, + "loss": 0.6334, + "step": 3632 + }, + { + "epoch": 0.4457668711656442, + "grad_norm": 0.9509782483480439, + "learning_rate": 1.2224655829811709e-05, + "loss": 0.6133, + "step": 3633 + }, + { + "epoch": 0.44588957055214723, + "grad_norm": 1.1014272885436176, + "learning_rate": 1.2220781060871421e-05, + "loss": 0.5605, + "step": 3634 + }, + { + "epoch": 0.4460122699386503, + "grad_norm": 0.9518503370794954, + "learning_rate": 1.2216905941177854e-05, + "loss": 0.6391, + "step": 3635 + }, + { + "epoch": 0.4461349693251534, + "grad_norm": 0.9695525114262185, + "learning_rate": 1.2213030471343054e-05, + "loss": 0.6497, + "step": 3636 + }, + { + "epoch": 0.44625766871165645, + "grad_norm": 0.86294399678968, + "learning_rate": 1.2209154651979117e-05, + "loss": 0.5806, + "step": 3637 + }, + { + "epoch": 0.4463803680981595, + "grad_norm": 0.86756454411281, + "learning_rate": 1.22052784836982e-05, + "loss": 0.5454, + "step": 3638 + }, + { + "epoch": 0.44650306748466256, + "grad_norm": 0.8478918914247974, + "learning_rate": 1.2201401967112501e-05, + "loss": 0.5646, + "step": 3639 + }, + { + "epoch": 0.4466257668711656, + "grad_norm": 0.9013656430519247, + "learning_rate": 1.2197525102834284e-05, + "loss": 0.6138, + "step": 3640 + }, + { + "epoch": 0.44674846625766873, + "grad_norm": 0.8465938396130203, + "learning_rate": 1.2193647891475873e-05, + "loss": 0.5734, + "step": 3641 + }, + { + "epoch": 0.4468711656441718, + "grad_norm": 0.8918286364809727, + "learning_rate": 1.2189770333649635e-05, + "loss": 0.5884, + "step": 3642 + }, + { + "epoch": 0.44699386503067484, + "grad_norm": 0.8758196371766438, + "learning_rate": 1.2185892429968001e-05, + "loss": 0.5563, + "step": 3643 + }, + { + "epoch": 0.4471165644171779, + "grad_norm": 1.0764751639703252, + "learning_rate": 1.218201418104345e-05, + "loss": 0.5809, + "step": 3644 + }, + { + "epoch": 0.447239263803681, + "grad_norm": 0.8545772498444744, + "learning_rate": 1.2178135587488515e-05, + "loss": 0.5792, + "step": 3645 + }, + { + "epoch": 0.44736196319018406, + "grad_norm": 0.982553227586846, + "learning_rate": 1.2174256649915793e-05, + "loss": 0.6156, + "step": 3646 + }, + { + "epoch": 0.4474846625766871, + "grad_norm": 0.852869174827509, + "learning_rate": 1.2170377368937926e-05, + "loss": 0.6383, + "step": 3647 + }, + { + "epoch": 0.4476073619631902, + "grad_norm": 0.8465889879443366, + "learning_rate": 1.2166497745167611e-05, + "loss": 0.5944, + "step": 3648 + }, + { + "epoch": 0.44773006134969323, + "grad_norm": 0.9792156373079445, + "learning_rate": 1.2162617779217603e-05, + "loss": 0.559, + "step": 3649 + }, + { + "epoch": 0.44785276073619634, + "grad_norm": 0.9171252022950123, + "learning_rate": 1.215873747170071e-05, + "loss": 0.5884, + "step": 3650 + }, + { + "epoch": 0.4479754601226994, + "grad_norm": 1.1043441096058493, + "learning_rate": 1.215485682322979e-05, + "loss": 0.6008, + "step": 3651 + }, + { + "epoch": 0.44809815950920245, + "grad_norm": 0.8280907800858185, + "learning_rate": 1.2150975834417761e-05, + "loss": 0.5475, + "step": 3652 + }, + { + "epoch": 0.4482208588957055, + "grad_norm": 0.8883118071783567, + "learning_rate": 1.2147094505877593e-05, + "loss": 0.524, + "step": 3653 + }, + { + "epoch": 0.44834355828220857, + "grad_norm": 0.9286779880386369, + "learning_rate": 1.2143212838222304e-05, + "loss": 0.5934, + "step": 3654 + }, + { + "epoch": 0.4484662576687117, + "grad_norm": 0.9172618659393269, + "learning_rate": 1.2139330832064975e-05, + "loss": 0.6062, + "step": 3655 + }, + { + "epoch": 0.44858895705521473, + "grad_norm": 0.9147867471415773, + "learning_rate": 1.2135448488018734e-05, + "loss": 0.5646, + "step": 3656 + }, + { + "epoch": 0.4487116564417178, + "grad_norm": 0.9443309633240163, + "learning_rate": 1.213156580669676e-05, + "loss": 0.5872, + "step": 3657 + }, + { + "epoch": 0.44883435582822084, + "grad_norm": 0.9421614283340239, + "learning_rate": 1.2127682788712296e-05, + "loss": 0.564, + "step": 3658 + }, + { + "epoch": 0.4489570552147239, + "grad_norm": 0.9129708901453906, + "learning_rate": 1.2123799434678624e-05, + "loss": 0.5457, + "step": 3659 + }, + { + "epoch": 0.449079754601227, + "grad_norm": 0.9181824260370373, + "learning_rate": 1.2119915745209092e-05, + "loss": 0.5814, + "step": 3660 + }, + { + "epoch": 0.44920245398773007, + "grad_norm": 0.9945444932876988, + "learning_rate": 1.2116031720917094e-05, + "loss": 0.646, + "step": 3661 + }, + { + "epoch": 0.4493251533742331, + "grad_norm": 0.9084433757922569, + "learning_rate": 1.2112147362416076e-05, + "loss": 0.6259, + "step": 3662 + }, + { + "epoch": 0.4494478527607362, + "grad_norm": 0.9646627242934306, + "learning_rate": 1.210826267031954e-05, + "loss": 0.5987, + "step": 3663 + }, + { + "epoch": 0.4495705521472393, + "grad_norm": 0.9738041136767135, + "learning_rate": 1.2104377645241042e-05, + "loss": 0.5448, + "step": 3664 + }, + { + "epoch": 0.44969325153374234, + "grad_norm": 0.9357689529896404, + "learning_rate": 1.2100492287794186e-05, + "loss": 0.5581, + "step": 3665 + }, + { + "epoch": 0.4498159509202454, + "grad_norm": 0.903509871380233, + "learning_rate": 1.209660659859263e-05, + "loss": 0.585, + "step": 3666 + }, + { + "epoch": 0.44993865030674846, + "grad_norm": 0.9133914461545127, + "learning_rate": 1.2092720578250088e-05, + "loss": 0.6264, + "step": 3667 + }, + { + "epoch": 0.4500613496932515, + "grad_norm": 0.9738774490882507, + "learning_rate": 1.2088834227380321e-05, + "loss": 0.6098, + "step": 3668 + }, + { + "epoch": 0.4501840490797546, + "grad_norm": 0.9770983787716808, + "learning_rate": 1.2084947546597148e-05, + "loss": 0.5444, + "step": 3669 + }, + { + "epoch": 0.4503067484662577, + "grad_norm": 0.9538139097784304, + "learning_rate": 1.2081060536514432e-05, + "loss": 0.5587, + "step": 3670 + }, + { + "epoch": 0.45042944785276073, + "grad_norm": 0.8980684512568353, + "learning_rate": 1.2077173197746097e-05, + "loss": 0.551, + "step": 3671 + }, + { + "epoch": 0.4505521472392638, + "grad_norm": 0.8760587241377842, + "learning_rate": 1.2073285530906112e-05, + "loss": 0.6275, + "step": 3672 + }, + { + "epoch": 0.45067484662576685, + "grad_norm": 0.9811250746508464, + "learning_rate": 1.2069397536608503e-05, + "loss": 0.5765, + "step": 3673 + }, + { + "epoch": 0.45079754601226996, + "grad_norm": 1.0176394673761822, + "learning_rate": 1.2065509215467345e-05, + "loss": 0.5844, + "step": 3674 + }, + { + "epoch": 0.450920245398773, + "grad_norm": 1.0262550655234108, + "learning_rate": 1.206162056809676e-05, + "loss": 0.5916, + "step": 3675 + }, + { + "epoch": 0.45104294478527607, + "grad_norm": 0.9188449273314545, + "learning_rate": 1.2057731595110933e-05, + "loss": 0.5816, + "step": 3676 + }, + { + "epoch": 0.4511656441717791, + "grad_norm": 1.0730314609767793, + "learning_rate": 1.2053842297124094e-05, + "loss": 0.5656, + "step": 3677 + }, + { + "epoch": 0.45128834355828223, + "grad_norm": 0.965487699902638, + "learning_rate": 1.2049952674750519e-05, + "loss": 0.5824, + "step": 3678 + }, + { + "epoch": 0.4514110429447853, + "grad_norm": 0.8715773938926656, + "learning_rate": 1.204606272860454e-05, + "loss": 0.561, + "step": 3679 + }, + { + "epoch": 0.45153374233128835, + "grad_norm": 0.9280501332388834, + "learning_rate": 1.2042172459300546e-05, + "loss": 0.6477, + "step": 3680 + }, + { + "epoch": 0.4516564417177914, + "grad_norm": 0.8554587152522213, + "learning_rate": 1.2038281867452968e-05, + "loss": 0.6205, + "step": 3681 + }, + { + "epoch": 0.45177914110429446, + "grad_norm": 1.1176517927045497, + "learning_rate": 1.2034390953676291e-05, + "loss": 0.6049, + "step": 3682 + }, + { + "epoch": 0.45190184049079757, + "grad_norm": 0.9320169846599496, + "learning_rate": 1.2030499718585054e-05, + "loss": 0.6295, + "step": 3683 + }, + { + "epoch": 0.4520245398773006, + "grad_norm": 0.8447569382450422, + "learning_rate": 1.2026608162793845e-05, + "loss": 0.5969, + "step": 3684 + }, + { + "epoch": 0.4521472392638037, + "grad_norm": 0.9537371368707911, + "learning_rate": 1.2022716286917298e-05, + "loss": 0.541, + "step": 3685 + }, + { + "epoch": 0.45226993865030674, + "grad_norm": 0.8660195449567284, + "learning_rate": 1.2018824091570103e-05, + "loss": 0.606, + "step": 3686 + }, + { + "epoch": 0.4523926380368098, + "grad_norm": 0.8712092933849288, + "learning_rate": 1.2014931577367002e-05, + "loss": 0.601, + "step": 3687 + }, + { + "epoch": 0.4525153374233129, + "grad_norm": 1.025206015548181, + "learning_rate": 1.2011038744922778e-05, + "loss": 0.6101, + "step": 3688 + }, + { + "epoch": 0.45263803680981596, + "grad_norm": 0.9674671476404796, + "learning_rate": 1.2007145594852274e-05, + "loss": 0.5709, + "step": 3689 + }, + { + "epoch": 0.452760736196319, + "grad_norm": 0.9136180733892025, + "learning_rate": 1.2003252127770378e-05, + "loss": 0.5659, + "step": 3690 + }, + { + "epoch": 0.45288343558282207, + "grad_norm": 0.916172117895324, + "learning_rate": 1.1999358344292031e-05, + "loss": 0.6046, + "step": 3691 + }, + { + "epoch": 0.4530061349693252, + "grad_norm": 0.9485183728538772, + "learning_rate": 1.1995464245032222e-05, + "loss": 0.5518, + "step": 3692 + }, + { + "epoch": 0.45312883435582824, + "grad_norm": 0.9337188414962395, + "learning_rate": 1.1991569830605994e-05, + "loss": 0.6134, + "step": 3693 + }, + { + "epoch": 0.4532515337423313, + "grad_norm": 0.9548246047549365, + "learning_rate": 1.1987675101628428e-05, + "loss": 0.5825, + "step": 3694 + }, + { + "epoch": 0.45337423312883435, + "grad_norm": 0.8967308719842554, + "learning_rate": 1.198378005871467e-05, + "loss": 0.5952, + "step": 3695 + }, + { + "epoch": 0.4534969325153374, + "grad_norm": 0.910896627818554, + "learning_rate": 1.1979884702479909e-05, + "loss": 0.5952, + "step": 3696 + }, + { + "epoch": 0.4536196319018405, + "grad_norm": 1.081001950635114, + "learning_rate": 1.197598903353938e-05, + "loss": 0.6195, + "step": 3697 + }, + { + "epoch": 0.45374233128834357, + "grad_norm": 0.9181438226616644, + "learning_rate": 1.197209305250837e-05, + "loss": 0.6119, + "step": 3698 + }, + { + "epoch": 0.4538650306748466, + "grad_norm": 1.0176892406068723, + "learning_rate": 1.1968196760002215e-05, + "loss": 0.592, + "step": 3699 + }, + { + "epoch": 0.4539877300613497, + "grad_norm": 0.9512280699773084, + "learning_rate": 1.1964300156636304e-05, + "loss": 0.5627, + "step": 3700 + }, + { + "epoch": 0.45411042944785274, + "grad_norm": 0.8454363498275624, + "learning_rate": 1.1960403243026072e-05, + "loss": 0.5543, + "step": 3701 + }, + { + "epoch": 0.45423312883435585, + "grad_norm": 0.8460789795292218, + "learning_rate": 1.1956506019787e-05, + "loss": 0.5899, + "step": 3702 + }, + { + "epoch": 0.4543558282208589, + "grad_norm": 0.879860717921475, + "learning_rate": 1.1952608487534622e-05, + "loss": 0.5864, + "step": 3703 + }, + { + "epoch": 0.45447852760736196, + "grad_norm": 0.9804429431835878, + "learning_rate": 1.1948710646884522e-05, + "loss": 0.5687, + "step": 3704 + }, + { + "epoch": 0.454601226993865, + "grad_norm": 0.9176195413788283, + "learning_rate": 1.1944812498452329e-05, + "loss": 0.6123, + "step": 3705 + }, + { + "epoch": 0.45472392638036807, + "grad_norm": 0.9986690091517327, + "learning_rate": 1.1940914042853719e-05, + "loss": 0.5858, + "step": 3706 + }, + { + "epoch": 0.4548466257668712, + "grad_norm": 0.8414929520198099, + "learning_rate": 1.1937015280704425e-05, + "loss": 0.5938, + "step": 3707 + }, + { + "epoch": 0.45496932515337424, + "grad_norm": 0.8600934530442891, + "learning_rate": 1.1933116212620221e-05, + "loss": 0.6018, + "step": 3708 + }, + { + "epoch": 0.4550920245398773, + "grad_norm": 0.8829177415773634, + "learning_rate": 1.1929216839216928e-05, + "loss": 0.5705, + "step": 3709 + }, + { + "epoch": 0.45521472392638035, + "grad_norm": 0.842994566598097, + "learning_rate": 1.192531716111042e-05, + "loss": 0.6162, + "step": 3710 + }, + { + "epoch": 0.45533742331288346, + "grad_norm": 0.8597526819918724, + "learning_rate": 1.192141717891662e-05, + "loss": 0.5741, + "step": 3711 + }, + { + "epoch": 0.4554601226993865, + "grad_norm": 0.9582607453643659, + "learning_rate": 1.1917516893251498e-05, + "loss": 0.5777, + "step": 3712 + }, + { + "epoch": 0.45558282208588957, + "grad_norm": 1.0382013643701151, + "learning_rate": 1.1913616304731064e-05, + "loss": 0.5856, + "step": 3713 + }, + { + "epoch": 0.4557055214723926, + "grad_norm": 0.8725115046205562, + "learning_rate": 1.190971541397139e-05, + "loss": 0.6184, + "step": 3714 + }, + { + "epoch": 0.4558282208588957, + "grad_norm": 1.021478139886246, + "learning_rate": 1.1905814221588581e-05, + "loss": 0.5712, + "step": 3715 + }, + { + "epoch": 0.4559509202453988, + "grad_norm": 0.9768177965116286, + "learning_rate": 1.1901912728198802e-05, + "loss": 0.5907, + "step": 3716 + }, + { + "epoch": 0.45607361963190185, + "grad_norm": 0.9633698291822471, + "learning_rate": 1.1898010934418261e-05, + "loss": 0.6274, + "step": 3717 + }, + { + "epoch": 0.4561963190184049, + "grad_norm": 0.9086534708114722, + "learning_rate": 1.1894108840863207e-05, + "loss": 0.5774, + "step": 3718 + }, + { + "epoch": 0.45631901840490796, + "grad_norm": 0.9181512895381228, + "learning_rate": 1.1890206448149945e-05, + "loss": 0.5496, + "step": 3719 + }, + { + "epoch": 0.456441717791411, + "grad_norm": 0.9320312849284779, + "learning_rate": 1.1886303756894828e-05, + "loss": 0.5635, + "step": 3720 + }, + { + "epoch": 0.45656441717791413, + "grad_norm": 0.9154294515472196, + "learning_rate": 1.1882400767714246e-05, + "loss": 0.6786, + "step": 3721 + }, + { + "epoch": 0.4566871165644172, + "grad_norm": 0.9656343060817534, + "learning_rate": 1.1878497481224649e-05, + "loss": 0.5603, + "step": 3722 + }, + { + "epoch": 0.45680981595092024, + "grad_norm": 1.0083744746370087, + "learning_rate": 1.187459389804252e-05, + "loss": 0.6339, + "step": 3723 + }, + { + "epoch": 0.4569325153374233, + "grad_norm": 0.9189858473519342, + "learning_rate": 1.1870690018784405e-05, + "loss": 0.6024, + "step": 3724 + }, + { + "epoch": 0.4570552147239264, + "grad_norm": 0.993924367736248, + "learning_rate": 1.1866785844066884e-05, + "loss": 0.5906, + "step": 3725 + }, + { + "epoch": 0.45717791411042946, + "grad_norm": 1.05429559437262, + "learning_rate": 1.1862881374506586e-05, + "loss": 0.6145, + "step": 3726 + }, + { + "epoch": 0.4573006134969325, + "grad_norm": 0.9663390379471323, + "learning_rate": 1.1858976610720195e-05, + "loss": 0.6466, + "step": 3727 + }, + { + "epoch": 0.4574233128834356, + "grad_norm": 0.90651662218955, + "learning_rate": 1.1855071553324427e-05, + "loss": 0.6368, + "step": 3728 + }, + { + "epoch": 0.45754601226993863, + "grad_norm": 0.8686548747803555, + "learning_rate": 1.1851166202936053e-05, + "loss": 0.551, + "step": 3729 + }, + { + "epoch": 0.45766871165644174, + "grad_norm": 0.9296201724770923, + "learning_rate": 1.1847260560171895e-05, + "loss": 0.5701, + "step": 3730 + }, + { + "epoch": 0.4577914110429448, + "grad_norm": 0.9962040447077993, + "learning_rate": 1.1843354625648813e-05, + "loss": 0.604, + "step": 3731 + }, + { + "epoch": 0.45791411042944785, + "grad_norm": 0.8560893437494889, + "learning_rate": 1.1839448399983713e-05, + "loss": 0.6624, + "step": 3732 + }, + { + "epoch": 0.4580368098159509, + "grad_norm": 0.8900885664401443, + "learning_rate": 1.1835541883793553e-05, + "loss": 0.5485, + "step": 3733 + }, + { + "epoch": 0.45815950920245396, + "grad_norm": 0.8762784019553184, + "learning_rate": 1.1831635077695337e-05, + "loss": 0.6125, + "step": 3734 + }, + { + "epoch": 0.4582822085889571, + "grad_norm": 0.8430011963461175, + "learning_rate": 1.18277279823061e-05, + "loss": 0.6428, + "step": 3735 + }, + { + "epoch": 0.45840490797546013, + "grad_norm": 0.9820051046646765, + "learning_rate": 1.1823820598242948e-05, + "loss": 0.576, + "step": 3736 + }, + { + "epoch": 0.4585276073619632, + "grad_norm": 1.0541121108825784, + "learning_rate": 1.181991292612301e-05, + "loss": 0.5974, + "step": 3737 + }, + { + "epoch": 0.45865030674846624, + "grad_norm": 1.0419978804647696, + "learning_rate": 1.181600496656347e-05, + "loss": 0.5818, + "step": 3738 + }, + { + "epoch": 0.45877300613496935, + "grad_norm": 0.9289029960282538, + "learning_rate": 1.1812096720181558e-05, + "loss": 0.5697, + "step": 3739 + }, + { + "epoch": 0.4588957055214724, + "grad_norm": 0.9931880464034615, + "learning_rate": 1.1808188187594549e-05, + "loss": 0.5798, + "step": 3740 + }, + { + "epoch": 0.45901840490797546, + "grad_norm": 0.8110061809799244, + "learning_rate": 1.1804279369419761e-05, + "loss": 0.5497, + "step": 3741 + }, + { + "epoch": 0.4591411042944785, + "grad_norm": 0.9622733366524411, + "learning_rate": 1.1800370266274559e-05, + "loss": 0.6117, + "step": 3742 + }, + { + "epoch": 0.4592638036809816, + "grad_norm": 1.0384870921578473, + "learning_rate": 1.179646087877635e-05, + "loss": 0.6219, + "step": 3743 + }, + { + "epoch": 0.4593865030674847, + "grad_norm": 0.8289914612160516, + "learning_rate": 1.179255120754259e-05, + "loss": 0.5656, + "step": 3744 + }, + { + "epoch": 0.45950920245398774, + "grad_norm": 0.9602008436469969, + "learning_rate": 1.1788641253190779e-05, + "loss": 0.641, + "step": 3745 + }, + { + "epoch": 0.4596319018404908, + "grad_norm": 0.9349909505846029, + "learning_rate": 1.1784731016338458e-05, + "loss": 0.611, + "step": 3746 + }, + { + "epoch": 0.45975460122699385, + "grad_norm": 1.027928268927123, + "learning_rate": 1.1780820497603215e-05, + "loss": 0.5994, + "step": 3747 + }, + { + "epoch": 0.4598773006134969, + "grad_norm": 1.3944694343225836, + "learning_rate": 1.177690969760269e-05, + "loss": 0.5426, + "step": 3748 + }, + { + "epoch": 0.46, + "grad_norm": 0.9612352534292622, + "learning_rate": 1.177299861695455e-05, + "loss": 0.5876, + "step": 3749 + }, + { + "epoch": 0.4601226993865031, + "grad_norm": 0.8480833876863497, + "learning_rate": 1.176908725627652e-05, + "loss": 0.6182, + "step": 3750 + }, + { + "epoch": 0.46024539877300613, + "grad_norm": 0.8782974532589257, + "learning_rate": 1.1765175616186368e-05, + "loss": 0.5672, + "step": 3751 + }, + { + "epoch": 0.4603680981595092, + "grad_norm": 1.0073779245003347, + "learning_rate": 1.1761263697301906e-05, + "loss": 0.5884, + "step": 3752 + }, + { + "epoch": 0.46049079754601224, + "grad_norm": 1.181626754425284, + "learning_rate": 1.1757351500240982e-05, + "loss": 0.5977, + "step": 3753 + }, + { + "epoch": 0.46061349693251535, + "grad_norm": 0.8529760207265654, + "learning_rate": 1.1753439025621496e-05, + "loss": 0.6136, + "step": 3754 + }, + { + "epoch": 0.4607361963190184, + "grad_norm": 0.9354802653496334, + "learning_rate": 1.1749526274061394e-05, + "loss": 0.6076, + "step": 3755 + }, + { + "epoch": 0.46085889570552147, + "grad_norm": 1.072222591460691, + "learning_rate": 1.1745613246178653e-05, + "loss": 0.6108, + "step": 3756 + }, + { + "epoch": 0.4609815950920245, + "grad_norm": 0.8955599723109778, + "learning_rate": 1.1741699942591313e-05, + "loss": 0.6024, + "step": 3757 + }, + { + "epoch": 0.46110429447852763, + "grad_norm": 0.9247311094902223, + "learning_rate": 1.1737786363917438e-05, + "loss": 0.6059, + "step": 3758 + }, + { + "epoch": 0.4612269938650307, + "grad_norm": 1.139130131260964, + "learning_rate": 1.1733872510775146e-05, + "loss": 0.6231, + "step": 3759 + }, + { + "epoch": 0.46134969325153374, + "grad_norm": 0.8529638632889777, + "learning_rate": 1.1729958383782598e-05, + "loss": 0.6113, + "step": 3760 + }, + { + "epoch": 0.4614723926380368, + "grad_norm": 0.9373361061025909, + "learning_rate": 1.1726043983557996e-05, + "loss": 0.6196, + "step": 3761 + }, + { + "epoch": 0.46159509202453985, + "grad_norm": 0.8597317157534974, + "learning_rate": 1.1722129310719585e-05, + "loss": 0.5679, + "step": 3762 + }, + { + "epoch": 0.46171779141104297, + "grad_norm": 0.8713346436778994, + "learning_rate": 1.1718214365885657e-05, + "loss": 0.5693, + "step": 3763 + }, + { + "epoch": 0.461840490797546, + "grad_norm": 0.9028847592208915, + "learning_rate": 1.1714299149674538e-05, + "loss": 0.5883, + "step": 3764 + }, + { + "epoch": 0.4619631901840491, + "grad_norm": 0.9085205376351863, + "learning_rate": 1.1710383662704608e-05, + "loss": 0.5646, + "step": 3765 + }, + { + "epoch": 0.46208588957055213, + "grad_norm": 0.81870816748455, + "learning_rate": 1.1706467905594285e-05, + "loss": 0.6151, + "step": 3766 + }, + { + "epoch": 0.4622085889570552, + "grad_norm": 1.0474675449284354, + "learning_rate": 1.1702551878962025e-05, + "loss": 0.6034, + "step": 3767 + }, + { + "epoch": 0.4623312883435583, + "grad_norm": 0.8785338346465108, + "learning_rate": 1.1698635583426334e-05, + "loss": 0.5499, + "step": 3768 + }, + { + "epoch": 0.46245398773006136, + "grad_norm": 0.9457349721930322, + "learning_rate": 1.1694719019605754e-05, + "loss": 0.6252, + "step": 3769 + }, + { + "epoch": 0.4625766871165644, + "grad_norm": 0.8715578002719956, + "learning_rate": 1.1690802188118878e-05, + "loss": 0.6094, + "step": 3770 + }, + { + "epoch": 0.46269938650306747, + "grad_norm": 0.9011852275922615, + "learning_rate": 1.1686885089584328e-05, + "loss": 0.5348, + "step": 3771 + }, + { + "epoch": 0.4628220858895706, + "grad_norm": 0.8861061477001378, + "learning_rate": 1.1682967724620784e-05, + "loss": 0.5693, + "step": 3772 + }, + { + "epoch": 0.46294478527607363, + "grad_norm": 0.8631382416629706, + "learning_rate": 1.1679050093846956e-05, + "loss": 0.5526, + "step": 3773 + }, + { + "epoch": 0.4630674846625767, + "grad_norm": 1.026098325796776, + "learning_rate": 1.16751321978816e-05, + "loss": 0.5725, + "step": 3774 + }, + { + "epoch": 0.46319018404907975, + "grad_norm": 0.9684234280028262, + "learning_rate": 1.1671214037343515e-05, + "loss": 0.597, + "step": 3775 + }, + { + "epoch": 0.4633128834355828, + "grad_norm": 0.8796374304811357, + "learning_rate": 1.166729561285154e-05, + "loss": 0.5282, + "step": 3776 + }, + { + "epoch": 0.4634355828220859, + "grad_norm": 0.8330344912744719, + "learning_rate": 1.1663376925024556e-05, + "loss": 0.5808, + "step": 3777 + }, + { + "epoch": 0.46355828220858897, + "grad_norm": 0.9476729356182071, + "learning_rate": 1.165945797448149e-05, + "loss": 0.632, + "step": 3778 + }, + { + "epoch": 0.463680981595092, + "grad_norm": 0.9840524566781953, + "learning_rate": 1.16555387618413e-05, + "loss": 0.5656, + "step": 3779 + }, + { + "epoch": 0.4638036809815951, + "grad_norm": 0.8442871599361491, + "learning_rate": 1.1651619287723e-05, + "loss": 0.6046, + "step": 3780 + }, + { + "epoch": 0.46392638036809813, + "grad_norm": 0.9006746358044507, + "learning_rate": 1.1647699552745628e-05, + "loss": 0.5904, + "step": 3781 + }, + { + "epoch": 0.46404907975460125, + "grad_norm": 0.8766176864407221, + "learning_rate": 1.1643779557528278e-05, + "loss": 0.6543, + "step": 3782 + }, + { + "epoch": 0.4641717791411043, + "grad_norm": 0.9278639695759705, + "learning_rate": 1.1639859302690081e-05, + "loss": 0.5816, + "step": 3783 + }, + { + "epoch": 0.46429447852760736, + "grad_norm": 0.8751334564685198, + "learning_rate": 1.1635938788850205e-05, + "loss": 0.6089, + "step": 3784 + }, + { + "epoch": 0.4644171779141104, + "grad_norm": 1.1057664189281629, + "learning_rate": 1.1632018016627859e-05, + "loss": 0.6296, + "step": 3785 + }, + { + "epoch": 0.4645398773006135, + "grad_norm": 0.8957369310020011, + "learning_rate": 1.16280969866423e-05, + "loss": 0.6099, + "step": 3786 + }, + { + "epoch": 0.4646625766871166, + "grad_norm": 0.9285769385658053, + "learning_rate": 1.1624175699512821e-05, + "loss": 0.5445, + "step": 3787 + }, + { + "epoch": 0.46478527607361964, + "grad_norm": 0.9244224465077997, + "learning_rate": 1.1620254155858752e-05, + "loss": 0.6086, + "step": 3788 + }, + { + "epoch": 0.4649079754601227, + "grad_norm": 0.9952375906859499, + "learning_rate": 1.161633235629947e-05, + "loss": 0.6319, + "step": 3789 + }, + { + "epoch": 0.46503067484662575, + "grad_norm": 0.9668320134034357, + "learning_rate": 1.1612410301454384e-05, + "loss": 0.6106, + "step": 3790 + }, + { + "epoch": 0.46515337423312886, + "grad_norm": 0.902316257793982, + "learning_rate": 1.1608487991942956e-05, + "loss": 0.5712, + "step": 3791 + }, + { + "epoch": 0.4652760736196319, + "grad_norm": 1.123969292199706, + "learning_rate": 1.1604565428384675e-05, + "loss": 0.6084, + "step": 3792 + }, + { + "epoch": 0.46539877300613497, + "grad_norm": 1.0556230654096954, + "learning_rate": 1.1600642611399083e-05, + "loss": 0.5921, + "step": 3793 + }, + { + "epoch": 0.465521472392638, + "grad_norm": 0.9734869051423081, + "learning_rate": 1.159671954160575e-05, + "loss": 0.5684, + "step": 3794 + }, + { + "epoch": 0.4656441717791411, + "grad_norm": 0.8154954103098911, + "learning_rate": 1.1592796219624292e-05, + "loss": 0.5771, + "step": 3795 + }, + { + "epoch": 0.4657668711656442, + "grad_norm": 1.0170101253252324, + "learning_rate": 1.1588872646074365e-05, + "loss": 0.5848, + "step": 3796 + }, + { + "epoch": 0.46588957055214725, + "grad_norm": 0.9229835477722635, + "learning_rate": 1.1584948821575666e-05, + "loss": 0.5461, + "step": 3797 + }, + { + "epoch": 0.4660122699386503, + "grad_norm": 0.8362983225320242, + "learning_rate": 1.1581024746747925e-05, + "loss": 0.5661, + "step": 3798 + }, + { + "epoch": 0.46613496932515336, + "grad_norm": 1.2105254835531607, + "learning_rate": 1.1577100422210916e-05, + "loss": 0.603, + "step": 3799 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 0.9276232351773044, + "learning_rate": 1.1573175848584455e-05, + "loss": 0.5698, + "step": 3800 + }, + { + "epoch": 0.4663803680981595, + "grad_norm": 0.9003034366543442, + "learning_rate": 1.1569251026488393e-05, + "loss": 0.6275, + "step": 3801 + }, + { + "epoch": 0.4665030674846626, + "grad_norm": 0.932308552382425, + "learning_rate": 1.1565325956542624e-05, + "loss": 0.6186, + "step": 3802 + }, + { + "epoch": 0.46662576687116564, + "grad_norm": 0.9228521943030055, + "learning_rate": 1.1561400639367075e-05, + "loss": 0.5906, + "step": 3803 + }, + { + "epoch": 0.4667484662576687, + "grad_norm": 1.0070844778702037, + "learning_rate": 1.155747507558172e-05, + "loss": 0.5949, + "step": 3804 + }, + { + "epoch": 0.4668711656441718, + "grad_norm": 1.303563757905625, + "learning_rate": 1.1553549265806567e-05, + "loss": 0.648, + "step": 3805 + }, + { + "epoch": 0.46699386503067486, + "grad_norm": 1.0280582067533561, + "learning_rate": 1.1549623210661663e-05, + "loss": 0.5876, + "step": 3806 + }, + { + "epoch": 0.4671165644171779, + "grad_norm": 0.9370465960480227, + "learning_rate": 1.1545696910767095e-05, + "loss": 0.5602, + "step": 3807 + }, + { + "epoch": 0.46723926380368097, + "grad_norm": 0.8790040969854525, + "learning_rate": 1.1541770366742987e-05, + "loss": 0.5959, + "step": 3808 + }, + { + "epoch": 0.467361963190184, + "grad_norm": 0.8958019833902566, + "learning_rate": 1.1537843579209509e-05, + "loss": 0.5437, + "step": 3809 + }, + { + "epoch": 0.46748466257668714, + "grad_norm": 0.9581352392565538, + "learning_rate": 1.1533916548786856e-05, + "loss": 0.5704, + "step": 3810 + }, + { + "epoch": 0.4676073619631902, + "grad_norm": 1.138095619987388, + "learning_rate": 1.1529989276095271e-05, + "loss": 0.6078, + "step": 3811 + }, + { + "epoch": 0.46773006134969325, + "grad_norm": 0.9831006684445776, + "learning_rate": 1.1526061761755033e-05, + "loss": 0.5889, + "step": 3812 + }, + { + "epoch": 0.4678527607361963, + "grad_norm": 0.8970026208227317, + "learning_rate": 1.1522134006386461e-05, + "loss": 0.5751, + "step": 3813 + }, + { + "epoch": 0.46797546012269936, + "grad_norm": 0.8416800673332153, + "learning_rate": 1.1518206010609907e-05, + "loss": 0.5639, + "step": 3814 + }, + { + "epoch": 0.46809815950920247, + "grad_norm": 1.0057767835626852, + "learning_rate": 1.1514277775045768e-05, + "loss": 0.6534, + "step": 3815 + }, + { + "epoch": 0.4682208588957055, + "grad_norm": 0.8901199528837996, + "learning_rate": 1.1510349300314472e-05, + "loss": 0.6064, + "step": 3816 + }, + { + "epoch": 0.4683435582822086, + "grad_norm": 0.8048019077637463, + "learning_rate": 1.150642058703649e-05, + "loss": 0.6065, + "step": 3817 + }, + { + "epoch": 0.46846625766871164, + "grad_norm": 0.8503299016580239, + "learning_rate": 1.1502491635832327e-05, + "loss": 0.5908, + "step": 3818 + }, + { + "epoch": 0.46858895705521475, + "grad_norm": 0.942125440603584, + "learning_rate": 1.1498562447322524e-05, + "loss": 0.5532, + "step": 3819 + }, + { + "epoch": 0.4687116564417178, + "grad_norm": 0.9128629842071281, + "learning_rate": 1.1494633022127669e-05, + "loss": 0.5281, + "step": 3820 + }, + { + "epoch": 0.46883435582822086, + "grad_norm": 0.9269233164284786, + "learning_rate": 1.1490703360868373e-05, + "loss": 0.6021, + "step": 3821 + }, + { + "epoch": 0.4689570552147239, + "grad_norm": 0.9060195037152415, + "learning_rate": 1.1486773464165301e-05, + "loss": 0.6075, + "step": 3822 + }, + { + "epoch": 0.469079754601227, + "grad_norm": 0.8206800385732056, + "learning_rate": 1.1482843332639143e-05, + "loss": 0.5675, + "step": 3823 + }, + { + "epoch": 0.4692024539877301, + "grad_norm": 0.8810883255559463, + "learning_rate": 1.1478912966910627e-05, + "loss": 0.5552, + "step": 3824 + }, + { + "epoch": 0.46932515337423314, + "grad_norm": 0.8822107800669117, + "learning_rate": 1.1474982367600524e-05, + "loss": 0.5489, + "step": 3825 + }, + { + "epoch": 0.4694478527607362, + "grad_norm": 0.9905951968075035, + "learning_rate": 1.1471051535329636e-05, + "loss": 0.5956, + "step": 3826 + }, + { + "epoch": 0.46957055214723925, + "grad_norm": 1.0269259137883269, + "learning_rate": 1.1467120470718805e-05, + "loss": 0.604, + "step": 3827 + }, + { + "epoch": 0.4696932515337423, + "grad_norm": 0.92304242998688, + "learning_rate": 1.1463189174388909e-05, + "loss": 0.6286, + "step": 3828 + }, + { + "epoch": 0.4698159509202454, + "grad_norm": 0.839261742755189, + "learning_rate": 1.1459257646960861e-05, + "loss": 0.5631, + "step": 3829 + }, + { + "epoch": 0.4699386503067485, + "grad_norm": 0.9780777312938477, + "learning_rate": 1.1455325889055616e-05, + "loss": 0.634, + "step": 3830 + }, + { + "epoch": 0.47006134969325153, + "grad_norm": 0.8855690712951748, + "learning_rate": 1.1451393901294155e-05, + "loss": 0.5574, + "step": 3831 + }, + { + "epoch": 0.4701840490797546, + "grad_norm": 1.0117752731229352, + "learning_rate": 1.1447461684297505e-05, + "loss": 0.5941, + "step": 3832 + }, + { + "epoch": 0.4703067484662577, + "grad_norm": 0.8993669427674515, + "learning_rate": 1.1443529238686726e-05, + "loss": 0.5574, + "step": 3833 + }, + { + "epoch": 0.47042944785276075, + "grad_norm": 0.9552768082227138, + "learning_rate": 1.1439596565082915e-05, + "loss": 0.5775, + "step": 3834 + }, + { + "epoch": 0.4705521472392638, + "grad_norm": 0.9927983267173566, + "learning_rate": 1.1435663664107204e-05, + "loss": 0.6008, + "step": 3835 + }, + { + "epoch": 0.47067484662576686, + "grad_norm": 1.0672190862171025, + "learning_rate": 1.1431730536380759e-05, + "loss": 0.5689, + "step": 3836 + }, + { + "epoch": 0.4707975460122699, + "grad_norm": 0.8559433666075349, + "learning_rate": 1.1427797182524785e-05, + "loss": 0.5637, + "step": 3837 + }, + { + "epoch": 0.47092024539877303, + "grad_norm": 0.8692206615761348, + "learning_rate": 1.1423863603160521e-05, + "loss": 0.5663, + "step": 3838 + }, + { + "epoch": 0.4710429447852761, + "grad_norm": 0.8421521297092119, + "learning_rate": 1.1419929798909241e-05, + "loss": 0.6118, + "step": 3839 + }, + { + "epoch": 0.47116564417177914, + "grad_norm": 0.9537875585037464, + "learning_rate": 1.141599577039226e-05, + "loss": 0.5244, + "step": 3840 + }, + { + "epoch": 0.4712883435582822, + "grad_norm": 0.8303737800661576, + "learning_rate": 1.1412061518230916e-05, + "loss": 0.5892, + "step": 3841 + }, + { + "epoch": 0.47141104294478525, + "grad_norm": 0.912236905139121, + "learning_rate": 1.1408127043046598e-05, + "loss": 0.5538, + "step": 3842 + }, + { + "epoch": 0.47153374233128836, + "grad_norm": 0.8592501920089997, + "learning_rate": 1.1404192345460717e-05, + "loss": 0.6202, + "step": 3843 + }, + { + "epoch": 0.4716564417177914, + "grad_norm": 1.0565607975713687, + "learning_rate": 1.1400257426094727e-05, + "loss": 0.5929, + "step": 3844 + }, + { + "epoch": 0.4717791411042945, + "grad_norm": 0.9415603760999204, + "learning_rate": 1.1396322285570119e-05, + "loss": 0.6333, + "step": 3845 + }, + { + "epoch": 0.47190184049079753, + "grad_norm": 0.8761227045110271, + "learning_rate": 1.1392386924508404e-05, + "loss": 0.6083, + "step": 3846 + }, + { + "epoch": 0.4720245398773006, + "grad_norm": 0.8986901654688602, + "learning_rate": 1.1388451343531151e-05, + "loss": 0.5919, + "step": 3847 + }, + { + "epoch": 0.4721472392638037, + "grad_norm": 0.919045235462357, + "learning_rate": 1.1384515543259943e-05, + "loss": 0.5997, + "step": 3848 + }, + { + "epoch": 0.47226993865030675, + "grad_norm": 0.8744292508102263, + "learning_rate": 1.1380579524316406e-05, + "loss": 0.6309, + "step": 3849 + }, + { + "epoch": 0.4723926380368098, + "grad_norm": 0.9138146841783154, + "learning_rate": 1.1376643287322202e-05, + "loss": 0.5723, + "step": 3850 + }, + { + "epoch": 0.47251533742331286, + "grad_norm": 0.9305874950127377, + "learning_rate": 1.1372706832899027e-05, + "loss": 0.5836, + "step": 3851 + }, + { + "epoch": 0.472638036809816, + "grad_norm": 0.9295978965358885, + "learning_rate": 1.136877016166861e-05, + "loss": 0.615, + "step": 3852 + }, + { + "epoch": 0.47276073619631903, + "grad_norm": 0.9013758826034735, + "learning_rate": 1.1364833274252714e-05, + "loss": 0.5406, + "step": 3853 + }, + { + "epoch": 0.4728834355828221, + "grad_norm": 1.148405143031077, + "learning_rate": 1.1360896171273134e-05, + "loss": 0.5907, + "step": 3854 + }, + { + "epoch": 0.47300613496932514, + "grad_norm": 0.8772891159788907, + "learning_rate": 1.1356958853351705e-05, + "loss": 0.5851, + "step": 3855 + }, + { + "epoch": 0.4731288343558282, + "grad_norm": 0.9822379091886584, + "learning_rate": 1.1353021321110289e-05, + "loss": 0.5881, + "step": 3856 + }, + { + "epoch": 0.4732515337423313, + "grad_norm": 0.8669993139485731, + "learning_rate": 1.134908357517079e-05, + "loss": 0.5666, + "step": 3857 + }, + { + "epoch": 0.47337423312883437, + "grad_norm": 0.8897036842449091, + "learning_rate": 1.134514561615514e-05, + "loss": 0.5845, + "step": 3858 + }, + { + "epoch": 0.4734969325153374, + "grad_norm": 0.8547174099655235, + "learning_rate": 1.1341207444685302e-05, + "loss": 0.6101, + "step": 3859 + }, + { + "epoch": 0.4736196319018405, + "grad_norm": 1.0256105095099788, + "learning_rate": 1.1337269061383278e-05, + "loss": 0.5622, + "step": 3860 + }, + { + "epoch": 0.47374233128834353, + "grad_norm": 1.2770828143078101, + "learning_rate": 1.1333330466871102e-05, + "loss": 0.5797, + "step": 3861 + }, + { + "epoch": 0.47386503067484664, + "grad_norm": 0.9682714534816785, + "learning_rate": 1.132939166177084e-05, + "loss": 0.5472, + "step": 3862 + }, + { + "epoch": 0.4739877300613497, + "grad_norm": 0.9178072155808281, + "learning_rate": 1.1325452646704597e-05, + "loss": 0.6101, + "step": 3863 + }, + { + "epoch": 0.47411042944785275, + "grad_norm": 0.9351662004992531, + "learning_rate": 1.1321513422294505e-05, + "loss": 0.5156, + "step": 3864 + }, + { + "epoch": 0.4742331288343558, + "grad_norm": 0.9690043130582571, + "learning_rate": 1.1317573989162727e-05, + "loss": 0.5712, + "step": 3865 + }, + { + "epoch": 0.4743558282208589, + "grad_norm": 1.0293199877010444, + "learning_rate": 1.1313634347931466e-05, + "loss": 0.5847, + "step": 3866 + }, + { + "epoch": 0.474478527607362, + "grad_norm": 0.8894809363647301, + "learning_rate": 1.1309694499222953e-05, + "loss": 0.5838, + "step": 3867 + }, + { + "epoch": 0.47460122699386503, + "grad_norm": 0.8233325536956599, + "learning_rate": 1.1305754443659453e-05, + "loss": 0.5843, + "step": 3868 + }, + { + "epoch": 0.4747239263803681, + "grad_norm": 0.9215206317625063, + "learning_rate": 1.1301814181863265e-05, + "loss": 0.5923, + "step": 3869 + }, + { + "epoch": 0.47484662576687114, + "grad_norm": 0.8706542095266578, + "learning_rate": 1.129787371445672e-05, + "loss": 0.5821, + "step": 3870 + }, + { + "epoch": 0.47496932515337426, + "grad_norm": 0.8433916069515894, + "learning_rate": 1.1293933042062178e-05, + "loss": 0.5918, + "step": 3871 + }, + { + "epoch": 0.4750920245398773, + "grad_norm": 1.0004827292351322, + "learning_rate": 1.1289992165302036e-05, + "loss": 0.6294, + "step": 3872 + }, + { + "epoch": 0.47521472392638037, + "grad_norm": 0.9090599821775814, + "learning_rate": 1.1286051084798721e-05, + "loss": 0.6005, + "step": 3873 + }, + { + "epoch": 0.4753374233128834, + "grad_norm": 1.0369231672341683, + "learning_rate": 1.1282109801174691e-05, + "loss": 0.6028, + "step": 3874 + }, + { + "epoch": 0.4754601226993865, + "grad_norm": 1.1356577154597325, + "learning_rate": 1.1278168315052445e-05, + "loss": 0.5675, + "step": 3875 + }, + { + "epoch": 0.4755828220858896, + "grad_norm": 0.8976599792804338, + "learning_rate": 1.12742266270545e-05, + "loss": 0.5786, + "step": 3876 + }, + { + "epoch": 0.47570552147239265, + "grad_norm": 1.0106687391982048, + "learning_rate": 1.1270284737803418e-05, + "loss": 0.5931, + "step": 3877 + }, + { + "epoch": 0.4758282208588957, + "grad_norm": 0.9392872119386569, + "learning_rate": 1.126634264792178e-05, + "loss": 0.5802, + "step": 3878 + }, + { + "epoch": 0.47595092024539876, + "grad_norm": 0.9295169235192206, + "learning_rate": 1.1262400358032208e-05, + "loss": 0.6074, + "step": 3879 + }, + { + "epoch": 0.47607361963190187, + "grad_norm": 1.066733099118751, + "learning_rate": 1.1258457868757352e-05, + "loss": 0.6414, + "step": 3880 + }, + { + "epoch": 0.4761963190184049, + "grad_norm": 1.0740823555200902, + "learning_rate": 1.1254515180719893e-05, + "loss": 0.6548, + "step": 3881 + }, + { + "epoch": 0.476319018404908, + "grad_norm": 0.8828137842549582, + "learning_rate": 1.1250572294542548e-05, + "loss": 0.5966, + "step": 3882 + }, + { + "epoch": 0.47644171779141103, + "grad_norm": 1.0530879630240322, + "learning_rate": 1.1246629210848062e-05, + "loss": 0.5914, + "step": 3883 + }, + { + "epoch": 0.4765644171779141, + "grad_norm": 0.820766917471255, + "learning_rate": 1.1242685930259207e-05, + "loss": 0.6054, + "step": 3884 + }, + { + "epoch": 0.4766871165644172, + "grad_norm": 0.8532950350533363, + "learning_rate": 1.1238742453398794e-05, + "loss": 0.6182, + "step": 3885 + }, + { + "epoch": 0.47680981595092026, + "grad_norm": 0.855118641045439, + "learning_rate": 1.123479878088966e-05, + "loss": 0.5426, + "step": 3886 + }, + { + "epoch": 0.4769325153374233, + "grad_norm": 0.8612732159537987, + "learning_rate": 1.1230854913354674e-05, + "loss": 0.583, + "step": 3887 + }, + { + "epoch": 0.47705521472392637, + "grad_norm": 0.8804900407332584, + "learning_rate": 1.1226910851416737e-05, + "loss": 0.5746, + "step": 3888 + }, + { + "epoch": 0.4771779141104294, + "grad_norm": 1.011954593294728, + "learning_rate": 1.122296659569878e-05, + "loss": 0.6087, + "step": 3889 + }, + { + "epoch": 0.47730061349693254, + "grad_norm": 0.9253069824631854, + "learning_rate": 1.1219022146823762e-05, + "loss": 0.5741, + "step": 3890 + }, + { + "epoch": 0.4774233128834356, + "grad_norm": 0.8398786607639158, + "learning_rate": 1.1215077505414676e-05, + "loss": 0.563, + "step": 3891 + }, + { + "epoch": 0.47754601226993865, + "grad_norm": 0.9430648643121433, + "learning_rate": 1.1211132672094542e-05, + "loss": 0.5613, + "step": 3892 + }, + { + "epoch": 0.4776687116564417, + "grad_norm": 0.811845233940565, + "learning_rate": 1.1207187647486418e-05, + "loss": 0.5715, + "step": 3893 + }, + { + "epoch": 0.47779141104294476, + "grad_norm": 1.0237161744225798, + "learning_rate": 1.120324243221338e-05, + "loss": 0.5787, + "step": 3894 + }, + { + "epoch": 0.47791411042944787, + "grad_norm": 0.8688147007323018, + "learning_rate": 1.1199297026898547e-05, + "loss": 0.6367, + "step": 3895 + }, + { + "epoch": 0.4780368098159509, + "grad_norm": 0.8891831516801001, + "learning_rate": 1.119535143216506e-05, + "loss": 0.6222, + "step": 3896 + }, + { + "epoch": 0.478159509202454, + "grad_norm": 0.8958185612372436, + "learning_rate": 1.1191405648636089e-05, + "loss": 0.5421, + "step": 3897 + }, + { + "epoch": 0.47828220858895704, + "grad_norm": 0.8421918364586818, + "learning_rate": 1.1187459676934842e-05, + "loss": 0.5725, + "step": 3898 + }, + { + "epoch": 0.47840490797546015, + "grad_norm": 0.9100771010921397, + "learning_rate": 1.1183513517684546e-05, + "loss": 0.5739, + "step": 3899 + }, + { + "epoch": 0.4785276073619632, + "grad_norm": 1.0995429459134252, + "learning_rate": 1.1179567171508463e-05, + "loss": 0.6198, + "step": 3900 + }, + { + "epoch": 0.47865030674846626, + "grad_norm": 0.9310067722038952, + "learning_rate": 1.1175620639029887e-05, + "loss": 0.5914, + "step": 3901 + }, + { + "epoch": 0.4787730061349693, + "grad_norm": 0.8725192031791343, + "learning_rate": 1.1171673920872134e-05, + "loss": 0.5897, + "step": 3902 + }, + { + "epoch": 0.47889570552147237, + "grad_norm": 0.8618528400931126, + "learning_rate": 1.1167727017658562e-05, + "loss": 0.5692, + "step": 3903 + }, + { + "epoch": 0.4790184049079755, + "grad_norm": 0.9019579765516432, + "learning_rate": 1.1163779930012548e-05, + "loss": 0.6164, + "step": 3904 + }, + { + "epoch": 0.47914110429447854, + "grad_norm": 0.7989640619960965, + "learning_rate": 1.1159832658557498e-05, + "loss": 0.5728, + "step": 3905 + }, + { + "epoch": 0.4792638036809816, + "grad_norm": 0.9341298606929248, + "learning_rate": 1.1155885203916851e-05, + "loss": 0.5479, + "step": 3906 + }, + { + "epoch": 0.47938650306748465, + "grad_norm": 0.9965229923136492, + "learning_rate": 1.1151937566714075e-05, + "loss": 0.6065, + "step": 3907 + }, + { + "epoch": 0.4795092024539877, + "grad_norm": 0.9328535144228729, + "learning_rate": 1.1147989747572662e-05, + "loss": 0.6064, + "step": 3908 + }, + { + "epoch": 0.4796319018404908, + "grad_norm": 0.9186599133367862, + "learning_rate": 1.1144041747116139e-05, + "loss": 0.5721, + "step": 3909 + }, + { + "epoch": 0.47975460122699387, + "grad_norm": 1.095193457771151, + "learning_rate": 1.1140093565968055e-05, + "loss": 0.5874, + "step": 3910 + }, + { + "epoch": 0.4798773006134969, + "grad_norm": 0.9107341538194088, + "learning_rate": 1.1136145204751995e-05, + "loss": 0.562, + "step": 3911 + }, + { + "epoch": 0.48, + "grad_norm": 1.0900977171326813, + "learning_rate": 1.1132196664091568e-05, + "loss": 0.5707, + "step": 3912 + }, + { + "epoch": 0.4801226993865031, + "grad_norm": 0.949225526535955, + "learning_rate": 1.1128247944610412e-05, + "loss": 0.5983, + "step": 3913 + }, + { + "epoch": 0.48024539877300615, + "grad_norm": 0.8790256773858917, + "learning_rate": 1.1124299046932192e-05, + "loss": 0.5673, + "step": 3914 + }, + { + "epoch": 0.4803680981595092, + "grad_norm": 0.9104050647346356, + "learning_rate": 1.1120349971680605e-05, + "loss": 0.6054, + "step": 3915 + }, + { + "epoch": 0.48049079754601226, + "grad_norm": 0.9674410233368753, + "learning_rate": 1.1116400719479366e-05, + "loss": 0.6169, + "step": 3916 + }, + { + "epoch": 0.4806134969325153, + "grad_norm": 0.9065634516121608, + "learning_rate": 1.1112451290952238e-05, + "loss": 0.6589, + "step": 3917 + }, + { + "epoch": 0.4807361963190184, + "grad_norm": 0.9112582348145261, + "learning_rate": 1.110850168672299e-05, + "loss": 0.6139, + "step": 3918 + }, + { + "epoch": 0.4808588957055215, + "grad_norm": 1.311924549822126, + "learning_rate": 1.110455190741543e-05, + "loss": 0.5991, + "step": 3919 + }, + { + "epoch": 0.48098159509202454, + "grad_norm": 0.9902194984310444, + "learning_rate": 1.1100601953653393e-05, + "loss": 0.5676, + "step": 3920 + }, + { + "epoch": 0.4811042944785276, + "grad_norm": 0.9182653947934922, + "learning_rate": 1.1096651826060741e-05, + "loss": 0.6046, + "step": 3921 + }, + { + "epoch": 0.48122699386503065, + "grad_norm": 0.7882565745033847, + "learning_rate": 1.1092701525261357e-05, + "loss": 0.5484, + "step": 3922 + }, + { + "epoch": 0.48134969325153376, + "grad_norm": 1.0058503384343234, + "learning_rate": 1.1088751051879166e-05, + "loss": 0.635, + "step": 3923 + }, + { + "epoch": 0.4814723926380368, + "grad_norm": 0.917193028976448, + "learning_rate": 1.1084800406538102e-05, + "loss": 0.5831, + "step": 3924 + }, + { + "epoch": 0.4815950920245399, + "grad_norm": 0.9207816939851068, + "learning_rate": 1.1080849589862142e-05, + "loss": 0.5909, + "step": 3925 + }, + { + "epoch": 0.48171779141104293, + "grad_norm": 0.9017822626166784, + "learning_rate": 1.107689860247528e-05, + "loss": 0.5772, + "step": 3926 + }, + { + "epoch": 0.48184049079754604, + "grad_norm": 0.9183517731231851, + "learning_rate": 1.1072947445001545e-05, + "loss": 0.5714, + "step": 3927 + }, + { + "epoch": 0.4819631901840491, + "grad_norm": 1.1038681092795886, + "learning_rate": 1.1068996118064986e-05, + "loss": 0.5857, + "step": 3928 + }, + { + "epoch": 0.48208588957055215, + "grad_norm": 0.8407835456218995, + "learning_rate": 1.1065044622289675e-05, + "loss": 0.5446, + "step": 3929 + }, + { + "epoch": 0.4822085889570552, + "grad_norm": 0.8374648919046686, + "learning_rate": 1.1061092958299727e-05, + "loss": 0.5923, + "step": 3930 + }, + { + "epoch": 0.48233128834355826, + "grad_norm": 0.928850520397355, + "learning_rate": 1.1057141126719266e-05, + "loss": 0.6085, + "step": 3931 + }, + { + "epoch": 0.4824539877300614, + "grad_norm": 0.8622791844871385, + "learning_rate": 1.1053189128172454e-05, + "loss": 0.5714, + "step": 3932 + }, + { + "epoch": 0.48257668711656443, + "grad_norm": 0.888588721890906, + "learning_rate": 1.1049236963283474e-05, + "loss": 0.5973, + "step": 3933 + }, + { + "epoch": 0.4826993865030675, + "grad_norm": 0.9260192587329154, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.6246, + "step": 3934 + }, + { + "epoch": 0.48282208588957054, + "grad_norm": 0.8923497459734298, + "learning_rate": 1.1041332136975874e-05, + "loss": 0.5801, + "step": 3935 + }, + { + "epoch": 0.4829447852760736, + "grad_norm": 0.8217052529430623, + "learning_rate": 1.1037379476805757e-05, + "loss": 0.6405, + "step": 3936 + }, + { + "epoch": 0.4830674846625767, + "grad_norm": 0.8899650885546319, + "learning_rate": 1.1033426652790469e-05, + "loss": 0.5263, + "step": 3937 + }, + { + "epoch": 0.48319018404907976, + "grad_norm": 0.8592035849829623, + "learning_rate": 1.1029473665554327e-05, + "loss": 0.5553, + "step": 3938 + }, + { + "epoch": 0.4833128834355828, + "grad_norm": 0.8760511533945242, + "learning_rate": 1.1025520515721668e-05, + "loss": 0.5791, + "step": 3939 + }, + { + "epoch": 0.4834355828220859, + "grad_norm": 1.265626204371407, + "learning_rate": 1.1021567203916861e-05, + "loss": 0.6298, + "step": 3940 + }, + { + "epoch": 0.48355828220858893, + "grad_norm": 0.9000949398485553, + "learning_rate": 1.1017613730764295e-05, + "loss": 0.5855, + "step": 3941 + }, + { + "epoch": 0.48368098159509204, + "grad_norm": 0.8819037881424008, + "learning_rate": 1.1013660096888392e-05, + "loss": 0.5514, + "step": 3942 + }, + { + "epoch": 0.4838036809815951, + "grad_norm": 1.0508035208358246, + "learning_rate": 1.1009706302913587e-05, + "loss": 0.5658, + "step": 3943 + }, + { + "epoch": 0.48392638036809815, + "grad_norm": 0.9843582217410337, + "learning_rate": 1.1005752349464353e-05, + "loss": 0.5685, + "step": 3944 + }, + { + "epoch": 0.4840490797546012, + "grad_norm": 0.908183216687965, + "learning_rate": 1.1001798237165185e-05, + "loss": 0.5763, + "step": 3945 + }, + { + "epoch": 0.4841717791411043, + "grad_norm": 0.9058659171000769, + "learning_rate": 1.0997843966640595e-05, + "loss": 0.619, + "step": 3946 + }, + { + "epoch": 0.4842944785276074, + "grad_norm": 1.048713701913548, + "learning_rate": 1.0993889538515135e-05, + "loss": 0.5771, + "step": 3947 + }, + { + "epoch": 0.48441717791411043, + "grad_norm": 0.8780057962219862, + "learning_rate": 1.0989934953413361e-05, + "loss": 0.6505, + "step": 3948 + }, + { + "epoch": 0.4845398773006135, + "grad_norm": 1.0466496787422803, + "learning_rate": 1.0985980211959875e-05, + "loss": 0.6218, + "step": 3949 + }, + { + "epoch": 0.48466257668711654, + "grad_norm": 0.8242409951571382, + "learning_rate": 1.0982025314779287e-05, + "loss": 0.5727, + "step": 3950 + }, + { + "epoch": 0.48478527607361965, + "grad_norm": 0.9830929020947211, + "learning_rate": 1.0978070262496248e-05, + "loss": 0.5567, + "step": 3951 + }, + { + "epoch": 0.4849079754601227, + "grad_norm": 0.998545051418398, + "learning_rate": 1.0974115055735417e-05, + "loss": 0.6471, + "step": 3952 + }, + { + "epoch": 0.48503067484662576, + "grad_norm": 0.8677086955228384, + "learning_rate": 1.0970159695121488e-05, + "loss": 0.5709, + "step": 3953 + }, + { + "epoch": 0.4851533742331288, + "grad_norm": 0.9652790238283316, + "learning_rate": 1.0966204181279174e-05, + "loss": 0.6026, + "step": 3954 + }, + { + "epoch": 0.4852760736196319, + "grad_norm": 0.8654119765150505, + "learning_rate": 1.0962248514833218e-05, + "loss": 0.5662, + "step": 3955 + }, + { + "epoch": 0.485398773006135, + "grad_norm": 0.8753819219365029, + "learning_rate": 1.0958292696408381e-05, + "loss": 0.5661, + "step": 3956 + }, + { + "epoch": 0.48552147239263804, + "grad_norm": 0.8761952170610189, + "learning_rate": 1.0954336726629453e-05, + "loss": 0.5643, + "step": 3957 + }, + { + "epoch": 0.4856441717791411, + "grad_norm": 0.8461303571192447, + "learning_rate": 1.0950380606121242e-05, + "loss": 0.5714, + "step": 3958 + }, + { + "epoch": 0.48576687116564415, + "grad_norm": 0.9664584431618993, + "learning_rate": 1.0946424335508585e-05, + "loss": 0.5367, + "step": 3959 + }, + { + "epoch": 0.48588957055214727, + "grad_norm": 0.9315324501917926, + "learning_rate": 1.0942467915416342e-05, + "loss": 0.5189, + "step": 3960 + }, + { + "epoch": 0.4860122699386503, + "grad_norm": 0.8543155485765072, + "learning_rate": 1.0938511346469392e-05, + "loss": 0.6237, + "step": 3961 + }, + { + "epoch": 0.4861349693251534, + "grad_norm": 1.079388675590392, + "learning_rate": 1.0934554629292645e-05, + "loss": 0.5695, + "step": 3962 + }, + { + "epoch": 0.48625766871165643, + "grad_norm": 0.8504920040390457, + "learning_rate": 1.0930597764511028e-05, + "loss": 0.5742, + "step": 3963 + }, + { + "epoch": 0.4863803680981595, + "grad_norm": 0.8982917922647862, + "learning_rate": 1.0926640752749496e-05, + "loss": 0.5549, + "step": 3964 + }, + { + "epoch": 0.4865030674846626, + "grad_norm": 1.015046587046694, + "learning_rate": 1.092268359463302e-05, + "loss": 0.5682, + "step": 3965 + }, + { + "epoch": 0.48662576687116565, + "grad_norm": 0.8847430581495225, + "learning_rate": 1.0918726290786606e-05, + "loss": 0.5708, + "step": 3966 + }, + { + "epoch": 0.4867484662576687, + "grad_norm": 0.8787057376814037, + "learning_rate": 1.0914768841835272e-05, + "loss": 0.535, + "step": 3967 + }, + { + "epoch": 0.48687116564417177, + "grad_norm": 0.8755821166580213, + "learning_rate": 1.0910811248404064e-05, + "loss": 0.6152, + "step": 3968 + }, + { + "epoch": 0.4869938650306748, + "grad_norm": 0.9405711875351856, + "learning_rate": 1.0906853511118051e-05, + "loss": 0.6172, + "step": 3969 + }, + { + "epoch": 0.48711656441717793, + "grad_norm": 0.817600430876492, + "learning_rate": 1.090289563060232e-05, + "loss": 0.5436, + "step": 3970 + }, + { + "epoch": 0.487239263803681, + "grad_norm": 0.8296461519312676, + "learning_rate": 1.0898937607481985e-05, + "loss": 0.5863, + "step": 3971 + }, + { + "epoch": 0.48736196319018404, + "grad_norm": 0.8746422736970981, + "learning_rate": 1.0894979442382187e-05, + "loss": 0.5981, + "step": 3972 + }, + { + "epoch": 0.4874846625766871, + "grad_norm": 0.8374244085595219, + "learning_rate": 1.089102113592808e-05, + "loss": 0.5408, + "step": 3973 + }, + { + "epoch": 0.4876073619631902, + "grad_norm": 1.0073807960855914, + "learning_rate": 1.0887062688744843e-05, + "loss": 0.5997, + "step": 3974 + }, + { + "epoch": 0.48773006134969327, + "grad_norm": 0.98253230310621, + "learning_rate": 1.088310410145768e-05, + "loss": 0.6097, + "step": 3975 + }, + { + "epoch": 0.4878527607361963, + "grad_norm": 1.0126182119024751, + "learning_rate": 1.0879145374691818e-05, + "loss": 0.6157, + "step": 3976 + }, + { + "epoch": 0.4879754601226994, + "grad_norm": 0.9044630604618383, + "learning_rate": 1.0875186509072502e-05, + "loss": 0.5672, + "step": 3977 + }, + { + "epoch": 0.48809815950920243, + "grad_norm": 0.8961585517979324, + "learning_rate": 1.0871227505225001e-05, + "loss": 0.5828, + "step": 3978 + }, + { + "epoch": 0.48822085889570555, + "grad_norm": 0.8809301336288059, + "learning_rate": 1.0867268363774606e-05, + "loss": 0.5965, + "step": 3979 + }, + { + "epoch": 0.4883435582822086, + "grad_norm": 0.9356958331696638, + "learning_rate": 1.086330908534663e-05, + "loss": 0.5788, + "step": 3980 + }, + { + "epoch": 0.48846625766871166, + "grad_norm": 0.9462227063567598, + "learning_rate": 1.0859349670566404e-05, + "loss": 0.5965, + "step": 3981 + }, + { + "epoch": 0.4885889570552147, + "grad_norm": 0.9215888345374578, + "learning_rate": 1.0855390120059284e-05, + "loss": 0.5764, + "step": 3982 + }, + { + "epoch": 0.48871165644171777, + "grad_norm": 1.0169467094948188, + "learning_rate": 1.0851430434450652e-05, + "loss": 0.5715, + "step": 3983 + }, + { + "epoch": 0.4888343558282209, + "grad_norm": 0.9587450909769529, + "learning_rate": 1.08474706143659e-05, + "loss": 0.5838, + "step": 3984 + }, + { + "epoch": 0.48895705521472393, + "grad_norm": 0.934800694399993, + "learning_rate": 1.0843510660430447e-05, + "loss": 0.604, + "step": 3985 + }, + { + "epoch": 0.489079754601227, + "grad_norm": 1.0504698592237363, + "learning_rate": 1.0839550573269744e-05, + "loss": 0.6219, + "step": 3986 + }, + { + "epoch": 0.48920245398773005, + "grad_norm": 0.985745585732801, + "learning_rate": 1.0835590353509244e-05, + "loss": 0.5912, + "step": 3987 + }, + { + "epoch": 0.4893251533742331, + "grad_norm": 0.9126345540536909, + "learning_rate": 1.083163000177443e-05, + "loss": 0.5453, + "step": 3988 + }, + { + "epoch": 0.4894478527607362, + "grad_norm": 0.8228875681025074, + "learning_rate": 1.0827669518690806e-05, + "loss": 0.5884, + "step": 3989 + }, + { + "epoch": 0.48957055214723927, + "grad_norm": 0.9619237850204946, + "learning_rate": 1.0823708904883898e-05, + "loss": 0.601, + "step": 3990 + }, + { + "epoch": 0.4896932515337423, + "grad_norm": 1.001275858392725, + "learning_rate": 1.081974816097925e-05, + "loss": 0.5532, + "step": 3991 + }, + { + "epoch": 0.4898159509202454, + "grad_norm": 0.8303401687064357, + "learning_rate": 1.0815787287602428e-05, + "loss": 0.5729, + "step": 3992 + }, + { + "epoch": 0.4899386503067485, + "grad_norm": 0.9699330760602971, + "learning_rate": 1.0811826285379018e-05, + "loss": 0.6001, + "step": 3993 + }, + { + "epoch": 0.49006134969325155, + "grad_norm": 0.8845322310615932, + "learning_rate": 1.0807865154934625e-05, + "loss": 0.5626, + "step": 3994 + }, + { + "epoch": 0.4901840490797546, + "grad_norm": 0.9624800616244106, + "learning_rate": 1.0803903896894877e-05, + "loss": 0.5578, + "step": 3995 + }, + { + "epoch": 0.49030674846625766, + "grad_norm": 0.8725883448859877, + "learning_rate": 1.0799942511885417e-05, + "loss": 0.5704, + "step": 3996 + }, + { + "epoch": 0.4904294478527607, + "grad_norm": 0.9555664580437743, + "learning_rate": 1.0795981000531917e-05, + "loss": 0.5588, + "step": 3997 + }, + { + "epoch": 0.4905521472392638, + "grad_norm": 0.8549257632916468, + "learning_rate": 1.079201936346006e-05, + "loss": 0.6149, + "step": 3998 + }, + { + "epoch": 0.4906748466257669, + "grad_norm": 0.7957790309044372, + "learning_rate": 1.0788057601295553e-05, + "loss": 0.565, + "step": 3999 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.8361788783677527, + "learning_rate": 1.0784095714664124e-05, + "loss": 0.6134, + "step": 4000 + }, + { + "epoch": 0.490920245398773, + "grad_norm": 0.9775599807123397, + "learning_rate": 1.0780133704191517e-05, + "loss": 0.5677, + "step": 4001 + }, + { + "epoch": 0.49104294478527605, + "grad_norm": 0.9955331585679262, + "learning_rate": 1.07761715705035e-05, + "loss": 0.5503, + "step": 4002 + }, + { + "epoch": 0.49116564417177916, + "grad_norm": 0.9415301128103238, + "learning_rate": 1.0772209314225857e-05, + "loss": 0.5997, + "step": 4003 + }, + { + "epoch": 0.4912883435582822, + "grad_norm": 0.8562008560030794, + "learning_rate": 1.0768246935984387e-05, + "loss": 0.6322, + "step": 4004 + }, + { + "epoch": 0.49141104294478527, + "grad_norm": 1.0313821119034303, + "learning_rate": 1.0764284436404924e-05, + "loss": 0.6396, + "step": 4005 + }, + { + "epoch": 0.4915337423312883, + "grad_norm": 0.9044289905883862, + "learning_rate": 1.0760321816113302e-05, + "loss": 0.6197, + "step": 4006 + }, + { + "epoch": 0.49165644171779144, + "grad_norm": 0.9302527607853867, + "learning_rate": 1.0756359075735385e-05, + "loss": 0.5774, + "step": 4007 + }, + { + "epoch": 0.4917791411042945, + "grad_norm": 0.9490051538801544, + "learning_rate": 1.0752396215897056e-05, + "loss": 0.5951, + "step": 4008 + }, + { + "epoch": 0.49190184049079755, + "grad_norm": 1.1956186879842157, + "learning_rate": 1.0748433237224213e-05, + "loss": 0.5984, + "step": 4009 + }, + { + "epoch": 0.4920245398773006, + "grad_norm": 0.9553288351289959, + "learning_rate": 1.0744470140342775e-05, + "loss": 0.5528, + "step": 4010 + }, + { + "epoch": 0.49214723926380366, + "grad_norm": 0.8983893876400526, + "learning_rate": 1.0740506925878677e-05, + "loss": 0.5994, + "step": 4011 + }, + { + "epoch": 0.49226993865030677, + "grad_norm": 0.861539218422224, + "learning_rate": 1.0736543594457876e-05, + "loss": 0.6058, + "step": 4012 + }, + { + "epoch": 0.4923926380368098, + "grad_norm": 0.9196280262276703, + "learning_rate": 1.0732580146706347e-05, + "loss": 0.5577, + "step": 4013 + }, + { + "epoch": 0.4925153374233129, + "grad_norm": 0.8190868249769032, + "learning_rate": 1.0728616583250082e-05, + "loss": 0.577, + "step": 4014 + }, + { + "epoch": 0.49263803680981594, + "grad_norm": 0.9659245366441422, + "learning_rate": 1.0724652904715091e-05, + "loss": 0.5563, + "step": 4015 + }, + { + "epoch": 0.492760736196319, + "grad_norm": 0.936476656193469, + "learning_rate": 1.0720689111727407e-05, + "loss": 0.5605, + "step": 4016 + }, + { + "epoch": 0.4928834355828221, + "grad_norm": 0.9434519417457714, + "learning_rate": 1.0716725204913072e-05, + "loss": 0.5721, + "step": 4017 + }, + { + "epoch": 0.49300613496932516, + "grad_norm": 1.0005483990486062, + "learning_rate": 1.0712761184898153e-05, + "loss": 0.5438, + "step": 4018 + }, + { + "epoch": 0.4931288343558282, + "grad_norm": 0.8199766255264919, + "learning_rate": 1.070879705230873e-05, + "loss": 0.5784, + "step": 4019 + }, + { + "epoch": 0.49325153374233127, + "grad_norm": 0.8338711137390801, + "learning_rate": 1.0704832807770909e-05, + "loss": 0.5573, + "step": 4020 + }, + { + "epoch": 0.4933742331288344, + "grad_norm": 0.9052189099410871, + "learning_rate": 1.0700868451910804e-05, + "loss": 0.6177, + "step": 4021 + }, + { + "epoch": 0.49349693251533744, + "grad_norm": 0.9135499522357553, + "learning_rate": 1.0696903985354555e-05, + "loss": 0.5775, + "step": 4022 + }, + { + "epoch": 0.4936196319018405, + "grad_norm": 0.9130961548066439, + "learning_rate": 1.0692939408728309e-05, + "loss": 0.5594, + "step": 4023 + }, + { + "epoch": 0.49374233128834355, + "grad_norm": 0.8557474142894992, + "learning_rate": 1.0688974722658244e-05, + "loss": 0.5678, + "step": 4024 + }, + { + "epoch": 0.4938650306748466, + "grad_norm": 0.9537009275704784, + "learning_rate": 1.0685009927770542e-05, + "loss": 0.5854, + "step": 4025 + }, + { + "epoch": 0.4939877300613497, + "grad_norm": 0.9238302962890808, + "learning_rate": 1.068104502469141e-05, + "loss": 0.6013, + "step": 4026 + }, + { + "epoch": 0.4941104294478528, + "grad_norm": 0.8922534260688605, + "learning_rate": 1.0677080014047076e-05, + "loss": 0.564, + "step": 4027 + }, + { + "epoch": 0.49423312883435583, + "grad_norm": 0.8641893299846002, + "learning_rate": 1.0673114896463772e-05, + "loss": 0.5806, + "step": 4028 + }, + { + "epoch": 0.4943558282208589, + "grad_norm": 0.8605945264935502, + "learning_rate": 1.0669149672567758e-05, + "loss": 0.5899, + "step": 4029 + }, + { + "epoch": 0.49447852760736194, + "grad_norm": 0.8935878430142736, + "learning_rate": 1.0665184342985306e-05, + "loss": 0.61, + "step": 4030 + }, + { + "epoch": 0.49460122699386505, + "grad_norm": 0.9519338692593231, + "learning_rate": 1.0661218908342705e-05, + "loss": 0.5579, + "step": 4031 + }, + { + "epoch": 0.4947239263803681, + "grad_norm": 0.9911055300008873, + "learning_rate": 1.0657253369266263e-05, + "loss": 0.5931, + "step": 4032 + }, + { + "epoch": 0.49484662576687116, + "grad_norm": 0.8641029363391343, + "learning_rate": 1.06532877263823e-05, + "loss": 0.5631, + "step": 4033 + }, + { + "epoch": 0.4949693251533742, + "grad_norm": 0.8665497357961034, + "learning_rate": 1.0649321980317158e-05, + "loss": 0.5603, + "step": 4034 + }, + { + "epoch": 0.4950920245398773, + "grad_norm": 0.8378768455799227, + "learning_rate": 1.064535613169719e-05, + "loss": 0.5609, + "step": 4035 + }, + { + "epoch": 0.4952147239263804, + "grad_norm": 0.9342625904343552, + "learning_rate": 1.0641390181148772e-05, + "loss": 0.5542, + "step": 4036 + }, + { + "epoch": 0.49533742331288344, + "grad_norm": 1.1571935163962008, + "learning_rate": 1.0637424129298288e-05, + "loss": 0.5619, + "step": 4037 + }, + { + "epoch": 0.4954601226993865, + "grad_norm": 0.8351935835072869, + "learning_rate": 1.0633457976772143e-05, + "loss": 0.5607, + "step": 4038 + }, + { + "epoch": 0.49558282208588955, + "grad_norm": 0.9300115813372859, + "learning_rate": 1.0629491724196759e-05, + "loss": 0.6044, + "step": 4039 + }, + { + "epoch": 0.49570552147239266, + "grad_norm": 0.8840577320954753, + "learning_rate": 1.0625525372198564e-05, + "loss": 0.5413, + "step": 4040 + }, + { + "epoch": 0.4958282208588957, + "grad_norm": 0.8741711886032835, + "learning_rate": 1.0621558921404016e-05, + "loss": 0.585, + "step": 4041 + }, + { + "epoch": 0.4959509202453988, + "grad_norm": 0.7470403145878659, + "learning_rate": 1.061759237243958e-05, + "loss": 0.5549, + "step": 4042 + }, + { + "epoch": 0.49607361963190183, + "grad_norm": 0.8534258564818081, + "learning_rate": 1.0613625725931738e-05, + "loss": 0.6179, + "step": 4043 + }, + { + "epoch": 0.4961963190184049, + "grad_norm": 0.8322149726248599, + "learning_rate": 1.060965898250699e-05, + "loss": 0.611, + "step": 4044 + }, + { + "epoch": 0.496319018404908, + "grad_norm": 0.871372494135632, + "learning_rate": 1.0605692142791846e-05, + "loss": 0.6184, + "step": 4045 + }, + { + "epoch": 0.49644171779141105, + "grad_norm": 0.9830372611638843, + "learning_rate": 1.0601725207412835e-05, + "loss": 0.5863, + "step": 4046 + }, + { + "epoch": 0.4965644171779141, + "grad_norm": 0.8291605934905063, + "learning_rate": 1.05977581769965e-05, + "loss": 0.617, + "step": 4047 + }, + { + "epoch": 0.49668711656441716, + "grad_norm": 1.0176960274507483, + "learning_rate": 1.0593791052169402e-05, + "loss": 0.636, + "step": 4048 + }, + { + "epoch": 0.4968098159509202, + "grad_norm": 0.8492779435253303, + "learning_rate": 1.0589823833558111e-05, + "loss": 0.5961, + "step": 4049 + }, + { + "epoch": 0.49693251533742333, + "grad_norm": 1.0021320569727579, + "learning_rate": 1.0585856521789215e-05, + "loss": 0.5719, + "step": 4050 + }, + { + "epoch": 0.4970552147239264, + "grad_norm": 1.000199172384964, + "learning_rate": 1.058188911748932e-05, + "loss": 0.5346, + "step": 4051 + }, + { + "epoch": 0.49717791411042944, + "grad_norm": 0.8457294757065806, + "learning_rate": 1.0577921621285041e-05, + "loss": 0.5593, + "step": 4052 + }, + { + "epoch": 0.4973006134969325, + "grad_norm": 0.9538743759180351, + "learning_rate": 1.0573954033803006e-05, + "loss": 0.5638, + "step": 4053 + }, + { + "epoch": 0.4974233128834356, + "grad_norm": 0.9131027188669268, + "learning_rate": 1.0569986355669872e-05, + "loss": 0.5595, + "step": 4054 + }, + { + "epoch": 0.49754601226993866, + "grad_norm": 0.8632809479083291, + "learning_rate": 1.056601858751229e-05, + "loss": 0.6086, + "step": 4055 + }, + { + "epoch": 0.4976687116564417, + "grad_norm": 0.8731150228446795, + "learning_rate": 1.0562050729956941e-05, + "loss": 0.5446, + "step": 4056 + }, + { + "epoch": 0.4977914110429448, + "grad_norm": 0.8878932827861601, + "learning_rate": 1.055808278363051e-05, + "loss": 0.6109, + "step": 4057 + }, + { + "epoch": 0.49791411042944783, + "grad_norm": 0.9026829888087737, + "learning_rate": 1.05541147491597e-05, + "loss": 0.6479, + "step": 4058 + }, + { + "epoch": 0.49803680981595094, + "grad_norm": 0.9039399871345406, + "learning_rate": 1.0550146627171231e-05, + "loss": 0.5602, + "step": 4059 + }, + { + "epoch": 0.498159509202454, + "grad_norm": 0.9790015327607489, + "learning_rate": 1.0546178418291833e-05, + "loss": 0.5757, + "step": 4060 + }, + { + "epoch": 0.49828220858895705, + "grad_norm": 0.9743110564498271, + "learning_rate": 1.0542210123148246e-05, + "loss": 0.6069, + "step": 4061 + }, + { + "epoch": 0.4984049079754601, + "grad_norm": 0.9125978456523475, + "learning_rate": 1.0538241742367233e-05, + "loss": 0.5615, + "step": 4062 + }, + { + "epoch": 0.49852760736196317, + "grad_norm": 1.0949909959556996, + "learning_rate": 1.0534273276575564e-05, + "loss": 0.5721, + "step": 4063 + }, + { + "epoch": 0.4986503067484663, + "grad_norm": 0.9191401486407043, + "learning_rate": 1.0530304726400025e-05, + "loss": 0.5638, + "step": 4064 + }, + { + "epoch": 0.49877300613496933, + "grad_norm": 0.8704910416385632, + "learning_rate": 1.0526336092467414e-05, + "loss": 0.5702, + "step": 4065 + }, + { + "epoch": 0.4988957055214724, + "grad_norm": 0.9226983825542691, + "learning_rate": 1.0522367375404539e-05, + "loss": 0.5778, + "step": 4066 + }, + { + "epoch": 0.49901840490797544, + "grad_norm": 0.8577332993915414, + "learning_rate": 1.051839857583823e-05, + "loss": 0.5764, + "step": 4067 + }, + { + "epoch": 0.49914110429447855, + "grad_norm": 0.9700641321899501, + "learning_rate": 1.0514429694395323e-05, + "loss": 0.5571, + "step": 4068 + }, + { + "epoch": 0.4992638036809816, + "grad_norm": 0.9527482158813381, + "learning_rate": 1.0510460731702667e-05, + "loss": 0.6081, + "step": 4069 + }, + { + "epoch": 0.49938650306748467, + "grad_norm": 0.8469931831591367, + "learning_rate": 1.0506491688387128e-05, + "loss": 0.5628, + "step": 4070 + }, + { + "epoch": 0.4995092024539877, + "grad_norm": 0.9926535762787527, + "learning_rate": 1.0502522565075581e-05, + "loss": 0.6269, + "step": 4071 + }, + { + "epoch": 0.4996319018404908, + "grad_norm": 0.9054526170254596, + "learning_rate": 1.0498553362394916e-05, + "loss": 0.5273, + "step": 4072 + }, + { + "epoch": 0.4997546012269939, + "grad_norm": 0.970335519380831, + "learning_rate": 1.0494584080972032e-05, + "loss": 0.5432, + "step": 4073 + }, + { + "epoch": 0.49987730061349694, + "grad_norm": 0.9160584466654129, + "learning_rate": 1.0490614721433846e-05, + "loss": 0.5711, + "step": 4074 + }, + { + "epoch": 0.5, + "grad_norm": 0.8392179012192376, + "learning_rate": 1.0486645284407282e-05, + "loss": 0.6029, + "step": 4075 + }, + { + "epoch": 0.5001226993865031, + "grad_norm": 0.9497832168939825, + "learning_rate": 1.0482675770519279e-05, + "loss": 0.5535, + "step": 4076 + }, + { + "epoch": 0.5002453987730061, + "grad_norm": 0.9971908342314618, + "learning_rate": 1.0478706180396788e-05, + "loss": 0.6157, + "step": 4077 + }, + { + "epoch": 0.5003680981595092, + "grad_norm": 0.9860868225327746, + "learning_rate": 1.0474736514666772e-05, + "loss": 0.5722, + "step": 4078 + }, + { + "epoch": 0.5004907975460122, + "grad_norm": 0.9519048109175037, + "learning_rate": 1.0470766773956205e-05, + "loss": 0.6107, + "step": 4079 + }, + { + "epoch": 0.5006134969325153, + "grad_norm": 0.8946163793558849, + "learning_rate": 1.0466796958892071e-05, + "loss": 0.5282, + "step": 4080 + }, + { + "epoch": 0.5007361963190184, + "grad_norm": 0.8120078802821484, + "learning_rate": 1.046282707010137e-05, + "loss": 0.583, + "step": 4081 + }, + { + "epoch": 0.5008588957055214, + "grad_norm": 1.0250955412719729, + "learning_rate": 1.0458857108211113e-05, + "loss": 0.5711, + "step": 4082 + }, + { + "epoch": 0.5009815950920246, + "grad_norm": 0.8815430823899145, + "learning_rate": 1.0454887073848323e-05, + "loss": 0.5834, + "step": 4083 + }, + { + "epoch": 0.5011042944785276, + "grad_norm": 0.9203676133756349, + "learning_rate": 1.0450916967640028e-05, + "loss": 0.592, + "step": 4084 + }, + { + "epoch": 0.5012269938650307, + "grad_norm": 0.972294355594408, + "learning_rate": 1.0446946790213275e-05, + "loss": 0.5824, + "step": 4085 + }, + { + "epoch": 0.5013496932515338, + "grad_norm": 0.9394112873347435, + "learning_rate": 1.044297654219512e-05, + "loss": 0.5743, + "step": 4086 + }, + { + "epoch": 0.5014723926380368, + "grad_norm": 0.8740600114160548, + "learning_rate": 1.0439006224212629e-05, + "loss": 0.5389, + "step": 4087 + }, + { + "epoch": 0.5015950920245399, + "grad_norm": 0.9577702987739091, + "learning_rate": 1.0435035836892879e-05, + "loss": 0.5778, + "step": 4088 + }, + { + "epoch": 0.5017177914110429, + "grad_norm": 0.8478594687094451, + "learning_rate": 1.0431065380862959e-05, + "loss": 0.5973, + "step": 4089 + }, + { + "epoch": 0.501840490797546, + "grad_norm": 0.8636980420285405, + "learning_rate": 1.0427094856749966e-05, + "loss": 0.5598, + "step": 4090 + }, + { + "epoch": 0.5019631901840491, + "grad_norm": 0.936826616447931, + "learning_rate": 1.0423124265181012e-05, + "loss": 0.5055, + "step": 4091 + }, + { + "epoch": 0.5020858895705521, + "grad_norm": 0.9040490686268186, + "learning_rate": 1.0419153606783219e-05, + "loss": 0.5447, + "step": 4092 + }, + { + "epoch": 0.5022085889570552, + "grad_norm": 0.8134718003676346, + "learning_rate": 1.0415182882183719e-05, + "loss": 0.5719, + "step": 4093 + }, + { + "epoch": 0.5023312883435583, + "grad_norm": 0.8584018920208903, + "learning_rate": 1.0411212092009647e-05, + "loss": 0.5499, + "step": 4094 + }, + { + "epoch": 0.5024539877300613, + "grad_norm": 0.9758164952599727, + "learning_rate": 1.0407241236888164e-05, + "loss": 0.5877, + "step": 4095 + }, + { + "epoch": 0.5025766871165644, + "grad_norm": 0.9384201033856219, + "learning_rate": 1.0403270317446428e-05, + "loss": 0.6271, + "step": 4096 + }, + { + "epoch": 0.5026993865030674, + "grad_norm": 0.9003687545475135, + "learning_rate": 1.0399299334311615e-05, + "loss": 0.6229, + "step": 4097 + }, + { + "epoch": 0.5028220858895706, + "grad_norm": 0.8272082529069057, + "learning_rate": 1.0395328288110902e-05, + "loss": 0.6012, + "step": 4098 + }, + { + "epoch": 0.5029447852760737, + "grad_norm": 0.7373552899244734, + "learning_rate": 1.0391357179471486e-05, + "loss": 0.6085, + "step": 4099 + }, + { + "epoch": 0.5030674846625767, + "grad_norm": 0.8587077637997008, + "learning_rate": 1.0387386009020569e-05, + "loss": 0.5527, + "step": 4100 + }, + { + "epoch": 0.5031901840490798, + "grad_norm": 0.9126058589941857, + "learning_rate": 1.038341477738536e-05, + "loss": 0.5727, + "step": 4101 + }, + { + "epoch": 0.5033128834355828, + "grad_norm": 0.9500435835765016, + "learning_rate": 1.0379443485193083e-05, + "loss": 0.5657, + "step": 4102 + }, + { + "epoch": 0.5034355828220859, + "grad_norm": 0.8488648252168539, + "learning_rate": 1.0375472133070969e-05, + "loss": 0.5432, + "step": 4103 + }, + { + "epoch": 0.503558282208589, + "grad_norm": 0.8361889568848454, + "learning_rate": 1.037150072164626e-05, + "loss": 0.5774, + "step": 4104 + }, + { + "epoch": 0.503680981595092, + "grad_norm": 0.9432583033384131, + "learning_rate": 1.0367529251546208e-05, + "loss": 0.6197, + "step": 4105 + }, + { + "epoch": 0.5038036809815951, + "grad_norm": 0.8839326646740705, + "learning_rate": 1.036355772339807e-05, + "loss": 0.5798, + "step": 4106 + }, + { + "epoch": 0.5039263803680981, + "grad_norm": 0.9205735725886734, + "learning_rate": 1.0359586137829115e-05, + "loss": 0.4772, + "step": 4107 + }, + { + "epoch": 0.5040490797546012, + "grad_norm": 0.8758828897060641, + "learning_rate": 1.0355614495466621e-05, + "loss": 0.5963, + "step": 4108 + }, + { + "epoch": 0.5041717791411043, + "grad_norm": 1.1654498965171594, + "learning_rate": 1.0351642796937873e-05, + "loss": 0.618, + "step": 4109 + }, + { + "epoch": 0.5042944785276073, + "grad_norm": 0.9602476610843593, + "learning_rate": 1.034767104287017e-05, + "loss": 0.5552, + "step": 4110 + }, + { + "epoch": 0.5044171779141104, + "grad_norm": 0.9602111120837373, + "learning_rate": 1.0343699233890814e-05, + "loss": 0.6015, + "step": 4111 + }, + { + "epoch": 0.5045398773006134, + "grad_norm": 1.375060925423841, + "learning_rate": 1.0339727370627119e-05, + "loss": 0.5761, + "step": 4112 + }, + { + "epoch": 0.5046625766871166, + "grad_norm": 0.9816581398509787, + "learning_rate": 1.0335755453706407e-05, + "loss": 0.5897, + "step": 4113 + }, + { + "epoch": 0.5047852760736197, + "grad_norm": 1.0262311244811602, + "learning_rate": 1.0331783483756008e-05, + "loss": 0.5863, + "step": 4114 + }, + { + "epoch": 0.5049079754601227, + "grad_norm": 0.8718115799702253, + "learning_rate": 1.032781146140326e-05, + "loss": 0.5955, + "step": 4115 + }, + { + "epoch": 0.5050306748466258, + "grad_norm": 0.846621521289147, + "learning_rate": 1.0323839387275508e-05, + "loss": 0.5581, + "step": 4116 + }, + { + "epoch": 0.5051533742331288, + "grad_norm": 0.9032876360008729, + "learning_rate": 1.031986726200011e-05, + "loss": 0.5666, + "step": 4117 + }, + { + "epoch": 0.5052760736196319, + "grad_norm": 0.9726905679934282, + "learning_rate": 1.031589508620443e-05, + "loss": 0.5637, + "step": 4118 + }, + { + "epoch": 0.505398773006135, + "grad_norm": 0.8975178379008764, + "learning_rate": 1.0311922860515835e-05, + "loss": 0.5362, + "step": 4119 + }, + { + "epoch": 0.505521472392638, + "grad_norm": 0.8498315006597545, + "learning_rate": 1.0307950585561705e-05, + "loss": 0.5804, + "step": 4120 + }, + { + "epoch": 0.5056441717791411, + "grad_norm": 0.8987144078631134, + "learning_rate": 1.030397826196943e-05, + "loss": 0.5524, + "step": 4121 + }, + { + "epoch": 0.5057668711656441, + "grad_norm": 0.9181005521911697, + "learning_rate": 1.0300005890366397e-05, + "loss": 0.5357, + "step": 4122 + }, + { + "epoch": 0.5058895705521472, + "grad_norm": 0.950854679736885, + "learning_rate": 1.0296033471380016e-05, + "loss": 0.6174, + "step": 4123 + }, + { + "epoch": 0.5060122699386503, + "grad_norm": 1.0799484092197635, + "learning_rate": 1.0292061005637695e-05, + "loss": 0.6079, + "step": 4124 + }, + { + "epoch": 0.5061349693251533, + "grad_norm": 0.890466788896417, + "learning_rate": 1.0288088493766846e-05, + "loss": 0.5144, + "step": 4125 + }, + { + "epoch": 0.5062576687116565, + "grad_norm": 0.8204727377545145, + "learning_rate": 1.02841159363949e-05, + "loss": 0.5417, + "step": 4126 + }, + { + "epoch": 0.5063803680981596, + "grad_norm": 0.881931628932646, + "learning_rate": 1.028014333414928e-05, + "loss": 0.6017, + "step": 4127 + }, + { + "epoch": 0.5065030674846626, + "grad_norm": 0.9524849907398186, + "learning_rate": 1.0276170687657431e-05, + "loss": 0.5512, + "step": 4128 + }, + { + "epoch": 0.5066257668711657, + "grad_norm": 0.9004757706318356, + "learning_rate": 1.0272197997546799e-05, + "loss": 0.5643, + "step": 4129 + }, + { + "epoch": 0.5067484662576687, + "grad_norm": 0.8674625945727186, + "learning_rate": 1.0268225264444829e-05, + "loss": 0.5593, + "step": 4130 + }, + { + "epoch": 0.5068711656441718, + "grad_norm": 0.9161548810422387, + "learning_rate": 1.0264252488978986e-05, + "loss": 0.5744, + "step": 4131 + }, + { + "epoch": 0.5069938650306749, + "grad_norm": 0.8433838753313854, + "learning_rate": 1.0260279671776735e-05, + "loss": 0.5649, + "step": 4132 + }, + { + "epoch": 0.5071165644171779, + "grad_norm": 0.9459920524608816, + "learning_rate": 1.0256306813465545e-05, + "loss": 0.5726, + "step": 4133 + }, + { + "epoch": 0.507239263803681, + "grad_norm": 0.959352950968489, + "learning_rate": 1.0252333914672899e-05, + "loss": 0.5419, + "step": 4134 + }, + { + "epoch": 0.507361963190184, + "grad_norm": 0.9489077088412305, + "learning_rate": 1.0248360976026279e-05, + "loss": 0.5536, + "step": 4135 + }, + { + "epoch": 0.5074846625766871, + "grad_norm": 0.9631757726710853, + "learning_rate": 1.0244387998153179e-05, + "loss": 0.5843, + "step": 4136 + }, + { + "epoch": 0.5076073619631902, + "grad_norm": 1.3765874859282168, + "learning_rate": 1.0240414981681097e-05, + "loss": 0.621, + "step": 4137 + }, + { + "epoch": 0.5077300613496932, + "grad_norm": 1.0133540121351912, + "learning_rate": 1.0236441927237534e-05, + "loss": 0.6437, + "step": 4138 + }, + { + "epoch": 0.5078527607361963, + "grad_norm": 0.7986276324887323, + "learning_rate": 1.0232468835450002e-05, + "loss": 0.5808, + "step": 4139 + }, + { + "epoch": 0.5079754601226993, + "grad_norm": 0.9551453162962029, + "learning_rate": 1.0228495706946015e-05, + "loss": 0.5535, + "step": 4140 + }, + { + "epoch": 0.5080981595092025, + "grad_norm": 0.943675815917841, + "learning_rate": 1.0224522542353097e-05, + "loss": 0.6032, + "step": 4141 + }, + { + "epoch": 0.5082208588957056, + "grad_norm": 0.7952563902427068, + "learning_rate": 1.0220549342298774e-05, + "loss": 0.5229, + "step": 4142 + }, + { + "epoch": 0.5083435582822086, + "grad_norm": 0.885948731225533, + "learning_rate": 1.0216576107410578e-05, + "loss": 0.5533, + "step": 4143 + }, + { + "epoch": 0.5084662576687117, + "grad_norm": 0.9178304288618164, + "learning_rate": 1.0212602838316046e-05, + "loss": 0.5186, + "step": 4144 + }, + { + "epoch": 0.5085889570552147, + "grad_norm": 0.9484431203000457, + "learning_rate": 1.0208629535642726e-05, + "loss": 0.5475, + "step": 4145 + }, + { + "epoch": 0.5087116564417178, + "grad_norm": 0.8974189134140688, + "learning_rate": 1.0204656200018164e-05, + "loss": 0.5222, + "step": 4146 + }, + { + "epoch": 0.5088343558282209, + "grad_norm": 0.9531531498676123, + "learning_rate": 1.0200682832069913e-05, + "loss": 0.5788, + "step": 4147 + }, + { + "epoch": 0.5089570552147239, + "grad_norm": 0.9185161286472543, + "learning_rate": 1.0196709432425535e-05, + "loss": 0.6149, + "step": 4148 + }, + { + "epoch": 0.509079754601227, + "grad_norm": 0.9737677382930128, + "learning_rate": 1.0192736001712596e-05, + "loss": 0.5853, + "step": 4149 + }, + { + "epoch": 0.50920245398773, + "grad_norm": 0.8634512238369613, + "learning_rate": 1.0188762540558657e-05, + "loss": 0.5571, + "step": 4150 + }, + { + "epoch": 0.5093251533742331, + "grad_norm": 0.9206870954841713, + "learning_rate": 1.01847890495913e-05, + "loss": 0.5943, + "step": 4151 + }, + { + "epoch": 0.5094478527607362, + "grad_norm": 0.953924278580398, + "learning_rate": 1.0180815529438101e-05, + "loss": 0.6011, + "step": 4152 + }, + { + "epoch": 0.5095705521472392, + "grad_norm": 0.8993662240910112, + "learning_rate": 1.0176841980726643e-05, + "loss": 0.6077, + "step": 4153 + }, + { + "epoch": 0.5096932515337423, + "grad_norm": 0.8468230522360041, + "learning_rate": 1.0172868404084518e-05, + "loss": 0.5642, + "step": 4154 + }, + { + "epoch": 0.5098159509202455, + "grad_norm": 0.9107944712533801, + "learning_rate": 1.0168894800139311e-05, + "loss": 0.5677, + "step": 4155 + }, + { + "epoch": 0.5099386503067485, + "grad_norm": 0.8035603224704897, + "learning_rate": 1.0164921169518624e-05, + "loss": 0.5795, + "step": 4156 + }, + { + "epoch": 0.5100613496932516, + "grad_norm": 0.913709096999965, + "learning_rate": 1.0160947512850057e-05, + "loss": 0.5743, + "step": 4157 + }, + { + "epoch": 0.5101840490797546, + "grad_norm": 0.9396828884978855, + "learning_rate": 1.0156973830761215e-05, + "loss": 0.6161, + "step": 4158 + }, + { + "epoch": 0.5103067484662577, + "grad_norm": 0.9858343300238426, + "learning_rate": 1.0153000123879704e-05, + "loss": 0.5711, + "step": 4159 + }, + { + "epoch": 0.5104294478527608, + "grad_norm": 0.8989979054635242, + "learning_rate": 1.0149026392833137e-05, + "loss": 0.5754, + "step": 4160 + }, + { + "epoch": 0.5105521472392638, + "grad_norm": 0.8730159573904434, + "learning_rate": 1.0145052638249135e-05, + "loss": 0.6309, + "step": 4161 + }, + { + "epoch": 0.5106748466257669, + "grad_norm": 0.9157113687805856, + "learning_rate": 1.0141078860755316e-05, + "loss": 0.6008, + "step": 4162 + }, + { + "epoch": 0.5107975460122699, + "grad_norm": 0.8437409280908825, + "learning_rate": 1.0137105060979301e-05, + "loss": 0.5848, + "step": 4163 + }, + { + "epoch": 0.510920245398773, + "grad_norm": 0.8630496307225166, + "learning_rate": 1.0133131239548721e-05, + "loss": 0.5123, + "step": 4164 + }, + { + "epoch": 0.5110429447852761, + "grad_norm": 0.7858461125617217, + "learning_rate": 1.0129157397091208e-05, + "loss": 0.5409, + "step": 4165 + }, + { + "epoch": 0.5111656441717791, + "grad_norm": 0.8356070262583233, + "learning_rate": 1.0125183534234392e-05, + "loss": 0.6175, + "step": 4166 + }, + { + "epoch": 0.5112883435582822, + "grad_norm": 0.8908955008643374, + "learning_rate": 1.0121209651605916e-05, + "loss": 0.544, + "step": 4167 + }, + { + "epoch": 0.5114110429447852, + "grad_norm": 1.3142876080892552, + "learning_rate": 1.0117235749833419e-05, + "loss": 0.5694, + "step": 4168 + }, + { + "epoch": 0.5115337423312883, + "grad_norm": 0.8194372905156164, + "learning_rate": 1.0113261829544541e-05, + "loss": 0.5977, + "step": 4169 + }, + { + "epoch": 0.5116564417177915, + "grad_norm": 0.9657929718261294, + "learning_rate": 1.010928789136693e-05, + "loss": 0.6084, + "step": 4170 + }, + { + "epoch": 0.5117791411042945, + "grad_norm": 0.871442912185482, + "learning_rate": 1.0105313935928235e-05, + "loss": 0.5511, + "step": 4171 + }, + { + "epoch": 0.5119018404907976, + "grad_norm": 0.9439920427037076, + "learning_rate": 1.0101339963856112e-05, + "loss": 0.5975, + "step": 4172 + }, + { + "epoch": 0.5120245398773006, + "grad_norm": 0.8619483283041264, + "learning_rate": 1.009736597577821e-05, + "loss": 0.5494, + "step": 4173 + }, + { + "epoch": 0.5121472392638037, + "grad_norm": 1.024842924153004, + "learning_rate": 1.0093391972322193e-05, + "loss": 0.5663, + "step": 4174 + }, + { + "epoch": 0.5122699386503068, + "grad_norm": 0.9659459678553617, + "learning_rate": 1.0089417954115715e-05, + "loss": 0.5657, + "step": 4175 + }, + { + "epoch": 0.5123926380368098, + "grad_norm": 0.9898966841613388, + "learning_rate": 1.0085443921786438e-05, + "loss": 0.6116, + "step": 4176 + }, + { + "epoch": 0.5125153374233129, + "grad_norm": 0.8438290000678658, + "learning_rate": 1.008146987596203e-05, + "loss": 0.5786, + "step": 4177 + }, + { + "epoch": 0.5126380368098159, + "grad_norm": 0.8651272180439827, + "learning_rate": 1.0077495817270155e-05, + "loss": 0.6269, + "step": 4178 + }, + { + "epoch": 0.512760736196319, + "grad_norm": 0.9401026174130735, + "learning_rate": 1.0073521746338482e-05, + "loss": 0.5483, + "step": 4179 + }, + { + "epoch": 0.5128834355828221, + "grad_norm": 0.8908319467764654, + "learning_rate": 1.0069547663794682e-05, + "loss": 0.5912, + "step": 4180 + }, + { + "epoch": 0.5130061349693251, + "grad_norm": 0.911343633081141, + "learning_rate": 1.0065573570266424e-05, + "loss": 0.5335, + "step": 4181 + }, + { + "epoch": 0.5131288343558282, + "grad_norm": 1.0813298653788905, + "learning_rate": 1.0061599466381388e-05, + "loss": 0.603, + "step": 4182 + }, + { + "epoch": 0.5132515337423312, + "grad_norm": 0.8740614392847137, + "learning_rate": 1.0057625352767244e-05, + "loss": 0.5967, + "step": 4183 + }, + { + "epoch": 0.5133742331288343, + "grad_norm": 0.8117803707317258, + "learning_rate": 1.0053651230051671e-05, + "loss": 0.5929, + "step": 4184 + }, + { + "epoch": 0.5134969325153375, + "grad_norm": 0.8865623791289746, + "learning_rate": 1.0049677098862347e-05, + "loss": 0.5712, + "step": 4185 + }, + { + "epoch": 0.5136196319018405, + "grad_norm": 0.8458823433930199, + "learning_rate": 1.0045702959826953e-05, + "loss": 0.5569, + "step": 4186 + }, + { + "epoch": 0.5137423312883436, + "grad_norm": 1.0196251300442996, + "learning_rate": 1.0041728813573168e-05, + "loss": 0.5666, + "step": 4187 + }, + { + "epoch": 0.5138650306748467, + "grad_norm": 0.8037460573478293, + "learning_rate": 1.0037754660728677e-05, + "loss": 0.5345, + "step": 4188 + }, + { + "epoch": 0.5139877300613497, + "grad_norm": 0.8836269125525082, + "learning_rate": 1.0033780501921164e-05, + "loss": 0.5742, + "step": 4189 + }, + { + "epoch": 0.5141104294478528, + "grad_norm": 0.8810333536384602, + "learning_rate": 1.002980633777831e-05, + "loss": 0.5833, + "step": 4190 + }, + { + "epoch": 0.5142331288343558, + "grad_norm": 0.9459807505518173, + "learning_rate": 1.0025832168927798e-05, + "loss": 0.5717, + "step": 4191 + }, + { + "epoch": 0.5143558282208589, + "grad_norm": 0.8791708984356781, + "learning_rate": 1.002185799599732e-05, + "loss": 0.5425, + "step": 4192 + }, + { + "epoch": 0.514478527607362, + "grad_norm": 0.9251187956472261, + "learning_rate": 1.0017883819614558e-05, + "loss": 0.5567, + "step": 4193 + }, + { + "epoch": 0.514601226993865, + "grad_norm": 0.9799162692845843, + "learning_rate": 1.00139096404072e-05, + "loss": 0.5755, + "step": 4194 + }, + { + "epoch": 0.5147239263803681, + "grad_norm": 0.8152602039730893, + "learning_rate": 1.0009935459002935e-05, + "loss": 0.5777, + "step": 4195 + }, + { + "epoch": 0.5148466257668711, + "grad_norm": 0.9494071555725125, + "learning_rate": 1.000596127602945e-05, + "loss": 0.5687, + "step": 4196 + }, + { + "epoch": 0.5149693251533742, + "grad_norm": 0.8865659553119689, + "learning_rate": 1.0001987092114431e-05, + "loss": 0.6094, + "step": 4197 + }, + { + "epoch": 0.5150920245398773, + "grad_norm": 0.9313659190578726, + "learning_rate": 9.998012907885569e-06, + "loss": 0.5623, + "step": 4198 + }, + { + "epoch": 0.5152147239263803, + "grad_norm": 0.9124445727605991, + "learning_rate": 9.994038723970551e-06, + "loss": 0.5574, + "step": 4199 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 0.8334769819250005, + "learning_rate": 9.990064540997066e-06, + "loss": 0.5895, + "step": 4200 + }, + { + "epoch": 0.5154601226993865, + "grad_norm": 0.8066220875149127, + "learning_rate": 9.9860903595928e-06, + "loss": 0.6374, + "step": 4201 + }, + { + "epoch": 0.5155828220858896, + "grad_norm": 1.1313223456103894, + "learning_rate": 9.982116180385444e-06, + "loss": 0.5985, + "step": 4202 + }, + { + "epoch": 0.5157055214723927, + "grad_norm": 0.9155723244182704, + "learning_rate": 9.978142004002681e-06, + "loss": 0.5808, + "step": 4203 + }, + { + "epoch": 0.5158282208588957, + "grad_norm": 0.9930493769990258, + "learning_rate": 9.974167831072204e-06, + "loss": 0.5645, + "step": 4204 + }, + { + "epoch": 0.5159509202453988, + "grad_norm": 0.9777389531442477, + "learning_rate": 9.970193662221694e-06, + "loss": 0.6001, + "step": 4205 + }, + { + "epoch": 0.5160736196319018, + "grad_norm": 0.8728165397178955, + "learning_rate": 9.966219498078839e-06, + "loss": 0.6115, + "step": 4206 + }, + { + "epoch": 0.5161963190184049, + "grad_norm": 0.9153407854527196, + "learning_rate": 9.962245339271324e-06, + "loss": 0.5836, + "step": 4207 + }, + { + "epoch": 0.516319018404908, + "grad_norm": 0.8634473575082949, + "learning_rate": 9.958271186426834e-06, + "loss": 0.6258, + "step": 4208 + }, + { + "epoch": 0.516441717791411, + "grad_norm": 0.8492199128949459, + "learning_rate": 9.95429704017305e-06, + "loss": 0.5987, + "step": 4209 + }, + { + "epoch": 0.5165644171779141, + "grad_norm": 1.0898965089307278, + "learning_rate": 9.950322901137655e-06, + "loss": 0.5996, + "step": 4210 + }, + { + "epoch": 0.5166871165644171, + "grad_norm": 0.8357260162211451, + "learning_rate": 9.946348769948332e-06, + "loss": 0.5908, + "step": 4211 + }, + { + "epoch": 0.5168098159509202, + "grad_norm": 0.8845607537585085, + "learning_rate": 9.94237464723276e-06, + "loss": 0.5939, + "step": 4212 + }, + { + "epoch": 0.5169325153374233, + "grad_norm": 0.7783230538116037, + "learning_rate": 9.938400533618615e-06, + "loss": 0.5679, + "step": 4213 + }, + { + "epoch": 0.5170552147239263, + "grad_norm": 0.8273105631381191, + "learning_rate": 9.934426429733577e-06, + "loss": 0.5811, + "step": 4214 + }, + { + "epoch": 0.5171779141104295, + "grad_norm": 0.9387735423298091, + "learning_rate": 9.93045233620532e-06, + "loss": 0.5917, + "step": 4215 + }, + { + "epoch": 0.5173006134969325, + "grad_norm": 0.8120489544704073, + "learning_rate": 9.92647825366152e-06, + "loss": 0.551, + "step": 4216 + }, + { + "epoch": 0.5174233128834356, + "grad_norm": 0.8927059189525182, + "learning_rate": 9.922504182729848e-06, + "loss": 0.5358, + "step": 4217 + }, + { + "epoch": 0.5175460122699387, + "grad_norm": 0.9506770538690666, + "learning_rate": 9.918530124037972e-06, + "loss": 0.623, + "step": 4218 + }, + { + "epoch": 0.5176687116564417, + "grad_norm": 0.9064348564957836, + "learning_rate": 9.914556078213567e-06, + "loss": 0.5846, + "step": 4219 + }, + { + "epoch": 0.5177914110429448, + "grad_norm": 0.9071947459517549, + "learning_rate": 9.910582045884292e-06, + "loss": 0.5799, + "step": 4220 + }, + { + "epoch": 0.5179141104294479, + "grad_norm": 0.8423058675704651, + "learning_rate": 9.906608027677812e-06, + "loss": 0.5876, + "step": 4221 + }, + { + "epoch": 0.5180368098159509, + "grad_norm": 0.9310343320881173, + "learning_rate": 9.902634024221795e-06, + "loss": 0.5975, + "step": 4222 + }, + { + "epoch": 0.518159509202454, + "grad_norm": 0.8921186442574828, + "learning_rate": 9.898660036143893e-06, + "loss": 0.5858, + "step": 4223 + }, + { + "epoch": 0.518282208588957, + "grad_norm": 0.8713501563633081, + "learning_rate": 9.89468606407177e-06, + "loss": 0.5586, + "step": 4224 + }, + { + "epoch": 0.5184049079754601, + "grad_norm": 0.7783477386952803, + "learning_rate": 9.890712108633076e-06, + "loss": 0.5683, + "step": 4225 + }, + { + "epoch": 0.5185276073619632, + "grad_norm": 0.9031700732697568, + "learning_rate": 9.886738170455464e-06, + "loss": 0.6175, + "step": 4226 + }, + { + "epoch": 0.5186503067484662, + "grad_norm": 0.8621450536930743, + "learning_rate": 9.882764250166584e-06, + "loss": 0.5883, + "step": 4227 + }, + { + "epoch": 0.5187730061349694, + "grad_norm": 1.0202868429145433, + "learning_rate": 9.878790348394086e-06, + "loss": 0.5643, + "step": 4228 + }, + { + "epoch": 0.5188957055214724, + "grad_norm": 0.8820772444848289, + "learning_rate": 9.874816465765607e-06, + "loss": 0.6236, + "step": 4229 + }, + { + "epoch": 0.5190184049079755, + "grad_norm": 0.8420058021588513, + "learning_rate": 9.870842602908794e-06, + "loss": 0.5628, + "step": 4230 + }, + { + "epoch": 0.5191411042944786, + "grad_norm": 0.8404250626199268, + "learning_rate": 9.86686876045128e-06, + "loss": 0.6395, + "step": 4231 + }, + { + "epoch": 0.5192638036809816, + "grad_norm": 0.9714772549937599, + "learning_rate": 9.862894939020702e-06, + "loss": 0.5613, + "step": 4232 + }, + { + "epoch": 0.5193865030674847, + "grad_norm": 0.8432926702507303, + "learning_rate": 9.858921139244689e-06, + "loss": 0.5823, + "step": 4233 + }, + { + "epoch": 0.5195092024539877, + "grad_norm": 1.1835878268749767, + "learning_rate": 9.854947361750868e-06, + "loss": 0.5895, + "step": 4234 + }, + { + "epoch": 0.5196319018404908, + "grad_norm": 0.8257255848328635, + "learning_rate": 9.850973607166865e-06, + "loss": 0.5773, + "step": 4235 + }, + { + "epoch": 0.5197546012269939, + "grad_norm": 0.8749717663084999, + "learning_rate": 9.8469998761203e-06, + "loss": 0.5787, + "step": 4236 + }, + { + "epoch": 0.5198773006134969, + "grad_norm": 0.8716308704186567, + "learning_rate": 9.843026169238789e-06, + "loss": 0.517, + "step": 4237 + }, + { + "epoch": 0.52, + "grad_norm": 0.8538733234588647, + "learning_rate": 9.839052487149947e-06, + "loss": 0.5522, + "step": 4238 + }, + { + "epoch": 0.520122699386503, + "grad_norm": 0.8632435786302167, + "learning_rate": 9.835078830481378e-06, + "loss": 0.6046, + "step": 4239 + }, + { + "epoch": 0.5202453987730061, + "grad_norm": 0.9236454714964564, + "learning_rate": 9.83110519986069e-06, + "loss": 0.6011, + "step": 4240 + }, + { + "epoch": 0.5203680981595092, + "grad_norm": 0.8441483875097697, + "learning_rate": 9.827131595915486e-06, + "loss": 0.5329, + "step": 4241 + }, + { + "epoch": 0.5204907975460122, + "grad_norm": 1.1446728778053212, + "learning_rate": 9.823158019273358e-06, + "loss": 0.5807, + "step": 4242 + }, + { + "epoch": 0.5206134969325154, + "grad_norm": 0.9723897596273848, + "learning_rate": 9.819184470561902e-06, + "loss": 0.5133, + "step": 4243 + }, + { + "epoch": 0.5207361963190184, + "grad_norm": 0.9326459471157301, + "learning_rate": 9.815210950408703e-06, + "loss": 0.584, + "step": 4244 + }, + { + "epoch": 0.5208588957055215, + "grad_norm": 1.0046677281352236, + "learning_rate": 9.811237459441346e-06, + "loss": 0.6184, + "step": 4245 + }, + { + "epoch": 0.5209815950920246, + "grad_norm": 0.9194108436571146, + "learning_rate": 9.80726399828741e-06, + "loss": 0.5597, + "step": 4246 + }, + { + "epoch": 0.5211042944785276, + "grad_norm": 0.880733234245664, + "learning_rate": 9.803290567574468e-06, + "loss": 0.6182, + "step": 4247 + }, + { + "epoch": 0.5212269938650307, + "grad_norm": 0.8941359400696923, + "learning_rate": 9.79931716793009e-06, + "loss": 0.582, + "step": 4248 + }, + { + "epoch": 0.5213496932515338, + "grad_norm": 0.9023722468402203, + "learning_rate": 9.79534379998184e-06, + "loss": 0.592, + "step": 4249 + }, + { + "epoch": 0.5214723926380368, + "grad_norm": 0.844191436183329, + "learning_rate": 9.791370464357279e-06, + "loss": 0.6163, + "step": 4250 + }, + { + "epoch": 0.5215950920245399, + "grad_norm": 0.920204306338685, + "learning_rate": 9.787397161683959e-06, + "loss": 0.597, + "step": 4251 + }, + { + "epoch": 0.5217177914110429, + "grad_norm": 0.9881737963428348, + "learning_rate": 9.783423892589428e-06, + "loss": 0.6322, + "step": 4252 + }, + { + "epoch": 0.521840490797546, + "grad_norm": 0.9643922613143182, + "learning_rate": 9.779450657701227e-06, + "loss": 0.6108, + "step": 4253 + }, + { + "epoch": 0.5219631901840491, + "grad_norm": 0.84889371987982, + "learning_rate": 9.775477457646902e-06, + "loss": 0.63, + "step": 4254 + }, + { + "epoch": 0.5220858895705521, + "grad_norm": 0.9329355284641412, + "learning_rate": 9.771504293053985e-06, + "loss": 0.5686, + "step": 4255 + }, + { + "epoch": 0.5222085889570552, + "grad_norm": 0.9695608562475986, + "learning_rate": 9.76753116455e-06, + "loss": 0.6084, + "step": 4256 + }, + { + "epoch": 0.5223312883435582, + "grad_norm": 0.8463819687877435, + "learning_rate": 9.763558072762467e-06, + "loss": 0.5579, + "step": 4257 + }, + { + "epoch": 0.5224539877300614, + "grad_norm": 1.0088044399907268, + "learning_rate": 9.759585018318906e-06, + "loss": 0.5657, + "step": 4258 + }, + { + "epoch": 0.5225766871165645, + "grad_norm": 1.0040126364410005, + "learning_rate": 9.755612001846823e-06, + "loss": 0.6449, + "step": 4259 + }, + { + "epoch": 0.5226993865030675, + "grad_norm": 0.848964686394128, + "learning_rate": 9.751639023973724e-06, + "loss": 0.579, + "step": 4260 + }, + { + "epoch": 0.5228220858895706, + "grad_norm": 0.8401067115947852, + "learning_rate": 9.747666085327104e-06, + "loss": 0.5647, + "step": 4261 + }, + { + "epoch": 0.5229447852760736, + "grad_norm": 0.9499493081518131, + "learning_rate": 9.743693186534458e-06, + "loss": 0.5572, + "step": 4262 + }, + { + "epoch": 0.5230674846625767, + "grad_norm": 0.8616421037593969, + "learning_rate": 9.73972032822327e-06, + "loss": 0.6035, + "step": 4263 + }, + { + "epoch": 0.5231901840490798, + "grad_norm": 1.15078683284394, + "learning_rate": 9.735747511021017e-06, + "loss": 0.5801, + "step": 4264 + }, + { + "epoch": 0.5233128834355828, + "grad_norm": 1.0051206970697064, + "learning_rate": 9.731774735555174e-06, + "loss": 0.5494, + "step": 4265 + }, + { + "epoch": 0.5234355828220859, + "grad_norm": 0.9141179760316209, + "learning_rate": 9.727802002453205e-06, + "loss": 0.5929, + "step": 4266 + }, + { + "epoch": 0.5235582822085889, + "grad_norm": 0.8711701916101952, + "learning_rate": 9.72382931234257e-06, + "loss": 0.5756, + "step": 4267 + }, + { + "epoch": 0.523680981595092, + "grad_norm": 1.0450340672452616, + "learning_rate": 9.719856665850724e-06, + "loss": 0.5909, + "step": 4268 + }, + { + "epoch": 0.5238036809815951, + "grad_norm": 1.1837783176823335, + "learning_rate": 9.715884063605106e-06, + "loss": 0.6342, + "step": 4269 + }, + { + "epoch": 0.5239263803680981, + "grad_norm": 1.029473967521579, + "learning_rate": 9.711911506233157e-06, + "loss": 0.5554, + "step": 4270 + }, + { + "epoch": 0.5240490797546012, + "grad_norm": 0.8528426308164803, + "learning_rate": 9.707938994362309e-06, + "loss": 0.5865, + "step": 4271 + }, + { + "epoch": 0.5241717791411042, + "grad_norm": 0.8851415412565555, + "learning_rate": 9.703966528619985e-06, + "loss": 0.5677, + "step": 4272 + }, + { + "epoch": 0.5242944785276074, + "grad_norm": 0.9611034630756163, + "learning_rate": 9.699994109633604e-06, + "loss": 0.5869, + "step": 4273 + }, + { + "epoch": 0.5244171779141105, + "grad_norm": 0.8448906179390953, + "learning_rate": 9.696021738030575e-06, + "loss": 0.5765, + "step": 4274 + }, + { + "epoch": 0.5245398773006135, + "grad_norm": 0.9683120669885255, + "learning_rate": 9.692049414438298e-06, + "loss": 0.5636, + "step": 4275 + }, + { + "epoch": 0.5246625766871166, + "grad_norm": 0.8643174211229647, + "learning_rate": 9.68807713948417e-06, + "loss": 0.6019, + "step": 4276 + }, + { + "epoch": 0.5247852760736196, + "grad_norm": 0.8989846052208987, + "learning_rate": 9.684104913795575e-06, + "loss": 0.6, + "step": 4277 + }, + { + "epoch": 0.5249079754601227, + "grad_norm": 0.8343121128094648, + "learning_rate": 9.680132737999891e-06, + "loss": 0.551, + "step": 4278 + }, + { + "epoch": 0.5250306748466258, + "grad_norm": 0.8652745303924864, + "learning_rate": 9.676160612724494e-06, + "loss": 0.5815, + "step": 4279 + }, + { + "epoch": 0.5251533742331288, + "grad_norm": 0.8930611909411619, + "learning_rate": 9.672188538596746e-06, + "loss": 0.5852, + "step": 4280 + }, + { + "epoch": 0.5252760736196319, + "grad_norm": 0.9053603480795893, + "learning_rate": 9.668216516243993e-06, + "loss": 0.5829, + "step": 4281 + }, + { + "epoch": 0.525398773006135, + "grad_norm": 0.9289579945231988, + "learning_rate": 9.664244546293593e-06, + "loss": 0.5896, + "step": 4282 + }, + { + "epoch": 0.525521472392638, + "grad_norm": 0.8726598864672968, + "learning_rate": 9.660272629372881e-06, + "loss": 0.5851, + "step": 4283 + }, + { + "epoch": 0.5256441717791411, + "grad_norm": 0.9686110406897719, + "learning_rate": 9.656300766109186e-06, + "loss": 0.5931, + "step": 4284 + }, + { + "epoch": 0.5257668711656441, + "grad_norm": 0.9791317095134862, + "learning_rate": 9.652328957129831e-06, + "loss": 0.5737, + "step": 4285 + }, + { + "epoch": 0.5258895705521472, + "grad_norm": 0.8812825375552931, + "learning_rate": 9.648357203062127e-06, + "loss": 0.5566, + "step": 4286 + }, + { + "epoch": 0.5260122699386504, + "grad_norm": 0.9847045573042694, + "learning_rate": 9.644385504533382e-06, + "loss": 0.5203, + "step": 4287 + }, + { + "epoch": 0.5261349693251534, + "grad_norm": 0.9199181687583601, + "learning_rate": 9.640413862170887e-06, + "loss": 0.6534, + "step": 4288 + }, + { + "epoch": 0.5262576687116565, + "grad_norm": 0.8976910648720504, + "learning_rate": 9.636442276601932e-06, + "loss": 0.5186, + "step": 4289 + }, + { + "epoch": 0.5263803680981595, + "grad_norm": 0.9701042745016812, + "learning_rate": 9.632470748453794e-06, + "loss": 0.6279, + "step": 4290 + }, + { + "epoch": 0.5265030674846626, + "grad_norm": 0.8762554741478604, + "learning_rate": 9.62849927835374e-06, + "loss": 0.624, + "step": 4291 + }, + { + "epoch": 0.5266257668711657, + "grad_norm": 1.2426199145512133, + "learning_rate": 9.624527866929033e-06, + "loss": 0.5919, + "step": 4292 + }, + { + "epoch": 0.5267484662576687, + "grad_norm": 0.8882452006516405, + "learning_rate": 9.62055651480692e-06, + "loss": 0.5769, + "step": 4293 + }, + { + "epoch": 0.5268711656441718, + "grad_norm": 0.84307067168205, + "learning_rate": 9.616585222614644e-06, + "loss": 0.5607, + "step": 4294 + }, + { + "epoch": 0.5269938650306748, + "grad_norm": 0.8157292049159339, + "learning_rate": 9.612613990979436e-06, + "loss": 0.5771, + "step": 4295 + }, + { + "epoch": 0.5271165644171779, + "grad_norm": 0.9033320065837888, + "learning_rate": 9.608642820528517e-06, + "loss": 0.5708, + "step": 4296 + }, + { + "epoch": 0.527239263803681, + "grad_norm": 0.8525020141815269, + "learning_rate": 9.6046717118891e-06, + "loss": 0.5687, + "step": 4297 + }, + { + "epoch": 0.527361963190184, + "grad_norm": 1.023197887387874, + "learning_rate": 9.600700665688388e-06, + "loss": 0.5625, + "step": 4298 + }, + { + "epoch": 0.5274846625766871, + "grad_norm": 0.9164952478552542, + "learning_rate": 9.596729682553576e-06, + "loss": 0.6015, + "step": 4299 + }, + { + "epoch": 0.5276073619631901, + "grad_norm": 1.1488118393821842, + "learning_rate": 9.59275876311184e-06, + "loss": 0.6008, + "step": 4300 + }, + { + "epoch": 0.5277300613496932, + "grad_norm": 0.9537828693540807, + "learning_rate": 9.588787907990356e-06, + "loss": 0.5826, + "step": 4301 + }, + { + "epoch": 0.5278527607361964, + "grad_norm": 1.0042823704725192, + "learning_rate": 9.584817117816288e-06, + "loss": 0.6325, + "step": 4302 + }, + { + "epoch": 0.5279754601226994, + "grad_norm": 0.8380228545692372, + "learning_rate": 9.580846393216784e-06, + "loss": 0.6037, + "step": 4303 + }, + { + "epoch": 0.5280981595092025, + "grad_norm": 1.053249093289344, + "learning_rate": 9.576875734818993e-06, + "loss": 0.5577, + "step": 4304 + }, + { + "epoch": 0.5282208588957055, + "grad_norm": 0.9180295656663952, + "learning_rate": 9.572905143250039e-06, + "loss": 0.5369, + "step": 4305 + }, + { + "epoch": 0.5283435582822086, + "grad_norm": 1.475390559830284, + "learning_rate": 9.568934619137048e-06, + "loss": 0.574, + "step": 4306 + }, + { + "epoch": 0.5284662576687117, + "grad_norm": 0.9744858342870557, + "learning_rate": 9.564964163107125e-06, + "loss": 0.5485, + "step": 4307 + }, + { + "epoch": 0.5285889570552147, + "grad_norm": 0.8702218965241452, + "learning_rate": 9.560993775787373e-06, + "loss": 0.558, + "step": 4308 + }, + { + "epoch": 0.5287116564417178, + "grad_norm": 0.9061660443011992, + "learning_rate": 9.55702345780488e-06, + "loss": 0.5823, + "step": 4309 + }, + { + "epoch": 0.5288343558282208, + "grad_norm": 0.8352146682456539, + "learning_rate": 9.553053209786725e-06, + "loss": 0.5916, + "step": 4310 + }, + { + "epoch": 0.5289570552147239, + "grad_norm": 1.0210585493992257, + "learning_rate": 9.549083032359972e-06, + "loss": 0.6004, + "step": 4311 + }, + { + "epoch": 0.529079754601227, + "grad_norm": 0.8988762329246424, + "learning_rate": 9.545112926151678e-06, + "loss": 0.5702, + "step": 4312 + }, + { + "epoch": 0.52920245398773, + "grad_norm": 0.8984575197937227, + "learning_rate": 9.541142891788887e-06, + "loss": 0.5483, + "step": 4313 + }, + { + "epoch": 0.5293251533742331, + "grad_norm": 0.9821180819223211, + "learning_rate": 9.53717292989863e-06, + "loss": 0.595, + "step": 4314 + }, + { + "epoch": 0.5294478527607362, + "grad_norm": 0.8929137981474432, + "learning_rate": 9.53320304110793e-06, + "loss": 0.6048, + "step": 4315 + }, + { + "epoch": 0.5295705521472392, + "grad_norm": 0.8473981158956406, + "learning_rate": 9.529233226043799e-06, + "loss": 0.5429, + "step": 4316 + }, + { + "epoch": 0.5296932515337424, + "grad_norm": 0.8870444804762837, + "learning_rate": 9.52526348533323e-06, + "loss": 0.507, + "step": 4317 + }, + { + "epoch": 0.5298159509202454, + "grad_norm": 1.0331051587906999, + "learning_rate": 9.521293819603216e-06, + "loss": 0.5359, + "step": 4318 + }, + { + "epoch": 0.5299386503067485, + "grad_norm": 0.9124984315801901, + "learning_rate": 9.517324229480724e-06, + "loss": 0.56, + "step": 4319 + }, + { + "epoch": 0.5300613496932516, + "grad_norm": 0.9117720604406807, + "learning_rate": 9.513354715592721e-06, + "loss": 0.5454, + "step": 4320 + }, + { + "epoch": 0.5301840490797546, + "grad_norm": 0.8238330370243281, + "learning_rate": 9.509385278566156e-06, + "loss": 0.5687, + "step": 4321 + }, + { + "epoch": 0.5303067484662577, + "grad_norm": 1.0072568723407291, + "learning_rate": 9.505415919027971e-06, + "loss": 0.5795, + "step": 4322 + }, + { + "epoch": 0.5304294478527607, + "grad_norm": 0.8427724449173857, + "learning_rate": 9.501446637605087e-06, + "loss": 0.5682, + "step": 4323 + }, + { + "epoch": 0.5305521472392638, + "grad_norm": 0.9312915864461498, + "learning_rate": 9.49747743492442e-06, + "loss": 0.5841, + "step": 4324 + }, + { + "epoch": 0.5306748466257669, + "grad_norm": 0.9572355615644804, + "learning_rate": 9.493508311612874e-06, + "loss": 0.5607, + "step": 4325 + }, + { + "epoch": 0.5307975460122699, + "grad_norm": 0.8839075637999401, + "learning_rate": 9.489539268297335e-06, + "loss": 0.522, + "step": 4326 + }, + { + "epoch": 0.530920245398773, + "grad_norm": 0.8370958942593687, + "learning_rate": 9.48557030560468e-06, + "loss": 0.5977, + "step": 4327 + }, + { + "epoch": 0.531042944785276, + "grad_norm": 0.8945047426687234, + "learning_rate": 9.481601424161772e-06, + "loss": 0.5236, + "step": 4328 + }, + { + "epoch": 0.5311656441717791, + "grad_norm": 0.8356361721768116, + "learning_rate": 9.477632624595466e-06, + "loss": 0.5471, + "step": 4329 + }, + { + "epoch": 0.5312883435582823, + "grad_norm": 0.943007845909241, + "learning_rate": 9.473663907532593e-06, + "loss": 0.5582, + "step": 4330 + }, + { + "epoch": 0.5314110429447853, + "grad_norm": 1.1868936869632474, + "learning_rate": 9.469695273599982e-06, + "loss": 0.5818, + "step": 4331 + }, + { + "epoch": 0.5315337423312884, + "grad_norm": 0.9112634389123234, + "learning_rate": 9.465726723424441e-06, + "loss": 0.6124, + "step": 4332 + }, + { + "epoch": 0.5316564417177914, + "grad_norm": 0.890981191073261, + "learning_rate": 9.461758257632772e-06, + "loss": 0.5898, + "step": 4333 + }, + { + "epoch": 0.5317791411042945, + "grad_norm": 0.8167795809258039, + "learning_rate": 9.457789876851759e-06, + "loss": 0.5368, + "step": 4334 + }, + { + "epoch": 0.5319018404907976, + "grad_norm": 0.8290306466732179, + "learning_rate": 9.453821581708174e-06, + "loss": 0.6057, + "step": 4335 + }, + { + "epoch": 0.5320245398773006, + "grad_norm": 0.809259807072771, + "learning_rate": 9.44985337282877e-06, + "loss": 0.6598, + "step": 4336 + }, + { + "epoch": 0.5321472392638037, + "grad_norm": 0.9601876622658689, + "learning_rate": 9.445885250840301e-06, + "loss": 0.593, + "step": 4337 + }, + { + "epoch": 0.5322699386503067, + "grad_norm": 0.947538211903336, + "learning_rate": 9.441917216369491e-06, + "loss": 0.5981, + "step": 4338 + }, + { + "epoch": 0.5323926380368098, + "grad_norm": 0.995838755897113, + "learning_rate": 9.43794927004306e-06, + "loss": 0.5725, + "step": 4339 + }, + { + "epoch": 0.5325153374233129, + "grad_norm": 1.04032312293022, + "learning_rate": 9.433981412487711e-06, + "loss": 0.6049, + "step": 4340 + }, + { + "epoch": 0.5326380368098159, + "grad_norm": 0.867569471626723, + "learning_rate": 9.43001364433013e-06, + "loss": 0.525, + "step": 4341 + }, + { + "epoch": 0.532760736196319, + "grad_norm": 0.8623353628224149, + "learning_rate": 9.426045966196992e-06, + "loss": 0.5887, + "step": 4342 + }, + { + "epoch": 0.5328834355828221, + "grad_norm": 0.9483142143596013, + "learning_rate": 9.422078378714962e-06, + "loss": 0.5781, + "step": 4343 + }, + { + "epoch": 0.5330061349693251, + "grad_norm": 0.930776581597441, + "learning_rate": 9.418110882510683e-06, + "loss": 0.5398, + "step": 4344 + }, + { + "epoch": 0.5331288343558283, + "grad_norm": 0.8294228137830378, + "learning_rate": 9.414143478210786e-06, + "loss": 0.5629, + "step": 4345 + }, + { + "epoch": 0.5332515337423313, + "grad_norm": 0.9459090958879924, + "learning_rate": 9.410176166441892e-06, + "loss": 0.5623, + "step": 4346 + }, + { + "epoch": 0.5333742331288344, + "grad_norm": 1.017296604092439, + "learning_rate": 9.4062089478306e-06, + "loss": 0.5275, + "step": 4347 + }, + { + "epoch": 0.5334969325153375, + "grad_norm": 1.1296767201272524, + "learning_rate": 9.402241823003503e-06, + "loss": 0.6182, + "step": 4348 + }, + { + "epoch": 0.5336196319018405, + "grad_norm": 1.0113309688531589, + "learning_rate": 9.398274792587169e-06, + "loss": 0.5813, + "step": 4349 + }, + { + "epoch": 0.5337423312883436, + "grad_norm": 0.9244401558986175, + "learning_rate": 9.394307857208158e-06, + "loss": 0.5471, + "step": 4350 + }, + { + "epoch": 0.5338650306748466, + "grad_norm": 0.8954620575904432, + "learning_rate": 9.390341017493014e-06, + "loss": 0.5962, + "step": 4351 + }, + { + "epoch": 0.5339877300613497, + "grad_norm": 0.9141951455719061, + "learning_rate": 9.386374274068263e-06, + "loss": 0.5503, + "step": 4352 + }, + { + "epoch": 0.5341104294478528, + "grad_norm": 0.9340019827800186, + "learning_rate": 9.382407627560423e-06, + "loss": 0.5487, + "step": 4353 + }, + { + "epoch": 0.5342331288343558, + "grad_norm": 0.9396218720455425, + "learning_rate": 9.378441078595988e-06, + "loss": 0.5345, + "step": 4354 + }, + { + "epoch": 0.5343558282208589, + "grad_norm": 1.0503057469126285, + "learning_rate": 9.374474627801439e-06, + "loss": 0.5971, + "step": 4355 + }, + { + "epoch": 0.5344785276073619, + "grad_norm": 0.8899804561897248, + "learning_rate": 9.370508275803248e-06, + "loss": 0.5683, + "step": 4356 + }, + { + "epoch": 0.534601226993865, + "grad_norm": 0.8636440088885338, + "learning_rate": 9.36654202322786e-06, + "loss": 0.5494, + "step": 4357 + }, + { + "epoch": 0.5347239263803681, + "grad_norm": 1.019286717378039, + "learning_rate": 9.362575870701715e-06, + "loss": 0.5852, + "step": 4358 + }, + { + "epoch": 0.5348466257668711, + "grad_norm": 0.952748082051462, + "learning_rate": 9.35860981885123e-06, + "loss": 0.6173, + "step": 4359 + }, + { + "epoch": 0.5349693251533743, + "grad_norm": 1.0074411977379663, + "learning_rate": 9.354643868302813e-06, + "loss": 0.6012, + "step": 4360 + }, + { + "epoch": 0.5350920245398773, + "grad_norm": 0.8969956396778902, + "learning_rate": 9.350678019682847e-06, + "loss": 0.5825, + "step": 4361 + }, + { + "epoch": 0.5352147239263804, + "grad_norm": 0.8775472025200636, + "learning_rate": 9.346712273617704e-06, + "loss": 0.595, + "step": 4362 + }, + { + "epoch": 0.5353374233128835, + "grad_norm": 0.9033375979791144, + "learning_rate": 9.342746630733738e-06, + "loss": 0.5858, + "step": 4363 + }, + { + "epoch": 0.5354601226993865, + "grad_norm": 0.8698394002430587, + "learning_rate": 9.338781091657297e-06, + "loss": 0.5587, + "step": 4364 + }, + { + "epoch": 0.5355828220858896, + "grad_norm": 0.8929199800151273, + "learning_rate": 9.334815657014696e-06, + "loss": 0.5233, + "step": 4365 + }, + { + "epoch": 0.5357055214723926, + "grad_norm": 0.8759696349525469, + "learning_rate": 9.330850327432243e-06, + "loss": 0.5965, + "step": 4366 + }, + { + "epoch": 0.5358282208588957, + "grad_norm": 0.8400795900391423, + "learning_rate": 9.32688510353623e-06, + "loss": 0.5191, + "step": 4367 + }, + { + "epoch": 0.5359509202453988, + "grad_norm": 0.9816743451990181, + "learning_rate": 9.322919985952926e-06, + "loss": 0.6114, + "step": 4368 + }, + { + "epoch": 0.5360736196319018, + "grad_norm": 0.8480773436936393, + "learning_rate": 9.31895497530859e-06, + "loss": 0.5435, + "step": 4369 + }, + { + "epoch": 0.5361963190184049, + "grad_norm": 0.9635623739396498, + "learning_rate": 9.314990072229461e-06, + "loss": 0.5942, + "step": 4370 + }, + { + "epoch": 0.5363190184049079, + "grad_norm": 0.8811195498878455, + "learning_rate": 9.31102527734176e-06, + "loss": 0.5891, + "step": 4371 + }, + { + "epoch": 0.536441717791411, + "grad_norm": 0.874226953042008, + "learning_rate": 9.307060591271694e-06, + "loss": 0.5069, + "step": 4372 + }, + { + "epoch": 0.5365644171779141, + "grad_norm": 0.8281116075060366, + "learning_rate": 9.30309601464545e-06, + "loss": 0.5617, + "step": 4373 + }, + { + "epoch": 0.5366871165644171, + "grad_norm": 0.856358278052138, + "learning_rate": 9.299131548089198e-06, + "loss": 0.5764, + "step": 4374 + }, + { + "epoch": 0.5368098159509203, + "grad_norm": 0.8892265623658975, + "learning_rate": 9.295167192229093e-06, + "loss": 0.5565, + "step": 4375 + }, + { + "epoch": 0.5369325153374234, + "grad_norm": 1.035298179365306, + "learning_rate": 9.291202947691272e-06, + "loss": 0.5839, + "step": 4376 + }, + { + "epoch": 0.5370552147239264, + "grad_norm": 1.2010524156248175, + "learning_rate": 9.28723881510185e-06, + "loss": 0.6174, + "step": 4377 + }, + { + "epoch": 0.5371779141104295, + "grad_norm": 0.8756212190663175, + "learning_rate": 9.28327479508693e-06, + "loss": 0.55, + "step": 4378 + }, + { + "epoch": 0.5373006134969325, + "grad_norm": 0.9316682931274243, + "learning_rate": 9.279310888272596e-06, + "loss": 0.5818, + "step": 4379 + }, + { + "epoch": 0.5374233128834356, + "grad_norm": 0.881082423948891, + "learning_rate": 9.27534709528491e-06, + "loss": 0.5726, + "step": 4380 + }, + { + "epoch": 0.5375460122699387, + "grad_norm": 0.954886815983628, + "learning_rate": 9.271383416749921e-06, + "loss": 0.5983, + "step": 4381 + }, + { + "epoch": 0.5376687116564417, + "grad_norm": 0.9663080633274992, + "learning_rate": 9.267419853293656e-06, + "loss": 0.5321, + "step": 4382 + }, + { + "epoch": 0.5377914110429448, + "grad_norm": 0.9109959223242343, + "learning_rate": 9.263456405542128e-06, + "loss": 0.5536, + "step": 4383 + }, + { + "epoch": 0.5379141104294478, + "grad_norm": 1.5526673725291844, + "learning_rate": 9.259493074121327e-06, + "loss": 0.5807, + "step": 4384 + }, + { + "epoch": 0.5380368098159509, + "grad_norm": 0.8998902363821281, + "learning_rate": 9.25552985965723e-06, + "loss": 0.5944, + "step": 4385 + }, + { + "epoch": 0.538159509202454, + "grad_norm": 0.9086481466077779, + "learning_rate": 9.251566762775792e-06, + "loss": 0.5474, + "step": 4386 + }, + { + "epoch": 0.538282208588957, + "grad_norm": 1.2185824536248533, + "learning_rate": 9.247603784102947e-06, + "loss": 0.5887, + "step": 4387 + }, + { + "epoch": 0.5384049079754601, + "grad_norm": 0.938194916723247, + "learning_rate": 9.243640924264619e-06, + "loss": 0.5734, + "step": 4388 + }, + { + "epoch": 0.5385276073619631, + "grad_norm": 0.9246020535292067, + "learning_rate": 9.239678183886703e-06, + "loss": 0.5797, + "step": 4389 + }, + { + "epoch": 0.5386503067484663, + "grad_norm": 0.8774878478221507, + "learning_rate": 9.235715563595082e-06, + "loss": 0.5787, + "step": 4390 + }, + { + "epoch": 0.5387730061349694, + "grad_norm": 0.858464527816275, + "learning_rate": 9.231753064015611e-06, + "loss": 0.5681, + "step": 4391 + }, + { + "epoch": 0.5388957055214724, + "grad_norm": 1.1367472220483374, + "learning_rate": 9.227790685774145e-06, + "loss": 0.618, + "step": 4392 + }, + { + "epoch": 0.5390184049079755, + "grad_norm": 0.9741889366780037, + "learning_rate": 9.2238284294965e-06, + "loss": 0.5356, + "step": 4393 + }, + { + "epoch": 0.5391411042944785, + "grad_norm": 0.8338711483265852, + "learning_rate": 9.219866295808481e-06, + "loss": 0.5713, + "step": 4394 + }, + { + "epoch": 0.5392638036809816, + "grad_norm": 0.9600615979382078, + "learning_rate": 9.215904285335876e-06, + "loss": 0.5578, + "step": 4395 + }, + { + "epoch": 0.5393865030674847, + "grad_norm": 0.9096182516720447, + "learning_rate": 9.211942398704447e-06, + "loss": 0.5158, + "step": 4396 + }, + { + "epoch": 0.5395092024539877, + "grad_norm": 0.9035505797755397, + "learning_rate": 9.207980636539943e-06, + "loss": 0.5216, + "step": 4397 + }, + { + "epoch": 0.5396319018404908, + "grad_norm": 0.8594376919098228, + "learning_rate": 9.204018999468086e-06, + "loss": 0.6251, + "step": 4398 + }, + { + "epoch": 0.5397546012269938, + "grad_norm": 0.9109627524308618, + "learning_rate": 9.200057488114585e-06, + "loss": 0.5458, + "step": 4399 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 1.104062552771998, + "learning_rate": 9.196096103105127e-06, + "loss": 0.5911, + "step": 4400 + }, + { + "epoch": 0.54, + "grad_norm": 0.860176233401433, + "learning_rate": 9.192134845065379e-06, + "loss": 0.5364, + "step": 4401 + }, + { + "epoch": 0.540122699386503, + "grad_norm": 1.0122758246040653, + "learning_rate": 9.188173714620985e-06, + "loss": 0.6184, + "step": 4402 + }, + { + "epoch": 0.5402453987730061, + "grad_norm": 0.8693959781435799, + "learning_rate": 9.184212712397574e-06, + "loss": 0.5519, + "step": 4403 + }, + { + "epoch": 0.5403680981595091, + "grad_norm": 0.924482159413508, + "learning_rate": 9.180251839020751e-06, + "loss": 0.579, + "step": 4404 + }, + { + "epoch": 0.5404907975460123, + "grad_norm": 0.8851497175403945, + "learning_rate": 9.176291095116104e-06, + "loss": 0.5775, + "step": 4405 + }, + { + "epoch": 0.5406134969325154, + "grad_norm": 0.9302366831638449, + "learning_rate": 9.172330481309195e-06, + "loss": 0.5244, + "step": 4406 + }, + { + "epoch": 0.5407361963190184, + "grad_norm": 0.8877551988167134, + "learning_rate": 9.168369998225572e-06, + "loss": 0.5309, + "step": 4407 + }, + { + "epoch": 0.5408588957055215, + "grad_norm": 0.9701971481412728, + "learning_rate": 9.16440964649076e-06, + "loss": 0.6186, + "step": 4408 + }, + { + "epoch": 0.5409815950920246, + "grad_norm": 0.9639997781466738, + "learning_rate": 9.160449426730261e-06, + "loss": 0.5749, + "step": 4409 + }, + { + "epoch": 0.5411042944785276, + "grad_norm": 0.8353614238334509, + "learning_rate": 9.156489339569555e-06, + "loss": 0.5551, + "step": 4410 + }, + { + "epoch": 0.5412269938650307, + "grad_norm": 0.9314411031106241, + "learning_rate": 9.152529385634106e-06, + "loss": 0.5469, + "step": 4411 + }, + { + "epoch": 0.5413496932515337, + "grad_norm": 0.9920842248077709, + "learning_rate": 9.148569565549355e-06, + "loss": 0.5974, + "step": 4412 + }, + { + "epoch": 0.5414723926380368, + "grad_norm": 0.8716997656600294, + "learning_rate": 9.14460987994072e-06, + "loss": 0.5648, + "step": 4413 + }, + { + "epoch": 0.5415950920245399, + "grad_norm": 0.9765137850836886, + "learning_rate": 9.140650329433602e-06, + "loss": 0.5952, + "step": 4414 + }, + { + "epoch": 0.5417177914110429, + "grad_norm": 0.9132603278810308, + "learning_rate": 9.136690914653377e-06, + "loss": 0.5477, + "step": 4415 + }, + { + "epoch": 0.541840490797546, + "grad_norm": 0.9457104362157008, + "learning_rate": 9.1327316362254e-06, + "loss": 0.5747, + "step": 4416 + }, + { + "epoch": 0.541963190184049, + "grad_norm": 3.2285163575159594, + "learning_rate": 9.128772494775004e-06, + "loss": 0.5839, + "step": 4417 + }, + { + "epoch": 0.5420858895705521, + "grad_norm": 0.9138974530529477, + "learning_rate": 9.1248134909275e-06, + "loss": 0.6003, + "step": 4418 + }, + { + "epoch": 0.5422085889570553, + "grad_norm": 0.911571701629161, + "learning_rate": 9.120854625308184e-06, + "loss": 0.5968, + "step": 4419 + }, + { + "epoch": 0.5423312883435583, + "grad_norm": 0.9557590667016846, + "learning_rate": 9.11689589854232e-06, + "loss": 0.6027, + "step": 4420 + }, + { + "epoch": 0.5424539877300614, + "grad_norm": 0.7752622295104302, + "learning_rate": 9.112937311255158e-06, + "loss": 0.5794, + "step": 4421 + }, + { + "epoch": 0.5425766871165644, + "grad_norm": 0.9835239806119984, + "learning_rate": 9.108978864071922e-06, + "loss": 0.5878, + "step": 4422 + }, + { + "epoch": 0.5426993865030675, + "grad_norm": 0.8803846950021231, + "learning_rate": 9.105020557617815e-06, + "loss": 0.5755, + "step": 4423 + }, + { + "epoch": 0.5428220858895706, + "grad_norm": 0.821166706450734, + "learning_rate": 9.101062392518015e-06, + "loss": 0.5826, + "step": 4424 + }, + { + "epoch": 0.5429447852760736, + "grad_norm": 0.8489931307751636, + "learning_rate": 9.097104369397681e-06, + "loss": 0.5425, + "step": 4425 + }, + { + "epoch": 0.5430674846625767, + "grad_norm": 0.822835824351937, + "learning_rate": 9.09314648888195e-06, + "loss": 0.5968, + "step": 4426 + }, + { + "epoch": 0.5431901840490797, + "grad_norm": 1.0025556085047194, + "learning_rate": 9.089188751595937e-06, + "loss": 0.5705, + "step": 4427 + }, + { + "epoch": 0.5433128834355828, + "grad_norm": 0.9410872919344504, + "learning_rate": 9.085231158164731e-06, + "loss": 0.5883, + "step": 4428 + }, + { + "epoch": 0.5434355828220859, + "grad_norm": 0.9275210151250135, + "learning_rate": 9.081273709213396e-06, + "loss": 0.56, + "step": 4429 + }, + { + "epoch": 0.5435582822085889, + "grad_norm": 0.8729549595324313, + "learning_rate": 9.07731640536698e-06, + "loss": 0.5153, + "step": 4430 + }, + { + "epoch": 0.543680981595092, + "grad_norm": 1.070848676517373, + "learning_rate": 9.073359247250508e-06, + "loss": 0.6127, + "step": 4431 + }, + { + "epoch": 0.543803680981595, + "grad_norm": 0.9026302343549883, + "learning_rate": 9.069402235488975e-06, + "loss": 0.6185, + "step": 4432 + }, + { + "epoch": 0.5439263803680982, + "grad_norm": 0.9322453378002516, + "learning_rate": 9.065445370707359e-06, + "loss": 0.5626, + "step": 4433 + }, + { + "epoch": 0.5440490797546013, + "grad_norm": 0.9398552134138479, + "learning_rate": 9.061488653530612e-06, + "loss": 0.5969, + "step": 4434 + }, + { + "epoch": 0.5441717791411043, + "grad_norm": 0.8490367433214207, + "learning_rate": 9.057532084583662e-06, + "loss": 0.5629, + "step": 4435 + }, + { + "epoch": 0.5442944785276074, + "grad_norm": 1.0131267363426935, + "learning_rate": 9.053575664491417e-06, + "loss": 0.5805, + "step": 4436 + }, + { + "epoch": 0.5444171779141105, + "grad_norm": 0.9692479646370897, + "learning_rate": 9.049619393878761e-06, + "loss": 0.5681, + "step": 4437 + }, + { + "epoch": 0.5445398773006135, + "grad_norm": 0.8871212440701839, + "learning_rate": 9.04566327337055e-06, + "loss": 0.5491, + "step": 4438 + }, + { + "epoch": 0.5446625766871166, + "grad_norm": 0.8795268407053924, + "learning_rate": 9.04170730359162e-06, + "loss": 0.6043, + "step": 4439 + }, + { + "epoch": 0.5447852760736196, + "grad_norm": 1.1005606326433257, + "learning_rate": 9.037751485166785e-06, + "loss": 0.5746, + "step": 4440 + }, + { + "epoch": 0.5449079754601227, + "grad_norm": 0.905432176110194, + "learning_rate": 9.03379581872083e-06, + "loss": 0.556, + "step": 4441 + }, + { + "epoch": 0.5450306748466258, + "grad_norm": 0.9036571907475225, + "learning_rate": 9.029840304878517e-06, + "loss": 0.5593, + "step": 4442 + }, + { + "epoch": 0.5451533742331288, + "grad_norm": 0.9960989532356757, + "learning_rate": 9.025884944264588e-06, + "loss": 0.5898, + "step": 4443 + }, + { + "epoch": 0.5452760736196319, + "grad_norm": 0.7601632824053183, + "learning_rate": 9.021929737503757e-06, + "loss": 0.5954, + "step": 4444 + }, + { + "epoch": 0.5453987730061349, + "grad_norm": 0.8390584130362639, + "learning_rate": 9.017974685220716e-06, + "loss": 0.5546, + "step": 4445 + }, + { + "epoch": 0.545521472392638, + "grad_norm": 0.8600672383796178, + "learning_rate": 9.014019788040129e-06, + "loss": 0.5542, + "step": 4446 + }, + { + "epoch": 0.5456441717791412, + "grad_norm": 0.8606687371922812, + "learning_rate": 9.01006504658664e-06, + "loss": 0.5625, + "step": 4447 + }, + { + "epoch": 0.5457668711656442, + "grad_norm": 0.8644997926706859, + "learning_rate": 9.006110461484869e-06, + "loss": 0.58, + "step": 4448 + }, + { + "epoch": 0.5458895705521473, + "grad_norm": 0.8120617254839182, + "learning_rate": 9.002156033359405e-06, + "loss": 0.5865, + "step": 4449 + }, + { + "epoch": 0.5460122699386503, + "grad_norm": 0.7863995549896317, + "learning_rate": 8.998201762834815e-06, + "loss": 0.5292, + "step": 4450 + }, + { + "epoch": 0.5461349693251534, + "grad_norm": 0.8578156826710372, + "learning_rate": 8.994247650535645e-06, + "loss": 0.6326, + "step": 4451 + }, + { + "epoch": 0.5462576687116565, + "grad_norm": 0.8893959125424178, + "learning_rate": 8.990293697086415e-06, + "loss": 0.6247, + "step": 4452 + }, + { + "epoch": 0.5463803680981595, + "grad_norm": 0.8227348652593331, + "learning_rate": 8.986339903111611e-06, + "loss": 0.5641, + "step": 4453 + }, + { + "epoch": 0.5465030674846626, + "grad_norm": 0.959804347584065, + "learning_rate": 8.982386269235706e-06, + "loss": 0.5882, + "step": 4454 + }, + { + "epoch": 0.5466257668711656, + "grad_norm": 0.9392346398493796, + "learning_rate": 8.97843279608314e-06, + "loss": 0.6201, + "step": 4455 + }, + { + "epoch": 0.5467484662576687, + "grad_norm": 0.8427311586897612, + "learning_rate": 8.974479484278334e-06, + "loss": 0.5542, + "step": 4456 + }, + { + "epoch": 0.5468711656441718, + "grad_norm": 0.8769215280057973, + "learning_rate": 8.970526334445675e-06, + "loss": 0.6194, + "step": 4457 + }, + { + "epoch": 0.5469938650306748, + "grad_norm": 0.9192230419618034, + "learning_rate": 8.966573347209535e-06, + "loss": 0.5894, + "step": 4458 + }, + { + "epoch": 0.5471165644171779, + "grad_norm": 0.8592691568403932, + "learning_rate": 8.962620523194245e-06, + "loss": 0.5584, + "step": 4459 + }, + { + "epoch": 0.5472392638036809, + "grad_norm": 0.909522178438442, + "learning_rate": 8.958667863024127e-06, + "loss": 0.5652, + "step": 4460 + }, + { + "epoch": 0.547361963190184, + "grad_norm": 0.8520693450449152, + "learning_rate": 8.954715367323468e-06, + "loss": 0.5698, + "step": 4461 + }, + { + "epoch": 0.5474846625766872, + "grad_norm": 0.8905870774955045, + "learning_rate": 8.950763036716528e-06, + "loss": 0.5394, + "step": 4462 + }, + { + "epoch": 0.5476073619631902, + "grad_norm": 0.7839458194142007, + "learning_rate": 8.946810871827548e-06, + "loss": 0.5456, + "step": 4463 + }, + { + "epoch": 0.5477300613496933, + "grad_norm": 1.0195856757803383, + "learning_rate": 8.942858873280735e-06, + "loss": 0.5967, + "step": 4464 + }, + { + "epoch": 0.5478527607361963, + "grad_norm": 0.9905181169289479, + "learning_rate": 8.938907041700275e-06, + "loss": 0.5448, + "step": 4465 + }, + { + "epoch": 0.5479754601226994, + "grad_norm": 1.0389084399331665, + "learning_rate": 8.934955377710326e-06, + "loss": 0.5878, + "step": 4466 + }, + { + "epoch": 0.5480981595092025, + "grad_norm": 0.9322958786022973, + "learning_rate": 8.931003881935019e-06, + "loss": 0.561, + "step": 4467 + }, + { + "epoch": 0.5482208588957055, + "grad_norm": 0.8512368593830069, + "learning_rate": 8.927052554998457e-06, + "loss": 0.5851, + "step": 4468 + }, + { + "epoch": 0.5483435582822086, + "grad_norm": 0.8793700698585322, + "learning_rate": 8.923101397524721e-06, + "loss": 0.6043, + "step": 4469 + }, + { + "epoch": 0.5484662576687117, + "grad_norm": 0.8780285258561982, + "learning_rate": 8.919150410137862e-06, + "loss": 0.5736, + "step": 4470 + }, + { + "epoch": 0.5485889570552147, + "grad_norm": 0.9025212277583998, + "learning_rate": 8.915199593461903e-06, + "loss": 0.6035, + "step": 4471 + }, + { + "epoch": 0.5487116564417178, + "grad_norm": 1.1031231783825817, + "learning_rate": 8.91124894812084e-06, + "loss": 0.5934, + "step": 4472 + }, + { + "epoch": 0.5488343558282208, + "grad_norm": 0.8861815425087276, + "learning_rate": 8.907298474738643e-06, + "loss": 0.5764, + "step": 4473 + }, + { + "epoch": 0.5489570552147239, + "grad_norm": 0.8153873704781186, + "learning_rate": 8.90334817393926e-06, + "loss": 0.529, + "step": 4474 + }, + { + "epoch": 0.549079754601227, + "grad_norm": 0.8950511641671934, + "learning_rate": 8.899398046346608e-06, + "loss": 0.5189, + "step": 4475 + }, + { + "epoch": 0.54920245398773, + "grad_norm": 0.8578809656730241, + "learning_rate": 8.89544809258457e-06, + "loss": 0.5226, + "step": 4476 + }, + { + "epoch": 0.5493251533742332, + "grad_norm": 0.870876247636186, + "learning_rate": 8.891498313277013e-06, + "loss": 0.5662, + "step": 4477 + }, + { + "epoch": 0.5494478527607362, + "grad_norm": 0.912567245044219, + "learning_rate": 8.887548709047765e-06, + "loss": 0.5188, + "step": 4478 + }, + { + "epoch": 0.5495705521472393, + "grad_norm": 0.8838283200996705, + "learning_rate": 8.883599280520634e-06, + "loss": 0.5603, + "step": 4479 + }, + { + "epoch": 0.5496932515337424, + "grad_norm": 1.0488587397020024, + "learning_rate": 8.8796500283194e-06, + "loss": 0.5965, + "step": 4480 + }, + { + "epoch": 0.5498159509202454, + "grad_norm": 0.7485242791708292, + "learning_rate": 8.87570095306781e-06, + "loss": 0.551, + "step": 4481 + }, + { + "epoch": 0.5499386503067485, + "grad_norm": 0.8243445530551763, + "learning_rate": 8.871752055389591e-06, + "loss": 0.572, + "step": 4482 + }, + { + "epoch": 0.5500613496932515, + "grad_norm": 0.8195342960153821, + "learning_rate": 8.867803335908433e-06, + "loss": 0.5891, + "step": 4483 + }, + { + "epoch": 0.5501840490797546, + "grad_norm": 0.9191391880498871, + "learning_rate": 8.863854795248007e-06, + "loss": 0.6073, + "step": 4484 + }, + { + "epoch": 0.5503067484662577, + "grad_norm": 0.8420989350940692, + "learning_rate": 8.859906434031947e-06, + "loss": 0.6064, + "step": 4485 + }, + { + "epoch": 0.5504294478527607, + "grad_norm": 0.8527279114094334, + "learning_rate": 8.855958252883865e-06, + "loss": 0.5833, + "step": 4486 + }, + { + "epoch": 0.5505521472392638, + "grad_norm": 0.9056659954601859, + "learning_rate": 8.85201025242734e-06, + "loss": 0.5716, + "step": 4487 + }, + { + "epoch": 0.5506748466257668, + "grad_norm": 0.9081503321450877, + "learning_rate": 8.848062433285927e-06, + "loss": 0.5548, + "step": 4488 + }, + { + "epoch": 0.5507975460122699, + "grad_norm": 0.8050726436231175, + "learning_rate": 8.844114796083152e-06, + "loss": 0.6062, + "step": 4489 + }, + { + "epoch": 0.550920245398773, + "grad_norm": 0.8525508103734813, + "learning_rate": 8.840167341442505e-06, + "loss": 0.5324, + "step": 4490 + }, + { + "epoch": 0.551042944785276, + "grad_norm": 0.8507281277525716, + "learning_rate": 8.836220069987454e-06, + "loss": 0.6065, + "step": 4491 + }, + { + "epoch": 0.5511656441717792, + "grad_norm": 0.9600978811978497, + "learning_rate": 8.83227298234144e-06, + "loss": 0.552, + "step": 4492 + }, + { + "epoch": 0.5512883435582822, + "grad_norm": 0.8845665105408298, + "learning_rate": 8.828326079127867e-06, + "loss": 0.5583, + "step": 4493 + }, + { + "epoch": 0.5514110429447853, + "grad_norm": 0.9126965200719644, + "learning_rate": 8.824379360970118e-06, + "loss": 0.6108, + "step": 4494 + }, + { + "epoch": 0.5515337423312884, + "grad_norm": 1.071946928753183, + "learning_rate": 8.820432828491542e-06, + "loss": 0.5774, + "step": 4495 + }, + { + "epoch": 0.5516564417177914, + "grad_norm": 0.8550914406079004, + "learning_rate": 8.816486482315459e-06, + "loss": 0.5709, + "step": 4496 + }, + { + "epoch": 0.5517791411042945, + "grad_norm": 0.9344928782284762, + "learning_rate": 8.812540323065163e-06, + "loss": 0.6007, + "step": 4497 + }, + { + "epoch": 0.5519018404907975, + "grad_norm": 0.926403872711403, + "learning_rate": 8.808594351363913e-06, + "loss": 0.5799, + "step": 4498 + }, + { + "epoch": 0.5520245398773006, + "grad_norm": 0.8202125281729502, + "learning_rate": 8.804648567834943e-06, + "loss": 0.5467, + "step": 4499 + }, + { + "epoch": 0.5521472392638037, + "grad_norm": 0.800874067056231, + "learning_rate": 8.800702973101454e-06, + "loss": 0.5815, + "step": 4500 + }, + { + "epoch": 0.5522699386503067, + "grad_norm": 0.9038499057382431, + "learning_rate": 8.79675756778662e-06, + "loss": 0.5908, + "step": 4501 + }, + { + "epoch": 0.5523926380368098, + "grad_norm": 1.110640623482688, + "learning_rate": 8.792812352513584e-06, + "loss": 0.6015, + "step": 4502 + }, + { + "epoch": 0.5525153374233129, + "grad_norm": 0.9225122130531034, + "learning_rate": 8.788867327905457e-06, + "loss": 0.5404, + "step": 4503 + }, + { + "epoch": 0.5526380368098159, + "grad_norm": 0.8562806632714186, + "learning_rate": 8.784922494585326e-06, + "loss": 0.5593, + "step": 4504 + }, + { + "epoch": 0.552760736196319, + "grad_norm": 1.0530427827992865, + "learning_rate": 8.78097785317624e-06, + "loss": 0.5237, + "step": 4505 + }, + { + "epoch": 0.552883435582822, + "grad_norm": 0.9879316853560969, + "learning_rate": 8.777033404301222e-06, + "loss": 0.5154, + "step": 4506 + }, + { + "epoch": 0.5530061349693252, + "grad_norm": 0.9057953172435281, + "learning_rate": 8.773089148583266e-06, + "loss": 0.5394, + "step": 4507 + }, + { + "epoch": 0.5531288343558283, + "grad_norm": 0.9407172222997103, + "learning_rate": 8.769145086645328e-06, + "loss": 0.5725, + "step": 4508 + }, + { + "epoch": 0.5532515337423313, + "grad_norm": 1.0005262074949737, + "learning_rate": 8.765201219110342e-06, + "loss": 0.5518, + "step": 4509 + }, + { + "epoch": 0.5533742331288344, + "grad_norm": 1.0588667185591365, + "learning_rate": 8.761257546601209e-06, + "loss": 0.6351, + "step": 4510 + }, + { + "epoch": 0.5534969325153374, + "grad_norm": 0.9728755457441192, + "learning_rate": 8.757314069740795e-06, + "loss": 0.5626, + "step": 4511 + }, + { + "epoch": 0.5536196319018405, + "grad_norm": 1.0283619319541435, + "learning_rate": 8.753370789151941e-06, + "loss": 0.5685, + "step": 4512 + }, + { + "epoch": 0.5537423312883436, + "grad_norm": 0.9284382028563193, + "learning_rate": 8.749427705457453e-06, + "loss": 0.586, + "step": 4513 + }, + { + "epoch": 0.5538650306748466, + "grad_norm": 0.8690392893013871, + "learning_rate": 8.745484819280108e-06, + "loss": 0.5872, + "step": 4514 + }, + { + "epoch": 0.5539877300613497, + "grad_norm": 0.8843135868490924, + "learning_rate": 8.741542131242652e-06, + "loss": 0.5537, + "step": 4515 + }, + { + "epoch": 0.5541104294478527, + "grad_norm": 0.9407276242681821, + "learning_rate": 8.737599641967795e-06, + "loss": 0.5565, + "step": 4516 + }, + { + "epoch": 0.5542331288343558, + "grad_norm": 0.8884610875726129, + "learning_rate": 8.733657352078223e-06, + "loss": 0.5179, + "step": 4517 + }, + { + "epoch": 0.5543558282208589, + "grad_norm": 0.9718218624872514, + "learning_rate": 8.729715262196584e-06, + "loss": 0.6083, + "step": 4518 + }, + { + "epoch": 0.5544785276073619, + "grad_norm": 0.8949941010973549, + "learning_rate": 8.725773372945501e-06, + "loss": 0.5454, + "step": 4519 + }, + { + "epoch": 0.554601226993865, + "grad_norm": 0.8559457913061106, + "learning_rate": 8.721831684947557e-06, + "loss": 0.5946, + "step": 4520 + }, + { + "epoch": 0.554723926380368, + "grad_norm": 0.9988573092775652, + "learning_rate": 8.71789019882531e-06, + "loss": 0.5267, + "step": 4521 + }, + { + "epoch": 0.5548466257668712, + "grad_norm": 0.9272796767065092, + "learning_rate": 8.713948915201284e-06, + "loss": 0.6123, + "step": 4522 + }, + { + "epoch": 0.5549693251533743, + "grad_norm": 0.8618491584864043, + "learning_rate": 8.71000783469797e-06, + "loss": 0.5567, + "step": 4523 + }, + { + "epoch": 0.5550920245398773, + "grad_norm": 0.8423830176767895, + "learning_rate": 8.706066957937829e-06, + "loss": 0.5477, + "step": 4524 + }, + { + "epoch": 0.5552147239263804, + "grad_norm": 0.9449469581141395, + "learning_rate": 8.702126285543286e-06, + "loss": 0.5596, + "step": 4525 + }, + { + "epoch": 0.5553374233128834, + "grad_norm": 0.9538353192575422, + "learning_rate": 8.69818581813674e-06, + "loss": 0.5899, + "step": 4526 + }, + { + "epoch": 0.5554601226993865, + "grad_norm": 0.8342446449212312, + "learning_rate": 8.69424555634055e-06, + "loss": 0.5395, + "step": 4527 + }, + { + "epoch": 0.5555828220858896, + "grad_norm": 0.9048408065531224, + "learning_rate": 8.690305500777052e-06, + "loss": 0.5752, + "step": 4528 + }, + { + "epoch": 0.5557055214723926, + "grad_norm": 0.9270094995415572, + "learning_rate": 8.686365652068536e-06, + "loss": 0.5822, + "step": 4529 + }, + { + "epoch": 0.5558282208588957, + "grad_norm": 0.8386241344362958, + "learning_rate": 8.682426010837274e-06, + "loss": 0.5465, + "step": 4530 + }, + { + "epoch": 0.5559509202453988, + "grad_norm": 0.8843390447940004, + "learning_rate": 8.678486577705496e-06, + "loss": 0.5348, + "step": 4531 + }, + { + "epoch": 0.5560736196319018, + "grad_norm": 0.8817584455479935, + "learning_rate": 8.674547353295401e-06, + "loss": 0.5943, + "step": 4532 + }, + { + "epoch": 0.5561963190184049, + "grad_norm": 0.952695063591393, + "learning_rate": 8.670608338229158e-06, + "loss": 0.5737, + "step": 4533 + }, + { + "epoch": 0.5563190184049079, + "grad_norm": 1.019895105612791, + "learning_rate": 8.6666695331289e-06, + "loss": 0.5478, + "step": 4534 + }, + { + "epoch": 0.556441717791411, + "grad_norm": 1.0433684917005177, + "learning_rate": 8.662730938616724e-06, + "loss": 0.5805, + "step": 4535 + }, + { + "epoch": 0.5565644171779142, + "grad_norm": 0.8729697128793343, + "learning_rate": 8.658792555314701e-06, + "loss": 0.6049, + "step": 4536 + }, + { + "epoch": 0.5566871165644172, + "grad_norm": 0.9660695432915869, + "learning_rate": 8.654854383844862e-06, + "loss": 0.5555, + "step": 4537 + }, + { + "epoch": 0.5568098159509203, + "grad_norm": 0.8886209387757009, + "learning_rate": 8.650916424829213e-06, + "loss": 0.4764, + "step": 4538 + }, + { + "epoch": 0.5569325153374233, + "grad_norm": 0.834954744070091, + "learning_rate": 8.646978678889713e-06, + "loss": 0.6063, + "step": 4539 + }, + { + "epoch": 0.5570552147239264, + "grad_norm": 0.885131749768121, + "learning_rate": 8.643041146648299e-06, + "loss": 0.5915, + "step": 4540 + }, + { + "epoch": 0.5571779141104295, + "grad_norm": 0.8341992035191056, + "learning_rate": 8.63910382872687e-06, + "loss": 0.582, + "step": 4541 + }, + { + "epoch": 0.5573006134969325, + "grad_norm": 0.8563723339748966, + "learning_rate": 8.63516672574729e-06, + "loss": 0.5006, + "step": 4542 + }, + { + "epoch": 0.5574233128834356, + "grad_norm": 0.9018996848990681, + "learning_rate": 8.631229838331392e-06, + "loss": 0.6053, + "step": 4543 + }, + { + "epoch": 0.5575460122699386, + "grad_norm": 0.9429389294880839, + "learning_rate": 8.627293167100975e-06, + "loss": 0.5528, + "step": 4544 + }, + { + "epoch": 0.5576687116564417, + "grad_norm": 0.7641639408554368, + "learning_rate": 8.6233567126778e-06, + "loss": 0.5559, + "step": 4545 + }, + { + "epoch": 0.5577914110429448, + "grad_norm": 0.8103409948746643, + "learning_rate": 8.619420475683597e-06, + "loss": 0.6125, + "step": 4546 + }, + { + "epoch": 0.5579141104294478, + "grad_norm": 0.8787148763842865, + "learning_rate": 8.615484456740062e-06, + "loss": 0.5726, + "step": 4547 + }, + { + "epoch": 0.5580368098159509, + "grad_norm": 0.9707237397228452, + "learning_rate": 8.611548656468854e-06, + "loss": 0.5618, + "step": 4548 + }, + { + "epoch": 0.5581595092024539, + "grad_norm": 1.12712928674739, + "learning_rate": 8.607613075491597e-06, + "loss": 0.5835, + "step": 4549 + }, + { + "epoch": 0.558282208588957, + "grad_norm": 0.8579734318122003, + "learning_rate": 8.603677714429888e-06, + "loss": 0.5628, + "step": 4550 + }, + { + "epoch": 0.5584049079754602, + "grad_norm": 0.8786812683668723, + "learning_rate": 8.599742573905276e-06, + "loss": 0.5766, + "step": 4551 + }, + { + "epoch": 0.5585276073619632, + "grad_norm": 0.8024051034769022, + "learning_rate": 8.595807654539288e-06, + "loss": 0.602, + "step": 4552 + }, + { + "epoch": 0.5586503067484663, + "grad_norm": 0.958136565387499, + "learning_rate": 8.591872956953409e-06, + "loss": 0.5475, + "step": 4553 + }, + { + "epoch": 0.5587730061349693, + "grad_norm": 0.9882956019924798, + "learning_rate": 8.58793848176909e-06, + "loss": 0.5383, + "step": 4554 + }, + { + "epoch": 0.5588957055214724, + "grad_norm": 0.8790154217996982, + "learning_rate": 8.584004229607747e-06, + "loss": 0.5695, + "step": 4555 + }, + { + "epoch": 0.5590184049079755, + "grad_norm": 0.8491898519794807, + "learning_rate": 8.580070201090759e-06, + "loss": 0.5956, + "step": 4556 + }, + { + "epoch": 0.5591411042944785, + "grad_norm": 0.9011411516897511, + "learning_rate": 8.576136396839482e-06, + "loss": 0.5491, + "step": 4557 + }, + { + "epoch": 0.5592638036809816, + "grad_norm": 0.9025465297814839, + "learning_rate": 8.572202817475217e-06, + "loss": 0.621, + "step": 4558 + }, + { + "epoch": 0.5593865030674846, + "grad_norm": 0.8203862816992951, + "learning_rate": 8.568269463619243e-06, + "loss": 0.5429, + "step": 4559 + }, + { + "epoch": 0.5595092024539877, + "grad_norm": 0.9268787143767696, + "learning_rate": 8.564336335892798e-06, + "loss": 0.5093, + "step": 4560 + }, + { + "epoch": 0.5596319018404908, + "grad_norm": 0.87320449876875, + "learning_rate": 8.560403434917087e-06, + "loss": 0.5416, + "step": 4561 + }, + { + "epoch": 0.5597546012269938, + "grad_norm": 0.9761577692520221, + "learning_rate": 8.556470761313275e-06, + "loss": 0.6203, + "step": 4562 + }, + { + "epoch": 0.5598773006134969, + "grad_norm": 0.8944072363578551, + "learning_rate": 8.552538315702497e-06, + "loss": 0.5207, + "step": 4563 + }, + { + "epoch": 0.56, + "grad_norm": 0.8158253477639013, + "learning_rate": 8.548606098705848e-06, + "loss": 0.5479, + "step": 4564 + }, + { + "epoch": 0.560122699386503, + "grad_norm": 0.8354619258412388, + "learning_rate": 8.54467411094439e-06, + "loss": 0.5091, + "step": 4565 + }, + { + "epoch": 0.5602453987730062, + "grad_norm": 0.9243199815875874, + "learning_rate": 8.54074235303914e-06, + "loss": 0.6038, + "step": 4566 + }, + { + "epoch": 0.5603680981595092, + "grad_norm": 0.8658815609810896, + "learning_rate": 8.536810825611094e-06, + "loss": 0.602, + "step": 4567 + }, + { + "epoch": 0.5604907975460123, + "grad_norm": 0.996597448433458, + "learning_rate": 8.532879529281199e-06, + "loss": 0.5816, + "step": 4568 + }, + { + "epoch": 0.5606134969325154, + "grad_norm": 1.005158219425028, + "learning_rate": 8.528948464670368e-06, + "loss": 0.5451, + "step": 4569 + }, + { + "epoch": 0.5607361963190184, + "grad_norm": 0.8427749201531061, + "learning_rate": 8.52501763239948e-06, + "loss": 0.6086, + "step": 4570 + }, + { + "epoch": 0.5608588957055215, + "grad_norm": 0.8625342508692644, + "learning_rate": 8.521087033089374e-06, + "loss": 0.5352, + "step": 4571 + }, + { + "epoch": 0.5609815950920245, + "grad_norm": 1.011700714612849, + "learning_rate": 8.517156667360859e-06, + "loss": 0.5505, + "step": 4572 + }, + { + "epoch": 0.5611042944785276, + "grad_norm": 1.0777638306243098, + "learning_rate": 8.5132265358347e-06, + "loss": 0.561, + "step": 4573 + }, + { + "epoch": 0.5612269938650307, + "grad_norm": 0.8496547844310126, + "learning_rate": 8.509296639131628e-06, + "loss": 0.5326, + "step": 4574 + }, + { + "epoch": 0.5613496932515337, + "grad_norm": 0.8713401237503704, + "learning_rate": 8.505366977872336e-06, + "loss": 0.6022, + "step": 4575 + }, + { + "epoch": 0.5614723926380368, + "grad_norm": 0.8938587656451443, + "learning_rate": 8.50143755267748e-06, + "loss": 0.5301, + "step": 4576 + }, + { + "epoch": 0.5615950920245398, + "grad_norm": 0.9544796612848163, + "learning_rate": 8.497508364167678e-06, + "loss": 0.5726, + "step": 4577 + }, + { + "epoch": 0.5617177914110429, + "grad_norm": 0.8888160798426339, + "learning_rate": 8.493579412963516e-06, + "loss": 0.5314, + "step": 4578 + }, + { + "epoch": 0.561840490797546, + "grad_norm": 0.9928960296715504, + "learning_rate": 8.489650699685531e-06, + "loss": 0.5559, + "step": 4579 + }, + { + "epoch": 0.561963190184049, + "grad_norm": 0.8987273650447843, + "learning_rate": 8.485722224954237e-06, + "loss": 0.5593, + "step": 4580 + }, + { + "epoch": 0.5620858895705522, + "grad_norm": 0.8596626271598117, + "learning_rate": 8.481793989390097e-06, + "loss": 0.4748, + "step": 4581 + }, + { + "epoch": 0.5622085889570552, + "grad_norm": 0.9304667683742157, + "learning_rate": 8.477865993613544e-06, + "loss": 0.6089, + "step": 4582 + }, + { + "epoch": 0.5623312883435583, + "grad_norm": 1.100285665393672, + "learning_rate": 8.473938238244972e-06, + "loss": 0.59, + "step": 4583 + }, + { + "epoch": 0.5624539877300614, + "grad_norm": 0.8570847372535083, + "learning_rate": 8.47001072390473e-06, + "loss": 0.6115, + "step": 4584 + }, + { + "epoch": 0.5625766871165644, + "grad_norm": 0.8985125350894886, + "learning_rate": 8.466083451213145e-06, + "loss": 0.558, + "step": 4585 + }, + { + "epoch": 0.5626993865030675, + "grad_norm": 0.9641043406693717, + "learning_rate": 8.462156420790493e-06, + "loss": 0.5715, + "step": 4586 + }, + { + "epoch": 0.5628220858895705, + "grad_norm": 0.8897617518494254, + "learning_rate": 8.458229633257014e-06, + "loss": 0.544, + "step": 4587 + }, + { + "epoch": 0.5629447852760736, + "grad_norm": 0.9576310144165104, + "learning_rate": 8.454303089232908e-06, + "loss": 0.6471, + "step": 4588 + }, + { + "epoch": 0.5630674846625767, + "grad_norm": 0.8910106186324974, + "learning_rate": 8.45037678933834e-06, + "loss": 0.5944, + "step": 4589 + }, + { + "epoch": 0.5631901840490797, + "grad_norm": 0.8617438248481889, + "learning_rate": 8.446450734193437e-06, + "loss": 0.5773, + "step": 4590 + }, + { + "epoch": 0.5633128834355828, + "grad_norm": 0.890506643284093, + "learning_rate": 8.442524924418282e-06, + "loss": 0.5402, + "step": 4591 + }, + { + "epoch": 0.5634355828220858, + "grad_norm": 1.0035062063875717, + "learning_rate": 8.438599360632927e-06, + "loss": 0.6201, + "step": 4592 + }, + { + "epoch": 0.563558282208589, + "grad_norm": 0.8496081339314459, + "learning_rate": 8.43467404345738e-06, + "loss": 0.5873, + "step": 4593 + }, + { + "epoch": 0.5636809815950921, + "grad_norm": 0.8554845111547902, + "learning_rate": 8.43074897351161e-06, + "loss": 0.5257, + "step": 4594 + }, + { + "epoch": 0.5638036809815951, + "grad_norm": 0.9615433910992054, + "learning_rate": 8.426824151415548e-06, + "loss": 0.5683, + "step": 4595 + }, + { + "epoch": 0.5639263803680982, + "grad_norm": 0.9022613425848269, + "learning_rate": 8.422899577789087e-06, + "loss": 0.5423, + "step": 4596 + }, + { + "epoch": 0.5640490797546013, + "grad_norm": 1.0099945842893217, + "learning_rate": 8.418975253252079e-06, + "loss": 0.5788, + "step": 4597 + }, + { + "epoch": 0.5641717791411043, + "grad_norm": 0.8841664605671831, + "learning_rate": 8.415051178424337e-06, + "loss": 0.5775, + "step": 4598 + }, + { + "epoch": 0.5642944785276074, + "grad_norm": 0.8213124830465637, + "learning_rate": 8.411127353925638e-06, + "loss": 0.5642, + "step": 4599 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 0.9098390230787368, + "learning_rate": 8.407203780375711e-06, + "loss": 0.599, + "step": 4600 + }, + { + "epoch": 0.5645398773006135, + "grad_norm": 0.9495592352029959, + "learning_rate": 8.403280458394255e-06, + "loss": 0.5767, + "step": 4601 + }, + { + "epoch": 0.5646625766871166, + "grad_norm": 0.939252370460108, + "learning_rate": 8.399357388600922e-06, + "loss": 0.6029, + "step": 4602 + }, + { + "epoch": 0.5647852760736196, + "grad_norm": 0.78056005614989, + "learning_rate": 8.395434571615328e-06, + "loss": 0.5651, + "step": 4603 + }, + { + "epoch": 0.5649079754601227, + "grad_norm": 0.9249396797049331, + "learning_rate": 8.391512008057049e-06, + "loss": 0.6366, + "step": 4604 + }, + { + "epoch": 0.5650306748466257, + "grad_norm": 0.8030144416895847, + "learning_rate": 8.38758969854562e-06, + "loss": 0.5789, + "step": 4605 + }, + { + "epoch": 0.5651533742331288, + "grad_norm": 0.9671185155645993, + "learning_rate": 8.383667643700536e-06, + "loss": 0.5737, + "step": 4606 + }, + { + "epoch": 0.565276073619632, + "grad_norm": 1.051628101087655, + "learning_rate": 8.379745844141253e-06, + "loss": 0.5604, + "step": 4607 + }, + { + "epoch": 0.565398773006135, + "grad_norm": 0.911822214925995, + "learning_rate": 8.375824300487184e-06, + "loss": 0.5643, + "step": 4608 + }, + { + "epoch": 0.5655214723926381, + "grad_norm": 0.9236155152183466, + "learning_rate": 8.371903013357701e-06, + "loss": 0.5605, + "step": 4609 + }, + { + "epoch": 0.5656441717791411, + "grad_norm": 0.9130716930991978, + "learning_rate": 8.367981983372143e-06, + "loss": 0.5685, + "step": 4610 + }, + { + "epoch": 0.5657668711656442, + "grad_norm": 0.86354648899042, + "learning_rate": 8.364061211149796e-06, + "loss": 0.5981, + "step": 4611 + }, + { + "epoch": 0.5658895705521473, + "grad_norm": 0.8073930840424781, + "learning_rate": 8.36014069730992e-06, + "loss": 0.5223, + "step": 4612 + }, + { + "epoch": 0.5660122699386503, + "grad_norm": 0.8711180640864734, + "learning_rate": 8.35622044247172e-06, + "loss": 0.5146, + "step": 4613 + }, + { + "epoch": 0.5661349693251534, + "grad_norm": 0.9278608317934761, + "learning_rate": 8.352300447254372e-06, + "loss": 0.542, + "step": 4614 + }, + { + "epoch": 0.5662576687116564, + "grad_norm": 0.8677026917912465, + "learning_rate": 8.348380712277002e-06, + "loss": 0.5846, + "step": 4615 + }, + { + "epoch": 0.5663803680981595, + "grad_norm": 1.0324239912912514, + "learning_rate": 8.3444612381587e-06, + "loss": 0.5714, + "step": 4616 + }, + { + "epoch": 0.5665030674846626, + "grad_norm": 0.9087860158289723, + "learning_rate": 8.340542025518512e-06, + "loss": 0.5972, + "step": 4617 + }, + { + "epoch": 0.5666257668711656, + "grad_norm": 0.9479722144240601, + "learning_rate": 8.336623074975447e-06, + "loss": 0.5994, + "step": 4618 + }, + { + "epoch": 0.5667484662576687, + "grad_norm": 0.8750486485260855, + "learning_rate": 8.332704387148463e-06, + "loss": 0.619, + "step": 4619 + }, + { + "epoch": 0.5668711656441717, + "grad_norm": 0.8885986782626163, + "learning_rate": 8.32878596265649e-06, + "loss": 0.5849, + "step": 4620 + }, + { + "epoch": 0.5669938650306748, + "grad_norm": 0.8572772212002838, + "learning_rate": 8.324867802118404e-06, + "loss": 0.576, + "step": 4621 + }, + { + "epoch": 0.567116564417178, + "grad_norm": 0.9514562977490599, + "learning_rate": 8.320949906153048e-06, + "loss": 0.5893, + "step": 4622 + }, + { + "epoch": 0.567239263803681, + "grad_norm": 0.8917651891488019, + "learning_rate": 8.31703227537922e-06, + "loss": 0.5698, + "step": 4623 + }, + { + "epoch": 0.5673619631901841, + "grad_norm": 0.8083853522210619, + "learning_rate": 8.313114910415674e-06, + "loss": 0.579, + "step": 4624 + }, + { + "epoch": 0.5674846625766872, + "grad_norm": 0.7961312310285298, + "learning_rate": 8.309197811881128e-06, + "loss": 0.5922, + "step": 4625 + }, + { + "epoch": 0.5676073619631902, + "grad_norm": 0.9641696864653815, + "learning_rate": 8.305280980394248e-06, + "loss": 0.5966, + "step": 4626 + }, + { + "epoch": 0.5677300613496933, + "grad_norm": 3.43764859112132, + "learning_rate": 8.30136441657367e-06, + "loss": 0.5736, + "step": 4627 + }, + { + "epoch": 0.5678527607361963, + "grad_norm": 0.9457089003020361, + "learning_rate": 8.297448121037978e-06, + "loss": 0.5969, + "step": 4628 + }, + { + "epoch": 0.5679754601226994, + "grad_norm": 1.0076746782329082, + "learning_rate": 8.293532094405719e-06, + "loss": 0.5943, + "step": 4629 + }, + { + "epoch": 0.5680981595092025, + "grad_norm": 0.9271247596347866, + "learning_rate": 8.289616337295396e-06, + "loss": 0.5825, + "step": 4630 + }, + { + "epoch": 0.5682208588957055, + "grad_norm": 0.8604502868752186, + "learning_rate": 8.285700850325467e-06, + "loss": 0.5889, + "step": 4631 + }, + { + "epoch": 0.5683435582822086, + "grad_norm": 0.8840280073858309, + "learning_rate": 8.281785634114348e-06, + "loss": 0.5391, + "step": 4632 + }, + { + "epoch": 0.5684662576687116, + "grad_norm": 0.8953518512506345, + "learning_rate": 8.27787068928042e-06, + "loss": 0.5907, + "step": 4633 + }, + { + "epoch": 0.5685889570552147, + "grad_norm": 0.9315706485152896, + "learning_rate": 8.27395601644201e-06, + "loss": 0.5775, + "step": 4634 + }, + { + "epoch": 0.5687116564417178, + "grad_norm": 1.1830275096072, + "learning_rate": 8.270041616217407e-06, + "loss": 0.5901, + "step": 4635 + }, + { + "epoch": 0.5688343558282208, + "grad_norm": 0.9198488268197302, + "learning_rate": 8.266127489224859e-06, + "loss": 0.5609, + "step": 4636 + }, + { + "epoch": 0.568957055214724, + "grad_norm": 0.8840751520092207, + "learning_rate": 8.262213636082567e-06, + "loss": 0.5513, + "step": 4637 + }, + { + "epoch": 0.569079754601227, + "grad_norm": 1.364132818264585, + "learning_rate": 8.258300057408692e-06, + "loss": 0.5951, + "step": 4638 + }, + { + "epoch": 0.5692024539877301, + "grad_norm": 0.8125015028090057, + "learning_rate": 8.254386753821346e-06, + "loss": 0.5177, + "step": 4639 + }, + { + "epoch": 0.5693251533742332, + "grad_norm": 0.9242406992902719, + "learning_rate": 8.250473725938608e-06, + "loss": 0.5526, + "step": 4640 + }, + { + "epoch": 0.5694478527607362, + "grad_norm": 0.9853918458162668, + "learning_rate": 8.246560974378504e-06, + "loss": 0.6235, + "step": 4641 + }, + { + "epoch": 0.5695705521472393, + "grad_norm": 0.91235865894208, + "learning_rate": 8.24264849975902e-06, + "loss": 0.5765, + "step": 4642 + }, + { + "epoch": 0.5696932515337423, + "grad_norm": 0.8640156028201377, + "learning_rate": 8.238736302698096e-06, + "loss": 0.5296, + "step": 4643 + }, + { + "epoch": 0.5698159509202454, + "grad_norm": 0.8650817118441516, + "learning_rate": 8.234824383813632e-06, + "loss": 0.5217, + "step": 4644 + }, + { + "epoch": 0.5699386503067485, + "grad_norm": 0.9342843381968361, + "learning_rate": 8.23091274372348e-06, + "loss": 0.5693, + "step": 4645 + }, + { + "epoch": 0.5700613496932515, + "grad_norm": 1.043161281497172, + "learning_rate": 8.227001383045453e-06, + "loss": 0.5592, + "step": 4646 + }, + { + "epoch": 0.5701840490797546, + "grad_norm": 0.9295224611877182, + "learning_rate": 8.223090302397313e-06, + "loss": 0.5665, + "step": 4647 + }, + { + "epoch": 0.5703067484662576, + "grad_norm": 0.9047162670580213, + "learning_rate": 8.219179502396786e-06, + "loss": 0.6004, + "step": 4648 + }, + { + "epoch": 0.5704294478527607, + "grad_norm": 0.9158616889363801, + "learning_rate": 8.215268983661547e-06, + "loss": 0.4897, + "step": 4649 + }, + { + "epoch": 0.5705521472392638, + "grad_norm": 0.8596661184474933, + "learning_rate": 8.211358746809225e-06, + "loss": 0.5723, + "step": 4650 + }, + { + "epoch": 0.5706748466257668, + "grad_norm": 0.8524722858381182, + "learning_rate": 8.207448792457413e-06, + "loss": 0.6026, + "step": 4651 + }, + { + "epoch": 0.57079754601227, + "grad_norm": 0.9696538756853887, + "learning_rate": 8.203539121223653e-06, + "loss": 0.5495, + "step": 4652 + }, + { + "epoch": 0.570920245398773, + "grad_norm": 1.3002502153760183, + "learning_rate": 8.199629733725444e-06, + "loss": 0.5845, + "step": 4653 + }, + { + "epoch": 0.5710429447852761, + "grad_norm": 0.8484922064835649, + "learning_rate": 8.195720630580242e-06, + "loss": 0.5605, + "step": 4654 + }, + { + "epoch": 0.5711656441717792, + "grad_norm": 0.8402354839169389, + "learning_rate": 8.191811812405453e-06, + "loss": 0.5749, + "step": 4655 + }, + { + "epoch": 0.5712883435582822, + "grad_norm": 0.971962448209201, + "learning_rate": 8.187903279818444e-06, + "loss": 0.652, + "step": 4656 + }, + { + "epoch": 0.5714110429447853, + "grad_norm": 1.0699508586157833, + "learning_rate": 8.183995033436534e-06, + "loss": 0.5687, + "step": 4657 + }, + { + "epoch": 0.5715337423312884, + "grad_norm": 0.9310118347902808, + "learning_rate": 8.180087073876995e-06, + "loss": 0.6172, + "step": 4658 + }, + { + "epoch": 0.5716564417177914, + "grad_norm": 0.7793504401032241, + "learning_rate": 8.176179401757057e-06, + "loss": 0.5498, + "step": 4659 + }, + { + "epoch": 0.5717791411042945, + "grad_norm": 0.9149140163285715, + "learning_rate": 8.172272017693903e-06, + "loss": 0.5529, + "step": 4660 + }, + { + "epoch": 0.5719018404907975, + "grad_norm": 0.8921715270399665, + "learning_rate": 8.16836492230467e-06, + "loss": 0.5, + "step": 4661 + }, + { + "epoch": 0.5720245398773006, + "grad_norm": 1.049829892347075, + "learning_rate": 8.16445811620645e-06, + "loss": 0.5952, + "step": 4662 + }, + { + "epoch": 0.5721472392638037, + "grad_norm": 0.8147195472964462, + "learning_rate": 8.16055160001629e-06, + "loss": 0.5232, + "step": 4663 + }, + { + "epoch": 0.5722699386503067, + "grad_norm": 0.9383304561836081, + "learning_rate": 8.156645374351194e-06, + "loss": 0.5235, + "step": 4664 + }, + { + "epoch": 0.5723926380368098, + "grad_norm": 0.859544471367909, + "learning_rate": 8.15273943982811e-06, + "loss": 0.6027, + "step": 4665 + }, + { + "epoch": 0.5725153374233128, + "grad_norm": 0.8322104917494199, + "learning_rate": 8.148833797063947e-06, + "loss": 0.5853, + "step": 4666 + }, + { + "epoch": 0.572638036809816, + "grad_norm": 0.826367056669286, + "learning_rate": 8.144928446675578e-06, + "loss": 0.5783, + "step": 4667 + }, + { + "epoch": 0.5727607361963191, + "grad_norm": 0.9134302184012763, + "learning_rate": 8.14102338927981e-06, + "loss": 0.6104, + "step": 4668 + }, + { + "epoch": 0.5728834355828221, + "grad_norm": 0.9215332416153489, + "learning_rate": 8.137118625493414e-06, + "loss": 0.6053, + "step": 4669 + }, + { + "epoch": 0.5730061349693252, + "grad_norm": 0.8461848625674859, + "learning_rate": 8.133214155933118e-06, + "loss": 0.5478, + "step": 4670 + }, + { + "epoch": 0.5731288343558282, + "grad_norm": 0.8395888351609905, + "learning_rate": 8.129309981215597e-06, + "loss": 0.5923, + "step": 4671 + }, + { + "epoch": 0.5732515337423313, + "grad_norm": 2.23803883218035, + "learning_rate": 8.125406101957481e-06, + "loss": 0.5909, + "step": 4672 + }, + { + "epoch": 0.5733742331288344, + "grad_norm": 0.9646519144422652, + "learning_rate": 8.121502518775355e-06, + "loss": 0.5852, + "step": 4673 + }, + { + "epoch": 0.5734969325153374, + "grad_norm": 0.8210702409600833, + "learning_rate": 8.117599232285756e-06, + "loss": 0.6047, + "step": 4674 + }, + { + "epoch": 0.5736196319018405, + "grad_norm": 0.7849561061033478, + "learning_rate": 8.113696243105175e-06, + "loss": 0.5721, + "step": 4675 + }, + { + "epoch": 0.5737423312883435, + "grad_norm": 0.8730191639365621, + "learning_rate": 8.109793551850057e-06, + "loss": 0.5044, + "step": 4676 + }, + { + "epoch": 0.5738650306748466, + "grad_norm": 0.9514580135006183, + "learning_rate": 8.105891159136796e-06, + "loss": 0.5417, + "step": 4677 + }, + { + "epoch": 0.5739877300613497, + "grad_norm": 0.9150640304151745, + "learning_rate": 8.101989065581742e-06, + "loss": 0.5103, + "step": 4678 + }, + { + "epoch": 0.5741104294478527, + "grad_norm": 0.8139734455649364, + "learning_rate": 8.0980872718012e-06, + "loss": 0.5624, + "step": 4679 + }, + { + "epoch": 0.5742331288343558, + "grad_norm": 0.905180056326903, + "learning_rate": 8.09418577841142e-06, + "loss": 0.5803, + "step": 4680 + }, + { + "epoch": 0.5743558282208588, + "grad_norm": 0.9101231193339557, + "learning_rate": 8.090284586028614e-06, + "loss": 0.571, + "step": 4681 + }, + { + "epoch": 0.574478527607362, + "grad_norm": 2.130744437279391, + "learning_rate": 8.086383695268937e-06, + "loss": 0.5674, + "step": 4682 + }, + { + "epoch": 0.5746012269938651, + "grad_norm": 1.0144421875474339, + "learning_rate": 8.082483106748506e-06, + "loss": 0.622, + "step": 4683 + }, + { + "epoch": 0.5747239263803681, + "grad_norm": 0.931877183458959, + "learning_rate": 8.078582821083381e-06, + "loss": 0.5169, + "step": 4684 + }, + { + "epoch": 0.5748466257668712, + "grad_norm": 0.8711839470847353, + "learning_rate": 8.074682838889581e-06, + "loss": 0.5898, + "step": 4685 + }, + { + "epoch": 0.5749693251533742, + "grad_norm": 0.9591274138724065, + "learning_rate": 8.070783160783075e-06, + "loss": 0.5719, + "step": 4686 + }, + { + "epoch": 0.5750920245398773, + "grad_norm": 0.8941305063623128, + "learning_rate": 8.066883787379784e-06, + "loss": 0.5534, + "step": 4687 + }, + { + "epoch": 0.5752147239263804, + "grad_norm": 1.01553422311265, + "learning_rate": 8.062984719295576e-06, + "loss": 0.5837, + "step": 4688 + }, + { + "epoch": 0.5753374233128834, + "grad_norm": 0.9467908422957799, + "learning_rate": 8.059085957146283e-06, + "loss": 0.5996, + "step": 4689 + }, + { + "epoch": 0.5754601226993865, + "grad_norm": 0.9268779890594085, + "learning_rate": 8.055187501547674e-06, + "loss": 0.5781, + "step": 4690 + }, + { + "epoch": 0.5755828220858896, + "grad_norm": 0.9193674939974747, + "learning_rate": 8.051289353115483e-06, + "loss": 0.5804, + "step": 4691 + }, + { + "epoch": 0.5757055214723926, + "grad_norm": 0.8951709836538608, + "learning_rate": 8.047391512465381e-06, + "loss": 0.5488, + "step": 4692 + }, + { + "epoch": 0.5758282208588957, + "grad_norm": 0.9138913258026943, + "learning_rate": 8.043493980213004e-06, + "loss": 0.5954, + "step": 4693 + }, + { + "epoch": 0.5759509202453987, + "grad_norm": 0.8688289867478662, + "learning_rate": 8.039596756973928e-06, + "loss": 0.57, + "step": 4694 + }, + { + "epoch": 0.5760736196319018, + "grad_norm": 0.8156948228836811, + "learning_rate": 8.035699843363696e-06, + "loss": 0.5796, + "step": 4695 + }, + { + "epoch": 0.576196319018405, + "grad_norm": 0.8539647347442147, + "learning_rate": 8.031803239997785e-06, + "loss": 0.534, + "step": 4696 + }, + { + "epoch": 0.576319018404908, + "grad_norm": 0.9013277293999756, + "learning_rate": 8.027906947491634e-06, + "loss": 0.5619, + "step": 4697 + }, + { + "epoch": 0.5764417177914111, + "grad_norm": 1.0626642402232396, + "learning_rate": 8.024010966460624e-06, + "loss": 0.5702, + "step": 4698 + }, + { + "epoch": 0.5765644171779141, + "grad_norm": 0.8531293422598036, + "learning_rate": 8.020115297520093e-06, + "loss": 0.6122, + "step": 4699 + }, + { + "epoch": 0.5766871165644172, + "grad_norm": 0.9006201630441247, + "learning_rate": 8.01621994128533e-06, + "loss": 0.571, + "step": 4700 + }, + { + "epoch": 0.5768098159509203, + "grad_norm": 0.9054938563440028, + "learning_rate": 8.012324898371574e-06, + "loss": 0.6251, + "step": 4701 + }, + { + "epoch": 0.5769325153374233, + "grad_norm": 0.990148848552469, + "learning_rate": 8.00843016939401e-06, + "loss": 0.5782, + "step": 4702 + }, + { + "epoch": 0.5770552147239264, + "grad_norm": 0.8500436727513242, + "learning_rate": 8.00453575496778e-06, + "loss": 0.5508, + "step": 4703 + }, + { + "epoch": 0.5771779141104294, + "grad_norm": 0.8963826832717691, + "learning_rate": 8.00064165570797e-06, + "loss": 0.5743, + "step": 4704 + }, + { + "epoch": 0.5773006134969325, + "grad_norm": 0.7594088523225808, + "learning_rate": 7.996747872229624e-06, + "loss": 0.5786, + "step": 4705 + }, + { + "epoch": 0.5774233128834356, + "grad_norm": 0.8211204885669263, + "learning_rate": 7.99285440514773e-06, + "loss": 0.5209, + "step": 4706 + }, + { + "epoch": 0.5775460122699386, + "grad_norm": 0.9019552760403634, + "learning_rate": 7.988961255077226e-06, + "loss": 0.5468, + "step": 4707 + }, + { + "epoch": 0.5776687116564417, + "grad_norm": 0.8750460030507159, + "learning_rate": 7.985068422633003e-06, + "loss": 0.5644, + "step": 4708 + }, + { + "epoch": 0.5777914110429447, + "grad_norm": 0.7677709988789228, + "learning_rate": 7.9811759084299e-06, + "loss": 0.5648, + "step": 4709 + }, + { + "epoch": 0.5779141104294478, + "grad_norm": 0.871302495618518, + "learning_rate": 7.977283713082706e-06, + "loss": 0.5571, + "step": 4710 + }, + { + "epoch": 0.578036809815951, + "grad_norm": 0.7764294616033187, + "learning_rate": 7.97339183720616e-06, + "loss": 0.6057, + "step": 4711 + }, + { + "epoch": 0.578159509202454, + "grad_norm": 0.9474026732960715, + "learning_rate": 7.969500281414947e-06, + "loss": 0.5402, + "step": 4712 + }, + { + "epoch": 0.5782822085889571, + "grad_norm": 1.0300393184578047, + "learning_rate": 7.965609046323712e-06, + "loss": 0.59, + "step": 4713 + }, + { + "epoch": 0.5784049079754601, + "grad_norm": 0.7696853611529263, + "learning_rate": 7.961718132547036e-06, + "loss": 0.5837, + "step": 4714 + }, + { + "epoch": 0.5785276073619632, + "grad_norm": 0.9945713843670665, + "learning_rate": 7.95782754069946e-06, + "loss": 0.5615, + "step": 4715 + }, + { + "epoch": 0.5786503067484663, + "grad_norm": 0.9191589364386709, + "learning_rate": 7.953937271395465e-06, + "loss": 0.582, + "step": 4716 + }, + { + "epoch": 0.5787730061349693, + "grad_norm": 0.9088762563619369, + "learning_rate": 7.950047325249488e-06, + "loss": 0.6027, + "step": 4717 + }, + { + "epoch": 0.5788957055214724, + "grad_norm": 1.056854006241706, + "learning_rate": 7.946157702875911e-06, + "loss": 0.5619, + "step": 4718 + }, + { + "epoch": 0.5790184049079755, + "grad_norm": 0.7542439678292422, + "learning_rate": 7.94226840488907e-06, + "loss": 0.5911, + "step": 4719 + }, + { + "epoch": 0.5791411042944785, + "grad_norm": 1.043370573312093, + "learning_rate": 7.938379431903243e-06, + "loss": 0.6211, + "step": 4720 + }, + { + "epoch": 0.5792638036809816, + "grad_norm": 0.8588162553163521, + "learning_rate": 7.934490784532657e-06, + "loss": 0.5721, + "step": 4721 + }, + { + "epoch": 0.5793865030674846, + "grad_norm": 0.8659473237609518, + "learning_rate": 7.930602463391499e-06, + "loss": 0.5247, + "step": 4722 + }, + { + "epoch": 0.5795092024539877, + "grad_norm": 0.8569356422929931, + "learning_rate": 7.926714469093888e-06, + "loss": 0.5524, + "step": 4723 + }, + { + "epoch": 0.5796319018404908, + "grad_norm": 0.9237848501157419, + "learning_rate": 7.922826802253904e-06, + "loss": 0.5918, + "step": 4724 + }, + { + "epoch": 0.5797546012269938, + "grad_norm": 0.9653119640513246, + "learning_rate": 7.91893946348557e-06, + "loss": 0.5556, + "step": 4725 + }, + { + "epoch": 0.579877300613497, + "grad_norm": 1.0113662055007389, + "learning_rate": 7.915052453402854e-06, + "loss": 0.6078, + "step": 4726 + }, + { + "epoch": 0.58, + "grad_norm": 0.9629886935909616, + "learning_rate": 7.91116577261968e-06, + "loss": 0.5844, + "step": 4727 + }, + { + "epoch": 0.5801226993865031, + "grad_norm": 0.81849330042083, + "learning_rate": 7.907279421749916e-06, + "loss": 0.5553, + "step": 4728 + }, + { + "epoch": 0.5802453987730062, + "grad_norm": 0.8644344675182696, + "learning_rate": 7.903393401407373e-06, + "loss": 0.6229, + "step": 4729 + }, + { + "epoch": 0.5803680981595092, + "grad_norm": 0.8884200311348925, + "learning_rate": 7.899507712205818e-06, + "loss": 0.5274, + "step": 4730 + }, + { + "epoch": 0.5804907975460123, + "grad_norm": 1.100355142628209, + "learning_rate": 7.895622354758962e-06, + "loss": 0.5813, + "step": 4731 + }, + { + "epoch": 0.5806134969325153, + "grad_norm": 1.1036103803085635, + "learning_rate": 7.891737329680462e-06, + "loss": 0.6037, + "step": 4732 + }, + { + "epoch": 0.5807361963190184, + "grad_norm": 0.8888956496915142, + "learning_rate": 7.887852637583927e-06, + "loss": 0.5847, + "step": 4733 + }, + { + "epoch": 0.5808588957055215, + "grad_norm": 1.096425520990871, + "learning_rate": 7.88396827908291e-06, + "loss": 0.5756, + "step": 4734 + }, + { + "epoch": 0.5809815950920245, + "grad_norm": 0.8331344666522724, + "learning_rate": 7.880084254790911e-06, + "loss": 0.5138, + "step": 4735 + }, + { + "epoch": 0.5811042944785276, + "grad_norm": 0.8665052683065922, + "learning_rate": 7.876200565321377e-06, + "loss": 0.5861, + "step": 4736 + }, + { + "epoch": 0.5812269938650306, + "grad_norm": 0.8895016386925753, + "learning_rate": 7.872317211287707e-06, + "loss": 0.5609, + "step": 4737 + }, + { + "epoch": 0.5813496932515337, + "grad_norm": 0.8934359016563314, + "learning_rate": 7.868434193303241e-06, + "loss": 0.5861, + "step": 4738 + }, + { + "epoch": 0.5814723926380369, + "grad_norm": 0.8767051824223248, + "learning_rate": 7.864551511981269e-06, + "loss": 0.6165, + "step": 4739 + }, + { + "epoch": 0.5815950920245399, + "grad_norm": 0.9411027377495527, + "learning_rate": 7.860669167935028e-06, + "loss": 0.5761, + "step": 4740 + }, + { + "epoch": 0.581717791411043, + "grad_norm": 0.9525450958837491, + "learning_rate": 7.8567871617777e-06, + "loss": 0.5467, + "step": 4741 + }, + { + "epoch": 0.581840490797546, + "grad_norm": 0.9578873230258763, + "learning_rate": 7.852905494122412e-06, + "loss": 0.612, + "step": 4742 + }, + { + "epoch": 0.5819631901840491, + "grad_norm": 0.8814672570223819, + "learning_rate": 7.849024165582242e-06, + "loss": 0.6046, + "step": 4743 + }, + { + "epoch": 0.5820858895705522, + "grad_norm": 0.8652790870858293, + "learning_rate": 7.845143176770214e-06, + "loss": 0.5883, + "step": 4744 + }, + { + "epoch": 0.5822085889570552, + "grad_norm": 0.905590648873165, + "learning_rate": 7.841262528299296e-06, + "loss": 0.5398, + "step": 4745 + }, + { + "epoch": 0.5823312883435583, + "grad_norm": 0.9322640611823896, + "learning_rate": 7.837382220782402e-06, + "loss": 0.5104, + "step": 4746 + }, + { + "epoch": 0.5824539877300613, + "grad_norm": 0.935680935714968, + "learning_rate": 7.833502254832394e-06, + "loss": 0.5733, + "step": 4747 + }, + { + "epoch": 0.5825766871165644, + "grad_norm": 0.9079727268535798, + "learning_rate": 7.829622631062079e-06, + "loss": 0.5827, + "step": 4748 + }, + { + "epoch": 0.5826993865030675, + "grad_norm": 0.8466854959531064, + "learning_rate": 7.825743350084207e-06, + "loss": 0.5022, + "step": 4749 + }, + { + "epoch": 0.5828220858895705, + "grad_norm": 0.8816807428338906, + "learning_rate": 7.821864412511485e-06, + "loss": 0.5695, + "step": 4750 + }, + { + "epoch": 0.5829447852760736, + "grad_norm": 0.9372025921753749, + "learning_rate": 7.817985818956552e-06, + "loss": 0.603, + "step": 4751 + }, + { + "epoch": 0.5830674846625767, + "grad_norm": 0.8909094164181387, + "learning_rate": 7.814107570032e-06, + "loss": 0.5591, + "step": 4752 + }, + { + "epoch": 0.5831901840490797, + "grad_norm": 0.9167418381551842, + "learning_rate": 7.810229666350363e-06, + "loss": 0.5446, + "step": 4753 + }, + { + "epoch": 0.5833128834355829, + "grad_norm": 0.7496720588310142, + "learning_rate": 7.806352108524129e-06, + "loss": 0.5496, + "step": 4754 + }, + { + "epoch": 0.5834355828220859, + "grad_norm": 1.0480758658509965, + "learning_rate": 7.802474897165716e-06, + "loss": 0.6031, + "step": 4755 + }, + { + "epoch": 0.583558282208589, + "grad_norm": 0.9117371243914787, + "learning_rate": 7.798598032887502e-06, + "loss": 0.5515, + "step": 4756 + }, + { + "epoch": 0.5836809815950921, + "grad_norm": 0.9290080348022279, + "learning_rate": 7.794721516301804e-06, + "loss": 0.6235, + "step": 4757 + }, + { + "epoch": 0.5838036809815951, + "grad_norm": 0.8915736868419857, + "learning_rate": 7.790845348020884e-06, + "loss": 0.5714, + "step": 4758 + }, + { + "epoch": 0.5839263803680982, + "grad_norm": 0.8266233006305985, + "learning_rate": 7.786969528656947e-06, + "loss": 0.572, + "step": 4759 + }, + { + "epoch": 0.5840490797546012, + "grad_norm": 1.0073136795740996, + "learning_rate": 7.783094058822147e-06, + "loss": 0.5774, + "step": 4760 + }, + { + "epoch": 0.5841717791411043, + "grad_norm": 0.8941518493801828, + "learning_rate": 7.779218939128582e-06, + "loss": 0.5558, + "step": 4761 + }, + { + "epoch": 0.5842944785276074, + "grad_norm": 1.3011880410753078, + "learning_rate": 7.775344170188294e-06, + "loss": 0.5833, + "step": 4762 + }, + { + "epoch": 0.5844171779141104, + "grad_norm": 0.9043593295432817, + "learning_rate": 7.771469752613267e-06, + "loss": 0.5613, + "step": 4763 + }, + { + "epoch": 0.5845398773006135, + "grad_norm": 0.8476409939913013, + "learning_rate": 7.767595687015435e-06, + "loss": 0.5401, + "step": 4764 + }, + { + "epoch": 0.5846625766871165, + "grad_norm": 0.7820948479865107, + "learning_rate": 7.76372197400667e-06, + "loss": 0.5036, + "step": 4765 + }, + { + "epoch": 0.5847852760736196, + "grad_norm": 1.0655427457585336, + "learning_rate": 7.759848614198795e-06, + "loss": 0.5686, + "step": 4766 + }, + { + "epoch": 0.5849079754601227, + "grad_norm": 0.9237106593192432, + "learning_rate": 7.755975608203571e-06, + "loss": 0.5299, + "step": 4767 + }, + { + "epoch": 0.5850306748466257, + "grad_norm": 0.8516403007748681, + "learning_rate": 7.75210295663271e-06, + "loss": 0.5886, + "step": 4768 + }, + { + "epoch": 0.5851533742331289, + "grad_norm": 0.8718285091836657, + "learning_rate": 7.74823066009786e-06, + "loss": 0.5997, + "step": 4769 + }, + { + "epoch": 0.5852760736196319, + "grad_norm": 1.0590220242696153, + "learning_rate": 7.74435871921062e-06, + "loss": 0.6626, + "step": 4770 + }, + { + "epoch": 0.585398773006135, + "grad_norm": 0.8061915470079942, + "learning_rate": 7.740487134582527e-06, + "loss": 0.5161, + "step": 4771 + }, + { + "epoch": 0.5855214723926381, + "grad_norm": 0.8583851481417598, + "learning_rate": 7.736615906825065e-06, + "loss": 0.5541, + "step": 4772 + }, + { + "epoch": 0.5856441717791411, + "grad_norm": 0.8609206609639336, + "learning_rate": 7.732745036549663e-06, + "loss": 0.5429, + "step": 4773 + }, + { + "epoch": 0.5857668711656442, + "grad_norm": 0.8514006631610208, + "learning_rate": 7.72887452436769e-06, + "loss": 0.5905, + "step": 4774 + }, + { + "epoch": 0.5858895705521472, + "grad_norm": 0.9390826483567757, + "learning_rate": 7.72500437089046e-06, + "loss": 0.5964, + "step": 4775 + }, + { + "epoch": 0.5860122699386503, + "grad_norm": 0.8673000911417489, + "learning_rate": 7.721134576729227e-06, + "loss": 0.5238, + "step": 4776 + }, + { + "epoch": 0.5861349693251534, + "grad_norm": 0.766200750922617, + "learning_rate": 7.717265142495203e-06, + "loss": 0.5882, + "step": 4777 + }, + { + "epoch": 0.5862576687116564, + "grad_norm": 0.8903353012236622, + "learning_rate": 7.713396068799521e-06, + "loss": 0.5803, + "step": 4778 + }, + { + "epoch": 0.5863803680981595, + "grad_norm": 0.9218576499347785, + "learning_rate": 7.709527356253274e-06, + "loss": 0.5523, + "step": 4779 + }, + { + "epoch": 0.5865030674846625, + "grad_norm": 0.8261135747383506, + "learning_rate": 7.705659005467489e-06, + "loss": 0.6017, + "step": 4780 + }, + { + "epoch": 0.5866257668711656, + "grad_norm": 0.8164762184574873, + "learning_rate": 7.701791017053137e-06, + "loss": 0.6012, + "step": 4781 + }, + { + "epoch": 0.5867484662576687, + "grad_norm": 0.9733364054222505, + "learning_rate": 7.697923391621139e-06, + "loss": 0.5492, + "step": 4782 + }, + { + "epoch": 0.5868711656441717, + "grad_norm": 1.038295795554746, + "learning_rate": 7.694056129782349e-06, + "loss": 0.6066, + "step": 4783 + }, + { + "epoch": 0.5869938650306749, + "grad_norm": 0.8099023591574817, + "learning_rate": 7.690189232147566e-06, + "loss": 0.5469, + "step": 4784 + }, + { + "epoch": 0.587116564417178, + "grad_norm": 0.9796172500793854, + "learning_rate": 7.68632269932754e-06, + "loss": 0.6151, + "step": 4785 + }, + { + "epoch": 0.587239263803681, + "grad_norm": 0.933272816030295, + "learning_rate": 7.68245653193295e-06, + "loss": 0.5758, + "step": 4786 + }, + { + "epoch": 0.5873619631901841, + "grad_norm": 0.8647151911269062, + "learning_rate": 7.678590730574429e-06, + "loss": 0.568, + "step": 4787 + }, + { + "epoch": 0.5874846625766871, + "grad_norm": 0.8896505784487946, + "learning_rate": 7.674725295862542e-06, + "loss": 0.5944, + "step": 4788 + }, + { + "epoch": 0.5876073619631902, + "grad_norm": 0.9402659466612814, + "learning_rate": 7.670860228407806e-06, + "loss": 0.5386, + "step": 4789 + }, + { + "epoch": 0.5877300613496933, + "grad_norm": 1.4452762573679836, + "learning_rate": 7.666995528820673e-06, + "loss": 0.58, + "step": 4790 + }, + { + "epoch": 0.5878527607361963, + "grad_norm": 0.760817179926323, + "learning_rate": 7.663131197711538e-06, + "loss": 0.5116, + "step": 4791 + }, + { + "epoch": 0.5879754601226994, + "grad_norm": 0.8571918404274423, + "learning_rate": 7.659267235690739e-06, + "loss": 0.5528, + "step": 4792 + }, + { + "epoch": 0.5880981595092024, + "grad_norm": 0.979561682177909, + "learning_rate": 7.655403643368557e-06, + "loss": 0.6296, + "step": 4793 + }, + { + "epoch": 0.5882208588957055, + "grad_norm": 0.9203613880387282, + "learning_rate": 7.651540421355212e-06, + "loss": 0.5458, + "step": 4794 + }, + { + "epoch": 0.5883435582822086, + "grad_norm": 0.8692887098136524, + "learning_rate": 7.647677570260868e-06, + "loss": 0.5788, + "step": 4795 + }, + { + "epoch": 0.5884662576687116, + "grad_norm": 0.9158162698887758, + "learning_rate": 7.64381509069563e-06, + "loss": 0.5446, + "step": 4796 + }, + { + "epoch": 0.5885889570552147, + "grad_norm": 0.8931050854432031, + "learning_rate": 7.63995298326954e-06, + "loss": 0.5815, + "step": 4797 + }, + { + "epoch": 0.5887116564417177, + "grad_norm": 0.8927553791852091, + "learning_rate": 7.636091248592588e-06, + "loss": 0.5767, + "step": 4798 + }, + { + "epoch": 0.5888343558282209, + "grad_norm": 0.8213103639108955, + "learning_rate": 7.632229887274699e-06, + "loss": 0.5719, + "step": 4799 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 0.8216352714170434, + "learning_rate": 7.628368899925744e-06, + "loss": 0.564, + "step": 4800 + }, + { + "epoch": 0.589079754601227, + "grad_norm": 0.9097982356088796, + "learning_rate": 7.624508287155534e-06, + "loss": 0.6289, + "step": 4801 + }, + { + "epoch": 0.5892024539877301, + "grad_norm": 0.8654973787522259, + "learning_rate": 7.620648049573815e-06, + "loss": 0.5617, + "step": 4802 + }, + { + "epoch": 0.5893251533742331, + "grad_norm": 0.8747595304375722, + "learning_rate": 7.616788187790283e-06, + "loss": 0.5592, + "step": 4803 + }, + { + "epoch": 0.5894478527607362, + "grad_norm": 0.8766146272517656, + "learning_rate": 7.612928702414561e-06, + "loss": 0.6168, + "step": 4804 + }, + { + "epoch": 0.5895705521472393, + "grad_norm": 0.7598167005038056, + "learning_rate": 7.609069594056234e-06, + "loss": 0.5549, + "step": 4805 + }, + { + "epoch": 0.5896932515337423, + "grad_norm": 0.8894107509775404, + "learning_rate": 7.605210863324809e-06, + "loss": 0.5519, + "step": 4806 + }, + { + "epoch": 0.5898159509202454, + "grad_norm": 0.8251828500204932, + "learning_rate": 7.60135251082974e-06, + "loss": 0.5006, + "step": 4807 + }, + { + "epoch": 0.5899386503067484, + "grad_norm": 0.8290126780274756, + "learning_rate": 7.597494537180423e-06, + "loss": 0.5741, + "step": 4808 + }, + { + "epoch": 0.5900613496932515, + "grad_norm": 0.9547640976997975, + "learning_rate": 7.593636942986185e-06, + "loss": 0.5762, + "step": 4809 + }, + { + "epoch": 0.5901840490797546, + "grad_norm": 0.9309395822913324, + "learning_rate": 7.589779728856307e-06, + "loss": 0.658, + "step": 4810 + }, + { + "epoch": 0.5903067484662576, + "grad_norm": 0.8806407257132726, + "learning_rate": 7.585922895399997e-06, + "loss": 0.5493, + "step": 4811 + }, + { + "epoch": 0.5904294478527607, + "grad_norm": 0.9329479127279704, + "learning_rate": 7.582066443226414e-06, + "loss": 0.5559, + "step": 4812 + }, + { + "epoch": 0.5905521472392639, + "grad_norm": 0.792054958777231, + "learning_rate": 7.578210372944648e-06, + "loss": 0.5847, + "step": 4813 + }, + { + "epoch": 0.5906748466257669, + "grad_norm": 0.9553668310837966, + "learning_rate": 7.574354685163732e-06, + "loss": 0.568, + "step": 4814 + }, + { + "epoch": 0.59079754601227, + "grad_norm": 0.9304197800494323, + "learning_rate": 7.570499380492641e-06, + "loss": 0.5859, + "step": 4815 + }, + { + "epoch": 0.590920245398773, + "grad_norm": 0.9708750046388563, + "learning_rate": 7.566644459540285e-06, + "loss": 0.5634, + "step": 4816 + }, + { + "epoch": 0.5910429447852761, + "grad_norm": 0.7811736000046781, + "learning_rate": 7.562789922915518e-06, + "loss": 0.5722, + "step": 4817 + }, + { + "epoch": 0.5911656441717792, + "grad_norm": 0.8159346379131343, + "learning_rate": 7.558935771227129e-06, + "loss": 0.5445, + "step": 4818 + }, + { + "epoch": 0.5912883435582822, + "grad_norm": 0.9059545861430197, + "learning_rate": 7.555082005083852e-06, + "loss": 0.51, + "step": 4819 + }, + { + "epoch": 0.5914110429447853, + "grad_norm": 0.8212890176093667, + "learning_rate": 7.551228625094349e-06, + "loss": 0.5779, + "step": 4820 + }, + { + "epoch": 0.5915337423312883, + "grad_norm": 0.8840010131039331, + "learning_rate": 7.547375631867233e-06, + "loss": 0.5421, + "step": 4821 + }, + { + "epoch": 0.5916564417177914, + "grad_norm": 0.9027541156613718, + "learning_rate": 7.5435230260110506e-06, + "loss": 0.6012, + "step": 4822 + }, + { + "epoch": 0.5917791411042945, + "grad_norm": 1.122978751754099, + "learning_rate": 7.539670808134286e-06, + "loss": 0.5996, + "step": 4823 + }, + { + "epoch": 0.5919018404907975, + "grad_norm": 0.8493833605006482, + "learning_rate": 7.535818978845368e-06, + "loss": 0.576, + "step": 4824 + }, + { + "epoch": 0.5920245398773006, + "grad_norm": 1.040833784796074, + "learning_rate": 7.5319675387526555e-06, + "loss": 0.5302, + "step": 4825 + }, + { + "epoch": 0.5921472392638036, + "grad_norm": 0.8685070635388543, + "learning_rate": 7.528116488464452e-06, + "loss": 0.5445, + "step": 4826 + }, + { + "epoch": 0.5922699386503067, + "grad_norm": 0.904390179127116, + "learning_rate": 7.524265828588999e-06, + "loss": 0.5996, + "step": 4827 + }, + { + "epoch": 0.5923926380368099, + "grad_norm": 0.8472177086837283, + "learning_rate": 7.520415559734474e-06, + "loss": 0.6016, + "step": 4828 + }, + { + "epoch": 0.5925153374233129, + "grad_norm": 0.9388610619901517, + "learning_rate": 7.516565682508994e-06, + "loss": 0.5882, + "step": 4829 + }, + { + "epoch": 0.592638036809816, + "grad_norm": 0.9355394316059938, + "learning_rate": 7.512716197520614e-06, + "loss": 0.592, + "step": 4830 + }, + { + "epoch": 0.592760736196319, + "grad_norm": 0.8598701273711497, + "learning_rate": 7.508867105377328e-06, + "loss": 0.5698, + "step": 4831 + }, + { + "epoch": 0.5928834355828221, + "grad_norm": 0.9182108677946915, + "learning_rate": 7.505018406687062e-06, + "loss": 0.5689, + "step": 4832 + }, + { + "epoch": 0.5930061349693252, + "grad_norm": 0.963873317146466, + "learning_rate": 7.501170102057691e-06, + "loss": 0.6112, + "step": 4833 + }, + { + "epoch": 0.5931288343558282, + "grad_norm": 0.9626485563879238, + "learning_rate": 7.497322192097021e-06, + "loss": 0.6411, + "step": 4834 + }, + { + "epoch": 0.5932515337423313, + "grad_norm": 0.9180410532610561, + "learning_rate": 7.493474677412795e-06, + "loss": 0.591, + "step": 4835 + }, + { + "epoch": 0.5933742331288343, + "grad_norm": 0.8247162013328834, + "learning_rate": 7.489627558612693e-06, + "loss": 0.5951, + "step": 4836 + }, + { + "epoch": 0.5934969325153374, + "grad_norm": 0.966769766401703, + "learning_rate": 7.485780836304337e-06, + "loss": 0.5892, + "step": 4837 + }, + { + "epoch": 0.5936196319018405, + "grad_norm": 0.9337152416655775, + "learning_rate": 7.481934511095286e-06, + "loss": 0.5567, + "step": 4838 + }, + { + "epoch": 0.5937423312883435, + "grad_norm": 0.9186371318826345, + "learning_rate": 7.478088583593026e-06, + "loss": 0.5522, + "step": 4839 + }, + { + "epoch": 0.5938650306748466, + "grad_norm": 0.983676165267744, + "learning_rate": 7.4742430544049945e-06, + "loss": 0.5284, + "step": 4840 + }, + { + "epoch": 0.5939877300613496, + "grad_norm": 0.8791928390994938, + "learning_rate": 7.4703979241385595e-06, + "loss": 0.5437, + "step": 4841 + }, + { + "epoch": 0.5941104294478528, + "grad_norm": 0.9798989771654298, + "learning_rate": 7.466553193401024e-06, + "loss": 0.5744, + "step": 4842 + }, + { + "epoch": 0.5942331288343559, + "grad_norm": 0.9178197254175285, + "learning_rate": 7.46270886279963e-06, + "loss": 0.5789, + "step": 4843 + }, + { + "epoch": 0.5943558282208589, + "grad_norm": 0.8798940402724457, + "learning_rate": 7.458864932941559e-06, + "loss": 0.6342, + "step": 4844 + }, + { + "epoch": 0.594478527607362, + "grad_norm": 0.8797777240573068, + "learning_rate": 7.4550214044339256e-06, + "loss": 0.5233, + "step": 4845 + }, + { + "epoch": 0.5946012269938651, + "grad_norm": 0.9842893212533531, + "learning_rate": 7.451178277883781e-06, + "loss": 0.5782, + "step": 4846 + }, + { + "epoch": 0.5947239263803681, + "grad_norm": 0.8762275080971014, + "learning_rate": 7.447335553898115e-06, + "loss": 0.5302, + "step": 4847 + }, + { + "epoch": 0.5948466257668712, + "grad_norm": 0.9454939972186805, + "learning_rate": 7.443493233083852e-06, + "loss": 0.528, + "step": 4848 + }, + { + "epoch": 0.5949693251533742, + "grad_norm": 0.8668460456978482, + "learning_rate": 7.439651316047856e-06, + "loss": 0.5528, + "step": 4849 + }, + { + "epoch": 0.5950920245398773, + "grad_norm": 0.9346455341067449, + "learning_rate": 7.435809803396923e-06, + "loss": 0.6429, + "step": 4850 + }, + { + "epoch": 0.5952147239263804, + "grad_norm": 0.9100761482898003, + "learning_rate": 7.431968695737786e-06, + "loss": 0.5389, + "step": 4851 + }, + { + "epoch": 0.5953374233128834, + "grad_norm": 0.9595968718755719, + "learning_rate": 7.428127993677116e-06, + "loss": 0.5846, + "step": 4852 + }, + { + "epoch": 0.5954601226993865, + "grad_norm": 0.9261181438591477, + "learning_rate": 7.424287697821517e-06, + "loss": 0.5806, + "step": 4853 + }, + { + "epoch": 0.5955828220858895, + "grad_norm": 0.8626275171349357, + "learning_rate": 7.420447808777533e-06, + "loss": 0.5591, + "step": 4854 + }, + { + "epoch": 0.5957055214723926, + "grad_norm": 0.8890295804501053, + "learning_rate": 7.416608327151642e-06, + "loss": 0.5617, + "step": 4855 + }, + { + "epoch": 0.5958282208588958, + "grad_norm": 1.0611681549549055, + "learning_rate": 7.412769253550255e-06, + "loss": 0.5937, + "step": 4856 + }, + { + "epoch": 0.5959509202453988, + "grad_norm": 0.9365739605401477, + "learning_rate": 7.408930588579721e-06, + "loss": 0.5593, + "step": 4857 + }, + { + "epoch": 0.5960736196319019, + "grad_norm": 0.9544266258268116, + "learning_rate": 7.405092332846327e-06, + "loss": 0.5364, + "step": 4858 + }, + { + "epoch": 0.5961963190184049, + "grad_norm": 0.8936631733833516, + "learning_rate": 7.401254486956285e-06, + "loss": 0.5895, + "step": 4859 + }, + { + "epoch": 0.596319018404908, + "grad_norm": 0.8555473316327803, + "learning_rate": 7.397417051515758e-06, + "loss": 0.5895, + "step": 4860 + }, + { + "epoch": 0.5964417177914111, + "grad_norm": 0.8231343951582827, + "learning_rate": 7.393580027130835e-06, + "loss": 0.545, + "step": 4861 + }, + { + "epoch": 0.5965644171779141, + "grad_norm": 0.7963042814778739, + "learning_rate": 7.389743414407536e-06, + "loss": 0.571, + "step": 4862 + }, + { + "epoch": 0.5966871165644172, + "grad_norm": 0.9716221196484921, + "learning_rate": 7.385907213951826e-06, + "loss": 0.6124, + "step": 4863 + }, + { + "epoch": 0.5968098159509202, + "grad_norm": 1.0989621323366021, + "learning_rate": 7.382071426369597e-06, + "loss": 0.5468, + "step": 4864 + }, + { + "epoch": 0.5969325153374233, + "grad_norm": 0.8035444608802637, + "learning_rate": 7.37823605226668e-06, + "loss": 0.5425, + "step": 4865 + }, + { + "epoch": 0.5970552147239264, + "grad_norm": 0.867882907883842, + "learning_rate": 7.374401092248837e-06, + "loss": 0.6096, + "step": 4866 + }, + { + "epoch": 0.5971779141104294, + "grad_norm": 0.8427153467387455, + "learning_rate": 7.3705665469217715e-06, + "loss": 0.5911, + "step": 4867 + }, + { + "epoch": 0.5973006134969325, + "grad_norm": 0.8532631218076675, + "learning_rate": 7.3667324168911125e-06, + "loss": 0.5865, + "step": 4868 + }, + { + "epoch": 0.5974233128834355, + "grad_norm": 0.9779456929319045, + "learning_rate": 7.362898702762433e-06, + "loss": 0.5828, + "step": 4869 + }, + { + "epoch": 0.5975460122699386, + "grad_norm": 0.9110683630565615, + "learning_rate": 7.359065405141228e-06, + "loss": 0.5434, + "step": 4870 + }, + { + "epoch": 0.5976687116564418, + "grad_norm": 0.883517127056143, + "learning_rate": 7.35523252463294e-06, + "loss": 0.5595, + "step": 4871 + }, + { + "epoch": 0.5977914110429448, + "grad_norm": 0.8399364302464579, + "learning_rate": 7.351400061842935e-06, + "loss": 0.556, + "step": 4872 + }, + { + "epoch": 0.5979141104294479, + "grad_norm": 0.7822923183319365, + "learning_rate": 7.347568017376521e-06, + "loss": 0.5667, + "step": 4873 + }, + { + "epoch": 0.5980368098159509, + "grad_norm": 0.9290780459135278, + "learning_rate": 7.343736391838936e-06, + "loss": 0.5378, + "step": 4874 + }, + { + "epoch": 0.598159509202454, + "grad_norm": 0.8942995756377037, + "learning_rate": 7.33990518583535e-06, + "loss": 0.576, + "step": 4875 + }, + { + "epoch": 0.5982822085889571, + "grad_norm": 0.8461253013775148, + "learning_rate": 7.336074399970872e-06, + "loss": 0.586, + "step": 4876 + }, + { + "epoch": 0.5984049079754601, + "grad_norm": 1.151784642791541, + "learning_rate": 7.33224403485054e-06, + "loss": 0.593, + "step": 4877 + }, + { + "epoch": 0.5985276073619632, + "grad_norm": 0.8527041747300544, + "learning_rate": 7.328414091079327e-06, + "loss": 0.5848, + "step": 4878 + }, + { + "epoch": 0.5986503067484663, + "grad_norm": 0.9716353244595248, + "learning_rate": 7.324584569262141e-06, + "loss": 0.5473, + "step": 4879 + }, + { + "epoch": 0.5987730061349693, + "grad_norm": 0.9509900572117145, + "learning_rate": 7.320755470003822e-06, + "loss": 0.5764, + "step": 4880 + }, + { + "epoch": 0.5988957055214724, + "grad_norm": 0.8495375419843939, + "learning_rate": 7.316926793909143e-06, + "loss": 0.5637, + "step": 4881 + }, + { + "epoch": 0.5990184049079754, + "grad_norm": 0.8558640127399479, + "learning_rate": 7.313098541582808e-06, + "loss": 0.5601, + "step": 4882 + }, + { + "epoch": 0.5991411042944785, + "grad_norm": 0.887325902947137, + "learning_rate": 7.309270713629459e-06, + "loss": 0.5807, + "step": 4883 + }, + { + "epoch": 0.5992638036809816, + "grad_norm": 0.887114302535074, + "learning_rate": 7.305443310653668e-06, + "loss": 0.5289, + "step": 4884 + }, + { + "epoch": 0.5993865030674846, + "grad_norm": 0.9033172760031554, + "learning_rate": 7.301616333259942e-06, + "loss": 0.5174, + "step": 4885 + }, + { + "epoch": 0.5995092024539878, + "grad_norm": 0.8233550262375121, + "learning_rate": 7.297789782052716e-06, + "loss": 0.5733, + "step": 4886 + }, + { + "epoch": 0.5996319018404908, + "grad_norm": 0.9433946485829681, + "learning_rate": 7.293963657636363e-06, + "loss": 0.5953, + "step": 4887 + }, + { + "epoch": 0.5997546012269939, + "grad_norm": 1.015025360044391, + "learning_rate": 7.290137960615186e-06, + "loss": 0.5744, + "step": 4888 + }, + { + "epoch": 0.599877300613497, + "grad_norm": 0.8816772534933266, + "learning_rate": 7.2863126915934215e-06, + "loss": 0.5302, + "step": 4889 + }, + { + "epoch": 0.6, + "grad_norm": 0.9116181305863242, + "learning_rate": 7.282487851175237e-06, + "loss": 0.6388, + "step": 4890 + }, + { + "epoch": 0.6001226993865031, + "grad_norm": 0.7865520796965185, + "learning_rate": 7.278663439964735e-06, + "loss": 0.5664, + "step": 4891 + }, + { + "epoch": 0.6002453987730061, + "grad_norm": 0.894179090455795, + "learning_rate": 7.274839458565945e-06, + "loss": 0.5061, + "step": 4892 + }, + { + "epoch": 0.6003680981595092, + "grad_norm": 0.9728757722245269, + "learning_rate": 7.271015907582835e-06, + "loss": 0.5524, + "step": 4893 + }, + { + "epoch": 0.6004907975460123, + "grad_norm": 0.8398047547718338, + "learning_rate": 7.267192787619301e-06, + "loss": 0.5842, + "step": 4894 + }, + { + "epoch": 0.6006134969325153, + "grad_norm": 0.9575959899705923, + "learning_rate": 7.263370099279173e-06, + "loss": 0.5824, + "step": 4895 + }, + { + "epoch": 0.6007361963190184, + "grad_norm": 0.8859064673526172, + "learning_rate": 7.259547843166208e-06, + "loss": 0.5297, + "step": 4896 + }, + { + "epoch": 0.6008588957055214, + "grad_norm": 0.8929528174408927, + "learning_rate": 7.255726019884102e-06, + "loss": 0.5907, + "step": 4897 + }, + { + "epoch": 0.6009815950920245, + "grad_norm": 0.8040996025072934, + "learning_rate": 7.251904630036479e-06, + "loss": 0.5898, + "step": 4898 + }, + { + "epoch": 0.6011042944785276, + "grad_norm": 1.3528428711335017, + "learning_rate": 7.2480836742268955e-06, + "loss": 0.5084, + "step": 4899 + }, + { + "epoch": 0.6012269938650306, + "grad_norm": 0.9370005912424204, + "learning_rate": 7.244263153058835e-06, + "loss": 0.5703, + "step": 4900 + }, + { + "epoch": 0.6013496932515338, + "grad_norm": 0.9267248528196429, + "learning_rate": 7.240443067135718e-06, + "loss": 0.529, + "step": 4901 + }, + { + "epoch": 0.6014723926380368, + "grad_norm": 0.907146763311549, + "learning_rate": 7.236623417060894e-06, + "loss": 0.5149, + "step": 4902 + }, + { + "epoch": 0.6015950920245399, + "grad_norm": 0.8948546522860692, + "learning_rate": 7.232804203437645e-06, + "loss": 0.5428, + "step": 4903 + }, + { + "epoch": 0.601717791411043, + "grad_norm": 0.9611071071267715, + "learning_rate": 7.22898542686918e-06, + "loss": 0.5679, + "step": 4904 + }, + { + "epoch": 0.601840490797546, + "grad_norm": 0.9652732639883027, + "learning_rate": 7.225167087958647e-06, + "loss": 0.6197, + "step": 4905 + }, + { + "epoch": 0.6019631901840491, + "grad_norm": 0.905759910526631, + "learning_rate": 7.221349187309115e-06, + "loss": 0.5351, + "step": 4906 + }, + { + "epoch": 0.6020858895705522, + "grad_norm": 1.0224592705466793, + "learning_rate": 7.21753172552359e-06, + "loss": 0.6195, + "step": 4907 + }, + { + "epoch": 0.6022085889570552, + "grad_norm": 0.802659288582675, + "learning_rate": 7.213714703205008e-06, + "loss": 0.524, + "step": 4908 + }, + { + "epoch": 0.6023312883435583, + "grad_norm": 1.0738640755501696, + "learning_rate": 7.2098981209562334e-06, + "loss": 0.5692, + "step": 4909 + }, + { + "epoch": 0.6024539877300613, + "grad_norm": 1.0194490961883365, + "learning_rate": 7.2060819793800665e-06, + "loss": 0.5678, + "step": 4910 + }, + { + "epoch": 0.6025766871165644, + "grad_norm": 1.1579425938716401, + "learning_rate": 7.2022662790792304e-06, + "loss": 0.5771, + "step": 4911 + }, + { + "epoch": 0.6026993865030675, + "grad_norm": 0.9444410561621102, + "learning_rate": 7.198451020656383e-06, + "loss": 0.6589, + "step": 4912 + }, + { + "epoch": 0.6028220858895705, + "grad_norm": 0.8637684597341705, + "learning_rate": 7.194636204714112e-06, + "loss": 0.572, + "step": 4913 + }, + { + "epoch": 0.6029447852760736, + "grad_norm": 0.7440523744228622, + "learning_rate": 7.190821831854928e-06, + "loss": 0.4827, + "step": 4914 + }, + { + "epoch": 0.6030674846625766, + "grad_norm": 0.7905842157194493, + "learning_rate": 7.187007902681289e-06, + "loss": 0.549, + "step": 4915 + }, + { + "epoch": 0.6031901840490798, + "grad_norm": 0.8383451788634262, + "learning_rate": 7.183194417795569e-06, + "loss": 0.568, + "step": 4916 + }, + { + "epoch": 0.6033128834355829, + "grad_norm": 0.9158808551800187, + "learning_rate": 7.179381377800074e-06, + "loss": 0.5742, + "step": 4917 + }, + { + "epoch": 0.6034355828220859, + "grad_norm": 0.9437014541601904, + "learning_rate": 7.175568783297045e-06, + "loss": 0.5983, + "step": 4918 + }, + { + "epoch": 0.603558282208589, + "grad_norm": 0.8838635428459177, + "learning_rate": 7.171756634888641e-06, + "loss": 0.532, + "step": 4919 + }, + { + "epoch": 0.603680981595092, + "grad_norm": 0.9169195089941368, + "learning_rate": 7.16794493317696e-06, + "loss": 0.6008, + "step": 4920 + }, + { + "epoch": 0.6038036809815951, + "grad_norm": 0.972009357470945, + "learning_rate": 7.1641336787640315e-06, + "loss": 0.5811, + "step": 4921 + }, + { + "epoch": 0.6039263803680982, + "grad_norm": 0.9689257723627086, + "learning_rate": 7.160322872251807e-06, + "loss": 0.5226, + "step": 4922 + }, + { + "epoch": 0.6040490797546012, + "grad_norm": 0.9533433194724532, + "learning_rate": 7.156512514242172e-06, + "loss": 0.5629, + "step": 4923 + }, + { + "epoch": 0.6041717791411043, + "grad_norm": 0.9093255537174942, + "learning_rate": 7.152702605336939e-06, + "loss": 0.6003, + "step": 4924 + }, + { + "epoch": 0.6042944785276073, + "grad_norm": 0.8421219215722241, + "learning_rate": 7.148893146137852e-06, + "loss": 0.5609, + "step": 4925 + }, + { + "epoch": 0.6044171779141104, + "grad_norm": 0.9404302625333898, + "learning_rate": 7.1450841372465806e-06, + "loss": 0.5516, + "step": 4926 + }, + { + "epoch": 0.6045398773006135, + "grad_norm": 0.8420514012220138, + "learning_rate": 7.141275579264726e-06, + "loss": 0.5378, + "step": 4927 + }, + { + "epoch": 0.6046625766871165, + "grad_norm": 0.8851715539899216, + "learning_rate": 7.137467472793815e-06, + "loss": 0.6136, + "step": 4928 + }, + { + "epoch": 0.6047852760736196, + "grad_norm": 0.8057566322175758, + "learning_rate": 7.133659818435308e-06, + "loss": 0.5607, + "step": 4929 + }, + { + "epoch": 0.6049079754601226, + "grad_norm": 0.8764734850647414, + "learning_rate": 7.129852616790594e-06, + "loss": 0.6068, + "step": 4930 + }, + { + "epoch": 0.6050306748466258, + "grad_norm": 0.9445712455162504, + "learning_rate": 7.12604586846098e-06, + "loss": 0.5506, + "step": 4931 + }, + { + "epoch": 0.6051533742331289, + "grad_norm": 0.9649881177854657, + "learning_rate": 7.122239574047713e-06, + "loss": 0.639, + "step": 4932 + }, + { + "epoch": 0.6052760736196319, + "grad_norm": 0.82028704039835, + "learning_rate": 7.118433734151966e-06, + "loss": 0.5555, + "step": 4933 + }, + { + "epoch": 0.605398773006135, + "grad_norm": 0.8411209044002799, + "learning_rate": 7.114628349374838e-06, + "loss": 0.6193, + "step": 4934 + }, + { + "epoch": 0.605521472392638, + "grad_norm": 0.8585277418912957, + "learning_rate": 7.110823420317356e-06, + "loss": 0.5532, + "step": 4935 + }, + { + "epoch": 0.6056441717791411, + "grad_norm": 0.9192406088966613, + "learning_rate": 7.107018947580475e-06, + "loss": 0.5465, + "step": 4936 + }, + { + "epoch": 0.6057668711656442, + "grad_norm": 0.9443973396167296, + "learning_rate": 7.10321493176508e-06, + "loss": 0.5652, + "step": 4937 + }, + { + "epoch": 0.6058895705521472, + "grad_norm": 0.8673746259541842, + "learning_rate": 7.099411373471983e-06, + "loss": 0.5393, + "step": 4938 + }, + { + "epoch": 0.6060122699386503, + "grad_norm": 0.8847474817371941, + "learning_rate": 7.095608273301922e-06, + "loss": 0.5777, + "step": 4939 + }, + { + "epoch": 0.6061349693251534, + "grad_norm": 1.0293454853564894, + "learning_rate": 7.091805631855566e-06, + "loss": 0.5236, + "step": 4940 + }, + { + "epoch": 0.6062576687116564, + "grad_norm": 0.918889509407867, + "learning_rate": 7.088003449733507e-06, + "loss": 0.5225, + "step": 4941 + }, + { + "epoch": 0.6063803680981595, + "grad_norm": 0.9363355755612764, + "learning_rate": 7.084201727536267e-06, + "loss": 0.5078, + "step": 4942 + }, + { + "epoch": 0.6065030674846625, + "grad_norm": 0.9513254520259745, + "learning_rate": 7.0804004658642975e-06, + "loss": 0.5928, + "step": 4943 + }, + { + "epoch": 0.6066257668711657, + "grad_norm": 0.8792662113284175, + "learning_rate": 7.076599665317975e-06, + "loss": 0.5385, + "step": 4944 + }, + { + "epoch": 0.6067484662576688, + "grad_norm": 1.0508912263879437, + "learning_rate": 7.072799326497603e-06, + "loss": 0.5561, + "step": 4945 + }, + { + "epoch": 0.6068711656441718, + "grad_norm": 0.8919730658825372, + "learning_rate": 7.068999450003411e-06, + "loss": 0.5232, + "step": 4946 + }, + { + "epoch": 0.6069938650306749, + "grad_norm": 0.9424355738003966, + "learning_rate": 7.065200036435558e-06, + "loss": 0.6416, + "step": 4947 + }, + { + "epoch": 0.6071165644171779, + "grad_norm": 0.8323358268158861, + "learning_rate": 7.061401086394131e-06, + "loss": 0.5816, + "step": 4948 + }, + { + "epoch": 0.607239263803681, + "grad_norm": 0.8327705993936422, + "learning_rate": 7.0576026004791345e-06, + "loss": 0.5326, + "step": 4949 + }, + { + "epoch": 0.6073619631901841, + "grad_norm": 0.8277087809779946, + "learning_rate": 7.053804579290513e-06, + "loss": 0.5091, + "step": 4950 + }, + { + "epoch": 0.6074846625766871, + "grad_norm": 0.9765296583951812, + "learning_rate": 7.0500070234281294e-06, + "loss": 0.5376, + "step": 4951 + }, + { + "epoch": 0.6076073619631902, + "grad_norm": 0.9445104669633834, + "learning_rate": 7.0462099334917745e-06, + "loss": 0.5485, + "step": 4952 + }, + { + "epoch": 0.6077300613496932, + "grad_norm": 0.9336922256074871, + "learning_rate": 7.042413310081165e-06, + "loss": 0.5867, + "step": 4953 + }, + { + "epoch": 0.6078527607361963, + "grad_norm": 0.9044005106825075, + "learning_rate": 7.038617153795948e-06, + "loss": 0.5346, + "step": 4954 + }, + { + "epoch": 0.6079754601226994, + "grad_norm": 0.9340027697896024, + "learning_rate": 7.034821465235693e-06, + "loss": 0.5668, + "step": 4955 + }, + { + "epoch": 0.6080981595092024, + "grad_norm": 0.7981637006397949, + "learning_rate": 7.031026244999894e-06, + "loss": 0.5538, + "step": 4956 + }, + { + "epoch": 0.6082208588957055, + "grad_norm": 0.8999450636620447, + "learning_rate": 7.027231493687974e-06, + "loss": 0.5653, + "step": 4957 + }, + { + "epoch": 0.6083435582822085, + "grad_norm": 1.009620588935381, + "learning_rate": 7.023437211899285e-06, + "loss": 0.5928, + "step": 4958 + }, + { + "epoch": 0.6084662576687117, + "grad_norm": 0.8433144323941025, + "learning_rate": 7.019643400233096e-06, + "loss": 0.5655, + "step": 4959 + }, + { + "epoch": 0.6085889570552148, + "grad_norm": 0.8692573360020787, + "learning_rate": 7.0158500592886115e-06, + "loss": 0.5896, + "step": 4960 + }, + { + "epoch": 0.6087116564417178, + "grad_norm": 1.0153208465885608, + "learning_rate": 7.012057189664954e-06, + "loss": 0.5323, + "step": 4961 + }, + { + "epoch": 0.6088343558282209, + "grad_norm": 0.8231011585489263, + "learning_rate": 7.008264791961174e-06, + "loss": 0.542, + "step": 4962 + }, + { + "epoch": 0.6089570552147239, + "grad_norm": 0.8822373474219534, + "learning_rate": 7.0044728667762515e-06, + "loss": 0.557, + "step": 4963 + }, + { + "epoch": 0.609079754601227, + "grad_norm": 0.8796782465910279, + "learning_rate": 7.000681414709086e-06, + "loss": 0.5161, + "step": 4964 + }, + { + "epoch": 0.6092024539877301, + "grad_norm": 0.8129015453261629, + "learning_rate": 6.996890436358505e-06, + "loss": 0.5847, + "step": 4965 + }, + { + "epoch": 0.6093251533742331, + "grad_norm": 0.8279189389011009, + "learning_rate": 6.993099932323263e-06, + "loss": 0.5478, + "step": 4966 + }, + { + "epoch": 0.6094478527607362, + "grad_norm": 0.8187736471748275, + "learning_rate": 6.989309903202035e-06, + "loss": 0.5741, + "step": 4967 + }, + { + "epoch": 0.6095705521472392, + "grad_norm": 0.937360342921668, + "learning_rate": 6.985520349593424e-06, + "loss": 0.5454, + "step": 4968 + }, + { + "epoch": 0.6096932515337423, + "grad_norm": 0.799446383205667, + "learning_rate": 6.981731272095955e-06, + "loss": 0.5748, + "step": 4969 + }, + { + "epoch": 0.6098159509202454, + "grad_norm": 0.8864656258467195, + "learning_rate": 6.977942671308087e-06, + "loss": 0.5092, + "step": 4970 + }, + { + "epoch": 0.6099386503067484, + "grad_norm": 0.9579475144164902, + "learning_rate": 6.974154547828191e-06, + "loss": 0.5467, + "step": 4971 + }, + { + "epoch": 0.6100613496932515, + "grad_norm": 0.8761060941280991, + "learning_rate": 6.970366902254573e-06, + "loss": 0.5406, + "step": 4972 + }, + { + "epoch": 0.6101840490797547, + "grad_norm": 0.9000139683901738, + "learning_rate": 6.966579735185455e-06, + "loss": 0.5997, + "step": 4973 + }, + { + "epoch": 0.6103067484662577, + "grad_norm": 0.9173435708647494, + "learning_rate": 6.96279304721899e-06, + "loss": 0.5227, + "step": 4974 + }, + { + "epoch": 0.6104294478527608, + "grad_norm": 0.9100839764599116, + "learning_rate": 6.95900683895325e-06, + "loss": 0.5925, + "step": 4975 + }, + { + "epoch": 0.6105521472392638, + "grad_norm": 0.9600670828075654, + "learning_rate": 6.955221110986237e-06, + "loss": 0.5621, + "step": 4976 + }, + { + "epoch": 0.6106748466257669, + "grad_norm": 0.8277266483174558, + "learning_rate": 6.951435863915874e-06, + "loss": 0.5465, + "step": 4977 + }, + { + "epoch": 0.61079754601227, + "grad_norm": 0.8767135697782674, + "learning_rate": 6.947651098340007e-06, + "loss": 0.5104, + "step": 4978 + }, + { + "epoch": 0.610920245398773, + "grad_norm": 1.015025837259412, + "learning_rate": 6.943866814856408e-06, + "loss": 0.5788, + "step": 4979 + }, + { + "epoch": 0.6110429447852761, + "grad_norm": 1.0189804595720593, + "learning_rate": 6.9400830140627705e-06, + "loss": 0.5554, + "step": 4980 + }, + { + "epoch": 0.6111656441717791, + "grad_norm": 0.9213072422293656, + "learning_rate": 6.936299696556714e-06, + "loss": 0.6173, + "step": 4981 + }, + { + "epoch": 0.6112883435582822, + "grad_norm": 0.8650044115551536, + "learning_rate": 6.932516862935783e-06, + "loss": 0.5278, + "step": 4982 + }, + { + "epoch": 0.6114110429447853, + "grad_norm": 0.8401986146995086, + "learning_rate": 6.92873451379744e-06, + "loss": 0.5788, + "step": 4983 + }, + { + "epoch": 0.6115337423312883, + "grad_norm": 0.912545119401174, + "learning_rate": 6.924952649739077e-06, + "loss": 0.6071, + "step": 4984 + }, + { + "epoch": 0.6116564417177914, + "grad_norm": 0.9488728051630271, + "learning_rate": 6.921171271358007e-06, + "loss": 0.5564, + "step": 4985 + }, + { + "epoch": 0.6117791411042944, + "grad_norm": 0.8495258179146326, + "learning_rate": 6.9173903792514654e-06, + "loss": 0.5544, + "step": 4986 + }, + { + "epoch": 0.6119018404907975, + "grad_norm": 0.8439308720868912, + "learning_rate": 6.9136099740166105e-06, + "loss": 0.5866, + "step": 4987 + }, + { + "epoch": 0.6120245398773007, + "grad_norm": 0.7818016358545784, + "learning_rate": 6.909830056250527e-06, + "loss": 0.5007, + "step": 4988 + }, + { + "epoch": 0.6121472392638037, + "grad_norm": 0.852967843947834, + "learning_rate": 6.906050626550219e-06, + "loss": 0.5294, + "step": 4989 + }, + { + "epoch": 0.6122699386503068, + "grad_norm": 0.8682897035082975, + "learning_rate": 6.902271685512616e-06, + "loss": 0.528, + "step": 4990 + }, + { + "epoch": 0.6123926380368098, + "grad_norm": 0.8741988867963967, + "learning_rate": 6.898493233734571e-06, + "loss": 0.5079, + "step": 4991 + }, + { + "epoch": 0.6125153374233129, + "grad_norm": 0.872251694788611, + "learning_rate": 6.894715271812853e-06, + "loss": 0.5735, + "step": 4992 + }, + { + "epoch": 0.612638036809816, + "grad_norm": 0.8953865292541155, + "learning_rate": 6.89093780034416e-06, + "loss": 0.5338, + "step": 4993 + }, + { + "epoch": 0.612760736196319, + "grad_norm": 0.9418835864982833, + "learning_rate": 6.8871608199251135e-06, + "loss": 0.5408, + "step": 4994 + }, + { + "epoch": 0.6128834355828221, + "grad_norm": 0.8655687873265167, + "learning_rate": 6.883384331152254e-06, + "loss": 0.5677, + "step": 4995 + }, + { + "epoch": 0.6130061349693251, + "grad_norm": 1.2188040065743921, + "learning_rate": 6.879608334622046e-06, + "loss": 0.5499, + "step": 4996 + }, + { + "epoch": 0.6131288343558282, + "grad_norm": 0.8726505919040244, + "learning_rate": 6.875832830930872e-06, + "loss": 0.5563, + "step": 4997 + }, + { + "epoch": 0.6132515337423313, + "grad_norm": 0.8943192440290695, + "learning_rate": 6.872057820675047e-06, + "loss": 0.5501, + "step": 4998 + }, + { + "epoch": 0.6133742331288343, + "grad_norm": 0.8744422255332062, + "learning_rate": 6.8682833044507955e-06, + "loss": 0.5458, + "step": 4999 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.9272269531921767, + "learning_rate": 6.864509282854272e-06, + "loss": 0.5244, + "step": 5000 + }, + { + "epoch": 0.6136196319018405, + "grad_norm": 1.0028484046317518, + "learning_rate": 6.860735756481553e-06, + "loss": 0.5775, + "step": 5001 + }, + { + "epoch": 0.6137423312883435, + "grad_norm": 0.9016777981718229, + "learning_rate": 6.8569627259286335e-06, + "loss": 0.5379, + "step": 5002 + }, + { + "epoch": 0.6138650306748467, + "grad_norm": 0.8454669837575917, + "learning_rate": 6.853190191791428e-06, + "loss": 0.5338, + "step": 5003 + }, + { + "epoch": 0.6139877300613497, + "grad_norm": 0.8404854403217523, + "learning_rate": 6.849418154665779e-06, + "loss": 0.5554, + "step": 5004 + }, + { + "epoch": 0.6141104294478528, + "grad_norm": 0.9672544581821766, + "learning_rate": 6.845646615147445e-06, + "loss": 0.6629, + "step": 5005 + }, + { + "epoch": 0.6142331288343559, + "grad_norm": 0.9695373217088706, + "learning_rate": 6.841875573832111e-06, + "loss": 0.6043, + "step": 5006 + }, + { + "epoch": 0.6143558282208589, + "grad_norm": 0.9488856199789801, + "learning_rate": 6.83810503131538e-06, + "loss": 0.5699, + "step": 5007 + }, + { + "epoch": 0.614478527607362, + "grad_norm": 1.3219701677944493, + "learning_rate": 6.834334988192775e-06, + "loss": 0.5748, + "step": 5008 + }, + { + "epoch": 0.614601226993865, + "grad_norm": 0.8322882296412084, + "learning_rate": 6.830565445059745e-06, + "loss": 0.5933, + "step": 5009 + }, + { + "epoch": 0.6147239263803681, + "grad_norm": 0.8282208187223873, + "learning_rate": 6.826796402511653e-06, + "loss": 0.5933, + "step": 5010 + }, + { + "epoch": 0.6148466257668712, + "grad_norm": 0.9088203600801623, + "learning_rate": 6.823027861143788e-06, + "loss": 0.507, + "step": 5011 + }, + { + "epoch": 0.6149693251533742, + "grad_norm": 0.9627645250309221, + "learning_rate": 6.819259821551361e-06, + "loss": 0.5884, + "step": 5012 + }, + { + "epoch": 0.6150920245398773, + "grad_norm": 0.8662605346790573, + "learning_rate": 6.815492284329499e-06, + "loss": 0.561, + "step": 5013 + }, + { + "epoch": 0.6152147239263803, + "grad_norm": 0.9739870664561039, + "learning_rate": 6.8117252500732525e-06, + "loss": 0.5679, + "step": 5014 + }, + { + "epoch": 0.6153374233128834, + "grad_norm": 0.9248041573809405, + "learning_rate": 6.8079587193775935e-06, + "loss": 0.6017, + "step": 5015 + }, + { + "epoch": 0.6154601226993865, + "grad_norm": 0.8198139716753875, + "learning_rate": 6.804192692837412e-06, + "loss": 0.5464, + "step": 5016 + }, + { + "epoch": 0.6155828220858895, + "grad_norm": 0.8616194811830175, + "learning_rate": 6.800427171047519e-06, + "loss": 0.4888, + "step": 5017 + }, + { + "epoch": 0.6157055214723927, + "grad_norm": 0.9085102565935903, + "learning_rate": 6.796662154602648e-06, + "loss": 0.6283, + "step": 5018 + }, + { + "epoch": 0.6158282208588957, + "grad_norm": 0.9337161294864456, + "learning_rate": 6.7928976440974504e-06, + "loss": 0.5732, + "step": 5019 + }, + { + "epoch": 0.6159509202453988, + "grad_norm": 0.8018512158739777, + "learning_rate": 6.789133640126498e-06, + "loss": 0.5136, + "step": 5020 + }, + { + "epoch": 0.6160736196319019, + "grad_norm": 0.9020597804028843, + "learning_rate": 6.785370143284285e-06, + "loss": 0.5893, + "step": 5021 + }, + { + "epoch": 0.6161963190184049, + "grad_norm": 0.8171238153370944, + "learning_rate": 6.781607154165217e-06, + "loss": 0.5582, + "step": 5022 + }, + { + "epoch": 0.616319018404908, + "grad_norm": 0.9127730367313699, + "learning_rate": 6.7778446733636315e-06, + "loss": 0.5362, + "step": 5023 + }, + { + "epoch": 0.616441717791411, + "grad_norm": 0.9597681778188732, + "learning_rate": 6.774082701473774e-06, + "loss": 0.5198, + "step": 5024 + }, + { + "epoch": 0.6165644171779141, + "grad_norm": 0.927262690560231, + "learning_rate": 6.770321239089825e-06, + "loss": 0.5432, + "step": 5025 + }, + { + "epoch": 0.6166871165644172, + "grad_norm": 0.8993936591627013, + "learning_rate": 6.7665602868058696e-06, + "loss": 0.6199, + "step": 5026 + }, + { + "epoch": 0.6168098159509202, + "grad_norm": 3.256673873644574, + "learning_rate": 6.76279984521592e-06, + "loss": 0.6099, + "step": 5027 + }, + { + "epoch": 0.6169325153374233, + "grad_norm": 0.9193393721555627, + "learning_rate": 6.759039914913905e-06, + "loss": 0.5464, + "step": 5028 + }, + { + "epoch": 0.6170552147239263, + "grad_norm": 0.9029157646883106, + "learning_rate": 6.75528049649367e-06, + "loss": 0.5583, + "step": 5029 + }, + { + "epoch": 0.6171779141104294, + "grad_norm": 1.0480914699520072, + "learning_rate": 6.751521590548986e-06, + "loss": 0.5384, + "step": 5030 + }, + { + "epoch": 0.6173006134969325, + "grad_norm": 0.8432107481461367, + "learning_rate": 6.747763197673538e-06, + "loss": 0.5978, + "step": 5031 + }, + { + "epoch": 0.6174233128834355, + "grad_norm": 0.8260932438157024, + "learning_rate": 6.744005318460934e-06, + "loss": 0.5158, + "step": 5032 + }, + { + "epoch": 0.6175460122699387, + "grad_norm": 0.819690078371365, + "learning_rate": 6.740247953504697e-06, + "loss": 0.584, + "step": 5033 + }, + { + "epoch": 0.6176687116564418, + "grad_norm": 0.8848882260370962, + "learning_rate": 6.736491103398273e-06, + "loss": 0.5294, + "step": 5034 + }, + { + "epoch": 0.6177914110429448, + "grad_norm": 0.8481032882662013, + "learning_rate": 6.732734768735021e-06, + "loss": 0.5379, + "step": 5035 + }, + { + "epoch": 0.6179141104294479, + "grad_norm": 0.8278727697512438, + "learning_rate": 6.728978950108222e-06, + "loss": 0.6079, + "step": 5036 + }, + { + "epoch": 0.6180368098159509, + "grad_norm": 0.8737691594859666, + "learning_rate": 6.725223648111078e-06, + "loss": 0.5303, + "step": 5037 + }, + { + "epoch": 0.618159509202454, + "grad_norm": 0.7922963590377262, + "learning_rate": 6.721468863336704e-06, + "loss": 0.6049, + "step": 5038 + }, + { + "epoch": 0.6182822085889571, + "grad_norm": 0.9080445996798023, + "learning_rate": 6.717714596378138e-06, + "loss": 0.6279, + "step": 5039 + }, + { + "epoch": 0.6184049079754601, + "grad_norm": 0.8112706639855854, + "learning_rate": 6.713960847828335e-06, + "loss": 0.5766, + "step": 5040 + }, + { + "epoch": 0.6185276073619632, + "grad_norm": 0.8610906138638988, + "learning_rate": 6.710207618280162e-06, + "loss": 0.5458, + "step": 5041 + }, + { + "epoch": 0.6186503067484662, + "grad_norm": 0.9235564406417486, + "learning_rate": 6.706454908326414e-06, + "loss": 0.495, + "step": 5042 + }, + { + "epoch": 0.6187730061349693, + "grad_norm": 0.7959583458665483, + "learning_rate": 6.702702718559799e-06, + "loss": 0.6229, + "step": 5043 + }, + { + "epoch": 0.6188957055214724, + "grad_norm": 0.8498795112307176, + "learning_rate": 6.698951049572942e-06, + "loss": 0.5698, + "step": 5044 + }, + { + "epoch": 0.6190184049079754, + "grad_norm": 0.8873265633304861, + "learning_rate": 6.695199901958386e-06, + "loss": 0.499, + "step": 5045 + }, + { + "epoch": 0.6191411042944786, + "grad_norm": 0.7810512238127565, + "learning_rate": 6.691449276308594e-06, + "loss": 0.5257, + "step": 5046 + }, + { + "epoch": 0.6192638036809816, + "grad_norm": 0.9262409474219487, + "learning_rate": 6.687699173215945e-06, + "loss": 0.5603, + "step": 5047 + }, + { + "epoch": 0.6193865030674847, + "grad_norm": 0.9868321680653064, + "learning_rate": 6.683949593272734e-06, + "loss": 0.5954, + "step": 5048 + }, + { + "epoch": 0.6195092024539878, + "grad_norm": 0.9851758330955999, + "learning_rate": 6.680200537071178e-06, + "loss": 0.5748, + "step": 5049 + }, + { + "epoch": 0.6196319018404908, + "grad_norm": 0.8679405338588927, + "learning_rate": 6.6764520052034054e-06, + "loss": 0.5601, + "step": 5050 + }, + { + "epoch": 0.6197546012269939, + "grad_norm": 0.8497416747678681, + "learning_rate": 6.672703998261465e-06, + "loss": 0.485, + "step": 5051 + }, + { + "epoch": 0.6198773006134969, + "grad_norm": 0.810560919926814, + "learning_rate": 6.668956516837322e-06, + "loss": 0.5595, + "step": 5052 + }, + { + "epoch": 0.62, + "grad_norm": 0.8970077590763583, + "learning_rate": 6.66520956152286e-06, + "loss": 0.5578, + "step": 5053 + }, + { + "epoch": 0.6201226993865031, + "grad_norm": 0.9776928923076248, + "learning_rate": 6.66146313290988e-06, + "loss": 0.5525, + "step": 5054 + }, + { + "epoch": 0.6202453987730061, + "grad_norm": 0.8526450766030655, + "learning_rate": 6.657717231590095e-06, + "loss": 0.5648, + "step": 5055 + }, + { + "epoch": 0.6203680981595092, + "grad_norm": 0.8648784804550131, + "learning_rate": 6.65397185815514e-06, + "loss": 0.5148, + "step": 5056 + }, + { + "epoch": 0.6204907975460122, + "grad_norm": 1.0239936220010413, + "learning_rate": 6.650227013196564e-06, + "loss": 0.5719, + "step": 5057 + }, + { + "epoch": 0.6206134969325153, + "grad_norm": 1.0614023110368236, + "learning_rate": 6.6464826973058314e-06, + "loss": 0.596, + "step": 5058 + }, + { + "epoch": 0.6207361963190184, + "grad_norm": 0.8731441265813166, + "learning_rate": 6.642738911074329e-06, + "loss": 0.5954, + "step": 5059 + }, + { + "epoch": 0.6208588957055214, + "grad_norm": 0.8495146291210899, + "learning_rate": 6.638995655093351e-06, + "loss": 0.5504, + "step": 5060 + }, + { + "epoch": 0.6209815950920246, + "grad_norm": 0.962528033146649, + "learning_rate": 6.635252929954114e-06, + "loss": 0.5644, + "step": 5061 + }, + { + "epoch": 0.6211042944785276, + "grad_norm": 0.9765124894793704, + "learning_rate": 6.631510736247749e-06, + "loss": 0.5363, + "step": 5062 + }, + { + "epoch": 0.6212269938650307, + "grad_norm": 0.8451700318293963, + "learning_rate": 6.6277690745653044e-06, + "loss": 0.5893, + "step": 5063 + }, + { + "epoch": 0.6213496932515338, + "grad_norm": 1.000898713593293, + "learning_rate": 6.624027945497742e-06, + "loss": 0.632, + "step": 5064 + }, + { + "epoch": 0.6214723926380368, + "grad_norm": 0.8876023305610731, + "learning_rate": 6.620287349635942e-06, + "loss": 0.4861, + "step": 5065 + }, + { + "epoch": 0.6215950920245399, + "grad_norm": 0.8954999357566813, + "learning_rate": 6.6165472875707005e-06, + "loss": 0.5711, + "step": 5066 + }, + { + "epoch": 0.621717791411043, + "grad_norm": 0.8674938507118719, + "learning_rate": 6.612807759892726e-06, + "loss": 0.5373, + "step": 5067 + }, + { + "epoch": 0.621840490797546, + "grad_norm": 0.8658863122447922, + "learning_rate": 6.609068767192646e-06, + "loss": 0.4981, + "step": 5068 + }, + { + "epoch": 0.6219631901840491, + "grad_norm": 0.8835664831127374, + "learning_rate": 6.605330310061e-06, + "loss": 0.5537, + "step": 5069 + }, + { + "epoch": 0.6220858895705521, + "grad_norm": 0.9106337592331579, + "learning_rate": 6.601592389088251e-06, + "loss": 0.5232, + "step": 5070 + }, + { + "epoch": 0.6222085889570552, + "grad_norm": 0.8570834329386189, + "learning_rate": 6.5978550048647645e-06, + "loss": 0.5479, + "step": 5071 + }, + { + "epoch": 0.6223312883435583, + "grad_norm": 0.9018048470713798, + "learning_rate": 6.594118157980833e-06, + "loss": 0.5641, + "step": 5072 + }, + { + "epoch": 0.6224539877300613, + "grad_norm": 0.947077679951778, + "learning_rate": 6.5903818490266554e-06, + "loss": 0.605, + "step": 5073 + }, + { + "epoch": 0.6225766871165644, + "grad_norm": 0.8223270207744092, + "learning_rate": 6.586646078592353e-06, + "loss": 0.6118, + "step": 5074 + }, + { + "epoch": 0.6226993865030674, + "grad_norm": 0.9389856517390077, + "learning_rate": 6.582910847267957e-06, + "loss": 0.5539, + "step": 5075 + }, + { + "epoch": 0.6228220858895706, + "grad_norm": 0.832892170071392, + "learning_rate": 6.579176155643415e-06, + "loss": 0.6057, + "step": 5076 + }, + { + "epoch": 0.6229447852760737, + "grad_norm": 0.8869763873635569, + "learning_rate": 6.5754420043085894e-06, + "loss": 0.5956, + "step": 5077 + }, + { + "epoch": 0.6230674846625767, + "grad_norm": 1.0123517777212603, + "learning_rate": 6.57170839385326e-06, + "loss": 0.5734, + "step": 5078 + }, + { + "epoch": 0.6231901840490798, + "grad_norm": 0.8585072328520806, + "learning_rate": 6.567975324867113e-06, + "loss": 0.5664, + "step": 5079 + }, + { + "epoch": 0.6233128834355828, + "grad_norm": 0.8184261737006295, + "learning_rate": 6.564242797939759e-06, + "loss": 0.6025, + "step": 5080 + }, + { + "epoch": 0.6234355828220859, + "grad_norm": 0.8467751318185724, + "learning_rate": 6.560510813660719e-06, + "loss": 0.5696, + "step": 5081 + }, + { + "epoch": 0.623558282208589, + "grad_norm": 0.8344174999645735, + "learning_rate": 6.556779372619425e-06, + "loss": 0.6163, + "step": 5082 + }, + { + "epoch": 0.623680981595092, + "grad_norm": 0.8695643685375342, + "learning_rate": 6.5530484754052286e-06, + "loss": 0.5777, + "step": 5083 + }, + { + "epoch": 0.6238036809815951, + "grad_norm": 0.8897317519207163, + "learning_rate": 6.549318122607389e-06, + "loss": 0.5713, + "step": 5084 + }, + { + "epoch": 0.6239263803680981, + "grad_norm": 0.771392833144376, + "learning_rate": 6.545588314815088e-06, + "loss": 0.5293, + "step": 5085 + }, + { + "epoch": 0.6240490797546012, + "grad_norm": 0.99221051158562, + "learning_rate": 6.541859052617414e-06, + "loss": 0.5223, + "step": 5086 + }, + { + "epoch": 0.6241717791411043, + "grad_norm": 1.1732942581391212, + "learning_rate": 6.538130336603372e-06, + "loss": 0.5533, + "step": 5087 + }, + { + "epoch": 0.6242944785276073, + "grad_norm": 1.0047920011894906, + "learning_rate": 6.534402167361882e-06, + "loss": 0.5704, + "step": 5088 + }, + { + "epoch": 0.6244171779141104, + "grad_norm": 0.9946129126216652, + "learning_rate": 6.5306745454817766e-06, + "loss": 0.5749, + "step": 5089 + }, + { + "epoch": 0.6245398773006134, + "grad_norm": 0.9599684042048869, + "learning_rate": 6.526947471551799e-06, + "loss": 0.5811, + "step": 5090 + }, + { + "epoch": 0.6246625766871166, + "grad_norm": 1.1486585785992232, + "learning_rate": 6.5232209461606085e-06, + "loss": 0.5577, + "step": 5091 + }, + { + "epoch": 0.6247852760736197, + "grad_norm": 0.8859535560445545, + "learning_rate": 6.519494969896782e-06, + "loss": 0.5257, + "step": 5092 + }, + { + "epoch": 0.6249079754601227, + "grad_norm": 0.9990529928357026, + "learning_rate": 6.5157695433488e-06, + "loss": 0.5444, + "step": 5093 + }, + { + "epoch": 0.6250306748466258, + "grad_norm": 0.912672214370086, + "learning_rate": 6.512044667105065e-06, + "loss": 0.5761, + "step": 5094 + }, + { + "epoch": 0.6251533742331289, + "grad_norm": 0.9739058410042669, + "learning_rate": 6.508320341753889e-06, + "loss": 0.5697, + "step": 5095 + }, + { + "epoch": 0.6252760736196319, + "grad_norm": 0.8670898526425432, + "learning_rate": 6.504596567883496e-06, + "loss": 0.5991, + "step": 5096 + }, + { + "epoch": 0.625398773006135, + "grad_norm": 0.937556752740679, + "learning_rate": 6.500873346082024e-06, + "loss": 0.537, + "step": 5097 + }, + { + "epoch": 0.625521472392638, + "grad_norm": 0.8354818704396555, + "learning_rate": 6.4971506769375245e-06, + "loss": 0.5695, + "step": 5098 + }, + { + "epoch": 0.6256441717791411, + "grad_norm": 0.8828700587242096, + "learning_rate": 6.4934285610379624e-06, + "loss": 0.4944, + "step": 5099 + }, + { + "epoch": 0.6257668711656442, + "grad_norm": 0.785218672911968, + "learning_rate": 6.489706998971212e-06, + "loss": 0.6041, + "step": 5100 + }, + { + "epoch": 0.6258895705521472, + "grad_norm": 1.0853757579183771, + "learning_rate": 6.485985991325064e-06, + "loss": 0.6085, + "step": 5101 + }, + { + "epoch": 0.6260122699386503, + "grad_norm": 0.8297938859055358, + "learning_rate": 6.482265538687217e-06, + "loss": 0.5379, + "step": 5102 + }, + { + "epoch": 0.6261349693251533, + "grad_norm": 1.191616567425193, + "learning_rate": 6.478545641645284e-06, + "loss": 0.6049, + "step": 5103 + }, + { + "epoch": 0.6262576687116564, + "grad_norm": 0.8836307385782646, + "learning_rate": 6.474826300786794e-06, + "loss": 0.5727, + "step": 5104 + }, + { + "epoch": 0.6263803680981596, + "grad_norm": 0.9035797347418074, + "learning_rate": 6.471107516699183e-06, + "loss": 0.528, + "step": 5105 + }, + { + "epoch": 0.6265030674846626, + "grad_norm": 0.9841698350008851, + "learning_rate": 6.467389289969801e-06, + "loss": 0.5739, + "step": 5106 + }, + { + "epoch": 0.6266257668711657, + "grad_norm": 0.8563667776831102, + "learning_rate": 6.4636716211859076e-06, + "loss": 0.5372, + "step": 5107 + }, + { + "epoch": 0.6267484662576687, + "grad_norm": 0.8977559829304693, + "learning_rate": 6.459954510934684e-06, + "loss": 0.5913, + "step": 5108 + }, + { + "epoch": 0.6268711656441718, + "grad_norm": 0.8780622621974686, + "learning_rate": 6.456237959803209e-06, + "loss": 0.5556, + "step": 5109 + }, + { + "epoch": 0.6269938650306749, + "grad_norm": 0.8529287849401092, + "learning_rate": 6.452521968378482e-06, + "loss": 0.5588, + "step": 5110 + }, + { + "epoch": 0.6271165644171779, + "grad_norm": 0.9007435421780846, + "learning_rate": 6.448806537247412e-06, + "loss": 0.5774, + "step": 5111 + }, + { + "epoch": 0.627239263803681, + "grad_norm": 0.9169069258114142, + "learning_rate": 6.44509166699682e-06, + "loss": 0.5793, + "step": 5112 + }, + { + "epoch": 0.627361963190184, + "grad_norm": 0.9251773764239402, + "learning_rate": 6.441377358213436e-06, + "loss": 0.6033, + "step": 5113 + }, + { + "epoch": 0.6274846625766871, + "grad_norm": 0.850377863886582, + "learning_rate": 6.437663611483905e-06, + "loss": 0.5925, + "step": 5114 + }, + { + "epoch": 0.6276073619631902, + "grad_norm": 0.9654649574467541, + "learning_rate": 6.4339504273947805e-06, + "loss": 0.6029, + "step": 5115 + }, + { + "epoch": 0.6277300613496932, + "grad_norm": 0.9282007202404221, + "learning_rate": 6.430237806532529e-06, + "loss": 0.5541, + "step": 5116 + }, + { + "epoch": 0.6278527607361963, + "grad_norm": 0.8571224443750577, + "learning_rate": 6.426525749483526e-06, + "loss": 0.4886, + "step": 5117 + }, + { + "epoch": 0.6279754601226993, + "grad_norm": 1.0682640362610647, + "learning_rate": 6.422814256834058e-06, + "loss": 0.5777, + "step": 5118 + }, + { + "epoch": 0.6280981595092024, + "grad_norm": 0.8432954269185569, + "learning_rate": 6.4191033291703245e-06, + "loss": 0.5383, + "step": 5119 + }, + { + "epoch": 0.6282208588957056, + "grad_norm": 0.8555554944054498, + "learning_rate": 6.415392967078438e-06, + "loss": 0.5979, + "step": 5120 + }, + { + "epoch": 0.6283435582822086, + "grad_norm": 0.9515495187961612, + "learning_rate": 6.4116831711444115e-06, + "loss": 0.5597, + "step": 5121 + }, + { + "epoch": 0.6284662576687117, + "grad_norm": 0.841276712902224, + "learning_rate": 6.407973941954179e-06, + "loss": 0.5486, + "step": 5122 + }, + { + "epoch": 0.6285889570552147, + "grad_norm": 0.7625335497239094, + "learning_rate": 6.404265280093581e-06, + "loss": 0.5676, + "step": 5123 + }, + { + "epoch": 0.6287116564417178, + "grad_norm": 0.8898971541026093, + "learning_rate": 6.400557186148371e-06, + "loss": 0.5379, + "step": 5124 + }, + { + "epoch": 0.6288343558282209, + "grad_norm": 0.8000686729218578, + "learning_rate": 6.396849660704205e-06, + "loss": 0.5817, + "step": 5125 + }, + { + "epoch": 0.6289570552147239, + "grad_norm": 0.840706214401779, + "learning_rate": 6.393142704346661e-06, + "loss": 0.5715, + "step": 5126 + }, + { + "epoch": 0.629079754601227, + "grad_norm": 0.8792731642517094, + "learning_rate": 6.389436317661217e-06, + "loss": 0.6246, + "step": 5127 + }, + { + "epoch": 0.6292024539877301, + "grad_norm": 0.8379522235044276, + "learning_rate": 6.385730501233267e-06, + "loss": 0.5886, + "step": 5128 + }, + { + "epoch": 0.6293251533742331, + "grad_norm": 1.079149580373205, + "learning_rate": 6.382025255648112e-06, + "loss": 0.5929, + "step": 5129 + }, + { + "epoch": 0.6294478527607362, + "grad_norm": 0.8715279181186211, + "learning_rate": 6.378320581490962e-06, + "loss": 0.566, + "step": 5130 + }, + { + "epoch": 0.6295705521472392, + "grad_norm": 1.0760504940892097, + "learning_rate": 6.374616479346943e-06, + "loss": 0.5305, + "step": 5131 + }, + { + "epoch": 0.6296932515337423, + "grad_norm": 1.0053071332002306, + "learning_rate": 6.370912949801083e-06, + "loss": 0.5636, + "step": 5132 + }, + { + "epoch": 0.6298159509202454, + "grad_norm": 1.0545647200264419, + "learning_rate": 6.367209993438319e-06, + "loss": 0.5619, + "step": 5133 + }, + { + "epoch": 0.6299386503067484, + "grad_norm": 0.9077942696579148, + "learning_rate": 6.363507610843504e-06, + "loss": 0.5871, + "step": 5134 + }, + { + "epoch": 0.6300613496932516, + "grad_norm": 0.8060510777249592, + "learning_rate": 6.3598058026013995e-06, + "loss": 0.5011, + "step": 5135 + }, + { + "epoch": 0.6301840490797546, + "grad_norm": 0.844243029139668, + "learning_rate": 6.356104569296675e-06, + "loss": 0.4987, + "step": 5136 + }, + { + "epoch": 0.6303067484662577, + "grad_norm": 0.9037102952281592, + "learning_rate": 6.352403911513907e-06, + "loss": 0.5493, + "step": 5137 + }, + { + "epoch": 0.6304294478527608, + "grad_norm": 1.3229753052855542, + "learning_rate": 6.3487038298375836e-06, + "loss": 0.5771, + "step": 5138 + }, + { + "epoch": 0.6305521472392638, + "grad_norm": 0.8746282544295517, + "learning_rate": 6.345004324852098e-06, + "loss": 0.5224, + "step": 5139 + }, + { + "epoch": 0.6306748466257669, + "grad_norm": 0.8662688442201931, + "learning_rate": 6.3413053971417575e-06, + "loss": 0.5688, + "step": 5140 + }, + { + "epoch": 0.6307975460122699, + "grad_norm": 0.8848010614414739, + "learning_rate": 6.337607047290774e-06, + "loss": 0.5396, + "step": 5141 + }, + { + "epoch": 0.630920245398773, + "grad_norm": 0.9340755847461911, + "learning_rate": 6.333909275883271e-06, + "loss": 0.5999, + "step": 5142 + }, + { + "epoch": 0.6310429447852761, + "grad_norm": 0.941924860340035, + "learning_rate": 6.33021208350328e-06, + "loss": 0.5738, + "step": 5143 + }, + { + "epoch": 0.6311656441717791, + "grad_norm": 0.9328590808388891, + "learning_rate": 6.326515470734741e-06, + "loss": 0.5758, + "step": 5144 + }, + { + "epoch": 0.6312883435582822, + "grad_norm": 0.9218688041959155, + "learning_rate": 6.322819438161502e-06, + "loss": 0.6073, + "step": 5145 + }, + { + "epoch": 0.6314110429447852, + "grad_norm": 0.8552015008776449, + "learning_rate": 6.319123986367319e-06, + "loss": 0.5912, + "step": 5146 + }, + { + "epoch": 0.6315337423312883, + "grad_norm": 0.853026395087687, + "learning_rate": 6.315429115935858e-06, + "loss": 0.6231, + "step": 5147 + }, + { + "epoch": 0.6316564417177915, + "grad_norm": 0.9961953623459082, + "learning_rate": 6.3117348274506904e-06, + "loss": 0.5361, + "step": 5148 + }, + { + "epoch": 0.6317791411042945, + "grad_norm": 0.8275679360466112, + "learning_rate": 6.308041121495298e-06, + "loss": 0.5612, + "step": 5149 + }, + { + "epoch": 0.6319018404907976, + "grad_norm": 0.880070663877452, + "learning_rate": 6.304347998653074e-06, + "loss": 0.5654, + "step": 5150 + }, + { + "epoch": 0.6320245398773006, + "grad_norm": 0.8766242267033718, + "learning_rate": 6.300655459507309e-06, + "loss": 0.5718, + "step": 5151 + }, + { + "epoch": 0.6321472392638037, + "grad_norm": 0.8568754815017016, + "learning_rate": 6.296963504641209e-06, + "loss": 0.5356, + "step": 5152 + }, + { + "epoch": 0.6322699386503068, + "grad_norm": 0.9121319915316406, + "learning_rate": 6.293272134637889e-06, + "loss": 0.569, + "step": 5153 + }, + { + "epoch": 0.6323926380368098, + "grad_norm": 0.8835387725804008, + "learning_rate": 6.289581350080368e-06, + "loss": 0.5981, + "step": 5154 + }, + { + "epoch": 0.6325153374233129, + "grad_norm": 0.7579089089127218, + "learning_rate": 6.285891151551573e-06, + "loss": 0.5474, + "step": 5155 + }, + { + "epoch": 0.6326380368098159, + "grad_norm": 0.8676792336085611, + "learning_rate": 6.282201539634341e-06, + "loss": 0.5689, + "step": 5156 + }, + { + "epoch": 0.632760736196319, + "grad_norm": 0.791597497277152, + "learning_rate": 6.278512514911412e-06, + "loss": 0.5451, + "step": 5157 + }, + { + "epoch": 0.6328834355828221, + "grad_norm": 0.8199528280337123, + "learning_rate": 6.274824077965438e-06, + "loss": 0.5591, + "step": 5158 + }, + { + "epoch": 0.6330061349693251, + "grad_norm": 0.9063191566900685, + "learning_rate": 6.271136229378976e-06, + "loss": 0.564, + "step": 5159 + }, + { + "epoch": 0.6331288343558282, + "grad_norm": 0.8538212912051455, + "learning_rate": 6.267448969734486e-06, + "loss": 0.6112, + "step": 5160 + }, + { + "epoch": 0.6332515337423313, + "grad_norm": 0.9207296507114558, + "learning_rate": 6.263762299614344e-06, + "loss": 0.5344, + "step": 5161 + }, + { + "epoch": 0.6333742331288343, + "grad_norm": 1.0203474373648624, + "learning_rate": 6.260076219600823e-06, + "loss": 0.6172, + "step": 5162 + }, + { + "epoch": 0.6334969325153375, + "grad_norm": 0.8642243229713779, + "learning_rate": 6.256390730276114e-06, + "loss": 0.5449, + "step": 5163 + }, + { + "epoch": 0.6336196319018405, + "grad_norm": 1.0088937961367348, + "learning_rate": 6.252705832222303e-06, + "loss": 0.5132, + "step": 5164 + }, + { + "epoch": 0.6337423312883436, + "grad_norm": 0.9673914854864989, + "learning_rate": 6.24902152602139e-06, + "loss": 0.5627, + "step": 5165 + }, + { + "epoch": 0.6338650306748467, + "grad_norm": 0.8553099411770287, + "learning_rate": 6.245337812255281e-06, + "loss": 0.5705, + "step": 5166 + }, + { + "epoch": 0.6339877300613497, + "grad_norm": 0.8833015159759668, + "learning_rate": 6.241654691505783e-06, + "loss": 0.5741, + "step": 5167 + }, + { + "epoch": 0.6341104294478528, + "grad_norm": 0.9596685035291217, + "learning_rate": 6.237972164354617e-06, + "loss": 0.5502, + "step": 5168 + }, + { + "epoch": 0.6342331288343558, + "grad_norm": 0.8097074450486521, + "learning_rate": 6.2342902313834065e-06, + "loss": 0.557, + "step": 5169 + }, + { + "epoch": 0.6343558282208589, + "grad_norm": 0.9005912757802748, + "learning_rate": 6.2306088931736766e-06, + "loss": 0.5908, + "step": 5170 + }, + { + "epoch": 0.634478527607362, + "grad_norm": 0.8735369520821304, + "learning_rate": 6.226928150306866e-06, + "loss": 0.5268, + "step": 5171 + }, + { + "epoch": 0.634601226993865, + "grad_norm": 0.891129544964529, + "learning_rate": 6.223248003364317e-06, + "loss": 0.5297, + "step": 5172 + }, + { + "epoch": 0.6347239263803681, + "grad_norm": 1.070506917312911, + "learning_rate": 6.219568452927277e-06, + "loss": 0.5953, + "step": 5173 + }, + { + "epoch": 0.6348466257668711, + "grad_norm": 0.8063861952098398, + "learning_rate": 6.215889499576898e-06, + "loss": 0.5969, + "step": 5174 + }, + { + "epoch": 0.6349693251533742, + "grad_norm": 1.0375588196450363, + "learning_rate": 6.21221114389424e-06, + "loss": 0.5832, + "step": 5175 + }, + { + "epoch": 0.6350920245398773, + "grad_norm": 0.9023626727383657, + "learning_rate": 6.208533386460269e-06, + "loss": 0.5462, + "step": 5176 + }, + { + "epoch": 0.6352147239263803, + "grad_norm": 1.0796182805048746, + "learning_rate": 6.204856227855853e-06, + "loss": 0.5663, + "step": 5177 + }, + { + "epoch": 0.6353374233128835, + "grad_norm": 0.9023927472291693, + "learning_rate": 6.201179668661768e-06, + "loss": 0.5897, + "step": 5178 + }, + { + "epoch": 0.6354601226993865, + "grad_norm": 0.9004501740033688, + "learning_rate": 6.197503709458696e-06, + "loss": 0.6135, + "step": 5179 + }, + { + "epoch": 0.6355828220858896, + "grad_norm": 0.8778569050472166, + "learning_rate": 6.193828350827222e-06, + "loss": 0.549, + "step": 5180 + }, + { + "epoch": 0.6357055214723927, + "grad_norm": 0.8327302056309065, + "learning_rate": 6.190153593347839e-06, + "loss": 0.5459, + "step": 5181 + }, + { + "epoch": 0.6358282208588957, + "grad_norm": 0.9224940932001452, + "learning_rate": 6.186479437600941e-06, + "loss": 0.6124, + "step": 5182 + }, + { + "epoch": 0.6359509202453988, + "grad_norm": 1.1779350717924721, + "learning_rate": 6.18280588416683e-06, + "loss": 0.5769, + "step": 5183 + }, + { + "epoch": 0.6360736196319018, + "grad_norm": 0.7857238513585045, + "learning_rate": 6.179132933625713e-06, + "loss": 0.5704, + "step": 5184 + }, + { + "epoch": 0.6361963190184049, + "grad_norm": 1.007505249181569, + "learning_rate": 6.175460586557701e-06, + "loss": 0.5861, + "step": 5185 + }, + { + "epoch": 0.636319018404908, + "grad_norm": 0.7860351055157793, + "learning_rate": 6.171788843542809e-06, + "loss": 0.5326, + "step": 5186 + }, + { + "epoch": 0.636441717791411, + "grad_norm": 0.8082738734511099, + "learning_rate": 6.168117705160956e-06, + "loss": 0.5564, + "step": 5187 + }, + { + "epoch": 0.6365644171779141, + "grad_norm": 0.8528397691835896, + "learning_rate": 6.1644471719919695e-06, + "loss": 0.5362, + "step": 5188 + }, + { + "epoch": 0.6366871165644172, + "grad_norm": 0.9119946133146491, + "learning_rate": 6.160777244615578e-06, + "loss": 0.5136, + "step": 5189 + }, + { + "epoch": 0.6368098159509202, + "grad_norm": 0.9688092520623429, + "learning_rate": 6.157107923611412e-06, + "loss": 0.6172, + "step": 5190 + }, + { + "epoch": 0.6369325153374233, + "grad_norm": 0.8663167847618725, + "learning_rate": 6.153439209559013e-06, + "loss": 0.5267, + "step": 5191 + }, + { + "epoch": 0.6370552147239263, + "grad_norm": 1.5716340206231458, + "learning_rate": 6.149771103037821e-06, + "loss": 0.5742, + "step": 5192 + }, + { + "epoch": 0.6371779141104295, + "grad_norm": 0.8872159991662576, + "learning_rate": 6.1461036046271824e-06, + "loss": 0.564, + "step": 5193 + }, + { + "epoch": 0.6373006134969326, + "grad_norm": 0.8673043117705134, + "learning_rate": 6.142436714906348e-06, + "loss": 0.5185, + "step": 5194 + }, + { + "epoch": 0.6374233128834356, + "grad_norm": 0.8578812869919099, + "learning_rate": 6.1387704344544684e-06, + "loss": 0.5886, + "step": 5195 + }, + { + "epoch": 0.6375460122699387, + "grad_norm": 0.8633800764998264, + "learning_rate": 6.135104763850605e-06, + "loss": 0.5842, + "step": 5196 + }, + { + "epoch": 0.6376687116564417, + "grad_norm": 0.8785278508692869, + "learning_rate": 6.1314397036737164e-06, + "loss": 0.5788, + "step": 5197 + }, + { + "epoch": 0.6377914110429448, + "grad_norm": 0.8803426511011209, + "learning_rate": 6.127775254502668e-06, + "loss": 0.5143, + "step": 5198 + }, + { + "epoch": 0.6379141104294479, + "grad_norm": 0.954326483969955, + "learning_rate": 6.12411141691623e-06, + "loss": 0.5441, + "step": 5199 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 0.773792939359086, + "learning_rate": 6.120448191493071e-06, + "loss": 0.5597, + "step": 5200 + }, + { + "epoch": 0.638159509202454, + "grad_norm": 0.8339336312641037, + "learning_rate": 6.1167855788117655e-06, + "loss": 0.5522, + "step": 5201 + }, + { + "epoch": 0.638282208588957, + "grad_norm": 0.8950225032296228, + "learning_rate": 6.113123579450795e-06, + "loss": 0.5473, + "step": 5202 + }, + { + "epoch": 0.6384049079754601, + "grad_norm": 1.1113143688335279, + "learning_rate": 6.1094621939885375e-06, + "loss": 0.569, + "step": 5203 + }, + { + "epoch": 0.6385276073619632, + "grad_norm": 0.8603042285317354, + "learning_rate": 6.1058014230032795e-06, + "loss": 0.493, + "step": 5204 + }, + { + "epoch": 0.6386503067484662, + "grad_norm": 0.9216263535438071, + "learning_rate": 6.102141267073207e-06, + "loss": 0.5875, + "step": 5205 + }, + { + "epoch": 0.6387730061349693, + "grad_norm": 1.0242722540287736, + "learning_rate": 6.0984817267764126e-06, + "loss": 0.5792, + "step": 5206 + }, + { + "epoch": 0.6388957055214723, + "grad_norm": 0.8826683677932767, + "learning_rate": 6.094822802690886e-06, + "loss": 0.5808, + "step": 5207 + }, + { + "epoch": 0.6390184049079755, + "grad_norm": 0.8481218020592338, + "learning_rate": 6.091164495394526e-06, + "loss": 0.5368, + "step": 5208 + }, + { + "epoch": 0.6391411042944786, + "grad_norm": 0.904224631348095, + "learning_rate": 6.087506805465127e-06, + "loss": 0.5583, + "step": 5209 + }, + { + "epoch": 0.6392638036809816, + "grad_norm": 0.8486011932625308, + "learning_rate": 6.083849733480394e-06, + "loss": 0.574, + "step": 5210 + }, + { + "epoch": 0.6393865030674847, + "grad_norm": 0.8907408213165094, + "learning_rate": 6.0801932800179275e-06, + "loss": 0.6142, + "step": 5211 + }, + { + "epoch": 0.6395092024539877, + "grad_norm": 0.8707068300749278, + "learning_rate": 6.076537445655233e-06, + "loss": 0.5457, + "step": 5212 + }, + { + "epoch": 0.6396319018404908, + "grad_norm": 0.8978335529207517, + "learning_rate": 6.072882230969716e-06, + "loss": 0.5713, + "step": 5213 + }, + { + "epoch": 0.6397546012269939, + "grad_norm": 0.8346545982042347, + "learning_rate": 6.069227636538692e-06, + "loss": 0.5207, + "step": 5214 + }, + { + "epoch": 0.6398773006134969, + "grad_norm": 0.906038501929268, + "learning_rate": 6.065573662939367e-06, + "loss": 0.5332, + "step": 5215 + }, + { + "epoch": 0.64, + "grad_norm": 0.8430590952013903, + "learning_rate": 6.061920310748858e-06, + "loss": 0.5915, + "step": 5216 + }, + { + "epoch": 0.640122699386503, + "grad_norm": 0.8449231543670918, + "learning_rate": 6.058267580544175e-06, + "loss": 0.5552, + "step": 5217 + }, + { + "epoch": 0.6402453987730061, + "grad_norm": 0.8922306983937657, + "learning_rate": 6.054615472902247e-06, + "loss": 0.4989, + "step": 5218 + }, + { + "epoch": 0.6403680981595092, + "grad_norm": 0.8218792794429076, + "learning_rate": 6.050963988399883e-06, + "loss": 0.5421, + "step": 5219 + }, + { + "epoch": 0.6404907975460122, + "grad_norm": 0.9127898584430951, + "learning_rate": 6.047313127613808e-06, + "loss": 0.609, + "step": 5220 + }, + { + "epoch": 0.6406134969325153, + "grad_norm": 0.8532832904598825, + "learning_rate": 6.043662891120641e-06, + "loss": 0.5389, + "step": 5221 + }, + { + "epoch": 0.6407361963190185, + "grad_norm": 0.891256494275921, + "learning_rate": 6.040013279496908e-06, + "loss": 0.5113, + "step": 5222 + }, + { + "epoch": 0.6408588957055215, + "grad_norm": 0.7827564385941238, + "learning_rate": 6.036364293319033e-06, + "loss": 0.5374, + "step": 5223 + }, + { + "epoch": 0.6409815950920246, + "grad_norm": 0.9563537208150857, + "learning_rate": 6.03271593316334e-06, + "loss": 0.5442, + "step": 5224 + }, + { + "epoch": 0.6411042944785276, + "grad_norm": 0.9586244293380248, + "learning_rate": 6.0290681996060605e-06, + "loss": 0.57, + "step": 5225 + }, + { + "epoch": 0.6412269938650307, + "grad_norm": 0.912870872096735, + "learning_rate": 6.025421093223318e-06, + "loss": 0.5474, + "step": 5226 + }, + { + "epoch": 0.6413496932515338, + "grad_norm": 0.870401523068652, + "learning_rate": 6.021774614591146e-06, + "loss": 0.4921, + "step": 5227 + }, + { + "epoch": 0.6414723926380368, + "grad_norm": 0.8498787885500226, + "learning_rate": 6.018128764285471e-06, + "loss": 0.5458, + "step": 5228 + }, + { + "epoch": 0.6415950920245399, + "grad_norm": 0.915585859623708, + "learning_rate": 6.014483542882126e-06, + "loss": 0.5868, + "step": 5229 + }, + { + "epoch": 0.6417177914110429, + "grad_norm": 1.0224387254865408, + "learning_rate": 6.010838950956841e-06, + "loss": 0.5494, + "step": 5230 + }, + { + "epoch": 0.641840490797546, + "grad_norm": 0.8006409229164306, + "learning_rate": 6.007194989085247e-06, + "loss": 0.5813, + "step": 5231 + }, + { + "epoch": 0.6419631901840491, + "grad_norm": 0.8551703181600083, + "learning_rate": 6.00355165784288e-06, + "loss": 0.5566, + "step": 5232 + }, + { + "epoch": 0.6420858895705521, + "grad_norm": 0.9170186128794378, + "learning_rate": 5.999908957805169e-06, + "loss": 0.5919, + "step": 5233 + }, + { + "epoch": 0.6422085889570552, + "grad_norm": 0.8107859262429036, + "learning_rate": 5.9962668895474486e-06, + "loss": 0.5655, + "step": 5234 + }, + { + "epoch": 0.6423312883435582, + "grad_norm": 0.8175243285853814, + "learning_rate": 5.992625453644953e-06, + "loss": 0.5598, + "step": 5235 + }, + { + "epoch": 0.6424539877300613, + "grad_norm": 0.965252986711724, + "learning_rate": 5.988984650672813e-06, + "loss": 0.566, + "step": 5236 + }, + { + "epoch": 0.6425766871165645, + "grad_norm": 0.8680024417806051, + "learning_rate": 5.985344481206067e-06, + "loss": 0.5883, + "step": 5237 + }, + { + "epoch": 0.6426993865030675, + "grad_norm": 0.9179304734236956, + "learning_rate": 5.9817049458196444e-06, + "loss": 0.5645, + "step": 5238 + }, + { + "epoch": 0.6428220858895706, + "grad_norm": 0.905310596254269, + "learning_rate": 5.9780660450883785e-06, + "loss": 0.5802, + "step": 5239 + }, + { + "epoch": 0.6429447852760736, + "grad_norm": 0.9095300994882807, + "learning_rate": 5.974427779587004e-06, + "loss": 0.5635, + "step": 5240 + }, + { + "epoch": 0.6430674846625767, + "grad_norm": 0.9385571490838592, + "learning_rate": 5.970790149890156e-06, + "loss": 0.5955, + "step": 5241 + }, + { + "epoch": 0.6431901840490798, + "grad_norm": 0.8582006003341163, + "learning_rate": 5.967153156572363e-06, + "loss": 0.5303, + "step": 5242 + }, + { + "epoch": 0.6433128834355828, + "grad_norm": 0.8753831958930149, + "learning_rate": 5.963516800208056e-06, + "loss": 0.5974, + "step": 5243 + }, + { + "epoch": 0.6434355828220859, + "grad_norm": 0.9750480075994766, + "learning_rate": 5.95988108137157e-06, + "loss": 0.5297, + "step": 5244 + }, + { + "epoch": 0.6435582822085889, + "grad_norm": 0.9299063843579549, + "learning_rate": 5.9562460006371295e-06, + "loss": 0.4987, + "step": 5245 + }, + { + "epoch": 0.643680981595092, + "grad_norm": 0.9101019931396316, + "learning_rate": 5.952611558578871e-06, + "loss": 0.5932, + "step": 5246 + }, + { + "epoch": 0.6438036809815951, + "grad_norm": 1.0147417079724455, + "learning_rate": 5.948977755770822e-06, + "loss": 0.5778, + "step": 5247 + }, + { + "epoch": 0.6439263803680981, + "grad_norm": 0.8448044586129928, + "learning_rate": 5.94534459278691e-06, + "loss": 0.5741, + "step": 5248 + }, + { + "epoch": 0.6440490797546012, + "grad_norm": 0.8192240730332445, + "learning_rate": 5.9417120702009604e-06, + "loss": 0.5056, + "step": 5249 + }, + { + "epoch": 0.6441717791411042, + "grad_norm": 0.8794643852279302, + "learning_rate": 5.938080188586699e-06, + "loss": 0.5635, + "step": 5250 + }, + { + "epoch": 0.6442944785276074, + "grad_norm": 1.4348934213194227, + "learning_rate": 5.93444894851775e-06, + "loss": 0.6471, + "step": 5251 + }, + { + "epoch": 0.6444171779141105, + "grad_norm": 0.8741445947985661, + "learning_rate": 5.930818350567639e-06, + "loss": 0.6332, + "step": 5252 + }, + { + "epoch": 0.6445398773006135, + "grad_norm": 0.8366338445244722, + "learning_rate": 5.9271883953097855e-06, + "loss": 0.5804, + "step": 5253 + }, + { + "epoch": 0.6446625766871166, + "grad_norm": 0.7802083238442286, + "learning_rate": 5.92355908331751e-06, + "loss": 0.6017, + "step": 5254 + }, + { + "epoch": 0.6447852760736197, + "grad_norm": 0.9080299982764314, + "learning_rate": 5.919930415164033e-06, + "loss": 0.5642, + "step": 5255 + }, + { + "epoch": 0.6449079754601227, + "grad_norm": 0.9191775218680266, + "learning_rate": 5.916302391422468e-06, + "loss": 0.5495, + "step": 5256 + }, + { + "epoch": 0.6450306748466258, + "grad_norm": 1.0376251611319767, + "learning_rate": 5.912675012665833e-06, + "loss": 0.6195, + "step": 5257 + }, + { + "epoch": 0.6451533742331288, + "grad_norm": 0.8284785927373994, + "learning_rate": 5.90904827946704e-06, + "loss": 0.6285, + "step": 5258 + }, + { + "epoch": 0.6452760736196319, + "grad_norm": 0.9002043890566288, + "learning_rate": 5.905422192398902e-06, + "loss": 0.5572, + "step": 5259 + }, + { + "epoch": 0.645398773006135, + "grad_norm": 0.8941739491612938, + "learning_rate": 5.901796752034128e-06, + "loss": 0.5895, + "step": 5260 + }, + { + "epoch": 0.645521472392638, + "grad_norm": 0.8186596214662912, + "learning_rate": 5.898171958945322e-06, + "loss": 0.5247, + "step": 5261 + }, + { + "epoch": 0.6456441717791411, + "grad_norm": 0.8999413005165515, + "learning_rate": 5.89454781370499e-06, + "loss": 0.5682, + "step": 5262 + }, + { + "epoch": 0.6457668711656441, + "grad_norm": 0.8567173290553713, + "learning_rate": 5.890924316885538e-06, + "loss": 0.5611, + "step": 5263 + }, + { + "epoch": 0.6458895705521472, + "grad_norm": 0.9780163752230666, + "learning_rate": 5.887301469059262e-06, + "loss": 0.55, + "step": 5264 + }, + { + "epoch": 0.6460122699386504, + "grad_norm": 0.8479419967648003, + "learning_rate": 5.883679270798363e-06, + "loss": 0.5527, + "step": 5265 + }, + { + "epoch": 0.6461349693251534, + "grad_norm": 0.9039386906935717, + "learning_rate": 5.880057722674933e-06, + "loss": 0.5558, + "step": 5266 + }, + { + "epoch": 0.6462576687116565, + "grad_norm": 0.8905491679604634, + "learning_rate": 5.876436825260967e-06, + "loss": 0.5446, + "step": 5267 + }, + { + "epoch": 0.6463803680981595, + "grad_norm": 0.8557261243488119, + "learning_rate": 5.8728165791283535e-06, + "loss": 0.538, + "step": 5268 + }, + { + "epoch": 0.6465030674846626, + "grad_norm": 0.8537732822737143, + "learning_rate": 5.869196984848879e-06, + "loss": 0.6049, + "step": 5269 + }, + { + "epoch": 0.6466257668711657, + "grad_norm": 0.9640684630923009, + "learning_rate": 5.865578042994227e-06, + "loss": 0.5365, + "step": 5270 + }, + { + "epoch": 0.6467484662576687, + "grad_norm": 0.8637749632109902, + "learning_rate": 5.8619597541359796e-06, + "loss": 0.5774, + "step": 5271 + }, + { + "epoch": 0.6468711656441718, + "grad_norm": 1.1205284484538172, + "learning_rate": 5.8583421188456125e-06, + "loss": 0.5908, + "step": 5272 + }, + { + "epoch": 0.6469938650306748, + "grad_norm": 0.8673230952191076, + "learning_rate": 5.854725137694501e-06, + "loss": 0.6033, + "step": 5273 + }, + { + "epoch": 0.6471165644171779, + "grad_norm": 0.8920068141078038, + "learning_rate": 5.851108811253918e-06, + "loss": 0.5986, + "step": 5274 + }, + { + "epoch": 0.647239263803681, + "grad_norm": 0.9275687158796954, + "learning_rate": 5.84749314009503e-06, + "loss": 0.5542, + "step": 5275 + }, + { + "epoch": 0.647361963190184, + "grad_norm": 0.9368771109799087, + "learning_rate": 5.8438781247889e-06, + "loss": 0.5799, + "step": 5276 + }, + { + "epoch": 0.6474846625766871, + "grad_norm": 0.9678189263115888, + "learning_rate": 5.84026376590649e-06, + "loss": 0.5834, + "step": 5277 + }, + { + "epoch": 0.6476073619631901, + "grad_norm": 0.8823045937721264, + "learning_rate": 5.836650064018657e-06, + "loss": 0.5468, + "step": 5278 + }, + { + "epoch": 0.6477300613496932, + "grad_norm": 0.8667811450719533, + "learning_rate": 5.8330370196961506e-06, + "loss": 0.5742, + "step": 5279 + }, + { + "epoch": 0.6478527607361964, + "grad_norm": 0.9576270079528981, + "learning_rate": 5.829424633509627e-06, + "loss": 0.5603, + "step": 5280 + }, + { + "epoch": 0.6479754601226994, + "grad_norm": 0.9033030701317698, + "learning_rate": 5.8258129060296235e-06, + "loss": 0.5756, + "step": 5281 + }, + { + "epoch": 0.6480981595092025, + "grad_norm": 0.9312149089729184, + "learning_rate": 5.822201837826589e-06, + "loss": 0.5319, + "step": 5282 + }, + { + "epoch": 0.6482208588957056, + "grad_norm": 0.9579522896894629, + "learning_rate": 5.818591429470853e-06, + "loss": 0.5634, + "step": 5283 + }, + { + "epoch": 0.6483435582822086, + "grad_norm": 0.8725817293989765, + "learning_rate": 5.814981681532657e-06, + "loss": 0.5713, + "step": 5284 + }, + { + "epoch": 0.6484662576687117, + "grad_norm": 0.772704895597652, + "learning_rate": 5.8113725945821245e-06, + "loss": 0.607, + "step": 5285 + }, + { + "epoch": 0.6485889570552147, + "grad_norm": 0.8537868821607896, + "learning_rate": 5.807764169189276e-06, + "loss": 0.5458, + "step": 5286 + }, + { + "epoch": 0.6487116564417178, + "grad_norm": 0.8825628051523888, + "learning_rate": 5.8041564059240405e-06, + "loss": 0.5397, + "step": 5287 + }, + { + "epoch": 0.6488343558282209, + "grad_norm": 1.0394263950968374, + "learning_rate": 5.800549305356224e-06, + "loss": 0.6214, + "step": 5288 + }, + { + "epoch": 0.6489570552147239, + "grad_norm": 0.9267326496087107, + "learning_rate": 5.796942868055546e-06, + "loss": 0.576, + "step": 5289 + }, + { + "epoch": 0.649079754601227, + "grad_norm": 0.847704452622528, + "learning_rate": 5.7933370945916036e-06, + "loss": 0.577, + "step": 5290 + }, + { + "epoch": 0.64920245398773, + "grad_norm": 0.8848644642074146, + "learning_rate": 5.789731985533906e-06, + "loss": 0.5341, + "step": 5291 + }, + { + "epoch": 0.6493251533742331, + "grad_norm": 0.9241706426430109, + "learning_rate": 5.786127541451841e-06, + "loss": 0.582, + "step": 5292 + }, + { + "epoch": 0.6494478527607362, + "grad_norm": 1.0097046556415497, + "learning_rate": 5.782523762914707e-06, + "loss": 0.5775, + "step": 5293 + }, + { + "epoch": 0.6495705521472392, + "grad_norm": 0.785818707958983, + "learning_rate": 5.7789206504916815e-06, + "loss": 0.4839, + "step": 5294 + }, + { + "epoch": 0.6496932515337424, + "grad_norm": 0.9084150120176895, + "learning_rate": 5.775318204751854e-06, + "loss": 0.5957, + "step": 5295 + }, + { + "epoch": 0.6498159509202454, + "grad_norm": 0.8121582592825418, + "learning_rate": 5.771716426264193e-06, + "loss": 0.5589, + "step": 5296 + }, + { + "epoch": 0.6499386503067485, + "grad_norm": 0.8874212154486337, + "learning_rate": 5.7681153155975755e-06, + "loss": 0.5574, + "step": 5297 + }, + { + "epoch": 0.6500613496932516, + "grad_norm": 1.0335834614666104, + "learning_rate": 5.764514873320761e-06, + "loss": 0.5672, + "step": 5298 + }, + { + "epoch": 0.6501840490797546, + "grad_norm": 0.9378138999309461, + "learning_rate": 5.760915100002407e-06, + "loss": 0.5115, + "step": 5299 + }, + { + "epoch": 0.6503067484662577, + "grad_norm": 0.9123272523439955, + "learning_rate": 5.757315996211066e-06, + "loss": 0.5874, + "step": 5300 + }, + { + "epoch": 0.6504294478527607, + "grad_norm": 0.8191213037998452, + "learning_rate": 5.75371756251519e-06, + "loss": 0.5833, + "step": 5301 + }, + { + "epoch": 0.6505521472392638, + "grad_norm": 0.8507166701925492, + "learning_rate": 5.750119799483123e-06, + "loss": 0.5482, + "step": 5302 + }, + { + "epoch": 0.6506748466257669, + "grad_norm": 0.8796839380632371, + "learning_rate": 5.746522707683093e-06, + "loss": 0.6107, + "step": 5303 + }, + { + "epoch": 0.6507975460122699, + "grad_norm": 0.8179561400153316, + "learning_rate": 5.742926287683237e-06, + "loss": 0.5471, + "step": 5304 + }, + { + "epoch": 0.650920245398773, + "grad_norm": 0.9618276753993911, + "learning_rate": 5.7393305400515755e-06, + "loss": 0.5447, + "step": 5305 + }, + { + "epoch": 0.651042944785276, + "grad_norm": 0.8462008772252385, + "learning_rate": 5.735735465356021e-06, + "loss": 0.5535, + "step": 5306 + }, + { + "epoch": 0.6511656441717791, + "grad_norm": 0.8659748130056476, + "learning_rate": 5.7321410641643934e-06, + "loss": 0.5597, + "step": 5307 + }, + { + "epoch": 0.6512883435582822, + "grad_norm": 1.1050891011166566, + "learning_rate": 5.72854733704439e-06, + "loss": 0.5851, + "step": 5308 + }, + { + "epoch": 0.6514110429447852, + "grad_norm": 0.9042080828564203, + "learning_rate": 5.724954284563615e-06, + "loss": 0.5457, + "step": 5309 + }, + { + "epoch": 0.6515337423312884, + "grad_norm": 0.9824806771740037, + "learning_rate": 5.721361907289556e-06, + "loss": 0.5449, + "step": 5310 + }, + { + "epoch": 0.6516564417177914, + "grad_norm": 0.8994276788337894, + "learning_rate": 5.7177702057896015e-06, + "loss": 0.5941, + "step": 5311 + }, + { + "epoch": 0.6517791411042945, + "grad_norm": 1.1141008512079689, + "learning_rate": 5.714179180631024e-06, + "loss": 0.5835, + "step": 5312 + }, + { + "epoch": 0.6519018404907976, + "grad_norm": 0.924593088931173, + "learning_rate": 5.710588832381003e-06, + "loss": 0.5262, + "step": 5313 + }, + { + "epoch": 0.6520245398773006, + "grad_norm": 1.0599728451663255, + "learning_rate": 5.706999161606597e-06, + "loss": 0.5523, + "step": 5314 + }, + { + "epoch": 0.6521472392638037, + "grad_norm": 0.9824618410134808, + "learning_rate": 5.703410168874768e-06, + "loss": 0.5568, + "step": 5315 + }, + { + "epoch": 0.6522699386503068, + "grad_norm": 0.8138474432632337, + "learning_rate": 5.699821854752365e-06, + "loss": 0.4805, + "step": 5316 + }, + { + "epoch": 0.6523926380368098, + "grad_norm": 0.8649505711367934, + "learning_rate": 5.6962342198061275e-06, + "loss": 0.564, + "step": 5317 + }, + { + "epoch": 0.6525153374233129, + "grad_norm": 0.8709109469072662, + "learning_rate": 5.692647264602698e-06, + "loss": 0.6008, + "step": 5318 + }, + { + "epoch": 0.6526380368098159, + "grad_norm": 1.1573384445694488, + "learning_rate": 5.689060989708599e-06, + "loss": 0.5489, + "step": 5319 + }, + { + "epoch": 0.652760736196319, + "grad_norm": 0.8985175641979422, + "learning_rate": 5.685475395690259e-06, + "loss": 0.579, + "step": 5320 + }, + { + "epoch": 0.6528834355828221, + "grad_norm": 0.8983852115744112, + "learning_rate": 5.681890483113982e-06, + "loss": 0.5453, + "step": 5321 + }, + { + "epoch": 0.6530061349693251, + "grad_norm": 1.1118489967970848, + "learning_rate": 5.6783062525459845e-06, + "loss": 0.552, + "step": 5322 + }, + { + "epoch": 0.6531288343558282, + "grad_norm": 0.8601020358301087, + "learning_rate": 5.6747227045523555e-06, + "loss": 0.5562, + "step": 5323 + }, + { + "epoch": 0.6532515337423312, + "grad_norm": 0.8561728614110385, + "learning_rate": 5.671139839699096e-06, + "loss": 0.5836, + "step": 5324 + }, + { + "epoch": 0.6533742331288344, + "grad_norm": 0.8670683946273918, + "learning_rate": 5.667557658552078e-06, + "loss": 0.5883, + "step": 5325 + }, + { + "epoch": 0.6534969325153375, + "grad_norm": 0.8839761712974847, + "learning_rate": 5.663976161677085e-06, + "loss": 0.5304, + "step": 5326 + }, + { + "epoch": 0.6536196319018405, + "grad_norm": 0.8298256908017851, + "learning_rate": 5.660395349639776e-06, + "loss": 0.5525, + "step": 5327 + }, + { + "epoch": 0.6537423312883436, + "grad_norm": 0.8818186283981619, + "learning_rate": 5.656815223005714e-06, + "loss": 0.5449, + "step": 5328 + }, + { + "epoch": 0.6538650306748466, + "grad_norm": 0.7796233566674894, + "learning_rate": 5.653235782340351e-06, + "loss": 0.553, + "step": 5329 + }, + { + "epoch": 0.6539877300613497, + "grad_norm": 0.9363297007584743, + "learning_rate": 5.649657028209024e-06, + "loss": 0.5671, + "step": 5330 + }, + { + "epoch": 0.6541104294478528, + "grad_norm": 0.8564408260787274, + "learning_rate": 5.646078961176971e-06, + "loss": 0.5713, + "step": 5331 + }, + { + "epoch": 0.6542331288343558, + "grad_norm": 0.9241012435410372, + "learning_rate": 5.642501581809312e-06, + "loss": 0.5293, + "step": 5332 + }, + { + "epoch": 0.6543558282208589, + "grad_norm": 0.9104323846877959, + "learning_rate": 5.638924890671069e-06, + "loss": 0.5575, + "step": 5333 + }, + { + "epoch": 0.6544785276073619, + "grad_norm": 0.8815976062530241, + "learning_rate": 5.635348888327148e-06, + "loss": 0.5896, + "step": 5334 + }, + { + "epoch": 0.654601226993865, + "grad_norm": 0.8621911200201399, + "learning_rate": 5.631773575342343e-06, + "loss": 0.5383, + "step": 5335 + }, + { + "epoch": 0.6547239263803681, + "grad_norm": 0.8773598483690144, + "learning_rate": 5.62819895228135e-06, + "loss": 0.6313, + "step": 5336 + }, + { + "epoch": 0.6548466257668711, + "grad_norm": 0.8739737865396848, + "learning_rate": 5.624625019708745e-06, + "loss": 0.5426, + "step": 5337 + }, + { + "epoch": 0.6549693251533742, + "grad_norm": 0.9138311992789179, + "learning_rate": 5.621051778189004e-06, + "loss": 0.6338, + "step": 5338 + }, + { + "epoch": 0.6550920245398772, + "grad_norm": 0.8813121046631149, + "learning_rate": 5.6174792282864865e-06, + "loss": 0.4987, + "step": 5339 + }, + { + "epoch": 0.6552147239263804, + "grad_norm": 0.8652049864896024, + "learning_rate": 5.61390737056545e-06, + "loss": 0.5156, + "step": 5340 + }, + { + "epoch": 0.6553374233128835, + "grad_norm": 0.9229163049004202, + "learning_rate": 5.610336205590033e-06, + "loss": 0.5905, + "step": 5341 + }, + { + "epoch": 0.6554601226993865, + "grad_norm": 0.8634246343928723, + "learning_rate": 5.6067657339242785e-06, + "loss": 0.5521, + "step": 5342 + }, + { + "epoch": 0.6555828220858896, + "grad_norm": 1.0182019946053247, + "learning_rate": 5.6031959561321026e-06, + "loss": 0.6073, + "step": 5343 + }, + { + "epoch": 0.6557055214723926, + "grad_norm": 0.9505535985406276, + "learning_rate": 5.599626872777329e-06, + "loss": 0.572, + "step": 5344 + }, + { + "epoch": 0.6558282208588957, + "grad_norm": 0.8607937446503354, + "learning_rate": 5.5960584844236565e-06, + "loss": 0.5537, + "step": 5345 + }, + { + "epoch": 0.6559509202453988, + "grad_norm": 0.8142569741226627, + "learning_rate": 5.592490791634689e-06, + "loss": 0.5656, + "step": 5346 + }, + { + "epoch": 0.6560736196319018, + "grad_norm": 0.8818528608614034, + "learning_rate": 5.588923794973908e-06, + "loss": 0.5171, + "step": 5347 + }, + { + "epoch": 0.6561963190184049, + "grad_norm": 0.7803143852451753, + "learning_rate": 5.585357495004689e-06, + "loss": 0.4778, + "step": 5348 + }, + { + "epoch": 0.656319018404908, + "grad_norm": 0.8535833231326172, + "learning_rate": 5.581791892290302e-06, + "loss": 0.5654, + "step": 5349 + }, + { + "epoch": 0.656441717791411, + "grad_norm": 0.86711250992231, + "learning_rate": 5.5782269873939e-06, + "loss": 0.5579, + "step": 5350 + }, + { + "epoch": 0.6565644171779141, + "grad_norm": 0.8002364720431162, + "learning_rate": 5.574662780878533e-06, + "loss": 0.5565, + "step": 5351 + }, + { + "epoch": 0.6566871165644171, + "grad_norm": 0.825023753244557, + "learning_rate": 5.571099273307131e-06, + "loss": 0.5159, + "step": 5352 + }, + { + "epoch": 0.6568098159509203, + "grad_norm": 0.9080306786244683, + "learning_rate": 5.567536465242527e-06, + "loss": 0.5749, + "step": 5353 + }, + { + "epoch": 0.6569325153374234, + "grad_norm": 0.9084847229760455, + "learning_rate": 5.563974357247428e-06, + "loss": 0.5128, + "step": 5354 + }, + { + "epoch": 0.6570552147239264, + "grad_norm": 0.9201434736118238, + "learning_rate": 5.560412949884442e-06, + "loss": 0.5658, + "step": 5355 + }, + { + "epoch": 0.6571779141104295, + "grad_norm": 0.9055276687931254, + "learning_rate": 5.556852243716065e-06, + "loss": 0.5215, + "step": 5356 + }, + { + "epoch": 0.6573006134969325, + "grad_norm": 0.8818664964989544, + "learning_rate": 5.553292239304675e-06, + "loss": 0.5615, + "step": 5357 + }, + { + "epoch": 0.6574233128834356, + "grad_norm": 0.9622052970290303, + "learning_rate": 5.549732937212551e-06, + "loss": 0.5899, + "step": 5358 + }, + { + "epoch": 0.6575460122699387, + "grad_norm": 1.0937752999866417, + "learning_rate": 5.546174338001846e-06, + "loss": 0.5525, + "step": 5359 + }, + { + "epoch": 0.6576687116564417, + "grad_norm": 0.8951333854423674, + "learning_rate": 5.542616442234618e-06, + "loss": 0.5375, + "step": 5360 + }, + { + "epoch": 0.6577914110429448, + "grad_norm": 1.107466264285304, + "learning_rate": 5.539059250472798e-06, + "loss": 0.5324, + "step": 5361 + }, + { + "epoch": 0.6579141104294478, + "grad_norm": 1.2434882879238423, + "learning_rate": 5.535502763278222e-06, + "loss": 0.5508, + "step": 5362 + }, + { + "epoch": 0.6580368098159509, + "grad_norm": 0.8473789832676505, + "learning_rate": 5.531946981212599e-06, + "loss": 0.5401, + "step": 5363 + }, + { + "epoch": 0.658159509202454, + "grad_norm": 0.8995955214550436, + "learning_rate": 5.5283919048375425e-06, + "loss": 0.5846, + "step": 5364 + }, + { + "epoch": 0.658282208588957, + "grad_norm": 0.8222085377680476, + "learning_rate": 5.52483753471454e-06, + "loss": 0.5604, + "step": 5365 + }, + { + "epoch": 0.6584049079754601, + "grad_norm": 1.066918206955465, + "learning_rate": 5.521283871404972e-06, + "loss": 0.6276, + "step": 5366 + }, + { + "epoch": 0.6585276073619631, + "grad_norm": 1.018813699281511, + "learning_rate": 5.517730915470115e-06, + "loss": 0.5807, + "step": 5367 + }, + { + "epoch": 0.6586503067484663, + "grad_norm": 1.1345296995409135, + "learning_rate": 5.5141786674711226e-06, + "loss": 0.5519, + "step": 5368 + }, + { + "epoch": 0.6587730061349694, + "grad_norm": 0.9313876753870801, + "learning_rate": 5.510627127969048e-06, + "loss": 0.5805, + "step": 5369 + }, + { + "epoch": 0.6588957055214724, + "grad_norm": 0.7719953242916838, + "learning_rate": 5.507076297524818e-06, + "loss": 0.6039, + "step": 5370 + }, + { + "epoch": 0.6590184049079755, + "grad_norm": 0.9342836349293634, + "learning_rate": 5.5035261766992655e-06, + "loss": 0.5552, + "step": 5371 + }, + { + "epoch": 0.6591411042944785, + "grad_norm": 0.8399723301533912, + "learning_rate": 5.4999767660530925e-06, + "loss": 0.4573, + "step": 5372 + }, + { + "epoch": 0.6592638036809816, + "grad_norm": 0.948341891364281, + "learning_rate": 5.496428066146906e-06, + "loss": 0.5348, + "step": 5373 + }, + { + "epoch": 0.6593865030674847, + "grad_norm": 0.9263855851714058, + "learning_rate": 5.492880077541184e-06, + "loss": 0.6031, + "step": 5374 + }, + { + "epoch": 0.6595092024539877, + "grad_norm": 0.8361702121819732, + "learning_rate": 5.48933280079631e-06, + "loss": 0.5741, + "step": 5375 + }, + { + "epoch": 0.6596319018404908, + "grad_norm": 0.8571658578728201, + "learning_rate": 5.485786236472542e-06, + "loss": 0.5208, + "step": 5376 + }, + { + "epoch": 0.6597546012269939, + "grad_norm": 0.9090958140954718, + "learning_rate": 5.482240385130022e-06, + "loss": 0.6011, + "step": 5377 + }, + { + "epoch": 0.6598773006134969, + "grad_norm": 0.8548847499947853, + "learning_rate": 5.478695247328798e-06, + "loss": 0.5521, + "step": 5378 + }, + { + "epoch": 0.66, + "grad_norm": 0.8881119960784644, + "learning_rate": 5.475150823628786e-06, + "loss": 0.5298, + "step": 5379 + }, + { + "epoch": 0.660122699386503, + "grad_norm": 1.0641842160479815, + "learning_rate": 5.471607114589806e-06, + "loss": 0.5824, + "step": 5380 + }, + { + "epoch": 0.6602453987730061, + "grad_norm": 0.9052416739201684, + "learning_rate": 5.468064120771544e-06, + "loss": 0.5369, + "step": 5381 + }, + { + "epoch": 0.6603680981595093, + "grad_norm": 1.0445488191707006, + "learning_rate": 5.464521842733594e-06, + "loss": 0.5953, + "step": 5382 + }, + { + "epoch": 0.6604907975460123, + "grad_norm": 0.9924777323792325, + "learning_rate": 5.460980281035432e-06, + "loss": 0.5259, + "step": 5383 + }, + { + "epoch": 0.6606134969325154, + "grad_norm": 1.0048413350076089, + "learning_rate": 5.457439436236407e-06, + "loss": 0.595, + "step": 5384 + }, + { + "epoch": 0.6607361963190184, + "grad_norm": 0.8282431722820721, + "learning_rate": 5.453899308895774e-06, + "loss": 0.5448, + "step": 5385 + }, + { + "epoch": 0.6608588957055215, + "grad_norm": 1.0547457535456934, + "learning_rate": 5.450359899572657e-06, + "loss": 0.5573, + "step": 5386 + }, + { + "epoch": 0.6609815950920246, + "grad_norm": 0.8916621752915113, + "learning_rate": 5.446821208826083e-06, + "loss": 0.578, + "step": 5387 + }, + { + "epoch": 0.6611042944785276, + "grad_norm": 0.8394628175286576, + "learning_rate": 5.4432832372149525e-06, + "loss": 0.5769, + "step": 5388 + }, + { + "epoch": 0.6612269938650307, + "grad_norm": 0.8691007319783273, + "learning_rate": 5.439745985298064e-06, + "loss": 0.5711, + "step": 5389 + }, + { + "epoch": 0.6613496932515337, + "grad_norm": 0.9361279975211095, + "learning_rate": 5.436209453634087e-06, + "loss": 0.5471, + "step": 5390 + }, + { + "epoch": 0.6614723926380368, + "grad_norm": 1.0029282820626582, + "learning_rate": 5.432673642781595e-06, + "loss": 0.5907, + "step": 5391 + }, + { + "epoch": 0.6615950920245399, + "grad_norm": 0.8566233599862448, + "learning_rate": 5.429138553299031e-06, + "loss": 0.607, + "step": 5392 + }, + { + "epoch": 0.6617177914110429, + "grad_norm": 0.97789854090709, + "learning_rate": 5.4256041857447415e-06, + "loss": 0.6052, + "step": 5393 + }, + { + "epoch": 0.661840490797546, + "grad_norm": 0.9680004118027647, + "learning_rate": 5.422070540676941e-06, + "loss": 0.5254, + "step": 5394 + }, + { + "epoch": 0.661963190184049, + "grad_norm": 0.8862117090555398, + "learning_rate": 5.418537618653743e-06, + "loss": 0.5947, + "step": 5395 + }, + { + "epoch": 0.6620858895705521, + "grad_norm": 0.7710146064634665, + "learning_rate": 5.415005420233141e-06, + "loss": 0.5626, + "step": 5396 + }, + { + "epoch": 0.6622085889570553, + "grad_norm": 0.9043298046609558, + "learning_rate": 5.411473945973012e-06, + "loss": 0.5613, + "step": 5397 + }, + { + "epoch": 0.6623312883435583, + "grad_norm": 0.8200208800153027, + "learning_rate": 5.407943196431127e-06, + "loss": 0.4981, + "step": 5398 + }, + { + "epoch": 0.6624539877300614, + "grad_norm": 1.2473088865615647, + "learning_rate": 5.404413172165133e-06, + "loss": 0.5661, + "step": 5399 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 0.8614319845551449, + "learning_rate": 5.400883873732574e-06, + "loss": 0.6108, + "step": 5400 + }, + { + "epoch": 0.6626993865030675, + "grad_norm": 0.916725173657014, + "learning_rate": 5.397355301690863e-06, + "loss": 0.5181, + "step": 5401 + }, + { + "epoch": 0.6628220858895706, + "grad_norm": 0.8885061431629392, + "learning_rate": 5.393827456597317e-06, + "loss": 0.5869, + "step": 5402 + }, + { + "epoch": 0.6629447852760736, + "grad_norm": 0.8888261437549699, + "learning_rate": 5.39030033900912e-06, + "loss": 0.587, + "step": 5403 + }, + { + "epoch": 0.6630674846625767, + "grad_norm": 0.7304620193389448, + "learning_rate": 5.386773949483357e-06, + "loss": 0.5529, + "step": 5404 + }, + { + "epoch": 0.6631901840490797, + "grad_norm": 0.9580088851299714, + "learning_rate": 5.3832482885769855e-06, + "loss": 0.6086, + "step": 5405 + }, + { + "epoch": 0.6633128834355828, + "grad_norm": 0.859334421869225, + "learning_rate": 5.37972335684686e-06, + "loss": 0.5072, + "step": 5406 + }, + { + "epoch": 0.6634355828220859, + "grad_norm": 0.8050284269993048, + "learning_rate": 5.376199154849708e-06, + "loss": 0.5548, + "step": 5407 + }, + { + "epoch": 0.6635582822085889, + "grad_norm": 0.7526077962960801, + "learning_rate": 5.372675683142146e-06, + "loss": 0.5298, + "step": 5408 + }, + { + "epoch": 0.663680981595092, + "grad_norm": 0.9314424029067235, + "learning_rate": 5.3691529422806796e-06, + "loss": 0.5621, + "step": 5409 + }, + { + "epoch": 0.6638036809815951, + "grad_norm": 0.811554157901308, + "learning_rate": 5.365630932821688e-06, + "loss": 0.526, + "step": 5410 + }, + { + "epoch": 0.6639263803680981, + "grad_norm": 0.8567103241484881, + "learning_rate": 5.362109655321455e-06, + "loss": 0.5718, + "step": 5411 + }, + { + "epoch": 0.6640490797546013, + "grad_norm": 0.8070572119584347, + "learning_rate": 5.358589110336125e-06, + "loss": 0.6147, + "step": 5412 + }, + { + "epoch": 0.6641717791411043, + "grad_norm": 0.9363360874353385, + "learning_rate": 5.355069298421747e-06, + "loss": 0.5416, + "step": 5413 + }, + { + "epoch": 0.6642944785276074, + "grad_norm": 0.8936694032439612, + "learning_rate": 5.35155022013424e-06, + "loss": 0.516, + "step": 5414 + }, + { + "epoch": 0.6644171779141105, + "grad_norm": 0.941078304474227, + "learning_rate": 5.3480318760294084e-06, + "loss": 0.5332, + "step": 5415 + }, + { + "epoch": 0.6645398773006135, + "grad_norm": 1.2282131507613923, + "learning_rate": 5.3445142666629525e-06, + "loss": 0.5884, + "step": 5416 + }, + { + "epoch": 0.6646625766871166, + "grad_norm": 0.7808755180919851, + "learning_rate": 5.340997392590439e-06, + "loss": 0.5624, + "step": 5417 + }, + { + "epoch": 0.6647852760736196, + "grad_norm": 0.7709983987543917, + "learning_rate": 5.337481254367338e-06, + "loss": 0.5861, + "step": 5418 + }, + { + "epoch": 0.6649079754601227, + "grad_norm": 0.9939712182927603, + "learning_rate": 5.333965852548984e-06, + "loss": 0.5278, + "step": 5419 + }, + { + "epoch": 0.6650306748466258, + "grad_norm": 0.8603063823099275, + "learning_rate": 5.330451187690614e-06, + "loss": 0.5487, + "step": 5420 + }, + { + "epoch": 0.6651533742331288, + "grad_norm": 0.9193312383175253, + "learning_rate": 5.326937260347329e-06, + "loss": 0.5685, + "step": 5421 + }, + { + "epoch": 0.6652760736196319, + "grad_norm": 0.8912029085917327, + "learning_rate": 5.3234240710741335e-06, + "loss": 0.5326, + "step": 5422 + }, + { + "epoch": 0.6653987730061349, + "grad_norm": 0.9264752191247219, + "learning_rate": 5.319911620425897e-06, + "loss": 0.5735, + "step": 5423 + }, + { + "epoch": 0.665521472392638, + "grad_norm": 0.8900713430918263, + "learning_rate": 5.316399908957388e-06, + "loss": 0.5719, + "step": 5424 + }, + { + "epoch": 0.6656441717791411, + "grad_norm": 0.887968192963091, + "learning_rate": 5.3128889372232436e-06, + "loss": 0.5859, + "step": 5425 + }, + { + "epoch": 0.6657668711656441, + "grad_norm": 0.9505517476651273, + "learning_rate": 5.309378705778e-06, + "loss": 0.5166, + "step": 5426 + }, + { + "epoch": 0.6658895705521473, + "grad_norm": 0.7578795114020261, + "learning_rate": 5.3058692151760635e-06, + "loss": 0.5691, + "step": 5427 + }, + { + "epoch": 0.6660122699386503, + "grad_norm": 0.8630362020258802, + "learning_rate": 5.302360465971725e-06, + "loss": 0.5856, + "step": 5428 + }, + { + "epoch": 0.6661349693251534, + "grad_norm": 0.8666511240459114, + "learning_rate": 5.298852458719168e-06, + "loss": 0.569, + "step": 5429 + }, + { + "epoch": 0.6662576687116565, + "grad_norm": 0.8623385585686365, + "learning_rate": 5.295345193972445e-06, + "loss": 0.5553, + "step": 5430 + }, + { + "epoch": 0.6663803680981595, + "grad_norm": 0.8059784749987836, + "learning_rate": 5.291838672285506e-06, + "loss": 0.5248, + "step": 5431 + }, + { + "epoch": 0.6665030674846626, + "grad_norm": 0.8830914495550747, + "learning_rate": 5.288332894212168e-06, + "loss": 0.5424, + "step": 5432 + }, + { + "epoch": 0.6666257668711656, + "grad_norm": 0.8853337944652794, + "learning_rate": 5.284827860306146e-06, + "loss": 0.6001, + "step": 5433 + }, + { + "epoch": 0.6667484662576687, + "grad_norm": 0.8523332792512284, + "learning_rate": 5.281323571121023e-06, + "loss": 0.5765, + "step": 5434 + }, + { + "epoch": 0.6668711656441718, + "grad_norm": 0.8483687842010841, + "learning_rate": 5.277820027210279e-06, + "loss": 0.5181, + "step": 5435 + }, + { + "epoch": 0.6669938650306748, + "grad_norm": 0.9203007920902101, + "learning_rate": 5.274317229127259e-06, + "loss": 0.5437, + "step": 5436 + }, + { + "epoch": 0.6671165644171779, + "grad_norm": 0.8899302569947607, + "learning_rate": 5.270815177425207e-06, + "loss": 0.4945, + "step": 5437 + }, + { + "epoch": 0.6672392638036809, + "grad_norm": 0.8806031478252908, + "learning_rate": 5.267313872657242e-06, + "loss": 0.6019, + "step": 5438 + }, + { + "epoch": 0.667361963190184, + "grad_norm": 0.8283473094759024, + "learning_rate": 5.263813315376362e-06, + "loss": 0.5603, + "step": 5439 + }, + { + "epoch": 0.6674846625766871, + "grad_norm": 0.8889799603564084, + "learning_rate": 5.260313506135452e-06, + "loss": 0.5761, + "step": 5440 + }, + { + "epoch": 0.6676073619631901, + "grad_norm": 0.8748667058558659, + "learning_rate": 5.256814445487275e-06, + "loss": 0.517, + "step": 5441 + }, + { + "epoch": 0.6677300613496933, + "grad_norm": 0.915664703639981, + "learning_rate": 5.253316133984481e-06, + "loss": 0.5837, + "step": 5442 + }, + { + "epoch": 0.6678527607361964, + "grad_norm": 0.9318118888059854, + "learning_rate": 5.249818572179592e-06, + "loss": 0.5594, + "step": 5443 + }, + { + "epoch": 0.6679754601226994, + "grad_norm": 0.9163067357521029, + "learning_rate": 5.246321760625025e-06, + "loss": 0.5716, + "step": 5444 + }, + { + "epoch": 0.6680981595092025, + "grad_norm": 0.8661441262667908, + "learning_rate": 5.242825699873068e-06, + "loss": 0.5757, + "step": 5445 + }, + { + "epoch": 0.6682208588957055, + "grad_norm": 0.8816321939470337, + "learning_rate": 5.23933039047589e-06, + "loss": 0.5829, + "step": 5446 + }, + { + "epoch": 0.6683435582822086, + "grad_norm": 0.836557985543053, + "learning_rate": 5.235835832985552e-06, + "loss": 0.5503, + "step": 5447 + }, + { + "epoch": 0.6684662576687117, + "grad_norm": 0.8360587472249933, + "learning_rate": 5.232342027953983e-06, + "loss": 0.559, + "step": 5448 + }, + { + "epoch": 0.6685889570552147, + "grad_norm": 1.0067619519017743, + "learning_rate": 5.228848975933005e-06, + "loss": 0.5841, + "step": 5449 + }, + { + "epoch": 0.6687116564417178, + "grad_norm": 0.9671397595313375, + "learning_rate": 5.225356677474309e-06, + "loss": 0.5305, + "step": 5450 + }, + { + "epoch": 0.6688343558282208, + "grad_norm": 0.8276805247226594, + "learning_rate": 5.221865133129482e-06, + "loss": 0.5333, + "step": 5451 + }, + { + "epoch": 0.6689570552147239, + "grad_norm": 0.7982910467877341, + "learning_rate": 5.218374343449976e-06, + "loss": 0.5575, + "step": 5452 + }, + { + "epoch": 0.669079754601227, + "grad_norm": 0.900904924056677, + "learning_rate": 5.214884308987136e-06, + "loss": 0.5745, + "step": 5453 + }, + { + "epoch": 0.66920245398773, + "grad_norm": 1.19654567351164, + "learning_rate": 5.211395030292179e-06, + "loss": 0.6275, + "step": 5454 + }, + { + "epoch": 0.6693251533742332, + "grad_norm": 0.9217039830409445, + "learning_rate": 5.2079065079162115e-06, + "loss": 0.5416, + "step": 5455 + }, + { + "epoch": 0.6694478527607362, + "grad_norm": 0.9047909547677173, + "learning_rate": 5.204418742410212e-06, + "loss": 0.5834, + "step": 5456 + }, + { + "epoch": 0.6695705521472393, + "grad_norm": 0.8827724138210784, + "learning_rate": 5.200931734325041e-06, + "loss": 0.5369, + "step": 5457 + }, + { + "epoch": 0.6696932515337424, + "grad_norm": 0.8780330471270014, + "learning_rate": 5.197445484211447e-06, + "loss": 0.5545, + "step": 5458 + }, + { + "epoch": 0.6698159509202454, + "grad_norm": 0.8136339696838238, + "learning_rate": 5.193959992620048e-06, + "loss": 0.571, + "step": 5459 + }, + { + "epoch": 0.6699386503067485, + "grad_norm": 1.0768141860970242, + "learning_rate": 5.190475260101353e-06, + "loss": 0.5359, + "step": 5460 + }, + { + "epoch": 0.6700613496932515, + "grad_norm": 0.8568056138726016, + "learning_rate": 5.186991287205739e-06, + "loss": 0.5813, + "step": 5461 + }, + { + "epoch": 0.6701840490797546, + "grad_norm": 0.9231315494925106, + "learning_rate": 5.183508074483478e-06, + "loss": 0.5552, + "step": 5462 + }, + { + "epoch": 0.6703067484662577, + "grad_norm": 0.9346556508164305, + "learning_rate": 5.180025622484703e-06, + "loss": 0.5319, + "step": 5463 + }, + { + "epoch": 0.6704294478527607, + "grad_norm": 0.8436680297363657, + "learning_rate": 5.176543931759447e-06, + "loss": 0.5716, + "step": 5464 + }, + { + "epoch": 0.6705521472392638, + "grad_norm": 0.8424912611457742, + "learning_rate": 5.1730630028576055e-06, + "loss": 0.5798, + "step": 5465 + }, + { + "epoch": 0.6706748466257668, + "grad_norm": 0.8053880367330661, + "learning_rate": 5.1695828363289635e-06, + "loss": 0.5297, + "step": 5466 + }, + { + "epoch": 0.6707975460122699, + "grad_norm": 0.8534624664562578, + "learning_rate": 5.166103432723191e-06, + "loss": 0.5739, + "step": 5467 + }, + { + "epoch": 0.670920245398773, + "grad_norm": 1.0364281874370698, + "learning_rate": 5.1626247925898175e-06, + "loss": 0.536, + "step": 5468 + }, + { + "epoch": 0.671042944785276, + "grad_norm": 0.8683377098235231, + "learning_rate": 5.159146916478274e-06, + "loss": 0.5225, + "step": 5469 + }, + { + "epoch": 0.6711656441717792, + "grad_norm": 0.9420612922948626, + "learning_rate": 5.155669804937855e-06, + "loss": 0.5419, + "step": 5470 + }, + { + "epoch": 0.6712883435582823, + "grad_norm": 0.8402298234110761, + "learning_rate": 5.1521934585177465e-06, + "loss": 0.5413, + "step": 5471 + }, + { + "epoch": 0.6714110429447853, + "grad_norm": 0.7937718768875395, + "learning_rate": 5.148717877767001e-06, + "loss": 0.5345, + "step": 5472 + }, + { + "epoch": 0.6715337423312884, + "grad_norm": 0.8999926240218478, + "learning_rate": 5.1452430632345616e-06, + "loss": 0.527, + "step": 5473 + }, + { + "epoch": 0.6716564417177914, + "grad_norm": 0.7872992161399992, + "learning_rate": 5.14176901546924e-06, + "loss": 0.5203, + "step": 5474 + }, + { + "epoch": 0.6717791411042945, + "grad_norm": 0.9579068239096515, + "learning_rate": 5.138295735019741e-06, + "loss": 0.5131, + "step": 5475 + }, + { + "epoch": 0.6719018404907976, + "grad_norm": 0.8998087012795082, + "learning_rate": 5.134823222434631e-06, + "loss": 0.6269, + "step": 5476 + }, + { + "epoch": 0.6720245398773006, + "grad_norm": 1.2718893759292518, + "learning_rate": 5.131351478262364e-06, + "loss": 0.5681, + "step": 5477 + }, + { + "epoch": 0.6721472392638037, + "grad_norm": 0.8837518731082772, + "learning_rate": 5.127880503051279e-06, + "loss": 0.5382, + "step": 5478 + }, + { + "epoch": 0.6722699386503067, + "grad_norm": 0.8816868967826569, + "learning_rate": 5.124410297349577e-06, + "loss": 0.5415, + "step": 5479 + }, + { + "epoch": 0.6723926380368098, + "grad_norm": 0.8867062347276847, + "learning_rate": 5.120940861705357e-06, + "loss": 0.5562, + "step": 5480 + }, + { + "epoch": 0.6725153374233129, + "grad_norm": 0.9720380641980946, + "learning_rate": 5.117472196666578e-06, + "loss": 0.6062, + "step": 5481 + }, + { + "epoch": 0.6726380368098159, + "grad_norm": 0.9911896702649406, + "learning_rate": 5.1140043027810925e-06, + "loss": 0.5906, + "step": 5482 + }, + { + "epoch": 0.672760736196319, + "grad_norm": 0.9477593093107308, + "learning_rate": 5.11053718059662e-06, + "loss": 0.5697, + "step": 5483 + }, + { + "epoch": 0.672883435582822, + "grad_norm": 0.7776201409630589, + "learning_rate": 5.107070830660765e-06, + "loss": 0.6191, + "step": 5484 + }, + { + "epoch": 0.6730061349693252, + "grad_norm": 0.8795671457461469, + "learning_rate": 5.103605253521007e-06, + "loss": 0.5658, + "step": 5485 + }, + { + "epoch": 0.6731288343558283, + "grad_norm": 0.9146511293365306, + "learning_rate": 5.1001404497247055e-06, + "loss": 0.5662, + "step": 5486 + }, + { + "epoch": 0.6732515337423313, + "grad_norm": 0.8991799468465754, + "learning_rate": 5.096676419819095e-06, + "loss": 0.4795, + "step": 5487 + }, + { + "epoch": 0.6733742331288344, + "grad_norm": 0.8210801197137659, + "learning_rate": 5.093213164351287e-06, + "loss": 0.6087, + "step": 5488 + }, + { + "epoch": 0.6734969325153374, + "grad_norm": 0.8306351830726164, + "learning_rate": 5.089750683868279e-06, + "loss": 0.5113, + "step": 5489 + }, + { + "epoch": 0.6736196319018405, + "grad_norm": 0.9056822484817103, + "learning_rate": 5.086288978916931e-06, + "loss": 0.5962, + "step": 5490 + }, + { + "epoch": 0.6737423312883436, + "grad_norm": 0.9508253904174847, + "learning_rate": 5.082828050044e-06, + "loss": 0.5448, + "step": 5491 + }, + { + "epoch": 0.6738650306748466, + "grad_norm": 0.8511677212385965, + "learning_rate": 5.079367897796102e-06, + "loss": 0.5616, + "step": 5492 + }, + { + "epoch": 0.6739877300613497, + "grad_norm": 1.0071497989392166, + "learning_rate": 5.075908522719739e-06, + "loss": 0.5614, + "step": 5493 + }, + { + "epoch": 0.6741104294478527, + "grad_norm": 0.9073521127097406, + "learning_rate": 5.072449925361296e-06, + "loss": 0.5796, + "step": 5494 + }, + { + "epoch": 0.6742331288343558, + "grad_norm": 0.8439038360518886, + "learning_rate": 5.068992106267021e-06, + "loss": 0.5526, + "step": 5495 + }, + { + "epoch": 0.6743558282208589, + "grad_norm": 0.8619890948539476, + "learning_rate": 5.065535065983053e-06, + "loss": 0.5433, + "step": 5496 + }, + { + "epoch": 0.6744785276073619, + "grad_norm": 0.9141394961727356, + "learning_rate": 5.062078805055397e-06, + "loss": 0.5951, + "step": 5497 + }, + { + "epoch": 0.674601226993865, + "grad_norm": 0.7832749872616003, + "learning_rate": 5.058623324029944e-06, + "loss": 0.5564, + "step": 5498 + }, + { + "epoch": 0.674723926380368, + "grad_norm": 0.8012663054826081, + "learning_rate": 5.0551686234524525e-06, + "loss": 0.5448, + "step": 5499 + }, + { + "epoch": 0.6748466257668712, + "grad_norm": 0.858467986598126, + "learning_rate": 5.051714703868569e-06, + "loss": 0.569, + "step": 5500 + }, + { + "epoch": 0.6749693251533743, + "grad_norm": 0.8557442683215597, + "learning_rate": 5.048261565823802e-06, + "loss": 0.5751, + "step": 5501 + }, + { + "epoch": 0.6750920245398773, + "grad_norm": 1.0634047887553404, + "learning_rate": 5.0448092098635546e-06, + "loss": 0.5974, + "step": 5502 + }, + { + "epoch": 0.6752147239263804, + "grad_norm": 0.9151546994342346, + "learning_rate": 5.0413576365330884e-06, + "loss": 0.5564, + "step": 5503 + }, + { + "epoch": 0.6753374233128835, + "grad_norm": 0.888646233723059, + "learning_rate": 5.037906846377556e-06, + "loss": 0.5688, + "step": 5504 + }, + { + "epoch": 0.6754601226993865, + "grad_norm": 0.9482062542452507, + "learning_rate": 5.034456839941979e-06, + "loss": 0.5341, + "step": 5505 + }, + { + "epoch": 0.6755828220858896, + "grad_norm": 1.066389535847748, + "learning_rate": 5.031007617771249e-06, + "loss": 0.5651, + "step": 5506 + }, + { + "epoch": 0.6757055214723926, + "grad_norm": 0.796898703162552, + "learning_rate": 5.027559180410151e-06, + "loss": 0.5301, + "step": 5507 + }, + { + "epoch": 0.6758282208588957, + "grad_norm": 0.8157840187629923, + "learning_rate": 5.0241115284033285e-06, + "loss": 0.5753, + "step": 5508 + }, + { + "epoch": 0.6759509202453988, + "grad_norm": 0.8448850268347678, + "learning_rate": 5.020664662295316e-06, + "loss": 0.5156, + "step": 5509 + }, + { + "epoch": 0.6760736196319018, + "grad_norm": 1.012703340546843, + "learning_rate": 5.017218582630507e-06, + "loss": 0.5921, + "step": 5510 + }, + { + "epoch": 0.6761963190184049, + "grad_norm": 1.1504037433601007, + "learning_rate": 5.013773289953189e-06, + "loss": 0.5456, + "step": 5511 + }, + { + "epoch": 0.6763190184049079, + "grad_norm": 0.7947307468093231, + "learning_rate": 5.010328784807509e-06, + "loss": 0.4785, + "step": 5512 + }, + { + "epoch": 0.676441717791411, + "grad_norm": 0.8542732719271683, + "learning_rate": 5.0068850677375036e-06, + "loss": 0.5514, + "step": 5513 + }, + { + "epoch": 0.6765644171779142, + "grad_norm": 0.9155827298254509, + "learning_rate": 5.003442139287072e-06, + "loss": 0.579, + "step": 5514 + }, + { + "epoch": 0.6766871165644172, + "grad_norm": 0.9006358635337707, + "learning_rate": 5.000000000000003e-06, + "loss": 0.578, + "step": 5515 + }, + { + "epoch": 0.6768098159509203, + "grad_norm": 0.8478394064455256, + "learning_rate": 4.996558650419942e-06, + "loss": 0.5746, + "step": 5516 + }, + { + "epoch": 0.6769325153374233, + "grad_norm": 0.9279994804432068, + "learning_rate": 4.993118091090433e-06, + "loss": 0.4874, + "step": 5517 + }, + { + "epoch": 0.6770552147239264, + "grad_norm": 0.905456726177912, + "learning_rate": 4.989678322554876e-06, + "loss": 0.561, + "step": 5518 + }, + { + "epoch": 0.6771779141104295, + "grad_norm": 0.9240665524015388, + "learning_rate": 4.98623934535655e-06, + "loss": 0.5169, + "step": 5519 + }, + { + "epoch": 0.6773006134969325, + "grad_norm": 0.9574663351684967, + "learning_rate": 4.982801160038614e-06, + "loss": 0.5276, + "step": 5520 + }, + { + "epoch": 0.6774233128834356, + "grad_norm": 1.0918354812742406, + "learning_rate": 4.979363767144102e-06, + "loss": 0.5606, + "step": 5521 + }, + { + "epoch": 0.6775460122699386, + "grad_norm": 0.9442545107247444, + "learning_rate": 4.975927167215924e-06, + "loss": 0.5518, + "step": 5522 + }, + { + "epoch": 0.6776687116564417, + "grad_norm": 0.8757573429515212, + "learning_rate": 4.972491360796852e-06, + "loss": 0.5627, + "step": 5523 + }, + { + "epoch": 0.6777914110429448, + "grad_norm": 0.9039345725971072, + "learning_rate": 4.9690563484295525e-06, + "loss": 0.5938, + "step": 5524 + }, + { + "epoch": 0.6779141104294478, + "grad_norm": 1.001857807540022, + "learning_rate": 4.965622130656551e-06, + "loss": 0.5457, + "step": 5525 + }, + { + "epoch": 0.6780368098159509, + "grad_norm": 0.8203921498540652, + "learning_rate": 4.962188708020248e-06, + "loss": 0.5514, + "step": 5526 + }, + { + "epoch": 0.6781595092024539, + "grad_norm": 0.7832810829302278, + "learning_rate": 4.958756081062932e-06, + "loss": 0.5891, + "step": 5527 + }, + { + "epoch": 0.678282208588957, + "grad_norm": 0.9520020350659582, + "learning_rate": 4.955324250326749e-06, + "loss": 0.545, + "step": 5528 + }, + { + "epoch": 0.6784049079754602, + "grad_norm": 0.8195465628091337, + "learning_rate": 4.9518932163537344e-06, + "loss": 0.532, + "step": 5529 + }, + { + "epoch": 0.6785276073619632, + "grad_norm": 0.9111496697238103, + "learning_rate": 4.948462979685783e-06, + "loss": 0.5833, + "step": 5530 + }, + { + "epoch": 0.6786503067484663, + "grad_norm": 0.9454162375321321, + "learning_rate": 4.94503354086468e-06, + "loss": 0.5541, + "step": 5531 + }, + { + "epoch": 0.6787730061349693, + "grad_norm": 0.9627396497841387, + "learning_rate": 4.941604900432065e-06, + "loss": 0.5044, + "step": 5532 + }, + { + "epoch": 0.6788957055214724, + "grad_norm": 0.8120241275074753, + "learning_rate": 4.938177058929474e-06, + "loss": 0.5642, + "step": 5533 + }, + { + "epoch": 0.6790184049079755, + "grad_norm": 0.8898635139199488, + "learning_rate": 4.934750016898295e-06, + "loss": 0.5538, + "step": 5534 + }, + { + "epoch": 0.6791411042944785, + "grad_norm": 0.8734483465160869, + "learning_rate": 4.931323774879807e-06, + "loss": 0.5028, + "step": 5535 + }, + { + "epoch": 0.6792638036809816, + "grad_norm": 0.8189825373733587, + "learning_rate": 4.927898333415154e-06, + "loss": 0.5982, + "step": 5536 + }, + { + "epoch": 0.6793865030674847, + "grad_norm": 0.8673522945239736, + "learning_rate": 4.924473693045349e-06, + "loss": 0.5429, + "step": 5537 + }, + { + "epoch": 0.6795092024539877, + "grad_norm": 0.9190740725955047, + "learning_rate": 4.921049854311293e-06, + "loss": 0.5423, + "step": 5538 + }, + { + "epoch": 0.6796319018404908, + "grad_norm": 0.8604273611531236, + "learning_rate": 4.9176268177537445e-06, + "loss": 0.4939, + "step": 5539 + }, + { + "epoch": 0.6797546012269938, + "grad_norm": 1.0251912790515545, + "learning_rate": 4.914204583913349e-06, + "loss": 0.5474, + "step": 5540 + }, + { + "epoch": 0.6798773006134969, + "grad_norm": 0.9109859462170536, + "learning_rate": 4.910783153330613e-06, + "loss": 0.5814, + "step": 5541 + }, + { + "epoch": 0.68, + "grad_norm": 0.8584399421578391, + "learning_rate": 4.907362526545929e-06, + "loss": 0.5642, + "step": 5542 + }, + { + "epoch": 0.680122699386503, + "grad_norm": 0.8847199459795413, + "learning_rate": 4.903942704099548e-06, + "loss": 0.5717, + "step": 5543 + }, + { + "epoch": 0.6802453987730062, + "grad_norm": 1.0843726065076533, + "learning_rate": 4.900523686531609e-06, + "loss": 0.5389, + "step": 5544 + }, + { + "epoch": 0.6803680981595092, + "grad_norm": 1.039792540204254, + "learning_rate": 4.897105474382109e-06, + "loss": 0.5782, + "step": 5545 + }, + { + "epoch": 0.6804907975460123, + "grad_norm": 0.8129654272171549, + "learning_rate": 4.893688068190933e-06, + "loss": 0.5577, + "step": 5546 + }, + { + "epoch": 0.6806134969325154, + "grad_norm": 0.8312386978877238, + "learning_rate": 4.890271468497822e-06, + "loss": 0.5546, + "step": 5547 + }, + { + "epoch": 0.6807361963190184, + "grad_norm": 0.8601495752071782, + "learning_rate": 4.886855675842406e-06, + "loss": 0.5673, + "step": 5548 + }, + { + "epoch": 0.6808588957055215, + "grad_norm": 0.8309892391276632, + "learning_rate": 4.8834406907641784e-06, + "loss": 0.542, + "step": 5549 + }, + { + "epoch": 0.6809815950920245, + "grad_norm": 0.8402143437823436, + "learning_rate": 4.880026513802504e-06, + "loss": 0.5783, + "step": 5550 + }, + { + "epoch": 0.6811042944785276, + "grad_norm": 0.8507060596247448, + "learning_rate": 4.876613145496627e-06, + "loss": 0.5676, + "step": 5551 + }, + { + "epoch": 0.6812269938650307, + "grad_norm": 0.8855253625748596, + "learning_rate": 4.8732005863856545e-06, + "loss": 0.5515, + "step": 5552 + }, + { + "epoch": 0.6813496932515337, + "grad_norm": 0.8792798976652361, + "learning_rate": 4.869788837008577e-06, + "loss": 0.5684, + "step": 5553 + }, + { + "epoch": 0.6814723926380368, + "grad_norm": 0.9225323260891823, + "learning_rate": 4.866377897904248e-06, + "loss": 0.5857, + "step": 5554 + }, + { + "epoch": 0.6815950920245398, + "grad_norm": 0.8692614595642842, + "learning_rate": 4.862967769611389e-06, + "loss": 0.5896, + "step": 5555 + }, + { + "epoch": 0.6817177914110429, + "grad_norm": 0.9773774673592832, + "learning_rate": 4.859558452668614e-06, + "loss": 0.5757, + "step": 5556 + }, + { + "epoch": 0.681840490797546, + "grad_norm": 0.9048361539147013, + "learning_rate": 4.8561499476143835e-06, + "loss": 0.5211, + "step": 5557 + }, + { + "epoch": 0.681963190184049, + "grad_norm": 0.8801037492132778, + "learning_rate": 4.8527422549870495e-06, + "loss": 0.5478, + "step": 5558 + }, + { + "epoch": 0.6820858895705522, + "grad_norm": 0.8323504579567264, + "learning_rate": 4.849335375324821e-06, + "loss": 0.5399, + "step": 5559 + }, + { + "epoch": 0.6822085889570552, + "grad_norm": 0.9450637929322525, + "learning_rate": 4.845929309165793e-06, + "loss": 0.6105, + "step": 5560 + }, + { + "epoch": 0.6823312883435583, + "grad_norm": 0.8652138561430397, + "learning_rate": 4.842524057047916e-06, + "loss": 0.5918, + "step": 5561 + }, + { + "epoch": 0.6824539877300614, + "grad_norm": 0.8343835076463767, + "learning_rate": 4.83911961950903e-06, + "loss": 0.5784, + "step": 5562 + }, + { + "epoch": 0.6825766871165644, + "grad_norm": 0.8808028706887406, + "learning_rate": 4.835715997086825e-06, + "loss": 0.5509, + "step": 5563 + }, + { + "epoch": 0.6826993865030675, + "grad_norm": 0.9410717627865092, + "learning_rate": 4.832313190318886e-06, + "loss": 0.5332, + "step": 5564 + }, + { + "epoch": 0.6828220858895706, + "grad_norm": 0.8814983463647472, + "learning_rate": 4.828911199742646e-06, + "loss": 0.5694, + "step": 5565 + }, + { + "epoch": 0.6829447852760736, + "grad_norm": 0.9692148441641973, + "learning_rate": 4.825510025895429e-06, + "loss": 0.5611, + "step": 5566 + }, + { + "epoch": 0.6830674846625767, + "grad_norm": 0.8416108665421779, + "learning_rate": 4.822109669314419e-06, + "loss": 0.5363, + "step": 5567 + }, + { + "epoch": 0.6831901840490797, + "grad_norm": 0.7997904025554, + "learning_rate": 4.818710130536667e-06, + "loss": 0.5118, + "step": 5568 + }, + { + "epoch": 0.6833128834355828, + "grad_norm": 0.8019852910252565, + "learning_rate": 4.81531141009911e-06, + "loss": 0.5491, + "step": 5569 + }, + { + "epoch": 0.6834355828220859, + "grad_norm": 0.8485744557421132, + "learning_rate": 4.8119135085385375e-06, + "loss": 0.5325, + "step": 5570 + }, + { + "epoch": 0.6835582822085889, + "grad_norm": 0.7785811912043009, + "learning_rate": 4.808516426391627e-06, + "loss": 0.5749, + "step": 5571 + }, + { + "epoch": 0.683680981595092, + "grad_norm": 0.8534512033803185, + "learning_rate": 4.805120164194912e-06, + "loss": 0.572, + "step": 5572 + }, + { + "epoch": 0.683803680981595, + "grad_norm": 0.8172803837337691, + "learning_rate": 4.801724722484809e-06, + "loss": 0.5403, + "step": 5573 + }, + { + "epoch": 0.6839263803680982, + "grad_norm": 0.8780357717419992, + "learning_rate": 4.7983301017975934e-06, + "loss": 0.6059, + "step": 5574 + }, + { + "epoch": 0.6840490797546013, + "grad_norm": 0.899156659927648, + "learning_rate": 4.794936302669417e-06, + "loss": 0.5846, + "step": 5575 + }, + { + "epoch": 0.6841717791411043, + "grad_norm": 0.9181044890037218, + "learning_rate": 4.791543325636307e-06, + "loss": 0.5336, + "step": 5576 + }, + { + "epoch": 0.6842944785276074, + "grad_norm": 0.9303770236695226, + "learning_rate": 4.788151171234149e-06, + "loss": 0.5408, + "step": 5577 + }, + { + "epoch": 0.6844171779141104, + "grad_norm": 0.8475893055069719, + "learning_rate": 4.784759839998709e-06, + "loss": 0.5725, + "step": 5578 + }, + { + "epoch": 0.6845398773006135, + "grad_norm": 0.8475965272860778, + "learning_rate": 4.78136933246561e-06, + "loss": 0.4978, + "step": 5579 + }, + { + "epoch": 0.6846625766871166, + "grad_norm": 0.8556005559441567, + "learning_rate": 4.777979649170367e-06, + "loss": 0.6135, + "step": 5580 + }, + { + "epoch": 0.6847852760736196, + "grad_norm": 0.863285958733861, + "learning_rate": 4.774590790648338e-06, + "loss": 0.5055, + "step": 5581 + }, + { + "epoch": 0.6849079754601227, + "grad_norm": 0.9197322803176449, + "learning_rate": 4.771202757434774e-06, + "loss": 0.5756, + "step": 5582 + }, + { + "epoch": 0.6850306748466257, + "grad_norm": 0.8416159874170376, + "learning_rate": 4.767815550064778e-06, + "loss": 0.6242, + "step": 5583 + }, + { + "epoch": 0.6851533742331288, + "grad_norm": 0.8435047866903647, + "learning_rate": 4.764429169073339e-06, + "loss": 0.5454, + "step": 5584 + }, + { + "epoch": 0.6852760736196319, + "grad_norm": 0.8291723327002769, + "learning_rate": 4.7610436149953e-06, + "loss": 0.5571, + "step": 5585 + }, + { + "epoch": 0.6853987730061349, + "grad_norm": 0.914484197588841, + "learning_rate": 4.75765888836538e-06, + "loss": 0.5044, + "step": 5586 + }, + { + "epoch": 0.685521472392638, + "grad_norm": 0.8634743704993627, + "learning_rate": 4.7542749897181715e-06, + "loss": 0.4577, + "step": 5587 + }, + { + "epoch": 0.685644171779141, + "grad_norm": 0.8166593056652277, + "learning_rate": 4.750891919588127e-06, + "loss": 0.5559, + "step": 5588 + }, + { + "epoch": 0.6857668711656442, + "grad_norm": 0.8429012804563537, + "learning_rate": 4.74750967850958e-06, + "loss": 0.5881, + "step": 5589 + }, + { + "epoch": 0.6858895705521473, + "grad_norm": 1.0791186274314226, + "learning_rate": 4.744128267016719e-06, + "loss": 0.5746, + "step": 5590 + }, + { + "epoch": 0.6860122699386503, + "grad_norm": 1.0219234370452999, + "learning_rate": 4.7407476856436166e-06, + "loss": 0.6022, + "step": 5591 + }, + { + "epoch": 0.6861349693251534, + "grad_norm": 0.8374975959901955, + "learning_rate": 4.737367934924198e-06, + "loss": 0.5081, + "step": 5592 + }, + { + "epoch": 0.6862576687116564, + "grad_norm": 1.2296535441470908, + "learning_rate": 4.733989015392275e-06, + "loss": 0.5531, + "step": 5593 + }, + { + "epoch": 0.6863803680981595, + "grad_norm": 0.9125834415279683, + "learning_rate": 4.730610927581511e-06, + "loss": 0.5025, + "step": 5594 + }, + { + "epoch": 0.6865030674846626, + "grad_norm": 0.8489838021360729, + "learning_rate": 4.727233672025453e-06, + "loss": 0.5935, + "step": 5595 + }, + { + "epoch": 0.6866257668711656, + "grad_norm": 0.9837737279781502, + "learning_rate": 4.723857249257502e-06, + "loss": 0.5981, + "step": 5596 + }, + { + "epoch": 0.6867484662576687, + "grad_norm": 0.8348748395283676, + "learning_rate": 4.720481659810941e-06, + "loss": 0.5731, + "step": 5597 + }, + { + "epoch": 0.6868711656441718, + "grad_norm": 0.9804021771664142, + "learning_rate": 4.7171069042189146e-06, + "loss": 0.5771, + "step": 5598 + }, + { + "epoch": 0.6869938650306748, + "grad_norm": 0.951850603961663, + "learning_rate": 4.713732983014431e-06, + "loss": 0.5382, + "step": 5599 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 0.885883559314245, + "learning_rate": 4.710359896730379e-06, + "loss": 0.5116, + "step": 5600 + }, + { + "epoch": 0.6872392638036809, + "grad_norm": 0.8586995821241667, + "learning_rate": 4.706987645899502e-06, + "loss": 0.5248, + "step": 5601 + }, + { + "epoch": 0.687361963190184, + "grad_norm": 0.847515764769254, + "learning_rate": 4.7036162310544255e-06, + "loss": 0.5689, + "step": 5602 + }, + { + "epoch": 0.6874846625766872, + "grad_norm": 0.9231075656575526, + "learning_rate": 4.700245652727623e-06, + "loss": 0.5064, + "step": 5603 + }, + { + "epoch": 0.6876073619631902, + "grad_norm": 0.8341798153302397, + "learning_rate": 4.696875911451466e-06, + "loss": 0.5231, + "step": 5604 + }, + { + "epoch": 0.6877300613496933, + "grad_norm": 0.8683901264076159, + "learning_rate": 4.693507007758165e-06, + "loss": 0.5531, + "step": 5605 + }, + { + "epoch": 0.6878527607361963, + "grad_norm": 0.7600190105692584, + "learning_rate": 4.690138942179809e-06, + "loss": 0.5381, + "step": 5606 + }, + { + "epoch": 0.6879754601226994, + "grad_norm": 0.9760607022698272, + "learning_rate": 4.686771715248362e-06, + "loss": 0.5796, + "step": 5607 + }, + { + "epoch": 0.6880981595092025, + "grad_norm": 0.8805635688645239, + "learning_rate": 4.683405327495638e-06, + "loss": 0.5565, + "step": 5608 + }, + { + "epoch": 0.6882208588957055, + "grad_norm": 0.8120218974020268, + "learning_rate": 4.68003977945334e-06, + "loss": 0.5263, + "step": 5609 + }, + { + "epoch": 0.6883435582822086, + "grad_norm": 0.9120686035142728, + "learning_rate": 4.676675071653019e-06, + "loss": 0.608, + "step": 5610 + }, + { + "epoch": 0.6884662576687116, + "grad_norm": 1.1047201417759327, + "learning_rate": 4.6733112046261075e-06, + "loss": 0.5559, + "step": 5611 + }, + { + "epoch": 0.6885889570552147, + "grad_norm": 0.9403380292710056, + "learning_rate": 4.669948178903895e-06, + "loss": 0.5607, + "step": 5612 + }, + { + "epoch": 0.6887116564417178, + "grad_norm": 0.9976256579517165, + "learning_rate": 4.666585995017546e-06, + "loss": 0.5588, + "step": 5613 + }, + { + "epoch": 0.6888343558282208, + "grad_norm": 0.889381152351003, + "learning_rate": 4.663224653498084e-06, + "loss": 0.5729, + "step": 5614 + }, + { + "epoch": 0.688957055214724, + "grad_norm": 0.8905417330597439, + "learning_rate": 4.659864154876411e-06, + "loss": 0.5445, + "step": 5615 + }, + { + "epoch": 0.689079754601227, + "grad_norm": 0.898954099114576, + "learning_rate": 4.656504499683285e-06, + "loss": 0.5256, + "step": 5616 + }, + { + "epoch": 0.6892024539877301, + "grad_norm": 0.7909666758262894, + "learning_rate": 4.65314568844933e-06, + "loss": 0.5625, + "step": 5617 + }, + { + "epoch": 0.6893251533742332, + "grad_norm": 0.9737196620900388, + "learning_rate": 4.6497877217050505e-06, + "loss": 0.5692, + "step": 5618 + }, + { + "epoch": 0.6894478527607362, + "grad_norm": 0.7481095986138221, + "learning_rate": 4.646430599980799e-06, + "loss": 0.5222, + "step": 5619 + }, + { + "epoch": 0.6895705521472393, + "grad_norm": 0.9825790065964856, + "learning_rate": 4.643074323806813e-06, + "loss": 0.6102, + "step": 5620 + }, + { + "epoch": 0.6896932515337423, + "grad_norm": 0.886894455209758, + "learning_rate": 4.6397188937131785e-06, + "loss": 0.4972, + "step": 5621 + }, + { + "epoch": 0.6898159509202454, + "grad_norm": 0.9448318639707636, + "learning_rate": 4.6363643102298675e-06, + "loss": 0.5787, + "step": 5622 + }, + { + "epoch": 0.6899386503067485, + "grad_norm": 0.932264711337477, + "learning_rate": 4.633010573886696e-06, + "loss": 0.5494, + "step": 5623 + }, + { + "epoch": 0.6900613496932515, + "grad_norm": 0.8268045498925868, + "learning_rate": 4.629657685213368e-06, + "loss": 0.5735, + "step": 5624 + }, + { + "epoch": 0.6901840490797546, + "grad_norm": 0.8682957265776536, + "learning_rate": 4.626305644739435e-06, + "loss": 0.5336, + "step": 5625 + }, + { + "epoch": 0.6903067484662576, + "grad_norm": 0.8884580171316653, + "learning_rate": 4.622954452994332e-06, + "loss": 0.5524, + "step": 5626 + }, + { + "epoch": 0.6904294478527607, + "grad_norm": 0.8964404134123963, + "learning_rate": 4.6196041105073444e-06, + "loss": 0.5536, + "step": 5627 + }, + { + "epoch": 0.6905521472392638, + "grad_norm": 1.1262111988015986, + "learning_rate": 4.6162546178076275e-06, + "loss": 0.5694, + "step": 5628 + }, + { + "epoch": 0.6906748466257668, + "grad_norm": 0.8822884084244963, + "learning_rate": 4.612905975424213e-06, + "loss": 0.4947, + "step": 5629 + }, + { + "epoch": 0.69079754601227, + "grad_norm": 1.5290852277289264, + "learning_rate": 4.609558183885979e-06, + "loss": 0.6135, + "step": 5630 + }, + { + "epoch": 0.6909202453987731, + "grad_norm": 0.8443052674682479, + "learning_rate": 4.606211243721694e-06, + "loss": 0.5226, + "step": 5631 + }, + { + "epoch": 0.6910429447852761, + "grad_norm": 1.0594865418750574, + "learning_rate": 4.602865155459969e-06, + "loss": 0.5255, + "step": 5632 + }, + { + "epoch": 0.6911656441717792, + "grad_norm": 0.8841707371307638, + "learning_rate": 4.599519919629297e-06, + "loss": 0.5039, + "step": 5633 + }, + { + "epoch": 0.6912883435582822, + "grad_norm": 0.9266743113556172, + "learning_rate": 4.596175536758024e-06, + "loss": 0.5522, + "step": 5634 + }, + { + "epoch": 0.6914110429447853, + "grad_norm": 0.9130555235095232, + "learning_rate": 4.592832007374364e-06, + "loss": 0.5596, + "step": 5635 + }, + { + "epoch": 0.6915337423312884, + "grad_norm": 0.8269916997167892, + "learning_rate": 4.589489332006406e-06, + "loss": 0.5344, + "step": 5636 + }, + { + "epoch": 0.6916564417177914, + "grad_norm": 0.8520371101353198, + "learning_rate": 4.5861475111820885e-06, + "loss": 0.5316, + "step": 5637 + }, + { + "epoch": 0.6917791411042945, + "grad_norm": 0.788596836653355, + "learning_rate": 4.582806545429232e-06, + "loss": 0.5367, + "step": 5638 + }, + { + "epoch": 0.6919018404907975, + "grad_norm": 0.8582985456396278, + "learning_rate": 4.579466435275506e-06, + "loss": 0.5746, + "step": 5639 + }, + { + "epoch": 0.6920245398773006, + "grad_norm": 0.8835070903251449, + "learning_rate": 4.576127181248459e-06, + "loss": 0.5648, + "step": 5640 + }, + { + "epoch": 0.6921472392638037, + "grad_norm": 0.8872651148234104, + "learning_rate": 4.57278878387549e-06, + "loss": 0.5736, + "step": 5641 + }, + { + "epoch": 0.6922699386503067, + "grad_norm": 0.8881049024413932, + "learning_rate": 4.569451243683878e-06, + "loss": 0.5482, + "step": 5642 + }, + { + "epoch": 0.6923926380368098, + "grad_norm": 0.868867494345852, + "learning_rate": 4.566114561200751e-06, + "loss": 0.5703, + "step": 5643 + }, + { + "epoch": 0.6925153374233128, + "grad_norm": 0.8105154849905138, + "learning_rate": 4.562778736953116e-06, + "loss": 0.5322, + "step": 5644 + }, + { + "epoch": 0.692638036809816, + "grad_norm": 0.8262966790319893, + "learning_rate": 4.559443771467833e-06, + "loss": 0.561, + "step": 5645 + }, + { + "epoch": 0.6927607361963191, + "grad_norm": 0.9262123804269962, + "learning_rate": 4.556109665271635e-06, + "loss": 0.6064, + "step": 5646 + }, + { + "epoch": 0.6928834355828221, + "grad_norm": 0.7537625782695039, + "learning_rate": 4.552776418891114e-06, + "loss": 0.5227, + "step": 5647 + }, + { + "epoch": 0.6930061349693252, + "grad_norm": 0.9324783177172138, + "learning_rate": 4.549444032852724e-06, + "loss": 0.5983, + "step": 5648 + }, + { + "epoch": 0.6931288343558282, + "grad_norm": 1.0161021930304535, + "learning_rate": 4.5461125076827925e-06, + "loss": 0.5157, + "step": 5649 + }, + { + "epoch": 0.6932515337423313, + "grad_norm": 0.8864800547440889, + "learning_rate": 4.542781843907499e-06, + "loss": 0.5374, + "step": 5650 + }, + { + "epoch": 0.6933742331288344, + "grad_norm": 0.8444854751617935, + "learning_rate": 4.539452042052901e-06, + "loss": 0.5662, + "step": 5651 + }, + { + "epoch": 0.6934969325153374, + "grad_norm": 0.8451486714227794, + "learning_rate": 4.536123102644904e-06, + "loss": 0.5791, + "step": 5652 + }, + { + "epoch": 0.6936196319018405, + "grad_norm": 0.9071532600009357, + "learning_rate": 4.532795026209292e-06, + "loss": 0.573, + "step": 5653 + }, + { + "epoch": 0.6937423312883435, + "grad_norm": 0.8407671849315971, + "learning_rate": 4.5294678132717e-06, + "loss": 0.583, + "step": 5654 + }, + { + "epoch": 0.6938650306748466, + "grad_norm": 0.9996419193662182, + "learning_rate": 4.5261414643576396e-06, + "loss": 0.5202, + "step": 5655 + }, + { + "epoch": 0.6939877300613497, + "grad_norm": 0.9687344906879156, + "learning_rate": 4.522815979992472e-06, + "loss": 0.5254, + "step": 5656 + }, + { + "epoch": 0.6941104294478527, + "grad_norm": 0.8248673759029268, + "learning_rate": 4.519491360701435e-06, + "loss": 0.4437, + "step": 5657 + }, + { + "epoch": 0.6942331288343558, + "grad_norm": 0.8523351123987659, + "learning_rate": 4.516167607009618e-06, + "loss": 0.5317, + "step": 5658 + }, + { + "epoch": 0.694355828220859, + "grad_norm": 0.8929374166901277, + "learning_rate": 4.512844719441982e-06, + "loss": 0.6025, + "step": 5659 + }, + { + "epoch": 0.694478527607362, + "grad_norm": 0.882283178441259, + "learning_rate": 4.509522698523352e-06, + "loss": 0.5007, + "step": 5660 + }, + { + "epoch": 0.6946012269938651, + "grad_norm": 0.8209574538174199, + "learning_rate": 4.506201544778406e-06, + "loss": 0.5732, + "step": 5661 + }, + { + "epoch": 0.6947239263803681, + "grad_norm": 0.8134131451345574, + "learning_rate": 4.5028812587316985e-06, + "loss": 0.5663, + "step": 5662 + }, + { + "epoch": 0.6948466257668712, + "grad_norm": 0.8675689139795031, + "learning_rate": 4.499561840907633e-06, + "loss": 0.603, + "step": 5663 + }, + { + "epoch": 0.6949693251533743, + "grad_norm": 0.9356520173927807, + "learning_rate": 4.49624329183049e-06, + "loss": 0.5924, + "step": 5664 + }, + { + "epoch": 0.6950920245398773, + "grad_norm": 0.9322765882668008, + "learning_rate": 4.492925612024402e-06, + "loss": 0.6069, + "step": 5665 + }, + { + "epoch": 0.6952147239263804, + "grad_norm": 0.8882555807210235, + "learning_rate": 4.489608802013367e-06, + "loss": 0.5639, + "step": 5666 + }, + { + "epoch": 0.6953374233128834, + "grad_norm": 1.3620670919423568, + "learning_rate": 4.48629286232125e-06, + "loss": 0.5688, + "step": 5667 + }, + { + "epoch": 0.6954601226993865, + "grad_norm": 0.8829885550767022, + "learning_rate": 4.482977793471769e-06, + "loss": 0.5751, + "step": 5668 + }, + { + "epoch": 0.6955828220858896, + "grad_norm": 0.8792732070204349, + "learning_rate": 4.4796635959885195e-06, + "loss": 0.5496, + "step": 5669 + }, + { + "epoch": 0.6957055214723926, + "grad_norm": 0.9568716218637832, + "learning_rate": 4.476350270394942e-06, + "loss": 0.589, + "step": 5670 + }, + { + "epoch": 0.6958282208588957, + "grad_norm": 0.9100984967709912, + "learning_rate": 4.473037817214355e-06, + "loss": 0.5793, + "step": 5671 + }, + { + "epoch": 0.6959509202453987, + "grad_norm": 0.9426856972609496, + "learning_rate": 4.469726236969926e-06, + "loss": 0.5417, + "step": 5672 + }, + { + "epoch": 0.6960736196319018, + "grad_norm": 0.719418804759808, + "learning_rate": 4.466415530184696e-06, + "loss": 0.5059, + "step": 5673 + }, + { + "epoch": 0.696196319018405, + "grad_norm": 0.8388306327147628, + "learning_rate": 4.463105697381556e-06, + "loss": 0.5444, + "step": 5674 + }, + { + "epoch": 0.696319018404908, + "grad_norm": 0.7974762051749911, + "learning_rate": 4.4597967390832745e-06, + "loss": 0.5658, + "step": 5675 + }, + { + "epoch": 0.6964417177914111, + "grad_norm": 0.8545172140347228, + "learning_rate": 4.4564886558124635e-06, + "loss": 0.5511, + "step": 5676 + }, + { + "epoch": 0.6965644171779141, + "grad_norm": 0.7971143146463304, + "learning_rate": 4.453181448091614e-06, + "loss": 0.6166, + "step": 5677 + }, + { + "epoch": 0.6966871165644172, + "grad_norm": 0.9388016264025429, + "learning_rate": 4.449875116443069e-06, + "loss": 0.5096, + "step": 5678 + }, + { + "epoch": 0.6968098159509203, + "grad_norm": 0.8554565082849349, + "learning_rate": 4.446569661389029e-06, + "loss": 0.596, + "step": 5679 + }, + { + "epoch": 0.6969325153374233, + "grad_norm": 0.913893641956967, + "learning_rate": 4.4432650834515735e-06, + "loss": 0.5867, + "step": 5680 + }, + { + "epoch": 0.6970552147239264, + "grad_norm": 0.8508081020785245, + "learning_rate": 4.43996138315262e-06, + "loss": 0.5522, + "step": 5681 + }, + { + "epoch": 0.6971779141104294, + "grad_norm": 0.8185486084658948, + "learning_rate": 4.43665856101397e-06, + "loss": 0.5498, + "step": 5682 + }, + { + "epoch": 0.6973006134969325, + "grad_norm": 0.916142460910008, + "learning_rate": 4.433356617557268e-06, + "loss": 0.5671, + "step": 5683 + }, + { + "epoch": 0.6974233128834356, + "grad_norm": 1.1905413212400993, + "learning_rate": 4.430055553304034e-06, + "loss": 0.5509, + "step": 5684 + }, + { + "epoch": 0.6975460122699386, + "grad_norm": 0.8797060980310756, + "learning_rate": 4.426755368775637e-06, + "loss": 0.519, + "step": 5685 + }, + { + "epoch": 0.6976687116564417, + "grad_norm": 0.8513771600256511, + "learning_rate": 4.4234560644933145e-06, + "loss": 0.5435, + "step": 5686 + }, + { + "epoch": 0.6977914110429447, + "grad_norm": 0.8308079179517006, + "learning_rate": 4.420157640978169e-06, + "loss": 0.5646, + "step": 5687 + }, + { + "epoch": 0.6979141104294478, + "grad_norm": 0.7145573782352175, + "learning_rate": 4.416860098751149e-06, + "loss": 0.5127, + "step": 5688 + }, + { + "epoch": 0.698036809815951, + "grad_norm": 0.8894916596964216, + "learning_rate": 4.413563438333081e-06, + "loss": 0.5738, + "step": 5689 + }, + { + "epoch": 0.698159509202454, + "grad_norm": 0.9227482700032016, + "learning_rate": 4.4102676602446375e-06, + "loss": 0.5355, + "step": 5690 + }, + { + "epoch": 0.6982822085889571, + "grad_norm": 0.8936509768835723, + "learning_rate": 4.4069727650063634e-06, + "loss": 0.5913, + "step": 5691 + }, + { + "epoch": 0.6984049079754602, + "grad_norm": 0.8330859254058107, + "learning_rate": 4.403678753138655e-06, + "loss": 0.572, + "step": 5692 + }, + { + "epoch": 0.6985276073619632, + "grad_norm": 0.8468777349994878, + "learning_rate": 4.4003856251617775e-06, + "loss": 0.5849, + "step": 5693 + }, + { + "epoch": 0.6986503067484663, + "grad_norm": 0.8944948655677117, + "learning_rate": 4.397093381595846e-06, + "loss": 0.6021, + "step": 5694 + }, + { + "epoch": 0.6987730061349693, + "grad_norm": 0.8380856590997314, + "learning_rate": 4.3938020229608506e-06, + "loss": 0.5416, + "step": 5695 + }, + { + "epoch": 0.6988957055214724, + "grad_norm": 0.9600121063984559, + "learning_rate": 4.390511549776628e-06, + "loss": 0.584, + "step": 5696 + }, + { + "epoch": 0.6990184049079755, + "grad_norm": 0.8727668193097582, + "learning_rate": 4.387221962562876e-06, + "loss": 0.594, + "step": 5697 + }, + { + "epoch": 0.6991411042944785, + "grad_norm": 0.8022982341234973, + "learning_rate": 4.3839332618391665e-06, + "loss": 0.5527, + "step": 5698 + }, + { + "epoch": 0.6992638036809816, + "grad_norm": 0.8259337015821042, + "learning_rate": 4.380645448124911e-06, + "loss": 0.5189, + "step": 5699 + }, + { + "epoch": 0.6993865030674846, + "grad_norm": 0.7851591378428237, + "learning_rate": 4.377358521939401e-06, + "loss": 0.5548, + "step": 5700 + }, + { + "epoch": 0.6995092024539877, + "grad_norm": 0.8606150444680782, + "learning_rate": 4.374072483801769e-06, + "loss": 0.6278, + "step": 5701 + }, + { + "epoch": 0.6996319018404908, + "grad_norm": 1.003189275275012, + "learning_rate": 4.370787334231026e-06, + "loss": 0.5611, + "step": 5702 + }, + { + "epoch": 0.6997546012269938, + "grad_norm": 1.022910234651313, + "learning_rate": 4.367503073746025e-06, + "loss": 0.5757, + "step": 5703 + }, + { + "epoch": 0.699877300613497, + "grad_norm": 0.8953426491475427, + "learning_rate": 4.364219702865492e-06, + "loss": 0.532, + "step": 5704 + }, + { + "epoch": 0.7, + "grad_norm": 0.7985189245543501, + "learning_rate": 4.360937222108002e-06, + "loss": 0.5775, + "step": 5705 + }, + { + "epoch": 0.7001226993865031, + "grad_norm": 0.9621514064763981, + "learning_rate": 4.357655631992004e-06, + "loss": 0.5362, + "step": 5706 + }, + { + "epoch": 0.7002453987730062, + "grad_norm": 0.9000246322533744, + "learning_rate": 4.354374933035789e-06, + "loss": 0.4846, + "step": 5707 + }, + { + "epoch": 0.7003680981595092, + "grad_norm": 0.8989703513606971, + "learning_rate": 4.351095125757513e-06, + "loss": 0.5513, + "step": 5708 + }, + { + "epoch": 0.7004907975460123, + "grad_norm": 0.8982462326028307, + "learning_rate": 4.347816210675202e-06, + "loss": 0.5242, + "step": 5709 + }, + { + "epoch": 0.7006134969325153, + "grad_norm": 0.8941314865696025, + "learning_rate": 4.344538188306723e-06, + "loss": 0.5341, + "step": 5710 + }, + { + "epoch": 0.7007361963190184, + "grad_norm": 0.8304319213858187, + "learning_rate": 4.341261059169821e-06, + "loss": 0.5696, + "step": 5711 + }, + { + "epoch": 0.7008588957055215, + "grad_norm": 0.7891209294346195, + "learning_rate": 4.337984823782082e-06, + "loss": 0.5661, + "step": 5712 + }, + { + "epoch": 0.7009815950920245, + "grad_norm": 0.8955189109122405, + "learning_rate": 4.334709482660962e-06, + "loss": 0.5788, + "step": 5713 + }, + { + "epoch": 0.7011042944785276, + "grad_norm": 0.8941688457453529, + "learning_rate": 4.331435036323778e-06, + "loss": 0.5861, + "step": 5714 + }, + { + "epoch": 0.7012269938650306, + "grad_norm": 0.8925046178967864, + "learning_rate": 4.328161485287693e-06, + "loss": 0.5936, + "step": 5715 + }, + { + "epoch": 0.7013496932515337, + "grad_norm": 0.904902147475755, + "learning_rate": 4.324888830069743e-06, + "loss": 0.6144, + "step": 5716 + }, + { + "epoch": 0.7014723926380368, + "grad_norm": 0.9976315834934731, + "learning_rate": 4.32161707118681e-06, + "loss": 0.5776, + "step": 5717 + }, + { + "epoch": 0.7015950920245398, + "grad_norm": 0.8990682226066912, + "learning_rate": 4.318346209155645e-06, + "loss": 0.5216, + "step": 5718 + }, + { + "epoch": 0.701717791411043, + "grad_norm": 0.8525115892257614, + "learning_rate": 4.315076244492847e-06, + "loss": 0.5704, + "step": 5719 + }, + { + "epoch": 0.701840490797546, + "grad_norm": 0.8344099261233584, + "learning_rate": 4.3118071777148865e-06, + "loss": 0.5395, + "step": 5720 + }, + { + "epoch": 0.7019631901840491, + "grad_norm": 0.8918055421787153, + "learning_rate": 4.308539009338075e-06, + "loss": 0.6186, + "step": 5721 + }, + { + "epoch": 0.7020858895705522, + "grad_norm": 0.9116233008648327, + "learning_rate": 4.305271739878601e-06, + "loss": 0.5787, + "step": 5722 + }, + { + "epoch": 0.7022085889570552, + "grad_norm": 0.8589870008260887, + "learning_rate": 4.3020053698524946e-06, + "loss": 0.517, + "step": 5723 + }, + { + "epoch": 0.7023312883435583, + "grad_norm": 0.9091350403708893, + "learning_rate": 4.298739899775656e-06, + "loss": 0.5229, + "step": 5724 + }, + { + "epoch": 0.7024539877300614, + "grad_norm": 0.8224719189601762, + "learning_rate": 4.295475330163832e-06, + "loss": 0.5724, + "step": 5725 + }, + { + "epoch": 0.7025766871165644, + "grad_norm": 0.7956452202192202, + "learning_rate": 4.292211661532641e-06, + "loss": 0.5623, + "step": 5726 + }, + { + "epoch": 0.7026993865030675, + "grad_norm": 0.8758823256499005, + "learning_rate": 4.288948894397547e-06, + "loss": 0.4948, + "step": 5727 + }, + { + "epoch": 0.7028220858895705, + "grad_norm": 0.9725198135276879, + "learning_rate": 4.285687029273872e-06, + "loss": 0.5829, + "step": 5728 + }, + { + "epoch": 0.7029447852760736, + "grad_norm": 0.8792319570738406, + "learning_rate": 4.282426066676808e-06, + "loss": 0.5342, + "step": 5729 + }, + { + "epoch": 0.7030674846625767, + "grad_norm": 0.8873731297513283, + "learning_rate": 4.279166007121389e-06, + "loss": 0.5532, + "step": 5730 + }, + { + "epoch": 0.7031901840490797, + "grad_norm": 0.8107514783537977, + "learning_rate": 4.27590685112252e-06, + "loss": 0.5779, + "step": 5731 + }, + { + "epoch": 0.7033128834355828, + "grad_norm": 0.8782932377664391, + "learning_rate": 4.272648599194948e-06, + "loss": 0.55, + "step": 5732 + }, + { + "epoch": 0.7034355828220858, + "grad_norm": 0.9608288540399598, + "learning_rate": 4.269391251853296e-06, + "loss": 0.4867, + "step": 5733 + }, + { + "epoch": 0.703558282208589, + "grad_norm": 0.8156956614670542, + "learning_rate": 4.2661348096120246e-06, + "loss": 0.5829, + "step": 5734 + }, + { + "epoch": 0.7036809815950921, + "grad_norm": 0.8583793079566093, + "learning_rate": 4.262879272985468e-06, + "loss": 0.5822, + "step": 5735 + }, + { + "epoch": 0.7038036809815951, + "grad_norm": 1.0207311122902982, + "learning_rate": 4.2596246424878055e-06, + "loss": 0.5878, + "step": 5736 + }, + { + "epoch": 0.7039263803680982, + "grad_norm": 0.8335869724426668, + "learning_rate": 4.256370918633081e-06, + "loss": 0.5367, + "step": 5737 + }, + { + "epoch": 0.7040490797546012, + "grad_norm": 0.8140885672702712, + "learning_rate": 4.253118101935193e-06, + "loss": 0.577, + "step": 5738 + }, + { + "epoch": 0.7041717791411043, + "grad_norm": 0.7919686579088944, + "learning_rate": 4.24986619290789e-06, + "loss": 0.5765, + "step": 5739 + }, + { + "epoch": 0.7042944785276074, + "grad_norm": 0.9286397677266376, + "learning_rate": 4.246615192064787e-06, + "loss": 0.5135, + "step": 5740 + }, + { + "epoch": 0.7044171779141104, + "grad_norm": 0.886804689235316, + "learning_rate": 4.243365099919352e-06, + "loss": 0.5444, + "step": 5741 + }, + { + "epoch": 0.7045398773006135, + "grad_norm": 0.854987068476323, + "learning_rate": 4.240115916984911e-06, + "loss": 0.6392, + "step": 5742 + }, + { + "epoch": 0.7046625766871165, + "grad_norm": 0.8502760401321097, + "learning_rate": 4.23686764377464e-06, + "loss": 0.57, + "step": 5743 + }, + { + "epoch": 0.7047852760736196, + "grad_norm": 0.8665057991277577, + "learning_rate": 4.233620280801582e-06, + "loss": 0.5584, + "step": 5744 + }, + { + "epoch": 0.7049079754601227, + "grad_norm": 0.8453224700081132, + "learning_rate": 4.230373828578626e-06, + "loss": 0.6114, + "step": 5745 + }, + { + "epoch": 0.7050306748466257, + "grad_norm": 0.7874235222777748, + "learning_rate": 4.227128287618517e-06, + "loss": 0.4956, + "step": 5746 + }, + { + "epoch": 0.7051533742331288, + "grad_norm": 0.8654925077683349, + "learning_rate": 4.223883658433869e-06, + "loss": 0.5511, + "step": 5747 + }, + { + "epoch": 0.7052760736196318, + "grad_norm": 0.8264681101864462, + "learning_rate": 4.220639941537136e-06, + "loss": 0.5029, + "step": 5748 + }, + { + "epoch": 0.705398773006135, + "grad_norm": 0.8780331464810784, + "learning_rate": 4.217397137440641e-06, + "loss": 0.5254, + "step": 5749 + }, + { + "epoch": 0.7055214723926381, + "grad_norm": 0.9037502792844, + "learning_rate": 4.21415524665655e-06, + "loss": 0.535, + "step": 5750 + }, + { + "epoch": 0.7056441717791411, + "grad_norm": 0.8348920194458128, + "learning_rate": 4.2109142696969e-06, + "loss": 0.5103, + "step": 5751 + }, + { + "epoch": 0.7057668711656442, + "grad_norm": 0.8885845270804373, + "learning_rate": 4.207674207073569e-06, + "loss": 0.5813, + "step": 5752 + }, + { + "epoch": 0.7058895705521473, + "grad_norm": 0.9320711509764239, + "learning_rate": 4.204435059298303e-06, + "loss": 0.5999, + "step": 5753 + }, + { + "epoch": 0.7060122699386503, + "grad_norm": 0.8620348519761196, + "learning_rate": 4.2011968268826895e-06, + "loss": 0.5077, + "step": 5754 + }, + { + "epoch": 0.7061349693251534, + "grad_norm": 0.8065492429204013, + "learning_rate": 4.197959510338187e-06, + "loss": 0.5577, + "step": 5755 + }, + { + "epoch": 0.7062576687116564, + "grad_norm": 0.9673952316330711, + "learning_rate": 4.1947231101761e-06, + "loss": 0.5482, + "step": 5756 + }, + { + "epoch": 0.7063803680981595, + "grad_norm": 1.2883962093030192, + "learning_rate": 4.191487626907586e-06, + "loss": 0.5273, + "step": 5757 + }, + { + "epoch": 0.7065030674846626, + "grad_norm": 0.8852877040725271, + "learning_rate": 4.188253061043666e-06, + "loss": 0.585, + "step": 5758 + }, + { + "epoch": 0.7066257668711656, + "grad_norm": 0.763369879428104, + "learning_rate": 4.185019413095208e-06, + "loss": 0.5413, + "step": 5759 + }, + { + "epoch": 0.7067484662576687, + "grad_norm": 0.9311242137200026, + "learning_rate": 4.181786683572946e-06, + "loss": 0.567, + "step": 5760 + }, + { + "epoch": 0.7068711656441717, + "grad_norm": 0.8753366469281313, + "learning_rate": 4.178554872987452e-06, + "loss": 0.5645, + "step": 5761 + }, + { + "epoch": 0.7069938650306749, + "grad_norm": 0.8848877503359904, + "learning_rate": 4.175323981849173e-06, + "loss": 0.5419, + "step": 5762 + }, + { + "epoch": 0.707116564417178, + "grad_norm": 0.8388094574659776, + "learning_rate": 4.1720940106683915e-06, + "loss": 0.4996, + "step": 5763 + }, + { + "epoch": 0.707239263803681, + "grad_norm": 0.9692478601882646, + "learning_rate": 4.168864959955261e-06, + "loss": 0.6019, + "step": 5764 + }, + { + "epoch": 0.7073619631901841, + "grad_norm": 0.8777239474387459, + "learning_rate": 4.165636830219776e-06, + "loss": 0.5412, + "step": 5765 + }, + { + "epoch": 0.7074846625766871, + "grad_norm": 0.9789871706531612, + "learning_rate": 4.162409621971797e-06, + "loss": 0.5128, + "step": 5766 + }, + { + "epoch": 0.7076073619631902, + "grad_norm": 0.8848693612111684, + "learning_rate": 4.1591833357210285e-06, + "loss": 0.4838, + "step": 5767 + }, + { + "epoch": 0.7077300613496933, + "grad_norm": 0.7798352071095319, + "learning_rate": 4.155957971977038e-06, + "loss": 0.5461, + "step": 5768 + }, + { + "epoch": 0.7078527607361963, + "grad_norm": 0.941307505850355, + "learning_rate": 4.152733531249248e-06, + "loss": 0.5285, + "step": 5769 + }, + { + "epoch": 0.7079754601226994, + "grad_norm": 0.9855992432875191, + "learning_rate": 4.149510014046922e-06, + "loss": 0.5551, + "step": 5770 + }, + { + "epoch": 0.7080981595092024, + "grad_norm": 0.8746492102637198, + "learning_rate": 4.146287420879196e-06, + "loss": 0.4656, + "step": 5771 + }, + { + "epoch": 0.7082208588957055, + "grad_norm": 0.925967308224309, + "learning_rate": 4.143065752255043e-06, + "loss": 0.5726, + "step": 5772 + }, + { + "epoch": 0.7083435582822086, + "grad_norm": 0.7592286860784045, + "learning_rate": 4.139845008683305e-06, + "loss": 0.4804, + "step": 5773 + }, + { + "epoch": 0.7084662576687116, + "grad_norm": 1.015845495421699, + "learning_rate": 4.136625190672664e-06, + "loss": 0.5748, + "step": 5774 + }, + { + "epoch": 0.7085889570552147, + "grad_norm": 0.797989717854119, + "learning_rate": 4.1334062987316695e-06, + "loss": 0.5307, + "step": 5775 + }, + { + "epoch": 0.7087116564417177, + "grad_norm": 0.8238959904487441, + "learning_rate": 4.130188333368713e-06, + "loss": 0.5736, + "step": 5776 + }, + { + "epoch": 0.7088343558282209, + "grad_norm": 0.9546770411194669, + "learning_rate": 4.126971295092043e-06, + "loss": 0.5528, + "step": 5777 + }, + { + "epoch": 0.708957055214724, + "grad_norm": 0.9010378619080089, + "learning_rate": 4.1237551844097686e-06, + "loss": 0.5583, + "step": 5778 + }, + { + "epoch": 0.709079754601227, + "grad_norm": 0.8667374829645222, + "learning_rate": 4.120540001829841e-06, + "loss": 0.5581, + "step": 5779 + }, + { + "epoch": 0.7092024539877301, + "grad_norm": 1.0714831929338122, + "learning_rate": 4.117325747860077e-06, + "loss": 0.5754, + "step": 5780 + }, + { + "epoch": 0.7093251533742331, + "grad_norm": 1.0346198079302946, + "learning_rate": 4.114112423008133e-06, + "loss": 0.5497, + "step": 5781 + }, + { + "epoch": 0.7094478527607362, + "grad_norm": 0.7804707519888849, + "learning_rate": 4.110900027781534e-06, + "loss": 0.5653, + "step": 5782 + }, + { + "epoch": 0.7095705521472393, + "grad_norm": 0.8738690728429983, + "learning_rate": 4.107688562687642e-06, + "loss": 0.5441, + "step": 5783 + }, + { + "epoch": 0.7096932515337423, + "grad_norm": 0.9266000036792097, + "learning_rate": 4.104478028233687e-06, + "loss": 0.5632, + "step": 5784 + }, + { + "epoch": 0.7098159509202454, + "grad_norm": 0.9634584236096968, + "learning_rate": 4.101268424926741e-06, + "loss": 0.5847, + "step": 5785 + }, + { + "epoch": 0.7099386503067485, + "grad_norm": 0.924531907200577, + "learning_rate": 4.098059753273738e-06, + "loss": 0.5176, + "step": 5786 + }, + { + "epoch": 0.7100613496932515, + "grad_norm": 0.8912821876496119, + "learning_rate": 4.094852013781456e-06, + "loss": 0.631, + "step": 5787 + }, + { + "epoch": 0.7101840490797546, + "grad_norm": 0.884318440380154, + "learning_rate": 4.091645206956528e-06, + "loss": 0.566, + "step": 5788 + }, + { + "epoch": 0.7103067484662576, + "grad_norm": 1.0005635389585235, + "learning_rate": 4.088439333305449e-06, + "loss": 0.6101, + "step": 5789 + }, + { + "epoch": 0.7104294478527607, + "grad_norm": 0.9116260847708174, + "learning_rate": 4.085234393334551e-06, + "loss": 0.5536, + "step": 5790 + }, + { + "epoch": 0.7105521472392639, + "grad_norm": 1.135861965618094, + "learning_rate": 4.082030387550033e-06, + "loss": 0.6295, + "step": 5791 + }, + { + "epoch": 0.7106748466257669, + "grad_norm": 0.9154112317573012, + "learning_rate": 4.078827316457935e-06, + "loss": 0.5479, + "step": 5792 + }, + { + "epoch": 0.71079754601227, + "grad_norm": 0.7935170163890024, + "learning_rate": 4.07562518056416e-06, + "loss": 0.5966, + "step": 5793 + }, + { + "epoch": 0.710920245398773, + "grad_norm": 0.9348410594976638, + "learning_rate": 4.0724239803744524e-06, + "loss": 0.556, + "step": 5794 + }, + { + "epoch": 0.7110429447852761, + "grad_norm": 0.9249929682306032, + "learning_rate": 4.069223716394419e-06, + "loss": 0.5489, + "step": 5795 + }, + { + "epoch": 0.7111656441717792, + "grad_norm": 0.9608207493054495, + "learning_rate": 4.06602438912951e-06, + "loss": 0.5203, + "step": 5796 + }, + { + "epoch": 0.7112883435582822, + "grad_norm": 0.8679223011306194, + "learning_rate": 4.062825999085031e-06, + "loss": 0.5879, + "step": 5797 + }, + { + "epoch": 0.7114110429447853, + "grad_norm": 0.8252848019818918, + "learning_rate": 4.059628546766149e-06, + "loss": 0.5084, + "step": 5798 + }, + { + "epoch": 0.7115337423312883, + "grad_norm": 0.8483336850286776, + "learning_rate": 4.056432032677863e-06, + "loss": 0.5632, + "step": 5799 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 0.8034007343198188, + "learning_rate": 4.053236457325043e-06, + "loss": 0.5144, + "step": 5800 + }, + { + "epoch": 0.7117791411042945, + "grad_norm": 0.9057931160867181, + "learning_rate": 4.050041821212396e-06, + "loss": 0.5859, + "step": 5801 + }, + { + "epoch": 0.7119018404907975, + "grad_norm": 0.8559058144275377, + "learning_rate": 4.046848124844495e-06, + "loss": 0.5835, + "step": 5802 + }, + { + "epoch": 0.7120245398773006, + "grad_norm": 0.8586041875366577, + "learning_rate": 4.043655368725747e-06, + "loss": 0.5692, + "step": 5803 + }, + { + "epoch": 0.7121472392638036, + "grad_norm": 0.9252622997915474, + "learning_rate": 4.040463553360431e-06, + "loss": 0.58, + "step": 5804 + }, + { + "epoch": 0.7122699386503067, + "grad_norm": 0.9554162522998897, + "learning_rate": 4.0372726792526614e-06, + "loss": 0.5401, + "step": 5805 + }, + { + "epoch": 0.7123926380368099, + "grad_norm": 0.8397458810392147, + "learning_rate": 4.034082746906406e-06, + "loss": 0.5558, + "step": 5806 + }, + { + "epoch": 0.7125153374233129, + "grad_norm": 0.8441716696672663, + "learning_rate": 4.030893756825495e-06, + "loss": 0.5932, + "step": 5807 + }, + { + "epoch": 0.712638036809816, + "grad_norm": 0.8783279636243219, + "learning_rate": 4.027705709513593e-06, + "loss": 0.5224, + "step": 5808 + }, + { + "epoch": 0.712760736196319, + "grad_norm": 0.8766646211058259, + "learning_rate": 4.024518605474233e-06, + "loss": 0.5805, + "step": 5809 + }, + { + "epoch": 0.7128834355828221, + "grad_norm": 0.9475318629178413, + "learning_rate": 4.021332445210785e-06, + "loss": 0.6186, + "step": 5810 + }, + { + "epoch": 0.7130061349693252, + "grad_norm": 0.8558699565346601, + "learning_rate": 4.018147229226481e-06, + "loss": 0.5166, + "step": 5811 + }, + { + "epoch": 0.7131288343558282, + "grad_norm": 0.9280318614201685, + "learning_rate": 4.014962958024391e-06, + "loss": 0.6051, + "step": 5812 + }, + { + "epoch": 0.7132515337423313, + "grad_norm": 0.986819197760245, + "learning_rate": 4.011779632107451e-06, + "loss": 0.4968, + "step": 5813 + }, + { + "epoch": 0.7133742331288343, + "grad_norm": 0.9426363810125358, + "learning_rate": 4.0085972519784335e-06, + "loss": 0.5768, + "step": 5814 + }, + { + "epoch": 0.7134969325153374, + "grad_norm": 0.922516539815654, + "learning_rate": 4.005415818139975e-06, + "loss": 0.5991, + "step": 5815 + }, + { + "epoch": 0.7136196319018405, + "grad_norm": 0.9074996750851935, + "learning_rate": 4.0022353310945474e-06, + "loss": 0.5645, + "step": 5816 + }, + { + "epoch": 0.7137423312883435, + "grad_norm": 0.7919664167345406, + "learning_rate": 3.99905579134449e-06, + "loss": 0.5401, + "step": 5817 + }, + { + "epoch": 0.7138650306748466, + "grad_norm": 0.7899996315090403, + "learning_rate": 3.995877199391977e-06, + "loss": 0.6145, + "step": 5818 + }, + { + "epoch": 0.7139877300613497, + "grad_norm": 0.971984966333715, + "learning_rate": 3.992699555739041e-06, + "loss": 0.5282, + "step": 5819 + }, + { + "epoch": 0.7141104294478527, + "grad_norm": 0.7813817150611861, + "learning_rate": 3.989522860887567e-06, + "loss": 0.5442, + "step": 5820 + }, + { + "epoch": 0.7142331288343559, + "grad_norm": 0.893337486989592, + "learning_rate": 3.986347115339281e-06, + "loss": 0.561, + "step": 5821 + }, + { + "epoch": 0.7143558282208589, + "grad_norm": 0.8693146439269812, + "learning_rate": 3.98317231959577e-06, + "loss": 0.5399, + "step": 5822 + }, + { + "epoch": 0.714478527607362, + "grad_norm": 0.7891560609599478, + "learning_rate": 3.979998474158459e-06, + "loss": 0.5691, + "step": 5823 + }, + { + "epoch": 0.7146012269938651, + "grad_norm": 0.8874604031439713, + "learning_rate": 3.9768255795286395e-06, + "loss": 0.5325, + "step": 5824 + }, + { + "epoch": 0.7147239263803681, + "grad_norm": 0.9565541489368284, + "learning_rate": 3.973653636207437e-06, + "loss": 0.5392, + "step": 5825 + }, + { + "epoch": 0.7148466257668712, + "grad_norm": 0.8216724052933704, + "learning_rate": 3.970482644695831e-06, + "loss": 0.573, + "step": 5826 + }, + { + "epoch": 0.7149693251533742, + "grad_norm": 0.9436208182167425, + "learning_rate": 3.967312605494658e-06, + "loss": 0.513, + "step": 5827 + }, + { + "epoch": 0.7150920245398773, + "grad_norm": 0.9287518067179927, + "learning_rate": 3.964143519104591e-06, + "loss": 0.5657, + "step": 5828 + }, + { + "epoch": 0.7152147239263804, + "grad_norm": 0.9270180728286131, + "learning_rate": 3.9609753860261675e-06, + "loss": 0.4637, + "step": 5829 + }, + { + "epoch": 0.7153374233128834, + "grad_norm": 0.9101867389148356, + "learning_rate": 3.95780820675976e-06, + "loss": 0.5724, + "step": 5830 + }, + { + "epoch": 0.7154601226993865, + "grad_norm": 0.9434021929075804, + "learning_rate": 3.9546419818056045e-06, + "loss": 0.559, + "step": 5831 + }, + { + "epoch": 0.7155828220858895, + "grad_norm": 0.9226047686864911, + "learning_rate": 3.951476711663772e-06, + "loss": 0.5806, + "step": 5832 + }, + { + "epoch": 0.7157055214723926, + "grad_norm": 0.8603551166834885, + "learning_rate": 3.948312396834195e-06, + "loss": 0.5091, + "step": 5833 + }, + { + "epoch": 0.7158282208588957, + "grad_norm": 0.8966927334121427, + "learning_rate": 3.945149037816644e-06, + "loss": 0.5059, + "step": 5834 + }, + { + "epoch": 0.7159509202453987, + "grad_norm": 0.8904664649234296, + "learning_rate": 3.941986635110754e-06, + "loss": 0.5797, + "step": 5835 + }, + { + "epoch": 0.7160736196319019, + "grad_norm": 0.9698658898652825, + "learning_rate": 3.938825189215991e-06, + "loss": 0.5911, + "step": 5836 + }, + { + "epoch": 0.7161963190184049, + "grad_norm": 0.8773395763268, + "learning_rate": 3.935664700631679e-06, + "loss": 0.5041, + "step": 5837 + }, + { + "epoch": 0.716319018404908, + "grad_norm": 0.8649714030383312, + "learning_rate": 3.932505169856993e-06, + "loss": 0.5574, + "step": 5838 + }, + { + "epoch": 0.7164417177914111, + "grad_norm": 0.8101793729321027, + "learning_rate": 3.929346597390949e-06, + "loss": 0.5219, + "step": 5839 + }, + { + "epoch": 0.7165644171779141, + "grad_norm": 0.8385743259048786, + "learning_rate": 3.9261889837324245e-06, + "loss": 0.4727, + "step": 5840 + }, + { + "epoch": 0.7166871165644172, + "grad_norm": 0.9195027241090795, + "learning_rate": 3.9230323293801275e-06, + "loss": 0.5947, + "step": 5841 + }, + { + "epoch": 0.7168098159509202, + "grad_norm": 0.9407714673966466, + "learning_rate": 3.9198766348326335e-06, + "loss": 0.5554, + "step": 5842 + }, + { + "epoch": 0.7169325153374233, + "grad_norm": 1.0304122046498445, + "learning_rate": 3.9167219005883495e-06, + "loss": 0.5535, + "step": 5843 + }, + { + "epoch": 0.7170552147239264, + "grad_norm": 0.8655790300433683, + "learning_rate": 3.9135681271455476e-06, + "loss": 0.5677, + "step": 5844 + }, + { + "epoch": 0.7171779141104294, + "grad_norm": 0.7646233902996908, + "learning_rate": 3.910415315002328e-06, + "loss": 0.5263, + "step": 5845 + }, + { + "epoch": 0.7173006134969325, + "grad_norm": 0.8917100204064037, + "learning_rate": 3.907263464656662e-06, + "loss": 0.5349, + "step": 5846 + }, + { + "epoch": 0.7174233128834356, + "grad_norm": 1.137696448477357, + "learning_rate": 3.904112576606347e-06, + "loss": 0.5411, + "step": 5847 + }, + { + "epoch": 0.7175460122699386, + "grad_norm": 0.8723690592622518, + "learning_rate": 3.900962651349048e-06, + "loss": 0.4989, + "step": 5848 + }, + { + "epoch": 0.7176687116564417, + "grad_norm": 0.9046137084573214, + "learning_rate": 3.897813689382262e-06, + "loss": 0.5455, + "step": 5849 + }, + { + "epoch": 0.7177914110429447, + "grad_norm": 0.9315022146608988, + "learning_rate": 3.89466569120334e-06, + "loss": 0.5667, + "step": 5850 + }, + { + "epoch": 0.7179141104294479, + "grad_norm": 0.8771491072124704, + "learning_rate": 3.891518657309482e-06, + "loss": 0.5352, + "step": 5851 + }, + { + "epoch": 0.718036809815951, + "grad_norm": 0.809597458322939, + "learning_rate": 3.888372588197738e-06, + "loss": 0.5144, + "step": 5852 + }, + { + "epoch": 0.718159509202454, + "grad_norm": 0.8409591418309341, + "learning_rate": 3.885227484365005e-06, + "loss": 0.5648, + "step": 5853 + }, + { + "epoch": 0.7182822085889571, + "grad_norm": 0.938916006263557, + "learning_rate": 3.882083346308017e-06, + "loss": 0.5374, + "step": 5854 + }, + { + "epoch": 0.7184049079754601, + "grad_norm": 0.959545778081247, + "learning_rate": 3.878940174523371e-06, + "loss": 0.5506, + "step": 5855 + }, + { + "epoch": 0.7185276073619632, + "grad_norm": 0.8905302219023441, + "learning_rate": 3.875797969507502e-06, + "loss": 0.5431, + "step": 5856 + }, + { + "epoch": 0.7186503067484663, + "grad_norm": 0.748976317394863, + "learning_rate": 3.8726567317566885e-06, + "loss": 0.5705, + "step": 5857 + }, + { + "epoch": 0.7187730061349693, + "grad_norm": 0.8921175515269131, + "learning_rate": 3.86951646176707e-06, + "loss": 0.5159, + "step": 5858 + }, + { + "epoch": 0.7188957055214724, + "grad_norm": 0.7882916302785927, + "learning_rate": 3.866377160034619e-06, + "loss": 0.5907, + "step": 5859 + }, + { + "epoch": 0.7190184049079754, + "grad_norm": 0.9123959397078153, + "learning_rate": 3.8632388270551665e-06, + "loss": 0.6006, + "step": 5860 + }, + { + "epoch": 0.7191411042944785, + "grad_norm": 0.8994475519523284, + "learning_rate": 3.860101463324379e-06, + "loss": 0.5861, + "step": 5861 + }, + { + "epoch": 0.7192638036809816, + "grad_norm": 0.8969986254739969, + "learning_rate": 3.856965069337785e-06, + "loss": 0.4555, + "step": 5862 + }, + { + "epoch": 0.7193865030674846, + "grad_norm": 0.9187634541884784, + "learning_rate": 3.8538296455907395e-06, + "loss": 0.5596, + "step": 5863 + }, + { + "epoch": 0.7195092024539878, + "grad_norm": 0.8349349507389197, + "learning_rate": 3.850695192578467e-06, + "loss": 0.5041, + "step": 5864 + }, + { + "epoch": 0.7196319018404908, + "grad_norm": 0.8632477361538388, + "learning_rate": 3.847561710796019e-06, + "loss": 0.5646, + "step": 5865 + }, + { + "epoch": 0.7197546012269939, + "grad_norm": 0.8420167763016597, + "learning_rate": 3.844429200738309e-06, + "loss": 0.4959, + "step": 5866 + }, + { + "epoch": 0.719877300613497, + "grad_norm": 1.2076417886855473, + "learning_rate": 3.841297662900085e-06, + "loss": 0.5516, + "step": 5867 + }, + { + "epoch": 0.72, + "grad_norm": 0.7811817804265716, + "learning_rate": 3.838167097775945e-06, + "loss": 0.571, + "step": 5868 + }, + { + "epoch": 0.7201226993865031, + "grad_norm": 0.8778681672644203, + "learning_rate": 3.835037505860342e-06, + "loss": 0.5861, + "step": 5869 + }, + { + "epoch": 0.7202453987730061, + "grad_norm": 0.8377526895710913, + "learning_rate": 3.8319088876475595e-06, + "loss": 0.5481, + "step": 5870 + }, + { + "epoch": 0.7203680981595092, + "grad_norm": 0.8468446454248969, + "learning_rate": 3.828781243631744e-06, + "loss": 0.5564, + "step": 5871 + }, + { + "epoch": 0.7204907975460123, + "grad_norm": 0.8871797921790248, + "learning_rate": 3.825654574306873e-06, + "loss": 0.5689, + "step": 5872 + }, + { + "epoch": 0.7206134969325153, + "grad_norm": 0.8469651135765706, + "learning_rate": 3.822528880166783e-06, + "loss": 0.4678, + "step": 5873 + }, + { + "epoch": 0.7207361963190184, + "grad_norm": 0.9109140810792221, + "learning_rate": 3.819404161705144e-06, + "loss": 0.5762, + "step": 5874 + }, + { + "epoch": 0.7208588957055214, + "grad_norm": 0.8676763011820039, + "learning_rate": 3.816280419415487e-06, + "loss": 0.533, + "step": 5875 + }, + { + "epoch": 0.7209815950920245, + "grad_norm": 0.9454017775402023, + "learning_rate": 3.813157653791171e-06, + "loss": 0.5827, + "step": 5876 + }, + { + "epoch": 0.7211042944785276, + "grad_norm": 0.8819179187058639, + "learning_rate": 3.8100358653254177e-06, + "loss": 0.5802, + "step": 5877 + }, + { + "epoch": 0.7212269938650306, + "grad_norm": 0.9275333942306917, + "learning_rate": 3.806915054511281e-06, + "loss": 0.6038, + "step": 5878 + }, + { + "epoch": 0.7213496932515338, + "grad_norm": 1.03414722192433, + "learning_rate": 3.8037952218416672e-06, + "loss": 0.5479, + "step": 5879 + }, + { + "epoch": 0.7214723926380369, + "grad_norm": 0.8292950446080234, + "learning_rate": 3.8006763678093326e-06, + "loss": 0.5516, + "step": 5880 + }, + { + "epoch": 0.7215950920245399, + "grad_norm": 0.8272203136484029, + "learning_rate": 3.7975584929068653e-06, + "loss": 0.5779, + "step": 5881 + }, + { + "epoch": 0.721717791411043, + "grad_norm": 0.8952981273758582, + "learning_rate": 3.7944415976267147e-06, + "loss": 0.5919, + "step": 5882 + }, + { + "epoch": 0.721840490797546, + "grad_norm": 0.875981757490247, + "learning_rate": 3.79132568246116e-06, + "loss": 0.5569, + "step": 5883 + }, + { + "epoch": 0.7219631901840491, + "grad_norm": 0.8849723191002984, + "learning_rate": 3.78821074790234e-06, + "loss": 0.565, + "step": 5884 + }, + { + "epoch": 0.7220858895705522, + "grad_norm": 0.9002987906085361, + "learning_rate": 3.785096794442229e-06, + "loss": 0.5653, + "step": 5885 + }, + { + "epoch": 0.7222085889570552, + "grad_norm": 0.7880776518736731, + "learning_rate": 3.781983822572646e-06, + "loss": 0.5684, + "step": 5886 + }, + { + "epoch": 0.7223312883435583, + "grad_norm": 0.936285908437408, + "learning_rate": 3.7788718327852625e-06, + "loss": 0.5154, + "step": 5887 + }, + { + "epoch": 0.7224539877300613, + "grad_norm": 0.8851957200104832, + "learning_rate": 3.775760825571587e-06, + "loss": 0.4977, + "step": 5888 + }, + { + "epoch": 0.7225766871165644, + "grad_norm": 0.9873647341330447, + "learning_rate": 3.7726508014229825e-06, + "loss": 0.533, + "step": 5889 + }, + { + "epoch": 0.7226993865030675, + "grad_norm": 0.800847259819679, + "learning_rate": 3.7695417608306415e-06, + "loss": 0.5653, + "step": 5890 + }, + { + "epoch": 0.7228220858895705, + "grad_norm": 0.9469349333208045, + "learning_rate": 3.766433704285619e-06, + "loss": 0.5338, + "step": 5891 + }, + { + "epoch": 0.7229447852760736, + "grad_norm": 0.8685942975117497, + "learning_rate": 3.763326632278799e-06, + "loss": 0.6257, + "step": 5892 + }, + { + "epoch": 0.7230674846625766, + "grad_norm": 0.936470875216317, + "learning_rate": 3.760220545300923e-06, + "loss": 0.6146, + "step": 5893 + }, + { + "epoch": 0.7231901840490798, + "grad_norm": 0.9224130093855736, + "learning_rate": 3.757115443842565e-06, + "loss": 0.5363, + "step": 5894 + }, + { + "epoch": 0.7233128834355829, + "grad_norm": 0.9209244470356109, + "learning_rate": 3.7540113283941536e-06, + "loss": 0.5745, + "step": 5895 + }, + { + "epoch": 0.7234355828220859, + "grad_norm": 1.1588616129881038, + "learning_rate": 3.750908199445953e-06, + "loss": 0.5424, + "step": 5896 + }, + { + "epoch": 0.723558282208589, + "grad_norm": 0.8267069525417874, + "learning_rate": 3.7478060574880805e-06, + "loss": 0.522, + "step": 5897 + }, + { + "epoch": 0.723680981595092, + "grad_norm": 0.8972726731734684, + "learning_rate": 3.7447049030104897e-06, + "loss": 0.5719, + "step": 5898 + }, + { + "epoch": 0.7238036809815951, + "grad_norm": 0.9772708480688703, + "learning_rate": 3.7416047365029793e-06, + "loss": 0.5652, + "step": 5899 + }, + { + "epoch": 0.7239263803680982, + "grad_norm": 0.909713151580799, + "learning_rate": 3.7385055584552e-06, + "loss": 0.517, + "step": 5900 + }, + { + "epoch": 0.7240490797546012, + "grad_norm": 0.7625237182477735, + "learning_rate": 3.735407369356633e-06, + "loss": 0.5593, + "step": 5901 + }, + { + "epoch": 0.7241717791411043, + "grad_norm": 0.9060899331476285, + "learning_rate": 3.732310169696618e-06, + "loss": 0.5084, + "step": 5902 + }, + { + "epoch": 0.7242944785276073, + "grad_norm": 0.9072735513583101, + "learning_rate": 3.729213959964323e-06, + "loss": 0.571, + "step": 5903 + }, + { + "epoch": 0.7244171779141104, + "grad_norm": 1.0550307433758535, + "learning_rate": 3.7261187406487764e-06, + "loss": 0.575, + "step": 5904 + }, + { + "epoch": 0.7245398773006135, + "grad_norm": 0.8692171290929895, + "learning_rate": 3.723024512238833e-06, + "loss": 0.583, + "step": 5905 + }, + { + "epoch": 0.7246625766871165, + "grad_norm": 0.8492417416558704, + "learning_rate": 3.7199312752232053e-06, + "loss": 0.483, + "step": 5906 + }, + { + "epoch": 0.7247852760736196, + "grad_norm": 0.8916211395741879, + "learning_rate": 3.7168390300904445e-06, + "loss": 0.5752, + "step": 5907 + }, + { + "epoch": 0.7249079754601226, + "grad_norm": 0.9149248475081564, + "learning_rate": 3.7137477773289377e-06, + "loss": 0.5858, + "step": 5908 + }, + { + "epoch": 0.7250306748466258, + "grad_norm": 0.8919931227652167, + "learning_rate": 3.71065751742693e-06, + "loss": 0.5633, + "step": 5909 + }, + { + "epoch": 0.7251533742331289, + "grad_norm": 0.9773911572948109, + "learning_rate": 3.707568250872493e-06, + "loss": 0.5528, + "step": 5910 + }, + { + "epoch": 0.7252760736196319, + "grad_norm": 0.8620379082080892, + "learning_rate": 3.7044799781535568e-06, + "loss": 0.5233, + "step": 5911 + }, + { + "epoch": 0.725398773006135, + "grad_norm": 0.8707948486962749, + "learning_rate": 3.701392699757882e-06, + "loss": 0.5432, + "step": 5912 + }, + { + "epoch": 0.7255214723926381, + "grad_norm": 0.8794400438392137, + "learning_rate": 3.698306416173083e-06, + "loss": 0.5171, + "step": 5913 + }, + { + "epoch": 0.7256441717791411, + "grad_norm": 0.797127933448321, + "learning_rate": 3.6952211278866058e-06, + "loss": 0.5799, + "step": 5914 + }, + { + "epoch": 0.7257668711656442, + "grad_norm": 0.8991832569171784, + "learning_rate": 3.6921368353857524e-06, + "loss": 0.5873, + "step": 5915 + }, + { + "epoch": 0.7258895705521472, + "grad_norm": 0.990519281848287, + "learning_rate": 3.6890535391576554e-06, + "loss": 0.4728, + "step": 5916 + }, + { + "epoch": 0.7260122699386503, + "grad_norm": 1.0009552711277194, + "learning_rate": 3.685971239689292e-06, + "loss": 0.6334, + "step": 5917 + }, + { + "epoch": 0.7261349693251534, + "grad_norm": 0.8301590529080924, + "learning_rate": 3.6828899374674933e-06, + "loss": 0.5359, + "step": 5918 + }, + { + "epoch": 0.7262576687116564, + "grad_norm": 0.914431311913465, + "learning_rate": 3.6798096329789144e-06, + "loss": 0.5698, + "step": 5919 + }, + { + "epoch": 0.7263803680981595, + "grad_norm": 0.8325089515319308, + "learning_rate": 3.676730326710074e-06, + "loss": 0.559, + "step": 5920 + }, + { + "epoch": 0.7265030674846625, + "grad_norm": 0.9391218226474937, + "learning_rate": 3.673652019147311e-06, + "loss": 0.5667, + "step": 5921 + }, + { + "epoch": 0.7266257668711656, + "grad_norm": 0.9152281120040063, + "learning_rate": 3.6705747107768275e-06, + "loss": 0.5835, + "step": 5922 + }, + { + "epoch": 0.7267484662576688, + "grad_norm": 0.8662550272064778, + "learning_rate": 3.6674984020846503e-06, + "loss": 0.586, + "step": 5923 + }, + { + "epoch": 0.7268711656441718, + "grad_norm": 0.8945467996108597, + "learning_rate": 3.6644230935566614e-06, + "loss": 0.5264, + "step": 5924 + }, + { + "epoch": 0.7269938650306749, + "grad_norm": 0.9419757247779371, + "learning_rate": 3.6613487856785744e-06, + "loss": 0.533, + "step": 5925 + }, + { + "epoch": 0.7271165644171779, + "grad_norm": 0.7981790696309919, + "learning_rate": 3.6582754789359553e-06, + "loss": 0.5721, + "step": 5926 + }, + { + "epoch": 0.727239263803681, + "grad_norm": 0.8791942277668675, + "learning_rate": 3.6552031738142004e-06, + "loss": 0.5207, + "step": 5927 + }, + { + "epoch": 0.7273619631901841, + "grad_norm": 0.8923915797319895, + "learning_rate": 3.652131870798561e-06, + "loss": 0.4803, + "step": 5928 + }, + { + "epoch": 0.7274846625766871, + "grad_norm": 1.0231300929863791, + "learning_rate": 3.6490615703741184e-06, + "loss": 0.5312, + "step": 5929 + }, + { + "epoch": 0.7276073619631902, + "grad_norm": 0.7994240134697922, + "learning_rate": 3.645992273025797e-06, + "loss": 0.474, + "step": 5930 + }, + { + "epoch": 0.7277300613496932, + "grad_norm": 0.8845883303200093, + "learning_rate": 3.642923979238373e-06, + "loss": 0.5349, + "step": 5931 + }, + { + "epoch": 0.7278527607361963, + "grad_norm": 0.8361574282401886, + "learning_rate": 3.6398566894964515e-06, + "loss": 0.5839, + "step": 5932 + }, + { + "epoch": 0.7279754601226994, + "grad_norm": 0.9124649042260095, + "learning_rate": 3.6367904042844857e-06, + "loss": 0.5321, + "step": 5933 + }, + { + "epoch": 0.7280981595092024, + "grad_norm": 0.8329213592426048, + "learning_rate": 3.6337251240867744e-06, + "loss": 0.5494, + "step": 5934 + }, + { + "epoch": 0.7282208588957055, + "grad_norm": 0.8606494807012754, + "learning_rate": 3.630660849387444e-06, + "loss": 0.5134, + "step": 5935 + }, + { + "epoch": 0.7283435582822085, + "grad_norm": 0.9022543847681632, + "learning_rate": 3.6275975806704777e-06, + "loss": 0.5722, + "step": 5936 + }, + { + "epoch": 0.7284662576687116, + "grad_norm": 0.96344225849891, + "learning_rate": 3.624535318419685e-06, + "loss": 0.5434, + "step": 5937 + }, + { + "epoch": 0.7285889570552148, + "grad_norm": 0.7922137494591112, + "learning_rate": 3.621474063118733e-06, + "loss": 0.551, + "step": 5938 + }, + { + "epoch": 0.7287116564417178, + "grad_norm": 1.046777365217492, + "learning_rate": 3.6184138152511107e-06, + "loss": 0.5177, + "step": 5939 + }, + { + "epoch": 0.7288343558282209, + "grad_norm": 0.9724676618899649, + "learning_rate": 3.6153545753001663e-06, + "loss": 0.5491, + "step": 5940 + }, + { + "epoch": 0.728957055214724, + "grad_norm": 0.9498935611483804, + "learning_rate": 3.6122963437490745e-06, + "loss": 0.5541, + "step": 5941 + }, + { + "epoch": 0.729079754601227, + "grad_norm": 0.9112513167604674, + "learning_rate": 3.609239121080862e-06, + "loss": 0.5772, + "step": 5942 + }, + { + "epoch": 0.7292024539877301, + "grad_norm": 0.9613615232400975, + "learning_rate": 3.6061829077783837e-06, + "loss": 0.5078, + "step": 5943 + }, + { + "epoch": 0.7293251533742331, + "grad_norm": 0.8802928079952381, + "learning_rate": 3.6031277043243505e-06, + "loss": 0.5082, + "step": 5944 + }, + { + "epoch": 0.7294478527607362, + "grad_norm": 0.8501066160354503, + "learning_rate": 3.6000735112012984e-06, + "loss": 0.4944, + "step": 5945 + }, + { + "epoch": 0.7295705521472393, + "grad_norm": 0.8002592171050578, + "learning_rate": 3.5970203288916183e-06, + "loss": 0.6084, + "step": 5946 + }, + { + "epoch": 0.7296932515337423, + "grad_norm": 0.9428372353132212, + "learning_rate": 3.593968157877529e-06, + "loss": 0.544, + "step": 5947 + }, + { + "epoch": 0.7298159509202454, + "grad_norm": 0.7987293204123412, + "learning_rate": 3.590916998641092e-06, + "loss": 0.498, + "step": 5948 + }, + { + "epoch": 0.7299386503067484, + "grad_norm": 0.8690546782592344, + "learning_rate": 3.587866851664219e-06, + "loss": 0.4929, + "step": 5949 + }, + { + "epoch": 0.7300613496932515, + "grad_norm": 1.0065200471528863, + "learning_rate": 3.584817717428647e-06, + "loss": 0.5188, + "step": 5950 + }, + { + "epoch": 0.7301840490797546, + "grad_norm": 0.8344103047986509, + "learning_rate": 3.5817695964159673e-06, + "loss": 0.5635, + "step": 5951 + }, + { + "epoch": 0.7303067484662576, + "grad_norm": 0.8615727833391371, + "learning_rate": 3.578722489107599e-06, + "loss": 0.5615, + "step": 5952 + }, + { + "epoch": 0.7304294478527608, + "grad_norm": 0.8332640903442158, + "learning_rate": 3.5756763959848117e-06, + "loss": 0.5025, + "step": 5953 + }, + { + "epoch": 0.7305521472392638, + "grad_norm": 0.9753622971997661, + "learning_rate": 3.572631317528703e-06, + "loss": 0.5731, + "step": 5954 + }, + { + "epoch": 0.7306748466257669, + "grad_norm": 0.8237677571890211, + "learning_rate": 3.569587254220225e-06, + "loss": 0.5732, + "step": 5955 + }, + { + "epoch": 0.73079754601227, + "grad_norm": 0.862373850670542, + "learning_rate": 3.566544206540152e-06, + "loss": 0.5486, + "step": 5956 + }, + { + "epoch": 0.730920245398773, + "grad_norm": 0.7972425056831559, + "learning_rate": 3.563502174969117e-06, + "loss": 0.5316, + "step": 5957 + }, + { + "epoch": 0.7310429447852761, + "grad_norm": 0.8691992035758921, + "learning_rate": 3.560461159987577e-06, + "loss": 0.4825, + "step": 5958 + }, + { + "epoch": 0.7311656441717791, + "grad_norm": 0.9924377520202345, + "learning_rate": 3.5574211620758327e-06, + "loss": 0.5111, + "step": 5959 + }, + { + "epoch": 0.7312883435582822, + "grad_norm": 0.8261230627541942, + "learning_rate": 3.5543821817140313e-06, + "loss": 0.5564, + "step": 5960 + }, + { + "epoch": 0.7314110429447853, + "grad_norm": 0.999158062318105, + "learning_rate": 3.5513442193821425e-06, + "loss": 0.6177, + "step": 5961 + }, + { + "epoch": 0.7315337423312883, + "grad_norm": 0.9191182401903785, + "learning_rate": 3.5483072755600012e-06, + "loss": 0.5653, + "step": 5962 + }, + { + "epoch": 0.7316564417177914, + "grad_norm": 0.8756571351044651, + "learning_rate": 3.545271350727257e-06, + "loss": 0.5369, + "step": 5963 + }, + { + "epoch": 0.7317791411042944, + "grad_norm": 0.8894162824002799, + "learning_rate": 3.542236445363414e-06, + "loss": 0.5147, + "step": 5964 + }, + { + "epoch": 0.7319018404907975, + "grad_norm": 0.9383722661220145, + "learning_rate": 3.5392025599478053e-06, + "loss": 0.5475, + "step": 5965 + }, + { + "epoch": 0.7320245398773007, + "grad_norm": 0.7989885067433995, + "learning_rate": 3.5361696949596046e-06, + "loss": 0.5601, + "step": 5966 + }, + { + "epoch": 0.7321472392638037, + "grad_norm": 0.8887604629630381, + "learning_rate": 3.5331378508778336e-06, + "loss": 0.577, + "step": 5967 + }, + { + "epoch": 0.7322699386503068, + "grad_norm": 0.8695995906153897, + "learning_rate": 3.53010702818134e-06, + "loss": 0.5434, + "step": 5968 + }, + { + "epoch": 0.7323926380368098, + "grad_norm": 0.8844336108958155, + "learning_rate": 3.5270772273488206e-06, + "loss": 0.5409, + "step": 5969 + }, + { + "epoch": 0.7325153374233129, + "grad_norm": 0.8756485915094504, + "learning_rate": 3.5240484488588012e-06, + "loss": 0.5345, + "step": 5970 + }, + { + "epoch": 0.732638036809816, + "grad_norm": 1.0257356723196096, + "learning_rate": 3.5210206931896586e-06, + "loss": 0.4912, + "step": 5971 + }, + { + "epoch": 0.732760736196319, + "grad_norm": 0.8161301468114641, + "learning_rate": 3.5179939608195935e-06, + "loss": 0.5309, + "step": 5972 + }, + { + "epoch": 0.7328834355828221, + "grad_norm": 0.9254231127188198, + "learning_rate": 3.5149682522266582e-06, + "loss": 0.5211, + "step": 5973 + }, + { + "epoch": 0.7330061349693252, + "grad_norm": 0.7751734297564792, + "learning_rate": 3.5119435678887328e-06, + "loss": 0.4659, + "step": 5974 + }, + { + "epoch": 0.7331288343558282, + "grad_norm": 0.8712153107916724, + "learning_rate": 3.5089199082835436e-06, + "loss": 0.5735, + "step": 5975 + }, + { + "epoch": 0.7332515337423313, + "grad_norm": 0.8716351283856082, + "learning_rate": 3.5058972738886476e-06, + "loss": 0.5168, + "step": 5976 + }, + { + "epoch": 0.7333742331288343, + "grad_norm": 0.8448156154316994, + "learning_rate": 3.502875665181449e-06, + "loss": 0.579, + "step": 5977 + }, + { + "epoch": 0.7334969325153374, + "grad_norm": 0.8496817890303003, + "learning_rate": 3.499855082639183e-06, + "loss": 0.5827, + "step": 5978 + }, + { + "epoch": 0.7336196319018405, + "grad_norm": 0.9145828373791982, + "learning_rate": 3.496835526738921e-06, + "loss": 0.5537, + "step": 5979 + }, + { + "epoch": 0.7337423312883435, + "grad_norm": 0.7921122066345749, + "learning_rate": 3.493816997957582e-06, + "loss": 0.5436, + "step": 5980 + }, + { + "epoch": 0.7338650306748467, + "grad_norm": 0.7870041788183836, + "learning_rate": 3.4907994967719096e-06, + "loss": 0.5685, + "step": 5981 + }, + { + "epoch": 0.7339877300613497, + "grad_norm": 0.9907792491532603, + "learning_rate": 3.4877830236584997e-06, + "loss": 0.5557, + "step": 5982 + }, + { + "epoch": 0.7341104294478528, + "grad_norm": 0.9266931881024175, + "learning_rate": 3.484767579093772e-06, + "loss": 0.5753, + "step": 5983 + }, + { + "epoch": 0.7342331288343559, + "grad_norm": 1.5191371621615362, + "learning_rate": 3.4817531635539946e-06, + "loss": 0.5315, + "step": 5984 + }, + { + "epoch": 0.7343558282208589, + "grad_norm": 0.8380728905406227, + "learning_rate": 3.478739777515264e-06, + "loss": 0.5285, + "step": 5985 + }, + { + "epoch": 0.734478527607362, + "grad_norm": 0.8120959446456106, + "learning_rate": 3.4757274214535254e-06, + "loss": 0.4912, + "step": 5986 + }, + { + "epoch": 0.734601226993865, + "grad_norm": 1.007777577820347, + "learning_rate": 3.472716095844547e-06, + "loss": 0.5452, + "step": 5987 + }, + { + "epoch": 0.7347239263803681, + "grad_norm": 1.060972671693418, + "learning_rate": 3.469705801163944e-06, + "loss": 0.5608, + "step": 5988 + }, + { + "epoch": 0.7348466257668712, + "grad_norm": 0.8755320910104148, + "learning_rate": 3.466696537887172e-06, + "loss": 0.5857, + "step": 5989 + }, + { + "epoch": 0.7349693251533742, + "grad_norm": 0.8529630221340826, + "learning_rate": 3.463688306489511e-06, + "loss": 0.5944, + "step": 5990 + }, + { + "epoch": 0.7350920245398773, + "grad_norm": 0.9981451212731883, + "learning_rate": 3.460681107446091e-06, + "loss": 0.566, + "step": 5991 + }, + { + "epoch": 0.7352147239263803, + "grad_norm": 0.8968633808788442, + "learning_rate": 3.4576749412318676e-06, + "loss": 0.5076, + "step": 5992 + }, + { + "epoch": 0.7353374233128834, + "grad_norm": 0.8868488594841593, + "learning_rate": 3.454669808321646e-06, + "loss": 0.5219, + "step": 5993 + }, + { + "epoch": 0.7354601226993865, + "grad_norm": 0.9958545926429931, + "learning_rate": 3.4516657091900517e-06, + "loss": 0.5482, + "step": 5994 + }, + { + "epoch": 0.7355828220858895, + "grad_norm": 0.8438977382990828, + "learning_rate": 3.448662644311567e-06, + "loss": 0.5709, + "step": 5995 + }, + { + "epoch": 0.7357055214723927, + "grad_norm": 0.8492353122602644, + "learning_rate": 3.4456606141604932e-06, + "loss": 0.5555, + "step": 5996 + }, + { + "epoch": 0.7358282208588957, + "grad_norm": 0.841081156939197, + "learning_rate": 3.442659619210974e-06, + "loss": 0.4801, + "step": 5997 + }, + { + "epoch": 0.7359509202453988, + "grad_norm": 1.1678298798270907, + "learning_rate": 3.439659659936997e-06, + "loss": 0.5953, + "step": 5998 + }, + { + "epoch": 0.7360736196319019, + "grad_norm": 0.832666029448814, + "learning_rate": 3.4366607368123727e-06, + "loss": 0.5366, + "step": 5999 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.8884923922974549, + "learning_rate": 3.433662850310763e-06, + "loss": 0.5278, + "step": 6000 + }, + { + "epoch": 0.736319018404908, + "grad_norm": 0.8256402595949277, + "learning_rate": 3.43066600090565e-06, + "loss": 0.5281, + "step": 6001 + }, + { + "epoch": 0.736441717791411, + "grad_norm": 0.9888724758638817, + "learning_rate": 3.427670189070369e-06, + "loss": 0.5371, + "step": 6002 + }, + { + "epoch": 0.7365644171779141, + "grad_norm": 0.913094867902773, + "learning_rate": 3.424675415278075e-06, + "loss": 0.5851, + "step": 6003 + }, + { + "epoch": 0.7366871165644172, + "grad_norm": 0.9688913546291684, + "learning_rate": 3.4216816800017727e-06, + "loss": 0.5161, + "step": 6004 + }, + { + "epoch": 0.7368098159509202, + "grad_norm": 0.9499500708041309, + "learning_rate": 3.418688983714291e-06, + "loss": 0.5305, + "step": 6005 + }, + { + "epoch": 0.7369325153374233, + "grad_norm": 2.0328164655542182, + "learning_rate": 3.4156973268883076e-06, + "loss": 0.548, + "step": 6006 + }, + { + "epoch": 0.7370552147239264, + "grad_norm": 0.7972367937058669, + "learning_rate": 3.412706709996325e-06, + "loss": 0.534, + "step": 6007 + }, + { + "epoch": 0.7371779141104294, + "grad_norm": 0.9706489592672032, + "learning_rate": 3.409717133510683e-06, + "loss": 0.5278, + "step": 6008 + }, + { + "epoch": 0.7373006134969325, + "grad_norm": 0.8611707684106815, + "learning_rate": 3.4067285979035637e-06, + "loss": 0.5301, + "step": 6009 + }, + { + "epoch": 0.7374233128834355, + "grad_norm": 0.9173109544317213, + "learning_rate": 3.403741103646977e-06, + "loss": 0.5852, + "step": 6010 + }, + { + "epoch": 0.7375460122699387, + "grad_norm": 0.8499915803308463, + "learning_rate": 3.4007546512127764e-06, + "loss": 0.5281, + "step": 6011 + }, + { + "epoch": 0.7376687116564418, + "grad_norm": 0.8628787653577114, + "learning_rate": 3.3977692410726416e-06, + "loss": 0.5162, + "step": 6012 + }, + { + "epoch": 0.7377914110429448, + "grad_norm": 0.8429521332302653, + "learning_rate": 3.394784873698098e-06, + "loss": 0.539, + "step": 6013 + }, + { + "epoch": 0.7379141104294479, + "grad_norm": 0.8263003054885073, + "learning_rate": 3.391801549560494e-06, + "loss": 0.5461, + "step": 6014 + }, + { + "epoch": 0.7380368098159509, + "grad_norm": 0.9579295709166281, + "learning_rate": 3.3888192691310262e-06, + "loss": 0.5212, + "step": 6015 + }, + { + "epoch": 0.738159509202454, + "grad_norm": 1.0339951647881938, + "learning_rate": 3.385838032880715e-06, + "loss": 0.5498, + "step": 6016 + }, + { + "epoch": 0.7382822085889571, + "grad_norm": 0.8932049975110159, + "learning_rate": 3.3828578412804235e-06, + "loss": 0.518, + "step": 6017 + }, + { + "epoch": 0.7384049079754601, + "grad_norm": 0.9202376281860855, + "learning_rate": 3.3798786948008498e-06, + "loss": 0.501, + "step": 6018 + }, + { + "epoch": 0.7385276073619632, + "grad_norm": 0.9505406358054351, + "learning_rate": 3.3769005939125198e-06, + "loss": 0.5289, + "step": 6019 + }, + { + "epoch": 0.7386503067484662, + "grad_norm": 0.8529960812772104, + "learning_rate": 3.373923539085805e-06, + "loss": 0.4823, + "step": 6020 + }, + { + "epoch": 0.7387730061349693, + "grad_norm": 0.8690982648373852, + "learning_rate": 3.3709475307908967e-06, + "loss": 0.5512, + "step": 6021 + }, + { + "epoch": 0.7388957055214724, + "grad_norm": 1.0440971127852292, + "learning_rate": 3.3679725694978395e-06, + "loss": 0.518, + "step": 6022 + }, + { + "epoch": 0.7390184049079754, + "grad_norm": 0.9558387816098507, + "learning_rate": 3.364998655676496e-06, + "loss": 0.5299, + "step": 6023 + }, + { + "epoch": 0.7391411042944785, + "grad_norm": 1.0316817363557247, + "learning_rate": 3.3620257897965748e-06, + "loss": 0.5502, + "step": 6024 + }, + { + "epoch": 0.7392638036809815, + "grad_norm": 0.9295192392604673, + "learning_rate": 3.3590539723276083e-06, + "loss": 0.5315, + "step": 6025 + }, + { + "epoch": 0.7393865030674847, + "grad_norm": 0.8861325530096887, + "learning_rate": 3.356083203738978e-06, + "loss": 0.5415, + "step": 6026 + }, + { + "epoch": 0.7395092024539878, + "grad_norm": 0.8601138185677717, + "learning_rate": 3.353113484499887e-06, + "loss": 0.5842, + "step": 6027 + }, + { + "epoch": 0.7396319018404908, + "grad_norm": 0.866431954638484, + "learning_rate": 3.3501448150793737e-06, + "loss": 0.5804, + "step": 6028 + }, + { + "epoch": 0.7397546012269939, + "grad_norm": 0.9069328454322261, + "learning_rate": 3.3471771959463195e-06, + "loss": 0.5889, + "step": 6029 + }, + { + "epoch": 0.7398773006134969, + "grad_norm": 1.0570637234839857, + "learning_rate": 3.3442106275694295e-06, + "loss": 0.5952, + "step": 6030 + }, + { + "epoch": 0.74, + "grad_norm": 0.9700234338143797, + "learning_rate": 3.341245110417253e-06, + "loss": 0.5452, + "step": 6031 + }, + { + "epoch": 0.7401226993865031, + "grad_norm": 1.0137485045727805, + "learning_rate": 3.338280644958162e-06, + "loss": 0.5496, + "step": 6032 + }, + { + "epoch": 0.7402453987730061, + "grad_norm": 0.8944514123209392, + "learning_rate": 3.3353172316603744e-06, + "loss": 0.5533, + "step": 6033 + }, + { + "epoch": 0.7403680981595092, + "grad_norm": 0.9821032754352604, + "learning_rate": 3.3323548709919286e-06, + "loss": 0.6032, + "step": 6034 + }, + { + "epoch": 0.7404907975460123, + "grad_norm": 0.8329604582694184, + "learning_rate": 3.329393563420713e-06, + "loss": 0.5264, + "step": 6035 + }, + { + "epoch": 0.7406134969325153, + "grad_norm": 0.9176832479169291, + "learning_rate": 3.3264333094144317e-06, + "loss": 0.5114, + "step": 6036 + }, + { + "epoch": 0.7407361963190184, + "grad_norm": 0.9193494971285681, + "learning_rate": 3.323474109440639e-06, + "loss": 0.5631, + "step": 6037 + }, + { + "epoch": 0.7408588957055214, + "grad_norm": 0.9382948332406009, + "learning_rate": 3.3205159639667117e-06, + "loss": 0.5152, + "step": 6038 + }, + { + "epoch": 0.7409815950920245, + "grad_norm": 0.9467533593148808, + "learning_rate": 3.3175588734598597e-06, + "loss": 0.6221, + "step": 6039 + }, + { + "epoch": 0.7411042944785277, + "grad_norm": 0.8893990974785184, + "learning_rate": 3.3146028383871363e-06, + "loss": 0.5445, + "step": 6040 + }, + { + "epoch": 0.7412269938650307, + "grad_norm": 0.838620628427, + "learning_rate": 3.3116478592154177e-06, + "loss": 0.4564, + "step": 6041 + }, + { + "epoch": 0.7413496932515338, + "grad_norm": 0.9520578221981473, + "learning_rate": 3.308693936411421e-06, + "loss": 0.5013, + "step": 6042 + }, + { + "epoch": 0.7414723926380368, + "grad_norm": 0.8338291444317085, + "learning_rate": 3.305741070441685e-06, + "loss": 0.5187, + "step": 6043 + }, + { + "epoch": 0.7415950920245399, + "grad_norm": 0.8980929214861042, + "learning_rate": 3.302789261772601e-06, + "loss": 0.5781, + "step": 6044 + }, + { + "epoch": 0.741717791411043, + "grad_norm": 1.1041592513324783, + "learning_rate": 3.2998385108703766e-06, + "loss": 0.584, + "step": 6045 + }, + { + "epoch": 0.741840490797546, + "grad_norm": 0.8660733404468619, + "learning_rate": 3.296888818201054e-06, + "loss": 0.4768, + "step": 6046 + }, + { + "epoch": 0.7419631901840491, + "grad_norm": 0.8802624349364299, + "learning_rate": 3.2939401842305187e-06, + "loss": 0.5691, + "step": 6047 + }, + { + "epoch": 0.7420858895705521, + "grad_norm": 0.9940141239277193, + "learning_rate": 3.290992609424475e-06, + "loss": 0.5398, + "step": 6048 + }, + { + "epoch": 0.7422085889570552, + "grad_norm": 0.9560071162365051, + "learning_rate": 3.2880460942484727e-06, + "loss": 0.5577, + "step": 6049 + }, + { + "epoch": 0.7423312883435583, + "grad_norm": 0.8499538897719283, + "learning_rate": 3.285100639167883e-06, + "loss": 0.5722, + "step": 6050 + }, + { + "epoch": 0.7424539877300613, + "grad_norm": 0.9132265956261326, + "learning_rate": 3.2821562446479215e-06, + "loss": 0.586, + "step": 6051 + }, + { + "epoch": 0.7425766871165644, + "grad_norm": 0.8520613669658071, + "learning_rate": 3.279212911153624e-06, + "loss": 0.5573, + "step": 6052 + }, + { + "epoch": 0.7426993865030674, + "grad_norm": 0.9017408525589339, + "learning_rate": 3.2762706391498712e-06, + "loss": 0.5732, + "step": 6053 + }, + { + "epoch": 0.7428220858895705, + "grad_norm": 0.9491878381811709, + "learning_rate": 3.273329429101362e-06, + "loss": 0.5822, + "step": 6054 + }, + { + "epoch": 0.7429447852760737, + "grad_norm": 0.8000171126467939, + "learning_rate": 3.2703892814726436e-06, + "loss": 0.495, + "step": 6055 + }, + { + "epoch": 0.7430674846625767, + "grad_norm": 0.9194081018372622, + "learning_rate": 3.2674501967280814e-06, + "loss": 0.5533, + "step": 6056 + }, + { + "epoch": 0.7431901840490798, + "grad_norm": 0.856020454857949, + "learning_rate": 3.2645121753318777e-06, + "loss": 0.5446, + "step": 6057 + }, + { + "epoch": 0.7433128834355828, + "grad_norm": 0.8001189246037217, + "learning_rate": 3.261575217748072e-06, + "loss": 0.4909, + "step": 6058 + }, + { + "epoch": 0.7434355828220859, + "grad_norm": 0.8138467725441894, + "learning_rate": 3.258639324440527e-06, + "loss": 0.6021, + "step": 6059 + }, + { + "epoch": 0.743558282208589, + "grad_norm": 0.7701503849158087, + "learning_rate": 3.2557044958729466e-06, + "loss": 0.5482, + "step": 6060 + }, + { + "epoch": 0.743680981595092, + "grad_norm": 0.8852709801603755, + "learning_rate": 3.2527707325088577e-06, + "loss": 0.5481, + "step": 6061 + }, + { + "epoch": 0.7438036809815951, + "grad_norm": 0.8800986922968741, + "learning_rate": 3.2498380348116264e-06, + "loss": 0.6067, + "step": 6062 + }, + { + "epoch": 0.7439263803680981, + "grad_norm": 0.8385834723838042, + "learning_rate": 3.246906403244443e-06, + "loss": 0.5719, + "step": 6063 + }, + { + "epoch": 0.7440490797546012, + "grad_norm": 0.8120291002951624, + "learning_rate": 3.243975838270339e-06, + "loss": 0.5322, + "step": 6064 + }, + { + "epoch": 0.7441717791411043, + "grad_norm": 0.9063085139159662, + "learning_rate": 3.2410463403521653e-06, + "loss": 0.5266, + "step": 6065 + }, + { + "epoch": 0.7442944785276073, + "grad_norm": 0.8942488186503772, + "learning_rate": 3.2381179099526173e-06, + "loss": 0.5866, + "step": 6066 + }, + { + "epoch": 0.7444171779141104, + "grad_norm": 0.8473879923995854, + "learning_rate": 3.2351905475342094e-06, + "loss": 0.6016, + "step": 6067 + }, + { + "epoch": 0.7445398773006136, + "grad_norm": 0.8137004902127468, + "learning_rate": 3.2322642535592994e-06, + "loss": 0.603, + "step": 6068 + }, + { + "epoch": 0.7446625766871166, + "grad_norm": 0.8307879223020188, + "learning_rate": 3.229339028490068e-06, + "loss": 0.5342, + "step": 6069 + }, + { + "epoch": 0.7447852760736197, + "grad_norm": 0.8240314539290481, + "learning_rate": 3.2264148727885257e-06, + "loss": 0.5647, + "step": 6070 + }, + { + "epoch": 0.7449079754601227, + "grad_norm": 0.903956191532969, + "learning_rate": 3.2234917869165203e-06, + "loss": 0.5402, + "step": 6071 + }, + { + "epoch": 0.7450306748466258, + "grad_norm": 0.7936281424154724, + "learning_rate": 3.2205697713357286e-06, + "loss": 0.5209, + "step": 6072 + }, + { + "epoch": 0.7451533742331289, + "grad_norm": 0.8937957134249706, + "learning_rate": 3.21764882650766e-06, + "loss": 0.5858, + "step": 6073 + }, + { + "epoch": 0.7452760736196319, + "grad_norm": 0.8616091720940388, + "learning_rate": 3.214728952893649e-06, + "loss": 0.5558, + "step": 6074 + }, + { + "epoch": 0.745398773006135, + "grad_norm": 0.8101175656056372, + "learning_rate": 3.211810150954867e-06, + "loss": 0.5379, + "step": 6075 + }, + { + "epoch": 0.745521472392638, + "grad_norm": 0.8022460711758092, + "learning_rate": 3.208892421152314e-06, + "loss": 0.5302, + "step": 6076 + }, + { + "epoch": 0.7456441717791411, + "grad_norm": 1.1496765624435659, + "learning_rate": 3.205975763946815e-06, + "loss": 0.574, + "step": 6077 + }, + { + "epoch": 0.7457668711656442, + "grad_norm": 1.1638907894183657, + "learning_rate": 3.203060179799038e-06, + "loss": 0.5687, + "step": 6078 + }, + { + "epoch": 0.7458895705521472, + "grad_norm": 0.908015537871933, + "learning_rate": 3.200145669169469e-06, + "loss": 0.5524, + "step": 6079 + }, + { + "epoch": 0.7460122699386503, + "grad_norm": 0.8262773802670051, + "learning_rate": 3.1972322325184347e-06, + "loss": 0.6288, + "step": 6080 + }, + { + "epoch": 0.7461349693251533, + "grad_norm": 0.862024101968302, + "learning_rate": 3.1943198703060816e-06, + "loss": 0.517, + "step": 6081 + }, + { + "epoch": 0.7462576687116564, + "grad_norm": 0.8243906656205485, + "learning_rate": 3.191408582992399e-06, + "loss": 0.4589, + "step": 6082 + }, + { + "epoch": 0.7463803680981596, + "grad_norm": 0.8588758922991857, + "learning_rate": 3.188498371037193e-06, + "loss": 0.6133, + "step": 6083 + }, + { + "epoch": 0.7465030674846626, + "grad_norm": 0.9946637157744813, + "learning_rate": 3.185589234900113e-06, + "loss": 0.5367, + "step": 6084 + }, + { + "epoch": 0.7466257668711657, + "grad_norm": 0.9305923777195153, + "learning_rate": 3.182681175040625e-06, + "loss": 0.5521, + "step": 6085 + }, + { + "epoch": 0.7467484662576687, + "grad_norm": 0.8610496101257872, + "learning_rate": 3.1797741919180403e-06, + "loss": 0.574, + "step": 6086 + }, + { + "epoch": 0.7468711656441718, + "grad_norm": 0.8550436492419169, + "learning_rate": 3.176868285991487e-06, + "loss": 0.5732, + "step": 6087 + }, + { + "epoch": 0.7469938650306749, + "grad_norm": 0.9492327082272696, + "learning_rate": 3.173963457719924e-06, + "loss": 0.5976, + "step": 6088 + }, + { + "epoch": 0.7471165644171779, + "grad_norm": 1.0532819979835648, + "learning_rate": 3.171059707562153e-06, + "loss": 0.5651, + "step": 6089 + }, + { + "epoch": 0.747239263803681, + "grad_norm": 0.8047759430913369, + "learning_rate": 3.1681570359767875e-06, + "loss": 0.5715, + "step": 6090 + }, + { + "epoch": 0.747361963190184, + "grad_norm": 0.8912032469327904, + "learning_rate": 3.165255443422288e-06, + "loss": 0.5583, + "step": 6091 + }, + { + "epoch": 0.7474846625766871, + "grad_norm": 0.9167109177567565, + "learning_rate": 3.162354930356929e-06, + "loss": 0.5605, + "step": 6092 + }, + { + "epoch": 0.7476073619631902, + "grad_norm": 0.9437161077433047, + "learning_rate": 3.159455497238827e-06, + "loss": 0.6148, + "step": 6093 + }, + { + "epoch": 0.7477300613496932, + "grad_norm": 0.832720462472306, + "learning_rate": 3.1565571445259168e-06, + "loss": 0.5017, + "step": 6094 + }, + { + "epoch": 0.7478527607361963, + "grad_norm": 0.9038006110404464, + "learning_rate": 3.1536598726759747e-06, + "loss": 0.5639, + "step": 6095 + }, + { + "epoch": 0.7479754601226993, + "grad_norm": 0.812678526936845, + "learning_rate": 3.1507636821465936e-06, + "loss": 0.5333, + "step": 6096 + }, + { + "epoch": 0.7480981595092024, + "grad_norm": 0.9841537590421471, + "learning_rate": 3.1478685733952076e-06, + "loss": 0.586, + "step": 6097 + }, + { + "epoch": 0.7482208588957056, + "grad_norm": 1.015760898145509, + "learning_rate": 3.144974546879069e-06, + "loss": 0.5489, + "step": 6098 + }, + { + "epoch": 0.7483435582822086, + "grad_norm": 0.9713423096908153, + "learning_rate": 3.142081603055267e-06, + "loss": 0.5763, + "step": 6099 + }, + { + "epoch": 0.7484662576687117, + "grad_norm": 0.9367877537565446, + "learning_rate": 3.1391897423807204e-06, + "loss": 0.5813, + "step": 6100 + }, + { + "epoch": 0.7485889570552148, + "grad_norm": 0.9430356132661342, + "learning_rate": 3.136298965312168e-06, + "loss": 0.5662, + "step": 6101 + }, + { + "epoch": 0.7487116564417178, + "grad_norm": 0.9590164016430083, + "learning_rate": 3.133409272306188e-06, + "loss": 0.5331, + "step": 6102 + }, + { + "epoch": 0.7488343558282209, + "grad_norm": 0.9860441666832167, + "learning_rate": 3.1305206638191774e-06, + "loss": 0.5537, + "step": 6103 + }, + { + "epoch": 0.7489570552147239, + "grad_norm": 0.8265768306300566, + "learning_rate": 3.1276331403073733e-06, + "loss": 0.5777, + "step": 6104 + }, + { + "epoch": 0.749079754601227, + "grad_norm": 0.8857626391523563, + "learning_rate": 3.1247467022268284e-06, + "loss": 0.5375, + "step": 6105 + }, + { + "epoch": 0.7492024539877301, + "grad_norm": 0.8690257745156846, + "learning_rate": 3.1218613500334382e-06, + "loss": 0.5328, + "step": 6106 + }, + { + "epoch": 0.7493251533742331, + "grad_norm": 0.8977782359152555, + "learning_rate": 3.1189770841829147e-06, + "loss": 0.591, + "step": 6107 + }, + { + "epoch": 0.7494478527607362, + "grad_norm": 0.8983910706767674, + "learning_rate": 3.1160939051308004e-06, + "loss": 0.5549, + "step": 6108 + }, + { + "epoch": 0.7495705521472392, + "grad_norm": 0.865825833769718, + "learning_rate": 3.1132118133324753e-06, + "loss": 0.5799, + "step": 6109 + }, + { + "epoch": 0.7496932515337423, + "grad_norm": 0.8240393115781055, + "learning_rate": 3.110330809243134e-06, + "loss": 0.5112, + "step": 6110 + }, + { + "epoch": 0.7498159509202454, + "grad_norm": 0.793901783560232, + "learning_rate": 3.107450893317814e-06, + "loss": 0.5475, + "step": 6111 + }, + { + "epoch": 0.7499386503067484, + "grad_norm": 0.8587859574023196, + "learning_rate": 3.1045720660113655e-06, + "loss": 0.521, + "step": 6112 + }, + { + "epoch": 0.7500613496932516, + "grad_norm": 0.9376023259487695, + "learning_rate": 3.1016943277784806e-06, + "loss": 0.5819, + "step": 6113 + }, + { + "epoch": 0.7501840490797546, + "grad_norm": 547.4739110462629, + "learning_rate": 3.098817679073668e-06, + "loss": 0.5109, + "step": 6114 + }, + { + "epoch": 0.7503067484662577, + "grad_norm": 0.8636392397663669, + "learning_rate": 3.095942120351276e-06, + "loss": 0.5881, + "step": 6115 + }, + { + "epoch": 0.7504294478527608, + "grad_norm": 0.8562717655076523, + "learning_rate": 3.093067652065468e-06, + "loss": 0.5582, + "step": 6116 + }, + { + "epoch": 0.7505521472392638, + "grad_norm": 0.9607407740274069, + "learning_rate": 3.0901942746702464e-06, + "loss": 0.5498, + "step": 6117 + }, + { + "epoch": 0.7506748466257669, + "grad_norm": 0.8199243296523513, + "learning_rate": 3.087321988619435e-06, + "loss": 0.5135, + "step": 6118 + }, + { + "epoch": 0.7507975460122699, + "grad_norm": 0.9398048986976759, + "learning_rate": 3.0844507943666834e-06, + "loss": 0.559, + "step": 6119 + }, + { + "epoch": 0.750920245398773, + "grad_norm": 0.8198357308735361, + "learning_rate": 3.081580692365478e-06, + "loss": 0.5641, + "step": 6120 + }, + { + "epoch": 0.7510429447852761, + "grad_norm": 0.9027065513426287, + "learning_rate": 3.07871168306912e-06, + "loss": 0.579, + "step": 6121 + }, + { + "epoch": 0.7511656441717791, + "grad_norm": 0.9211830567424443, + "learning_rate": 3.0758437669307516e-06, + "loss": 0.5527, + "step": 6122 + }, + { + "epoch": 0.7512883435582822, + "grad_norm": 0.8610051294706452, + "learning_rate": 3.0729769444033286e-06, + "loss": 0.563, + "step": 6123 + }, + { + "epoch": 0.7514110429447852, + "grad_norm": 0.812961311159405, + "learning_rate": 3.0701112159396486e-06, + "loss": 0.4703, + "step": 6124 + }, + { + "epoch": 0.7515337423312883, + "grad_norm": 0.9308485039712158, + "learning_rate": 3.0672465819923215e-06, + "loss": 0.6077, + "step": 6125 + }, + { + "epoch": 0.7516564417177914, + "grad_norm": 0.9407422760110136, + "learning_rate": 3.0643830430137933e-06, + "loss": 0.5473, + "step": 6126 + }, + { + "epoch": 0.7517791411042944, + "grad_norm": 0.8825427735093756, + "learning_rate": 3.0615205994563412e-06, + "loss": 0.5604, + "step": 6127 + }, + { + "epoch": 0.7519018404907976, + "grad_norm": 0.9018556555679089, + "learning_rate": 3.058659251772057e-06, + "loss": 0.5518, + "step": 6128 + }, + { + "epoch": 0.7520245398773007, + "grad_norm": 1.064423761950228, + "learning_rate": 3.0557990004128703e-06, + "loss": 0.5047, + "step": 6129 + }, + { + "epoch": 0.7521472392638037, + "grad_norm": 1.6782367863873608, + "learning_rate": 3.052939845830528e-06, + "loss": 0.5422, + "step": 6130 + }, + { + "epoch": 0.7522699386503068, + "grad_norm": 0.9317923669630455, + "learning_rate": 3.0500817884766155e-06, + "loss": 0.4994, + "step": 6131 + }, + { + "epoch": 0.7523926380368098, + "grad_norm": 0.8892417645237023, + "learning_rate": 3.047224828802532e-06, + "loss": 0.5111, + "step": 6132 + }, + { + "epoch": 0.7525153374233129, + "grad_norm": 0.9512698931000392, + "learning_rate": 3.0443689672595154e-06, + "loss": 0.5467, + "step": 6133 + }, + { + "epoch": 0.752638036809816, + "grad_norm": 0.9025959801915759, + "learning_rate": 3.041514204298619e-06, + "loss": 0.5805, + "step": 6134 + }, + { + "epoch": 0.752760736196319, + "grad_norm": 0.8489119036327046, + "learning_rate": 3.0386605403707347e-06, + "loss": 0.594, + "step": 6135 + }, + { + "epoch": 0.7528834355828221, + "grad_norm": 0.9034006493657629, + "learning_rate": 3.035807975926569e-06, + "loss": 0.4745, + "step": 6136 + }, + { + "epoch": 0.7530061349693251, + "grad_norm": 0.9850710561689212, + "learning_rate": 3.0329565114166592e-06, + "loss": 0.5425, + "step": 6137 + }, + { + "epoch": 0.7531288343558282, + "grad_norm": 0.9945091231785493, + "learning_rate": 3.030106147291375e-06, + "loss": 0.5987, + "step": 6138 + }, + { + "epoch": 0.7532515337423313, + "grad_norm": 0.8306463083559636, + "learning_rate": 3.0272568840008996e-06, + "loss": 0.5194, + "step": 6139 + }, + { + "epoch": 0.7533742331288343, + "grad_norm": 0.8857908079799662, + "learning_rate": 3.0244087219952565e-06, + "loss": 0.5202, + "step": 6140 + }, + { + "epoch": 0.7534969325153374, + "grad_norm": 0.8869769514615079, + "learning_rate": 3.0215616617242836e-06, + "loss": 0.5471, + "step": 6141 + }, + { + "epoch": 0.7536196319018404, + "grad_norm": 0.8537650332817519, + "learning_rate": 3.0187157036376548e-06, + "loss": 0.4977, + "step": 6142 + }, + { + "epoch": 0.7537423312883436, + "grad_norm": 0.7604073483412293, + "learning_rate": 3.0158708481848577e-06, + "loss": 0.5434, + "step": 6143 + }, + { + "epoch": 0.7538650306748467, + "grad_norm": 1.0835100828018074, + "learning_rate": 3.0130270958152196e-06, + "loss": 0.5698, + "step": 6144 + }, + { + "epoch": 0.7539877300613497, + "grad_norm": 0.8745230194796783, + "learning_rate": 3.0101844469778797e-06, + "loss": 0.581, + "step": 6145 + }, + { + "epoch": 0.7541104294478528, + "grad_norm": 0.8816796404728799, + "learning_rate": 3.007342902121818e-06, + "loss": 0.5618, + "step": 6146 + }, + { + "epoch": 0.7542331288343558, + "grad_norm": 0.9668510866569915, + "learning_rate": 3.004502461695825e-06, + "loss": 0.6155, + "step": 6147 + }, + { + "epoch": 0.7543558282208589, + "grad_norm": 0.8679398175695923, + "learning_rate": 3.0016631261485296e-06, + "loss": 0.5982, + "step": 6148 + }, + { + "epoch": 0.754478527607362, + "grad_norm": 0.8910233278049393, + "learning_rate": 2.9988248959283784e-06, + "loss": 0.5566, + "step": 6149 + }, + { + "epoch": 0.754601226993865, + "grad_norm": 1.0821268984499373, + "learning_rate": 2.9959877714836406e-06, + "loss": 0.5125, + "step": 6150 + }, + { + "epoch": 0.7547239263803681, + "grad_norm": 0.8833333854184637, + "learning_rate": 2.9931517532624233e-06, + "loss": 0.5408, + "step": 6151 + }, + { + "epoch": 0.7548466257668711, + "grad_norm": 0.9870749878571086, + "learning_rate": 2.990316841712644e-06, + "loss": 0.5724, + "step": 6152 + }, + { + "epoch": 0.7549693251533742, + "grad_norm": 0.9214905451741058, + "learning_rate": 2.9874830372820607e-06, + "loss": 0.5453, + "step": 6153 + }, + { + "epoch": 0.7550920245398773, + "grad_norm": 0.8827833628546871, + "learning_rate": 2.984650340418236e-06, + "loss": 0.5596, + "step": 6154 + }, + { + "epoch": 0.7552147239263803, + "grad_norm": 0.8114593537614493, + "learning_rate": 2.981818751568586e-06, + "loss": 0.5853, + "step": 6155 + }, + { + "epoch": 0.7553374233128834, + "grad_norm": 0.9898900362871053, + "learning_rate": 2.9789882711803262e-06, + "loss": 0.5528, + "step": 6156 + }, + { + "epoch": 0.7554601226993864, + "grad_norm": 0.7943212857424583, + "learning_rate": 2.9761588997005065e-06, + "loss": 0.5224, + "step": 6157 + }, + { + "epoch": 0.7555828220858896, + "grad_norm": 0.8450616394334264, + "learning_rate": 2.973330637576006e-06, + "loss": 0.5692, + "step": 6158 + }, + { + "epoch": 0.7557055214723927, + "grad_norm": 1.0177367626553937, + "learning_rate": 2.9705034852535187e-06, + "loss": 0.5944, + "step": 6159 + }, + { + "epoch": 0.7558282208588957, + "grad_norm": 1.0055255485064258, + "learning_rate": 2.9676774431795752e-06, + "loss": 0.5516, + "step": 6160 + }, + { + "epoch": 0.7559509202453988, + "grad_norm": 0.8603164653538137, + "learning_rate": 2.964852511800519e-06, + "loss": 0.545, + "step": 6161 + }, + { + "epoch": 0.7560736196319019, + "grad_norm": 0.9250559446010559, + "learning_rate": 2.9620286915625296e-06, + "loss": 0.5674, + "step": 6162 + }, + { + "epoch": 0.7561963190184049, + "grad_norm": 0.966376546736886, + "learning_rate": 2.959205982911599e-06, + "loss": 0.5942, + "step": 6163 + }, + { + "epoch": 0.756319018404908, + "grad_norm": 1.01072040791173, + "learning_rate": 2.956384386293555e-06, + "loss": 0.4711, + "step": 6164 + }, + { + "epoch": 0.756441717791411, + "grad_norm": 0.7849538293885328, + "learning_rate": 2.95356390215404e-06, + "loss": 0.4595, + "step": 6165 + }, + { + "epoch": 0.7565644171779141, + "grad_norm": 0.9488100509378573, + "learning_rate": 2.95074453093853e-06, + "loss": 0.5566, + "step": 6166 + }, + { + "epoch": 0.7566871165644172, + "grad_norm": 0.7988669243806954, + "learning_rate": 2.9479262730923165e-06, + "loss": 0.5278, + "step": 6167 + }, + { + "epoch": 0.7568098159509202, + "grad_norm": 0.8091928259185477, + "learning_rate": 2.945109129060519e-06, + "loss": 0.5828, + "step": 6168 + }, + { + "epoch": 0.7569325153374233, + "grad_norm": 0.8775568362982188, + "learning_rate": 2.942293099288085e-06, + "loss": 0.5845, + "step": 6169 + }, + { + "epoch": 0.7570552147239263, + "grad_norm": 0.8093250923643728, + "learning_rate": 2.939478184219777e-06, + "loss": 0.5104, + "step": 6170 + }, + { + "epoch": 0.7571779141104295, + "grad_norm": 0.885537238669747, + "learning_rate": 2.936664384300192e-06, + "loss": 0.5984, + "step": 6171 + }, + { + "epoch": 0.7573006134969326, + "grad_norm": 0.8669818390303639, + "learning_rate": 2.933851699973741e-06, + "loss": 0.5503, + "step": 6172 + }, + { + "epoch": 0.7574233128834356, + "grad_norm": 0.8360586197061124, + "learning_rate": 2.931040131684667e-06, + "loss": 0.5093, + "step": 6173 + }, + { + "epoch": 0.7575460122699387, + "grad_norm": 0.8539864568023647, + "learning_rate": 2.9282296798770293e-06, + "loss": 0.536, + "step": 6174 + }, + { + "epoch": 0.7576687116564417, + "grad_norm": 0.8803972867686285, + "learning_rate": 2.9254203449947196e-06, + "loss": 0.5047, + "step": 6175 + }, + { + "epoch": 0.7577914110429448, + "grad_norm": 0.8563947069556673, + "learning_rate": 2.922612127481441e-06, + "loss": 0.5541, + "step": 6176 + }, + { + "epoch": 0.7579141104294479, + "grad_norm": 0.8167210153430196, + "learning_rate": 2.9198050277807354e-06, + "loss": 0.5463, + "step": 6177 + }, + { + "epoch": 0.7580368098159509, + "grad_norm": 0.9328020392947657, + "learning_rate": 2.9169990463359556e-06, + "loss": 0.551, + "step": 6178 + }, + { + "epoch": 0.758159509202454, + "grad_norm": 0.8710621234948446, + "learning_rate": 2.9141941835902796e-06, + "loss": 0.5678, + "step": 6179 + }, + { + "epoch": 0.758282208588957, + "grad_norm": 0.9480386646942809, + "learning_rate": 2.9113904399867188e-06, + "loss": 0.5029, + "step": 6180 + }, + { + "epoch": 0.7584049079754601, + "grad_norm": 0.8834465140883921, + "learning_rate": 2.908587815968089e-06, + "loss": 0.5443, + "step": 6181 + }, + { + "epoch": 0.7585276073619632, + "grad_norm": 0.791280258745228, + "learning_rate": 2.905786311977055e-06, + "loss": 0.5499, + "step": 6182 + }, + { + "epoch": 0.7586503067484662, + "grad_norm": 0.8258533631690017, + "learning_rate": 2.902985928456079e-06, + "loss": 0.572, + "step": 6183 + }, + { + "epoch": 0.7587730061349693, + "grad_norm": 0.8705157232861185, + "learning_rate": 2.900186665847464e-06, + "loss": 0.6032, + "step": 6184 + }, + { + "epoch": 0.7588957055214723, + "grad_norm": 0.714961675564289, + "learning_rate": 2.8973885245933287e-06, + "loss": 0.5259, + "step": 6185 + }, + { + "epoch": 0.7590184049079755, + "grad_norm": 1.0124826857011802, + "learning_rate": 2.894591505135609e-06, + "loss": 0.5933, + "step": 6186 + }, + { + "epoch": 0.7591411042944786, + "grad_norm": 0.7845493374245917, + "learning_rate": 2.891795607916079e-06, + "loss": 0.6047, + "step": 6187 + }, + { + "epoch": 0.7592638036809816, + "grad_norm": 0.8616527426069122, + "learning_rate": 2.8890008333763187e-06, + "loss": 0.5663, + "step": 6188 + }, + { + "epoch": 0.7593865030674847, + "grad_norm": 0.942465006447271, + "learning_rate": 2.8862071819577453e-06, + "loss": 0.5917, + "step": 6189 + }, + { + "epoch": 0.7595092024539877, + "grad_norm": 0.8931253832510038, + "learning_rate": 2.8834146541015874e-06, + "loss": 0.5737, + "step": 6190 + }, + { + "epoch": 0.7596319018404908, + "grad_norm": 0.8315861456581892, + "learning_rate": 2.880623250248904e-06, + "loss": 0.5548, + "step": 6191 + }, + { + "epoch": 0.7597546012269939, + "grad_norm": 0.844326893973884, + "learning_rate": 2.8778329708405694e-06, + "loss": 0.5646, + "step": 6192 + }, + { + "epoch": 0.7598773006134969, + "grad_norm": 0.8229970105957892, + "learning_rate": 2.87504381631729e-06, + "loss": 0.5359, + "step": 6193 + }, + { + "epoch": 0.76, + "grad_norm": 0.8789469367228939, + "learning_rate": 2.8722557871195822e-06, + "loss": 0.549, + "step": 6194 + }, + { + "epoch": 0.7601226993865031, + "grad_norm": 0.8956903355120999, + "learning_rate": 2.869468883687798e-06, + "loss": 0.599, + "step": 6195 + }, + { + "epoch": 0.7602453987730061, + "grad_norm": 0.8849100992271762, + "learning_rate": 2.8666831064620983e-06, + "loss": 0.5288, + "step": 6196 + }, + { + "epoch": 0.7603680981595092, + "grad_norm": 0.9021275525686351, + "learning_rate": 2.8638984558824777e-06, + "loss": 0.5532, + "step": 6197 + }, + { + "epoch": 0.7604907975460122, + "grad_norm": 0.7894436707260927, + "learning_rate": 2.8611149323887466e-06, + "loss": 0.4654, + "step": 6198 + }, + { + "epoch": 0.7606134969325153, + "grad_norm": 0.884662279774404, + "learning_rate": 2.858332536420535e-06, + "loss": 0.5495, + "step": 6199 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 0.825795061255919, + "learning_rate": 2.855551268417305e-06, + "loss": 0.5911, + "step": 6200 + }, + { + "epoch": 0.7608588957055215, + "grad_norm": 0.9655225696590644, + "learning_rate": 2.8527711288183256e-06, + "loss": 0.5201, + "step": 6201 + }, + { + "epoch": 0.7609815950920246, + "grad_norm": 0.9454138122836264, + "learning_rate": 2.8499921180627044e-06, + "loss": 0.5491, + "step": 6202 + }, + { + "epoch": 0.7611042944785276, + "grad_norm": 0.9243478973762776, + "learning_rate": 2.847214236589356e-06, + "loss": 0.5522, + "step": 6203 + }, + { + "epoch": 0.7612269938650307, + "grad_norm": 0.9098546110548985, + "learning_rate": 2.8444374848370272e-06, + "loss": 0.5371, + "step": 6204 + }, + { + "epoch": 0.7613496932515338, + "grad_norm": 0.8766459102837334, + "learning_rate": 2.8416618632442785e-06, + "loss": 0.5777, + "step": 6205 + }, + { + "epoch": 0.7614723926380368, + "grad_norm": 0.9086889788578605, + "learning_rate": 2.838887372249499e-06, + "loss": 0.5523, + "step": 6206 + }, + { + "epoch": 0.7615950920245399, + "grad_norm": 0.8924093595741234, + "learning_rate": 2.8361140122908913e-06, + "loss": 0.6018, + "step": 6207 + }, + { + "epoch": 0.7617177914110429, + "grad_norm": 0.8852860176796001, + "learning_rate": 2.8333417838064902e-06, + "loss": 0.5914, + "step": 6208 + }, + { + "epoch": 0.761840490797546, + "grad_norm": 0.8576784029503802, + "learning_rate": 2.830570687234138e-06, + "loss": 0.5599, + "step": 6209 + }, + { + "epoch": 0.7619631901840491, + "grad_norm": 0.9370912222504394, + "learning_rate": 2.827800723011508e-06, + "loss": 0.6115, + "step": 6210 + }, + { + "epoch": 0.7620858895705521, + "grad_norm": 0.9266420234247684, + "learning_rate": 2.825031891576098e-06, + "loss": 0.5371, + "step": 6211 + }, + { + "epoch": 0.7622085889570552, + "grad_norm": 0.7862943138373307, + "learning_rate": 2.822264193365212e-06, + "loss": 0.5792, + "step": 6212 + }, + { + "epoch": 0.7623312883435582, + "grad_norm": 0.8705316253999754, + "learning_rate": 2.8194976288159927e-06, + "loss": 0.5185, + "step": 6213 + }, + { + "epoch": 0.7624539877300613, + "grad_norm": 0.9338034873532529, + "learning_rate": 2.8167321983653873e-06, + "loss": 0.5421, + "step": 6214 + }, + { + "epoch": 0.7625766871165645, + "grad_norm": 0.9048840690064047, + "learning_rate": 2.813967902450179e-06, + "loss": 0.5561, + "step": 6215 + }, + { + "epoch": 0.7626993865030675, + "grad_norm": 0.807241027268681, + "learning_rate": 2.811204741506961e-06, + "loss": 0.5749, + "step": 6216 + }, + { + "epoch": 0.7628220858895706, + "grad_norm": 0.9043404467710682, + "learning_rate": 2.808442715972147e-06, + "loss": 0.5779, + "step": 6217 + }, + { + "epoch": 0.7629447852760736, + "grad_norm": 0.8850948474848894, + "learning_rate": 2.805681826281983e-06, + "loss": 0.6003, + "step": 6218 + }, + { + "epoch": 0.7630674846625767, + "grad_norm": 0.8274400976562802, + "learning_rate": 2.8029220728725213e-06, + "loss": 0.5059, + "step": 6219 + }, + { + "epoch": 0.7631901840490798, + "grad_norm": 0.9270591019245973, + "learning_rate": 2.8001634561796463e-06, + "loss": 0.5737, + "step": 6220 + }, + { + "epoch": 0.7633128834355828, + "grad_norm": 0.9009857565746803, + "learning_rate": 2.7974059766390527e-06, + "loss": 0.5584, + "step": 6221 + }, + { + "epoch": 0.7634355828220859, + "grad_norm": 0.9301146811992831, + "learning_rate": 2.7946496346862662e-06, + "loss": 0.4997, + "step": 6222 + }, + { + "epoch": 0.763558282208589, + "grad_norm": 0.8267695496097874, + "learning_rate": 2.791894430756621e-06, + "loss": 0.4888, + "step": 6223 + }, + { + "epoch": 0.763680981595092, + "grad_norm": 0.8635103196273879, + "learning_rate": 2.7891403652852844e-06, + "loss": 0.5482, + "step": 6224 + }, + { + "epoch": 0.7638036809815951, + "grad_norm": 0.9146250305942519, + "learning_rate": 2.786387438707231e-06, + "loss": 0.581, + "step": 6225 + }, + { + "epoch": 0.7639263803680981, + "grad_norm": 0.8304391602085948, + "learning_rate": 2.7836356514572684e-06, + "loss": 0.5942, + "step": 6226 + }, + { + "epoch": 0.7640490797546012, + "grad_norm": 0.7674915349555599, + "learning_rate": 2.780885003970012e-06, + "loss": 0.5874, + "step": 6227 + }, + { + "epoch": 0.7641717791411043, + "grad_norm": 0.8825813098375718, + "learning_rate": 2.778135496679908e-06, + "loss": 0.5, + "step": 6228 + }, + { + "epoch": 0.7642944785276073, + "grad_norm": 0.8565612660214982, + "learning_rate": 2.775387130021214e-06, + "loss": 0.5218, + "step": 6229 + }, + { + "epoch": 0.7644171779141105, + "grad_norm": 0.966086214719309, + "learning_rate": 2.7726399044280107e-06, + "loss": 0.5531, + "step": 6230 + }, + { + "epoch": 0.7645398773006135, + "grad_norm": 0.8851585376988486, + "learning_rate": 2.769893820334202e-06, + "loss": 0.5211, + "step": 6231 + }, + { + "epoch": 0.7646625766871166, + "grad_norm": 0.8974546895627689, + "learning_rate": 2.767148878173502e-06, + "loss": 0.5242, + "step": 6232 + }, + { + "epoch": 0.7647852760736197, + "grad_norm": 0.9466377433257902, + "learning_rate": 2.7644050783794586e-06, + "loss": 0.5305, + "step": 6233 + }, + { + "epoch": 0.7649079754601227, + "grad_norm": 0.8454914172469348, + "learning_rate": 2.7616624213854247e-06, + "loss": 0.5653, + "step": 6234 + }, + { + "epoch": 0.7650306748466258, + "grad_norm": 0.8739560694909212, + "learning_rate": 2.758920907624585e-06, + "loss": 0.5583, + "step": 6235 + }, + { + "epoch": 0.7651533742331288, + "grad_norm": 0.8387545319637697, + "learning_rate": 2.756180537529932e-06, + "loss": 0.5379, + "step": 6236 + }, + { + "epoch": 0.7652760736196319, + "grad_norm": 0.8633596055050818, + "learning_rate": 2.7534413115342874e-06, + "loss": 0.5114, + "step": 6237 + }, + { + "epoch": 0.765398773006135, + "grad_norm": 0.7607611092006873, + "learning_rate": 2.750703230070291e-06, + "loss": 0.5573, + "step": 6238 + }, + { + "epoch": 0.765521472392638, + "grad_norm": 0.8129790013859539, + "learning_rate": 2.747966293570392e-06, + "loss": 0.5409, + "step": 6239 + }, + { + "epoch": 0.7656441717791411, + "grad_norm": 0.880935355351362, + "learning_rate": 2.7452305024668747e-06, + "loss": 0.547, + "step": 6240 + }, + { + "epoch": 0.7657668711656441, + "grad_norm": 0.8649086099850031, + "learning_rate": 2.7424958571918247e-06, + "loss": 0.5429, + "step": 6241 + }, + { + "epoch": 0.7658895705521472, + "grad_norm": 0.9476238826431106, + "learning_rate": 2.7397623581771638e-06, + "loss": 0.5666, + "step": 6242 + }, + { + "epoch": 0.7660122699386503, + "grad_norm": 0.956707089649415, + "learning_rate": 2.7370300058546182e-06, + "loss": 0.5546, + "step": 6243 + }, + { + "epoch": 0.7661349693251533, + "grad_norm": 0.9478542309103092, + "learning_rate": 2.734298800655746e-06, + "loss": 0.5792, + "step": 6244 + }, + { + "epoch": 0.7662576687116565, + "grad_norm": 0.9729344886635936, + "learning_rate": 2.7315687430119097e-06, + "loss": 0.4812, + "step": 6245 + }, + { + "epoch": 0.7663803680981595, + "grad_norm": 0.9705776613762003, + "learning_rate": 2.7288398333543063e-06, + "loss": 0.5596, + "step": 6246 + }, + { + "epoch": 0.7665030674846626, + "grad_norm": 0.8730638062676479, + "learning_rate": 2.72611207211394e-06, + "loss": 0.5404, + "step": 6247 + }, + { + "epoch": 0.7666257668711657, + "grad_norm": 1.0300226043197995, + "learning_rate": 2.7233854597216335e-06, + "loss": 0.5359, + "step": 6248 + }, + { + "epoch": 0.7667484662576687, + "grad_norm": 0.9061202446509312, + "learning_rate": 2.7206599966080394e-06, + "loss": 0.4511, + "step": 6249 + }, + { + "epoch": 0.7668711656441718, + "grad_norm": 0.9052242628288175, + "learning_rate": 2.7179356832036142e-06, + "loss": 0.5253, + "step": 6250 + }, + { + "epoch": 0.7669938650306748, + "grad_norm": 0.9678321211082401, + "learning_rate": 2.715212519938646e-06, + "loss": 0.5171, + "step": 6251 + }, + { + "epoch": 0.7671165644171779, + "grad_norm": 0.8470445490598517, + "learning_rate": 2.7124905072432275e-06, + "loss": 0.5608, + "step": 6252 + }, + { + "epoch": 0.767239263803681, + "grad_norm": 0.836385920505488, + "learning_rate": 2.7097696455472855e-06, + "loss": 0.5356, + "step": 6253 + }, + { + "epoch": 0.767361963190184, + "grad_norm": 0.8557862018633525, + "learning_rate": 2.70704993528055e-06, + "loss": 0.5755, + "step": 6254 + }, + { + "epoch": 0.7674846625766871, + "grad_norm": 0.9958832342467033, + "learning_rate": 2.704331376872581e-06, + "loss": 0.5527, + "step": 6255 + }, + { + "epoch": 0.7676073619631902, + "grad_norm": 0.88011704461052, + "learning_rate": 2.7016139707527465e-06, + "loss": 0.5577, + "step": 6256 + }, + { + "epoch": 0.7677300613496932, + "grad_norm": 1.3367916185042186, + "learning_rate": 2.698897717350243e-06, + "loss": 0.606, + "step": 6257 + }, + { + "epoch": 0.7678527607361963, + "grad_norm": 0.8592470117794108, + "learning_rate": 2.696182617094076e-06, + "loss": 0.5551, + "step": 6258 + }, + { + "epoch": 0.7679754601226993, + "grad_norm": 0.9092612108260262, + "learning_rate": 2.6934686704130698e-06, + "loss": 0.559, + "step": 6259 + }, + { + "epoch": 0.7680981595092025, + "grad_norm": 0.9039128641171968, + "learning_rate": 2.6907558777358756e-06, + "loss": 0.5484, + "step": 6260 + }, + { + "epoch": 0.7682208588957056, + "grad_norm": 0.9268428319213802, + "learning_rate": 2.688044239490948e-06, + "loss": 0.544, + "step": 6261 + }, + { + "epoch": 0.7683435582822086, + "grad_norm": 0.8872891742450851, + "learning_rate": 2.685333756106573e-06, + "loss": 0.55, + "step": 6262 + }, + { + "epoch": 0.7684662576687117, + "grad_norm": 0.9321645715596395, + "learning_rate": 2.6826244280108438e-06, + "loss": 0.5743, + "step": 6263 + }, + { + "epoch": 0.7685889570552147, + "grad_norm": 0.8394281359337723, + "learning_rate": 2.6799162556316784e-06, + "loss": 0.5353, + "step": 6264 + }, + { + "epoch": 0.7687116564417178, + "grad_norm": 1.0866524391632277, + "learning_rate": 2.677209239396811e-06, + "loss": 0.5563, + "step": 6265 + }, + { + "epoch": 0.7688343558282209, + "grad_norm": 0.8722661623377856, + "learning_rate": 2.674503379733785e-06, + "loss": 0.5461, + "step": 6266 + }, + { + "epoch": 0.7689570552147239, + "grad_norm": 0.8988149462222237, + "learning_rate": 2.6717986770699757e-06, + "loss": 0.5904, + "step": 6267 + }, + { + "epoch": 0.769079754601227, + "grad_norm": 0.8410668787767036, + "learning_rate": 2.6690951318325608e-06, + "loss": 0.5172, + "step": 6268 + }, + { + "epoch": 0.76920245398773, + "grad_norm": 0.8400760184519279, + "learning_rate": 2.6663927444485484e-06, + "loss": 0.5301, + "step": 6269 + }, + { + "epoch": 0.7693251533742331, + "grad_norm": 0.782760415781665, + "learning_rate": 2.6636915153447494e-06, + "loss": 0.5786, + "step": 6270 + }, + { + "epoch": 0.7694478527607362, + "grad_norm": 0.8842969316022133, + "learning_rate": 2.660991444947808e-06, + "loss": 0.5209, + "step": 6271 + }, + { + "epoch": 0.7695705521472392, + "grad_norm": 0.9002087380404971, + "learning_rate": 2.6582925336841705e-06, + "loss": 0.5803, + "step": 6272 + }, + { + "epoch": 0.7696932515337424, + "grad_norm": 0.7992580941411249, + "learning_rate": 2.6555947819801133e-06, + "loss": 0.5479, + "step": 6273 + }, + { + "epoch": 0.7698159509202454, + "grad_norm": 0.9289055214095501, + "learning_rate": 2.6528981902617146e-06, + "loss": 0.5173, + "step": 6274 + }, + { + "epoch": 0.7699386503067485, + "grad_norm": 0.9223080645286637, + "learning_rate": 2.650202758954886e-06, + "loss": 0.5825, + "step": 6275 + }, + { + "epoch": 0.7700613496932516, + "grad_norm": 0.8892998448627754, + "learning_rate": 2.6475084884853418e-06, + "loss": 0.5058, + "step": 6276 + }, + { + "epoch": 0.7701840490797546, + "grad_norm": 0.881472423941979, + "learning_rate": 2.6448153792786248e-06, + "loss": 0.5831, + "step": 6277 + }, + { + "epoch": 0.7703067484662577, + "grad_norm": 0.8551217618633913, + "learning_rate": 2.6421234317600842e-06, + "loss": 0.5165, + "step": 6278 + }, + { + "epoch": 0.7704294478527607, + "grad_norm": 0.824878042023435, + "learning_rate": 2.639432646354888e-06, + "loss": 0.583, + "step": 6279 + }, + { + "epoch": 0.7705521472392638, + "grad_norm": 0.848939657294153, + "learning_rate": 2.6367430234880286e-06, + "loss": 0.4404, + "step": 6280 + }, + { + "epoch": 0.7706748466257669, + "grad_norm": 0.8684829057667617, + "learning_rate": 2.6340545635843027e-06, + "loss": 0.5403, + "step": 6281 + }, + { + "epoch": 0.7707975460122699, + "grad_norm": 0.8982607844362396, + "learning_rate": 2.6313672670683355e-06, + "loss": 0.5502, + "step": 6282 + }, + { + "epoch": 0.770920245398773, + "grad_norm": 0.8399246348854323, + "learning_rate": 2.6286811343645557e-06, + "loss": 0.5479, + "step": 6283 + }, + { + "epoch": 0.771042944785276, + "grad_norm": 0.8511269734261426, + "learning_rate": 2.625996165897222e-06, + "loss": 0.5593, + "step": 6284 + }, + { + "epoch": 0.7711656441717791, + "grad_norm": 0.8700476478217616, + "learning_rate": 2.6233123620903946e-06, + "loss": 0.5598, + "step": 6285 + }, + { + "epoch": 0.7712883435582822, + "grad_norm": 0.7998354633135235, + "learning_rate": 2.6206297233679645e-06, + "loss": 0.5627, + "step": 6286 + }, + { + "epoch": 0.7714110429447852, + "grad_norm": 0.9644607303287098, + "learning_rate": 2.617948250153626e-06, + "loss": 0.5339, + "step": 6287 + }, + { + "epoch": 0.7715337423312884, + "grad_norm": 0.9059511083392224, + "learning_rate": 2.6152679428708983e-06, + "loss": 0.5542, + "step": 6288 + }, + { + "epoch": 0.7716564417177915, + "grad_norm": 0.9620882790487847, + "learning_rate": 2.6125888019431124e-06, + "loss": 0.6055, + "step": 6289 + }, + { + "epoch": 0.7717791411042945, + "grad_norm": 0.8289200961075109, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.5441, + "step": 6290 + }, + { + "epoch": 0.7719018404907976, + "grad_norm": 1.2136754231581137, + "learning_rate": 2.60723402084476e-06, + "loss": 0.5802, + "step": 6291 + }, + { + "epoch": 0.7720245398773006, + "grad_norm": 0.8799947775197976, + "learning_rate": 2.6045583815199394e-06, + "loss": 0.5731, + "step": 6292 + }, + { + "epoch": 0.7721472392638037, + "grad_norm": 0.8024320073327512, + "learning_rate": 2.601883910241546e-06, + "loss": 0.5652, + "step": 6293 + }, + { + "epoch": 0.7722699386503068, + "grad_norm": 1.0997237250008227, + "learning_rate": 2.5992106074319835e-06, + "loss": 0.5948, + "step": 6294 + }, + { + "epoch": 0.7723926380368098, + "grad_norm": 0.9904088751637293, + "learning_rate": 2.5965384735134825e-06, + "loss": 0.5386, + "step": 6295 + }, + { + "epoch": 0.7725153374233129, + "grad_norm": 1.0344802729996927, + "learning_rate": 2.59386750890808e-06, + "loss": 0.5838, + "step": 6296 + }, + { + "epoch": 0.7726380368098159, + "grad_norm": 0.9442689441861124, + "learning_rate": 2.591197714037631e-06, + "loss": 0.4948, + "step": 6297 + }, + { + "epoch": 0.772760736196319, + "grad_norm": 0.8469573590790231, + "learning_rate": 2.588529089323811e-06, + "loss": 0.5734, + "step": 6298 + }, + { + "epoch": 0.7728834355828221, + "grad_norm": 0.8841935057074073, + "learning_rate": 2.5858616351881006e-06, + "loss": 0.5872, + "step": 6299 + }, + { + "epoch": 0.7730061349693251, + "grad_norm": 0.8785042011140887, + "learning_rate": 2.583195352051808e-06, + "loss": 0.5698, + "step": 6300 + }, + { + "epoch": 0.7731288343558282, + "grad_norm": 0.8504990680215293, + "learning_rate": 2.580530240336043e-06, + "loss": 0.538, + "step": 6301 + }, + { + "epoch": 0.7732515337423312, + "grad_norm": 0.9242853579565328, + "learning_rate": 2.577866300461743e-06, + "loss": 0.6153, + "step": 6302 + }, + { + "epoch": 0.7733742331288344, + "grad_norm": 1.0166741942875397, + "learning_rate": 2.575203532849648e-06, + "loss": 0.5527, + "step": 6303 + }, + { + "epoch": 0.7734969325153375, + "grad_norm": 0.9206963934617466, + "learning_rate": 2.5725419379203275e-06, + "loss": 0.5366, + "step": 6304 + }, + { + "epoch": 0.7736196319018405, + "grad_norm": 0.8826099363054313, + "learning_rate": 2.5698815160941494e-06, + "loss": 0.53, + "step": 6305 + }, + { + "epoch": 0.7737423312883436, + "grad_norm": 0.9218696284294353, + "learning_rate": 2.5672222677913106e-06, + "loss": 0.5643, + "step": 6306 + }, + { + "epoch": 0.7738650306748466, + "grad_norm": 0.7336992481924712, + "learning_rate": 2.564564193431813e-06, + "loss": 0.5357, + "step": 6307 + }, + { + "epoch": 0.7739877300613497, + "grad_norm": 0.9453346584928816, + "learning_rate": 2.5619072934354762e-06, + "loss": 0.5088, + "step": 6308 + }, + { + "epoch": 0.7741104294478528, + "grad_norm": 1.0698648744035801, + "learning_rate": 2.5592515682219386e-06, + "loss": 0.5492, + "step": 6309 + }, + { + "epoch": 0.7742331288343558, + "grad_norm": 0.8864487787879501, + "learning_rate": 2.5565970182106425e-06, + "loss": 0.5088, + "step": 6310 + }, + { + "epoch": 0.7743558282208589, + "grad_norm": 0.8631944471417621, + "learning_rate": 2.55394364382086e-06, + "loss": 0.5391, + "step": 6311 + }, + { + "epoch": 0.7744785276073619, + "grad_norm": 0.8976315424001405, + "learning_rate": 2.55129144547166e-06, + "loss": 0.5844, + "step": 6312 + }, + { + "epoch": 0.774601226993865, + "grad_norm": 1.2522347318036142, + "learning_rate": 2.548640423581942e-06, + "loss": 0.5582, + "step": 6313 + }, + { + "epoch": 0.7747239263803681, + "grad_norm": 0.9394670125035529, + "learning_rate": 2.545990578570404e-06, + "loss": 0.5155, + "step": 6314 + }, + { + "epoch": 0.7748466257668711, + "grad_norm": 0.8699558735089595, + "learning_rate": 2.5433419108555758e-06, + "loss": 0.5144, + "step": 6315 + }, + { + "epoch": 0.7749693251533742, + "grad_norm": 0.9272092282597066, + "learning_rate": 2.5406944208557825e-06, + "loss": 0.5591, + "step": 6316 + }, + { + "epoch": 0.7750920245398774, + "grad_norm": 0.9128680824391777, + "learning_rate": 2.5380481089891804e-06, + "loss": 0.4931, + "step": 6317 + }, + { + "epoch": 0.7752147239263804, + "grad_norm": 0.9230581022621793, + "learning_rate": 2.535402975673725e-06, + "loss": 0.5403, + "step": 6318 + }, + { + "epoch": 0.7753374233128835, + "grad_norm": 0.9824761357114197, + "learning_rate": 2.5327590213271957e-06, + "loss": 0.5262, + "step": 6319 + }, + { + "epoch": 0.7754601226993865, + "grad_norm": 1.0236038310892581, + "learning_rate": 2.5301162463671845e-06, + "loss": 0.5634, + "step": 6320 + }, + { + "epoch": 0.7755828220858896, + "grad_norm": 0.8733987380032039, + "learning_rate": 2.527474651211089e-06, + "loss": 0.6057, + "step": 6321 + }, + { + "epoch": 0.7757055214723927, + "grad_norm": 0.8754256810106041, + "learning_rate": 2.5248342362761336e-06, + "loss": 0.4748, + "step": 6322 + }, + { + "epoch": 0.7758282208588957, + "grad_norm": 0.7686941533851631, + "learning_rate": 2.522195001979343e-06, + "loss": 0.4121, + "step": 6323 + }, + { + "epoch": 0.7759509202453988, + "grad_norm": 0.8895566823034029, + "learning_rate": 2.519556948737566e-06, + "loss": 0.5694, + "step": 6324 + }, + { + "epoch": 0.7760736196319018, + "grad_norm": 0.9995082565547646, + "learning_rate": 2.516920076967455e-06, + "loss": 0.5499, + "step": 6325 + }, + { + "epoch": 0.7761963190184049, + "grad_norm": 0.9099586488101986, + "learning_rate": 2.514284387085488e-06, + "loss": 0.6148, + "step": 6326 + }, + { + "epoch": 0.776319018404908, + "grad_norm": 0.9290225834656006, + "learning_rate": 2.511649879507947e-06, + "loss": 0.5103, + "step": 6327 + }, + { + "epoch": 0.776441717791411, + "grad_norm": 0.8304254994103386, + "learning_rate": 2.509016554650926e-06, + "loss": 0.4612, + "step": 6328 + }, + { + "epoch": 0.7765644171779141, + "grad_norm": 0.7830329081528479, + "learning_rate": 2.5063844129303416e-06, + "loss": 0.5815, + "step": 6329 + }, + { + "epoch": 0.7766871165644171, + "grad_norm": 0.8237797257541956, + "learning_rate": 2.5037534547619125e-06, + "loss": 0.6043, + "step": 6330 + }, + { + "epoch": 0.7768098159509202, + "grad_norm": 0.9320395452382325, + "learning_rate": 2.5011236805611818e-06, + "loss": 0.5349, + "step": 6331 + }, + { + "epoch": 0.7769325153374234, + "grad_norm": 0.8617048114276779, + "learning_rate": 2.4984950907434934e-06, + "loss": 0.599, + "step": 6332 + }, + { + "epoch": 0.7770552147239264, + "grad_norm": 0.8124295623752148, + "learning_rate": 2.4958676857240184e-06, + "loss": 0.5882, + "step": 6333 + }, + { + "epoch": 0.7771779141104295, + "grad_norm": 1.0538019197368373, + "learning_rate": 2.493241465917724e-06, + "loss": 0.5705, + "step": 6334 + }, + { + "epoch": 0.7773006134969325, + "grad_norm": 0.9945534787587729, + "learning_rate": 2.4906164317394067e-06, + "loss": 0.5542, + "step": 6335 + }, + { + "epoch": 0.7774233128834356, + "grad_norm": 0.8329085500773072, + "learning_rate": 2.487992583603661e-06, + "loss": 0.4843, + "step": 6336 + }, + { + "epoch": 0.7775460122699387, + "grad_norm": 0.9789713482391719, + "learning_rate": 2.4853699219249083e-06, + "loss": 0.538, + "step": 6337 + }, + { + "epoch": 0.7776687116564417, + "grad_norm": 0.9335231286556507, + "learning_rate": 2.4827484471173726e-06, + "loss": 0.5516, + "step": 6338 + }, + { + "epoch": 0.7777914110429448, + "grad_norm": 0.8826297495561642, + "learning_rate": 2.480128159595089e-06, + "loss": 0.5625, + "step": 6339 + }, + { + "epoch": 0.7779141104294478, + "grad_norm": 0.7869725513658761, + "learning_rate": 2.4775090597719163e-06, + "loss": 0.5203, + "step": 6340 + }, + { + "epoch": 0.7780368098159509, + "grad_norm": 0.8784850522781141, + "learning_rate": 2.4748911480615135e-06, + "loss": 0.5606, + "step": 6341 + }, + { + "epoch": 0.778159509202454, + "grad_norm": 0.8213292748959642, + "learning_rate": 2.472274424877361e-06, + "loss": 0.5277, + "step": 6342 + }, + { + "epoch": 0.778282208588957, + "grad_norm": 0.8706725407934411, + "learning_rate": 2.4696588906327433e-06, + "loss": 0.5724, + "step": 6343 + }, + { + "epoch": 0.7784049079754601, + "grad_norm": 0.9767407682347026, + "learning_rate": 2.467044545740769e-06, + "loss": 0.5908, + "step": 6344 + }, + { + "epoch": 0.7785276073619631, + "grad_norm": 0.892705755851941, + "learning_rate": 2.4644313906143414e-06, + "loss": 0.5792, + "step": 6345 + }, + { + "epoch": 0.7786503067484662, + "grad_norm": 0.9067993369287317, + "learning_rate": 2.461819425666193e-06, + "loss": 0.4784, + "step": 6346 + }, + { + "epoch": 0.7787730061349694, + "grad_norm": 0.9438289961320364, + "learning_rate": 2.459208651308862e-06, + "loss": 0.5806, + "step": 6347 + }, + { + "epoch": 0.7788957055214724, + "grad_norm": 0.7706668214551738, + "learning_rate": 2.4565990679546913e-06, + "loss": 0.5272, + "step": 6348 + }, + { + "epoch": 0.7790184049079755, + "grad_norm": 0.908296659719137, + "learning_rate": 2.4539906760158495e-06, + "loss": 0.5858, + "step": 6349 + }, + { + "epoch": 0.7791411042944786, + "grad_norm": 0.8472467356779886, + "learning_rate": 2.451383475904304e-06, + "loss": 0.4585, + "step": 6350 + }, + { + "epoch": 0.7792638036809816, + "grad_norm": 0.9316698255449214, + "learning_rate": 2.4487774680318444e-06, + "loss": 0.5269, + "step": 6351 + }, + { + "epoch": 0.7793865030674847, + "grad_norm": 0.9923952455874712, + "learning_rate": 2.4461726528100615e-06, + "loss": 0.6054, + "step": 6352 + }, + { + "epoch": 0.7795092024539877, + "grad_norm": 0.9813744254444092, + "learning_rate": 2.44356903065037e-06, + "loss": 0.5082, + "step": 6353 + }, + { + "epoch": 0.7796319018404908, + "grad_norm": 0.8948150534786463, + "learning_rate": 2.440966601963983e-06, + "loss": 0.5853, + "step": 6354 + }, + { + "epoch": 0.7797546012269939, + "grad_norm": 0.9361300882756853, + "learning_rate": 2.438365367161939e-06, + "loss": 0.5758, + "step": 6355 + }, + { + "epoch": 0.7798773006134969, + "grad_norm": 0.9470270061326957, + "learning_rate": 2.435765326655073e-06, + "loss": 0.551, + "step": 6356 + }, + { + "epoch": 0.78, + "grad_norm": 0.8818785347635615, + "learning_rate": 2.4331664808540458e-06, + "loss": 0.5301, + "step": 6357 + }, + { + "epoch": 0.780122699386503, + "grad_norm": 0.949288735710694, + "learning_rate": 2.4305688301693196e-06, + "loss": 0.5987, + "step": 6358 + }, + { + "epoch": 0.7802453987730061, + "grad_norm": 1.0499143466802832, + "learning_rate": 2.4279723750111683e-06, + "loss": 0.5122, + "step": 6359 + }, + { + "epoch": 0.7803680981595092, + "grad_norm": 0.8631609511254096, + "learning_rate": 2.4253771157896856e-06, + "loss": 0.5391, + "step": 6360 + }, + { + "epoch": 0.7804907975460122, + "grad_norm": 0.7945981366288127, + "learning_rate": 2.4227830529147645e-06, + "loss": 0.5524, + "step": 6361 + }, + { + "epoch": 0.7806134969325154, + "grad_norm": 0.8776401169893697, + "learning_rate": 2.42019018679612e-06, + "loss": 0.6489, + "step": 6362 + }, + { + "epoch": 0.7807361963190184, + "grad_norm": 0.9033408071535943, + "learning_rate": 2.4175985178432683e-06, + "loss": 0.5494, + "step": 6363 + }, + { + "epoch": 0.7808588957055215, + "grad_norm": 1.1506752299416414, + "learning_rate": 2.4150080464655457e-06, + "loss": 0.5923, + "step": 6364 + }, + { + "epoch": 0.7809815950920246, + "grad_norm": 0.8849169586775628, + "learning_rate": 2.4124187730720916e-06, + "loss": 0.5832, + "step": 6365 + }, + { + "epoch": 0.7811042944785276, + "grad_norm": 0.9474722798521764, + "learning_rate": 2.4098306980718622e-06, + "loss": 0.5545, + "step": 6366 + }, + { + "epoch": 0.7812269938650307, + "grad_norm": 0.8880339038071821, + "learning_rate": 2.4072438218736184e-06, + "loss": 0.5395, + "step": 6367 + }, + { + "epoch": 0.7813496932515337, + "grad_norm": 0.7697529332673366, + "learning_rate": 2.4046581448859387e-06, + "loss": 0.5273, + "step": 6368 + }, + { + "epoch": 0.7814723926380368, + "grad_norm": 1.134865587758651, + "learning_rate": 2.4020736675172073e-06, + "loss": 0.5942, + "step": 6369 + }, + { + "epoch": 0.7815950920245399, + "grad_norm": 0.9912538572510521, + "learning_rate": 2.3994903901756163e-06, + "loss": 0.5553, + "step": 6370 + }, + { + "epoch": 0.7817177914110429, + "grad_norm": 0.9908495948311703, + "learning_rate": 2.3969083132691785e-06, + "loss": 0.521, + "step": 6371 + }, + { + "epoch": 0.781840490797546, + "grad_norm": 0.8351583851495803, + "learning_rate": 2.3943274372057058e-06, + "loss": 0.5129, + "step": 6372 + }, + { + "epoch": 0.781963190184049, + "grad_norm": 0.9562163771315976, + "learning_rate": 2.3917477623928286e-06, + "loss": 0.5216, + "step": 6373 + }, + { + "epoch": 0.7820858895705521, + "grad_norm": 0.9009801974137172, + "learning_rate": 2.389169289237978e-06, + "loss": 0.5626, + "step": 6374 + }, + { + "epoch": 0.7822085889570553, + "grad_norm": 0.824700327714275, + "learning_rate": 2.3865920181484127e-06, + "loss": 0.4995, + "step": 6375 + }, + { + "epoch": 0.7823312883435583, + "grad_norm": 0.8636494115134975, + "learning_rate": 2.3840159495311842e-06, + "loss": 0.5286, + "step": 6376 + }, + { + "epoch": 0.7824539877300614, + "grad_norm": 0.9019925035329666, + "learning_rate": 2.3814410837931577e-06, + "loss": 0.5724, + "step": 6377 + }, + { + "epoch": 0.7825766871165644, + "grad_norm": 0.8107960777329211, + "learning_rate": 2.378867421341018e-06, + "loss": 0.5605, + "step": 6378 + }, + { + "epoch": 0.7826993865030675, + "grad_norm": 0.9633334156404918, + "learning_rate": 2.3762949625812448e-06, + "loss": 0.5159, + "step": 6379 + }, + { + "epoch": 0.7828220858895706, + "grad_norm": 0.8086301064119364, + "learning_rate": 2.3737237079201437e-06, + "loss": 0.5271, + "step": 6380 + }, + { + "epoch": 0.7829447852760736, + "grad_norm": 0.7323048551127876, + "learning_rate": 2.371153657763816e-06, + "loss": 0.5227, + "step": 6381 + }, + { + "epoch": 0.7830674846625767, + "grad_norm": 0.966315199331785, + "learning_rate": 2.3685848125181843e-06, + "loss": 0.5521, + "step": 6382 + }, + { + "epoch": 0.7831901840490798, + "grad_norm": 0.8852235082148927, + "learning_rate": 2.3660171725889703e-06, + "loss": 0.5734, + "step": 6383 + }, + { + "epoch": 0.7833128834355828, + "grad_norm": 1.1671845586638279, + "learning_rate": 2.363450738381715e-06, + "loss": 0.5807, + "step": 6384 + }, + { + "epoch": 0.7834355828220859, + "grad_norm": 0.8931694118223578, + "learning_rate": 2.3608855103017613e-06, + "loss": 0.593, + "step": 6385 + }, + { + "epoch": 0.7835582822085889, + "grad_norm": 0.977203799674938, + "learning_rate": 2.3583214887542705e-06, + "loss": 0.5477, + "step": 6386 + }, + { + "epoch": 0.783680981595092, + "grad_norm": 0.824073775766963, + "learning_rate": 2.355758674144203e-06, + "loss": 0.5509, + "step": 6387 + }, + { + "epoch": 0.7838036809815951, + "grad_norm": 0.7931284983472844, + "learning_rate": 2.3531970668763305e-06, + "loss": 0.5497, + "step": 6388 + }, + { + "epoch": 0.7839263803680981, + "grad_norm": 0.8852652983535539, + "learning_rate": 2.350636667355244e-06, + "loss": 0.5553, + "step": 6389 + }, + { + "epoch": 0.7840490797546013, + "grad_norm": 0.8747820354225659, + "learning_rate": 2.3480774759853307e-06, + "loss": 0.5508, + "step": 6390 + }, + { + "epoch": 0.7841717791411043, + "grad_norm": 0.9306456620404856, + "learning_rate": 2.345519493170798e-06, + "loss": 0.5352, + "step": 6391 + }, + { + "epoch": 0.7842944785276074, + "grad_norm": 0.9287886328515597, + "learning_rate": 2.3429627193156513e-06, + "loss": 0.593, + "step": 6392 + }, + { + "epoch": 0.7844171779141105, + "grad_norm": 0.9417181056521587, + "learning_rate": 2.3404071548237183e-06, + "loss": 0.5534, + "step": 6393 + }, + { + "epoch": 0.7845398773006135, + "grad_norm": 0.9215381845609574, + "learning_rate": 2.3378528000986213e-06, + "loss": 0.4884, + "step": 6394 + }, + { + "epoch": 0.7846625766871166, + "grad_norm": 0.9242645428174622, + "learning_rate": 2.3352996555438036e-06, + "loss": 0.5623, + "step": 6395 + }, + { + "epoch": 0.7847852760736196, + "grad_norm": 0.930182200108994, + "learning_rate": 2.3327477215625094e-06, + "loss": 0.558, + "step": 6396 + }, + { + "epoch": 0.7849079754601227, + "grad_norm": 0.8773620037361087, + "learning_rate": 2.3301969985577975e-06, + "loss": 0.5479, + "step": 6397 + }, + { + "epoch": 0.7850306748466258, + "grad_norm": 0.866415263411179, + "learning_rate": 2.3276474869325295e-06, + "loss": 0.571, + "step": 6398 + }, + { + "epoch": 0.7851533742331288, + "grad_norm": 1.031387404403391, + "learning_rate": 2.3250991870893837e-06, + "loss": 0.5456, + "step": 6399 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 0.8942815636751069, + "learning_rate": 2.3225520994308382e-06, + "loss": 0.6017, + "step": 6400 + }, + { + "epoch": 0.7853987730061349, + "grad_norm": 0.9983984427939996, + "learning_rate": 2.320006224359178e-06, + "loss": 0.5257, + "step": 6401 + }, + { + "epoch": 0.785521472392638, + "grad_norm": 0.8552043329320427, + "learning_rate": 2.317461562276515e-06, + "loss": 0.4844, + "step": 6402 + }, + { + "epoch": 0.7856441717791411, + "grad_norm": 0.9184148523757963, + "learning_rate": 2.3149181135847475e-06, + "loss": 0.5649, + "step": 6403 + }, + { + "epoch": 0.7857668711656441, + "grad_norm": 0.7959926566971384, + "learning_rate": 2.312375878685598e-06, + "loss": 0.4457, + "step": 6404 + }, + { + "epoch": 0.7858895705521473, + "grad_norm": 1.2655447148119419, + "learning_rate": 2.309834857980583e-06, + "loss": 0.5478, + "step": 6405 + }, + { + "epoch": 0.7860122699386503, + "grad_norm": 0.8980343462266352, + "learning_rate": 2.307295051871041e-06, + "loss": 0.5323, + "step": 6406 + }, + { + "epoch": 0.7861349693251534, + "grad_norm": 0.8942627726054845, + "learning_rate": 2.304756460758111e-06, + "loss": 0.5351, + "step": 6407 + }, + { + "epoch": 0.7862576687116565, + "grad_norm": 0.8597272314792082, + "learning_rate": 2.3022190850427386e-06, + "loss": 0.5526, + "step": 6408 + }, + { + "epoch": 0.7863803680981595, + "grad_norm": 1.0161289934643827, + "learning_rate": 2.299682925125685e-06, + "loss": 0.5375, + "step": 6409 + }, + { + "epoch": 0.7865030674846626, + "grad_norm": 1.0163337832699473, + "learning_rate": 2.297147981407509e-06, + "loss": 0.604, + "step": 6410 + }, + { + "epoch": 0.7866257668711657, + "grad_norm": 0.7665014608375627, + "learning_rate": 2.2946142542885895e-06, + "loss": 0.5893, + "step": 6411 + }, + { + "epoch": 0.7867484662576687, + "grad_norm": 0.7889035989325581, + "learning_rate": 2.2920817441691024e-06, + "loss": 0.5341, + "step": 6412 + }, + { + "epoch": 0.7868711656441718, + "grad_norm": 0.9428926301444834, + "learning_rate": 2.2895504514490397e-06, + "loss": 0.5126, + "step": 6413 + }, + { + "epoch": 0.7869938650306748, + "grad_norm": 0.8625504329565008, + "learning_rate": 2.287020376528193e-06, + "loss": 0.5555, + "step": 6414 + }, + { + "epoch": 0.7871165644171779, + "grad_norm": 0.8566024569021179, + "learning_rate": 2.2844915198061714e-06, + "loss": 0.547, + "step": 6415 + }, + { + "epoch": 0.787239263803681, + "grad_norm": 0.8560795658692578, + "learning_rate": 2.2819638816823796e-06, + "loss": 0.4989, + "step": 6416 + }, + { + "epoch": 0.787361963190184, + "grad_norm": 0.8653174458100663, + "learning_rate": 2.2794374625560423e-06, + "loss": 0.539, + "step": 6417 + }, + { + "epoch": 0.7874846625766871, + "grad_norm": 0.9429797901662791, + "learning_rate": 2.276912262826183e-06, + "loss": 0.6057, + "step": 6418 + }, + { + "epoch": 0.7876073619631901, + "grad_norm": 1.0678935070320505, + "learning_rate": 2.274388282891632e-06, + "loss": 0.5633, + "step": 6419 + }, + { + "epoch": 0.7877300613496933, + "grad_norm": 0.8569591531679419, + "learning_rate": 2.2718655231510368e-06, + "loss": 0.5156, + "step": 6420 + }, + { + "epoch": 0.7878527607361964, + "grad_norm": 0.8733251634648385, + "learning_rate": 2.2693439840028387e-06, + "loss": 0.579, + "step": 6421 + }, + { + "epoch": 0.7879754601226994, + "grad_norm": 0.8926324246046327, + "learning_rate": 2.2668236658453e-06, + "loss": 0.5842, + "step": 6422 + }, + { + "epoch": 0.7880981595092025, + "grad_norm": 0.931526590395179, + "learning_rate": 2.2643045690764774e-06, + "loss": 0.5042, + "step": 6423 + }, + { + "epoch": 0.7882208588957055, + "grad_norm": 0.9104728694549975, + "learning_rate": 2.2617866940942467e-06, + "loss": 0.5278, + "step": 6424 + }, + { + "epoch": 0.7883435582822086, + "grad_norm": 0.9104498255284545, + "learning_rate": 2.2592700412962775e-06, + "loss": 0.5966, + "step": 6425 + }, + { + "epoch": 0.7884662576687117, + "grad_norm": 0.9041122847444971, + "learning_rate": 2.2567546110800598e-06, + "loss": 0.5112, + "step": 6426 + }, + { + "epoch": 0.7885889570552147, + "grad_norm": 0.8942178207470172, + "learning_rate": 2.254240403842878e-06, + "loss": 0.5208, + "step": 6427 + }, + { + "epoch": 0.7887116564417178, + "grad_norm": 0.9175808332400223, + "learning_rate": 2.251727419981836e-06, + "loss": 0.5229, + "step": 6428 + }, + { + "epoch": 0.7888343558282208, + "grad_norm": 0.9793291542935054, + "learning_rate": 2.249215659893833e-06, + "loss": 0.5008, + "step": 6429 + }, + { + "epoch": 0.7889570552147239, + "grad_norm": 0.7887403119831412, + "learning_rate": 2.246705123975582e-06, + "loss": 0.5617, + "step": 6430 + }, + { + "epoch": 0.789079754601227, + "grad_norm": 0.9007838006848147, + "learning_rate": 2.244195812623602e-06, + "loss": 0.5466, + "step": 6431 + }, + { + "epoch": 0.78920245398773, + "grad_norm": 0.9978766705460054, + "learning_rate": 2.241687726234214e-06, + "loss": 0.5381, + "step": 6432 + }, + { + "epoch": 0.7893251533742331, + "grad_norm": 0.7986411840322267, + "learning_rate": 2.239180865203552e-06, + "loss": 0.514, + "step": 6433 + }, + { + "epoch": 0.7894478527607361, + "grad_norm": 0.804210258599009, + "learning_rate": 2.2366752299275497e-06, + "loss": 0.4891, + "step": 6434 + }, + { + "epoch": 0.7895705521472393, + "grad_norm": 0.9073081600935783, + "learning_rate": 2.234170820801954e-06, + "loss": 0.5201, + "step": 6435 + }, + { + "epoch": 0.7896932515337424, + "grad_norm": 1.0895907298593759, + "learning_rate": 2.2316676382223134e-06, + "loss": 0.5635, + "step": 6436 + }, + { + "epoch": 0.7898159509202454, + "grad_norm": 0.8753662000325216, + "learning_rate": 2.2291656825839814e-06, + "loss": 0.5633, + "step": 6437 + }, + { + "epoch": 0.7899386503067485, + "grad_norm": 0.8597371585279941, + "learning_rate": 2.2266649542821263e-06, + "loss": 0.593, + "step": 6438 + }, + { + "epoch": 0.7900613496932515, + "grad_norm": 0.9505089297083199, + "learning_rate": 2.22416545371171e-06, + "loss": 0.5365, + "step": 6439 + }, + { + "epoch": 0.7901840490797546, + "grad_norm": 0.911805336930484, + "learning_rate": 2.2216671812675118e-06, + "loss": 0.5576, + "step": 6440 + }, + { + "epoch": 0.7903067484662577, + "grad_norm": 0.7955344849730641, + "learning_rate": 2.2191701373441087e-06, + "loss": 0.5328, + "step": 6441 + }, + { + "epoch": 0.7904294478527607, + "grad_norm": 0.858875370953626, + "learning_rate": 2.2166743223358923e-06, + "loss": 0.5449, + "step": 6442 + }, + { + "epoch": 0.7905521472392638, + "grad_norm": 0.9873085547199739, + "learning_rate": 2.2141797366370498e-06, + "loss": 0.5152, + "step": 6443 + }, + { + "epoch": 0.7906748466257669, + "grad_norm": 1.0060586014313184, + "learning_rate": 2.2116863806415843e-06, + "loss": 0.5508, + "step": 6444 + }, + { + "epoch": 0.7907975460122699, + "grad_norm": 0.7937900892691276, + "learning_rate": 2.209194254743295e-06, + "loss": 0.5599, + "step": 6445 + }, + { + "epoch": 0.790920245398773, + "grad_norm": 0.8824294193764151, + "learning_rate": 2.2067033593357977e-06, + "loss": 0.5248, + "step": 6446 + }, + { + "epoch": 0.791042944785276, + "grad_norm": 0.8068368621669486, + "learning_rate": 2.2042136948125014e-06, + "loss": 0.5677, + "step": 6447 + }, + { + "epoch": 0.7911656441717791, + "grad_norm": 0.9109501538790725, + "learning_rate": 2.201725261566634e-06, + "loss": 0.5484, + "step": 6448 + }, + { + "epoch": 0.7912883435582823, + "grad_norm": 0.8991320453254948, + "learning_rate": 2.1992380599912176e-06, + "loss": 0.5415, + "step": 6449 + }, + { + "epoch": 0.7914110429447853, + "grad_norm": 0.8669433238753704, + "learning_rate": 2.196752090479083e-06, + "loss": 0.5697, + "step": 6450 + }, + { + "epoch": 0.7915337423312884, + "grad_norm": 0.8494079757163666, + "learning_rate": 2.194267353422873e-06, + "loss": 0.4882, + "step": 6451 + }, + { + "epoch": 0.7916564417177914, + "grad_norm": 0.9317093377260794, + "learning_rate": 2.1917838492150245e-06, + "loss": 0.5158, + "step": 6452 + }, + { + "epoch": 0.7917791411042945, + "grad_norm": 0.8935237335695831, + "learning_rate": 2.189301578247791e-06, + "loss": 0.4959, + "step": 6453 + }, + { + "epoch": 0.7919018404907976, + "grad_norm": 0.8914782368213114, + "learning_rate": 2.1868205409132205e-06, + "loss": 0.576, + "step": 6454 + }, + { + "epoch": 0.7920245398773006, + "grad_norm": 1.0217606406688675, + "learning_rate": 2.184340737603178e-06, + "loss": 0.5116, + "step": 6455 + }, + { + "epoch": 0.7921472392638037, + "grad_norm": 0.8590542821838095, + "learning_rate": 2.1818621687093213e-06, + "loss": 0.5315, + "step": 6456 + }, + { + "epoch": 0.7922699386503067, + "grad_norm": 0.8031382676075073, + "learning_rate": 2.1793848346231195e-06, + "loss": 0.5481, + "step": 6457 + }, + { + "epoch": 0.7923926380368098, + "grad_norm": 0.848995616577967, + "learning_rate": 2.176908735735852e-06, + "loss": 0.5688, + "step": 6458 + }, + { + "epoch": 0.7925153374233129, + "grad_norm": 0.8821012847602615, + "learning_rate": 2.1744338724385906e-06, + "loss": 0.4546, + "step": 6459 + }, + { + "epoch": 0.7926380368098159, + "grad_norm": 0.7948991154038145, + "learning_rate": 2.1719602451222245e-06, + "loss": 0.5701, + "step": 6460 + }, + { + "epoch": 0.792760736196319, + "grad_norm": 0.9077662394919123, + "learning_rate": 2.1694878541774355e-06, + "loss": 0.5909, + "step": 6461 + }, + { + "epoch": 0.792883435582822, + "grad_norm": 0.9592308054258385, + "learning_rate": 2.1670166999947228e-06, + "loss": 0.601, + "step": 6462 + }, + { + "epoch": 0.7930061349693251, + "grad_norm": 0.7684500987162692, + "learning_rate": 2.1645467829643786e-06, + "loss": 0.5332, + "step": 6463 + }, + { + "epoch": 0.7931288343558283, + "grad_norm": 0.8749502758346829, + "learning_rate": 2.1620781034765106e-06, + "loss": 0.5602, + "step": 6464 + }, + { + "epoch": 0.7932515337423313, + "grad_norm": 0.9255649944503427, + "learning_rate": 2.159610661921018e-06, + "loss": 0.5491, + "step": 6465 + }, + { + "epoch": 0.7933742331288344, + "grad_norm": 0.9520211099230025, + "learning_rate": 2.15714445868762e-06, + "loss": 0.5341, + "step": 6466 + }, + { + "epoch": 0.7934969325153374, + "grad_norm": 0.7462345948937332, + "learning_rate": 2.154679494165829e-06, + "loss": 0.5188, + "step": 6467 + }, + { + "epoch": 0.7936196319018405, + "grad_norm": 0.9869357488883812, + "learning_rate": 2.1522157687449608e-06, + "loss": 0.5452, + "step": 6468 + }, + { + "epoch": 0.7937423312883436, + "grad_norm": 0.7687034767496871, + "learning_rate": 2.1497532828141463e-06, + "loss": 0.4993, + "step": 6469 + }, + { + "epoch": 0.7938650306748466, + "grad_norm": 1.0136949217710491, + "learning_rate": 2.1472920367623094e-06, + "loss": 0.5715, + "step": 6470 + }, + { + "epoch": 0.7939877300613497, + "grad_norm": 0.8479539782449588, + "learning_rate": 2.1448320309781855e-06, + "loss": 0.5528, + "step": 6471 + }, + { + "epoch": 0.7941104294478527, + "grad_norm": 0.8579265863545134, + "learning_rate": 2.142373265850309e-06, + "loss": 0.582, + "step": 6472 + }, + { + "epoch": 0.7942331288343558, + "grad_norm": 0.8983342926764901, + "learning_rate": 2.1399157417670233e-06, + "loss": 0.5257, + "step": 6473 + }, + { + "epoch": 0.7943558282208589, + "grad_norm": 0.8076165729377454, + "learning_rate": 2.13745945911647e-06, + "loss": 0.5264, + "step": 6474 + }, + { + "epoch": 0.7944785276073619, + "grad_norm": 0.9374832712783021, + "learning_rate": 2.1350044182866025e-06, + "loss": 0.5368, + "step": 6475 + }, + { + "epoch": 0.794601226993865, + "grad_norm": 0.9357247307423373, + "learning_rate": 2.132550619665168e-06, + "loss": 0.523, + "step": 6476 + }, + { + "epoch": 0.7947239263803682, + "grad_norm": 0.8774673465518094, + "learning_rate": 2.130098063639727e-06, + "loss": 0.494, + "step": 6477 + }, + { + "epoch": 0.7948466257668712, + "grad_norm": 0.8665505310324556, + "learning_rate": 2.127646750597636e-06, + "loss": 0.5625, + "step": 6478 + }, + { + "epoch": 0.7949693251533743, + "grad_norm": 0.8898624674409802, + "learning_rate": 2.1251966809260647e-06, + "loss": 0.5313, + "step": 6479 + }, + { + "epoch": 0.7950920245398773, + "grad_norm": 0.977037545248552, + "learning_rate": 2.1227478550119763e-06, + "loss": 0.5545, + "step": 6480 + }, + { + "epoch": 0.7952147239263804, + "grad_norm": 0.9522702350250302, + "learning_rate": 2.1203002732421386e-06, + "loss": 0.5833, + "step": 6481 + }, + { + "epoch": 0.7953374233128835, + "grad_norm": 1.0574240809271387, + "learning_rate": 2.1178539360031335e-06, + "loss": 0.5569, + "step": 6482 + }, + { + "epoch": 0.7954601226993865, + "grad_norm": 0.8625559753797555, + "learning_rate": 2.1154088436813314e-06, + "loss": 0.5155, + "step": 6483 + }, + { + "epoch": 0.7955828220858896, + "grad_norm": 1.0167777644749492, + "learning_rate": 2.1129649966629185e-06, + "loss": 0.5914, + "step": 6484 + }, + { + "epoch": 0.7957055214723926, + "grad_norm": 0.9489985155743519, + "learning_rate": 2.1105223953338805e-06, + "loss": 0.5275, + "step": 6485 + }, + { + "epoch": 0.7958282208588957, + "grad_norm": 1.012384362904631, + "learning_rate": 2.10808104008e-06, + "loss": 0.5752, + "step": 6486 + }, + { + "epoch": 0.7959509202453988, + "grad_norm": 0.9443020941769509, + "learning_rate": 2.1056409312868754e-06, + "loss": 0.5469, + "step": 6487 + }, + { + "epoch": 0.7960736196319018, + "grad_norm": 0.775971458926631, + "learning_rate": 2.1032020693398925e-06, + "loss": 0.5591, + "step": 6488 + }, + { + "epoch": 0.7961963190184049, + "grad_norm": 0.8301015023148359, + "learning_rate": 2.100764454624257e-06, + "loss": 0.5006, + "step": 6489 + }, + { + "epoch": 0.7963190184049079, + "grad_norm": 0.7736858696617622, + "learning_rate": 2.09832808752496e-06, + "loss": 0.5511, + "step": 6490 + }, + { + "epoch": 0.796441717791411, + "grad_norm": 0.838057219486667, + "learning_rate": 2.095892968426815e-06, + "loss": 0.5015, + "step": 6491 + }, + { + "epoch": 0.7965644171779142, + "grad_norm": 0.9614569703699929, + "learning_rate": 2.0934590977144186e-06, + "loss": 0.5321, + "step": 6492 + }, + { + "epoch": 0.7966871165644172, + "grad_norm": 1.0647665747773396, + "learning_rate": 2.0910264757721866e-06, + "loss": 0.5295, + "step": 6493 + }, + { + "epoch": 0.7968098159509203, + "grad_norm": 0.9599855741739739, + "learning_rate": 2.0885951029843256e-06, + "loss": 0.5174, + "step": 6494 + }, + { + "epoch": 0.7969325153374233, + "grad_norm": 0.7918067819920878, + "learning_rate": 2.086164979734856e-06, + "loss": 0.5186, + "step": 6495 + }, + { + "epoch": 0.7970552147239264, + "grad_norm": 0.8598923373901719, + "learning_rate": 2.083736106407588e-06, + "loss": 0.5569, + "step": 6496 + }, + { + "epoch": 0.7971779141104295, + "grad_norm": 0.8342401274244906, + "learning_rate": 2.0813084833861473e-06, + "loss": 0.4847, + "step": 6497 + }, + { + "epoch": 0.7973006134969325, + "grad_norm": 0.8647454866181763, + "learning_rate": 2.0788821110539536e-06, + "loss": 0.5267, + "step": 6498 + }, + { + "epoch": 0.7974233128834356, + "grad_norm": 0.9567031501744785, + "learning_rate": 2.076456989794229e-06, + "loss": 0.5329, + "step": 6499 + }, + { + "epoch": 0.7975460122699386, + "grad_norm": 0.8652018152696551, + "learning_rate": 2.0740331199900053e-06, + "loss": 0.5563, + "step": 6500 + }, + { + "epoch": 0.7976687116564417, + "grad_norm": 0.933924050247351, + "learning_rate": 2.0716105020241074e-06, + "loss": 0.5139, + "step": 6501 + }, + { + "epoch": 0.7977914110429448, + "grad_norm": 0.9177583445992002, + "learning_rate": 2.0691891362791715e-06, + "loss": 0.5933, + "step": 6502 + }, + { + "epoch": 0.7979141104294478, + "grad_norm": 0.9932005439318173, + "learning_rate": 2.0667690231376257e-06, + "loss": 0.5583, + "step": 6503 + }, + { + "epoch": 0.7980368098159509, + "grad_norm": 0.8568447608637619, + "learning_rate": 2.064350162981714e-06, + "loss": 0.57, + "step": 6504 + }, + { + "epoch": 0.798159509202454, + "grad_norm": 0.9880766599335951, + "learning_rate": 2.0619325561934658e-06, + "loss": 0.5171, + "step": 6505 + }, + { + "epoch": 0.798282208588957, + "grad_norm": 0.8321675009569491, + "learning_rate": 2.0595162031547287e-06, + "loss": 0.5278, + "step": 6506 + }, + { + "epoch": 0.7984049079754602, + "grad_norm": 0.9063314373509933, + "learning_rate": 2.05710110424714e-06, + "loss": 0.5048, + "step": 6507 + }, + { + "epoch": 0.7985276073619632, + "grad_norm": 0.8904686547762493, + "learning_rate": 2.0546872598521474e-06, + "loss": 0.5258, + "step": 6508 + }, + { + "epoch": 0.7986503067484663, + "grad_norm": 0.912110508385768, + "learning_rate": 2.0522746703509944e-06, + "loss": 0.5796, + "step": 6509 + }, + { + "epoch": 0.7987730061349694, + "grad_norm": 0.976473337741852, + "learning_rate": 2.0498633361247278e-06, + "loss": 0.44, + "step": 6510 + }, + { + "epoch": 0.7988957055214724, + "grad_norm": 0.859005642303997, + "learning_rate": 2.047453257554202e-06, + "loss": 0.5568, + "step": 6511 + }, + { + "epoch": 0.7990184049079755, + "grad_norm": 0.8240026672321273, + "learning_rate": 2.0450444350200584e-06, + "loss": 0.5302, + "step": 6512 + }, + { + "epoch": 0.7991411042944785, + "grad_norm": 0.9137102080925293, + "learning_rate": 2.042636868902761e-06, + "loss": 0.5252, + "step": 6513 + }, + { + "epoch": 0.7992638036809816, + "grad_norm": 1.0165924606883723, + "learning_rate": 2.040230559582558e-06, + "loss": 0.5516, + "step": 6514 + }, + { + "epoch": 0.7993865030674847, + "grad_norm": 0.8818613942923158, + "learning_rate": 2.0378255074395094e-06, + "loss": 0.5282, + "step": 6515 + }, + { + "epoch": 0.7995092024539877, + "grad_norm": 0.8562576149653613, + "learning_rate": 2.0354217128534704e-06, + "loss": 0.5031, + "step": 6516 + }, + { + "epoch": 0.7996319018404908, + "grad_norm": 0.826832640539906, + "learning_rate": 2.033019176204095e-06, + "loss": 0.5637, + "step": 6517 + }, + { + "epoch": 0.7997546012269938, + "grad_norm": 0.8677357647015903, + "learning_rate": 2.030617897870851e-06, + "loss": 0.542, + "step": 6518 + }, + { + "epoch": 0.7998773006134969, + "grad_norm": 0.9644964031129206, + "learning_rate": 2.028217878232993e-06, + "loss": 0.543, + "step": 6519 + }, + { + "epoch": 0.8, + "grad_norm": 0.8998296564726476, + "learning_rate": 2.0258191176695896e-06, + "loss": 0.4991, + "step": 6520 + }, + { + "epoch": 0.800122699386503, + "grad_norm": 0.8098497182820869, + "learning_rate": 2.0234216165594988e-06, + "loss": 0.4947, + "step": 6521 + }, + { + "epoch": 0.8002453987730062, + "grad_norm": 0.8544199063691265, + "learning_rate": 2.0210253752813903e-06, + "loss": 0.555, + "step": 6522 + }, + { + "epoch": 0.8003680981595092, + "grad_norm": 0.8183426725522255, + "learning_rate": 2.018630394213724e-06, + "loss": 0.4802, + "step": 6523 + }, + { + "epoch": 0.8004907975460123, + "grad_norm": 0.8480633456228946, + "learning_rate": 2.0162366737347738e-06, + "loss": 0.5936, + "step": 6524 + }, + { + "epoch": 0.8006134969325154, + "grad_norm": 0.9230335934129994, + "learning_rate": 2.0138442142226e-06, + "loss": 0.6001, + "step": 6525 + }, + { + "epoch": 0.8007361963190184, + "grad_norm": 0.7729539697059667, + "learning_rate": 2.011453016055077e-06, + "loss": 0.5169, + "step": 6526 + }, + { + "epoch": 0.8008588957055215, + "grad_norm": 0.8465711043728422, + "learning_rate": 2.0090630796098688e-06, + "loss": 0.5414, + "step": 6527 + }, + { + "epoch": 0.8009815950920245, + "grad_norm": 0.8271952290574116, + "learning_rate": 2.0066744052644505e-06, + "loss": 0.5562, + "step": 6528 + }, + { + "epoch": 0.8011042944785276, + "grad_norm": 0.8696999215871748, + "learning_rate": 2.0042869933960906e-06, + "loss": 0.506, + "step": 6529 + }, + { + "epoch": 0.8012269938650307, + "grad_norm": 1.0645684980718486, + "learning_rate": 2.001900844381857e-06, + "loss": 0.5734, + "step": 6530 + }, + { + "epoch": 0.8013496932515337, + "grad_norm": 0.835586341298176, + "learning_rate": 1.999515958598626e-06, + "loss": 0.5929, + "step": 6531 + }, + { + "epoch": 0.8014723926380368, + "grad_norm": 0.8012701600710459, + "learning_rate": 1.9971323364230653e-06, + "loss": 0.5357, + "step": 6532 + }, + { + "epoch": 0.8015950920245398, + "grad_norm": 0.838102306463735, + "learning_rate": 1.9947499782316537e-06, + "loss": 0.5549, + "step": 6533 + }, + { + "epoch": 0.8017177914110429, + "grad_norm": 0.9100832671470853, + "learning_rate": 1.992368884400657e-06, + "loss": 0.6004, + "step": 6534 + }, + { + "epoch": 0.801840490797546, + "grad_norm": 0.9352492400096544, + "learning_rate": 1.9899890553061565e-06, + "loss": 0.5877, + "step": 6535 + }, + { + "epoch": 0.801963190184049, + "grad_norm": 0.7597032300534069, + "learning_rate": 1.9876104913240167e-06, + "loss": 0.4937, + "step": 6536 + }, + { + "epoch": 0.8020858895705522, + "grad_norm": 0.8551271947479283, + "learning_rate": 1.9852331928299205e-06, + "loss": 0.5018, + "step": 6537 + }, + { + "epoch": 0.8022085889570553, + "grad_norm": 1.0421677918327892, + "learning_rate": 1.982857160199334e-06, + "loss": 0.5637, + "step": 6538 + }, + { + "epoch": 0.8023312883435583, + "grad_norm": 0.9298626312520283, + "learning_rate": 1.9804823938075344e-06, + "loss": 0.5852, + "step": 6539 + }, + { + "epoch": 0.8024539877300614, + "grad_norm": 0.969745616042372, + "learning_rate": 1.978108894029598e-06, + "loss": 0.6246, + "step": 6540 + }, + { + "epoch": 0.8025766871165644, + "grad_norm": 0.8428584321363278, + "learning_rate": 1.9757366612403938e-06, + "loss": 0.5441, + "step": 6541 + }, + { + "epoch": 0.8026993865030675, + "grad_norm": 0.8726629587672163, + "learning_rate": 1.973365695814602e-06, + "loss": 0.5358, + "step": 6542 + }, + { + "epoch": 0.8028220858895706, + "grad_norm": 0.8821280272976575, + "learning_rate": 1.970995998126688e-06, + "loss": 0.5645, + "step": 6543 + }, + { + "epoch": 0.8029447852760736, + "grad_norm": 0.9562637287705603, + "learning_rate": 1.9686275685509337e-06, + "loss": 0.5704, + "step": 6544 + }, + { + "epoch": 0.8030674846625767, + "grad_norm": 0.8948754959606136, + "learning_rate": 1.9662604074614044e-06, + "loss": 0.639, + "step": 6545 + }, + { + "epoch": 0.8031901840490797, + "grad_norm": 0.8349960579456965, + "learning_rate": 1.96389451523198e-06, + "loss": 0.5194, + "step": 6546 + }, + { + "epoch": 0.8033128834355828, + "grad_norm": 0.8451417768200428, + "learning_rate": 1.961529892236328e-06, + "loss": 0.4863, + "step": 6547 + }, + { + "epoch": 0.8034355828220859, + "grad_norm": 0.8791349714824086, + "learning_rate": 1.9591665388479196e-06, + "loss": 0.5338, + "step": 6548 + }, + { + "epoch": 0.8035582822085889, + "grad_norm": 0.872138929243202, + "learning_rate": 1.9568044554400313e-06, + "loss": 0.5252, + "step": 6549 + }, + { + "epoch": 0.803680981595092, + "grad_norm": 0.8669977024696464, + "learning_rate": 1.954443642385727e-06, + "loss": 0.4913, + "step": 6550 + }, + { + "epoch": 0.803803680981595, + "grad_norm": 1.004914626783484, + "learning_rate": 1.952084100057884e-06, + "loss": 0.5503, + "step": 6551 + }, + { + "epoch": 0.8039263803680982, + "grad_norm": 0.8664777213880824, + "learning_rate": 1.9497258288291655e-06, + "loss": 0.6072, + "step": 6552 + }, + { + "epoch": 0.8040490797546013, + "grad_norm": 0.909804264586807, + "learning_rate": 1.947368829072046e-06, + "loss": 0.5632, + "step": 6553 + }, + { + "epoch": 0.8041717791411043, + "grad_norm": 0.8674950649783028, + "learning_rate": 1.945013101158787e-06, + "loss": 0.5181, + "step": 6554 + }, + { + "epoch": 0.8042944785276074, + "grad_norm": 0.9624687046187714, + "learning_rate": 1.9426586454614617e-06, + "loss": 0.517, + "step": 6555 + }, + { + "epoch": 0.8044171779141104, + "grad_norm": 0.9374018276582398, + "learning_rate": 1.9403054623519303e-06, + "loss": 0.4816, + "step": 6556 + }, + { + "epoch": 0.8045398773006135, + "grad_norm": 1.4750978040719351, + "learning_rate": 1.9379535522018623e-06, + "loss": 0.5292, + "step": 6557 + }, + { + "epoch": 0.8046625766871166, + "grad_norm": 0.7541973661953241, + "learning_rate": 1.9356029153827215e-06, + "loss": 0.5645, + "step": 6558 + }, + { + "epoch": 0.8047852760736196, + "grad_norm": 0.870038311253158, + "learning_rate": 1.933253552265767e-06, + "loss": 0.4715, + "step": 6559 + }, + { + "epoch": 0.8049079754601227, + "grad_norm": 0.8740173617385413, + "learning_rate": 1.9309054632220645e-06, + "loss": 0.608, + "step": 6560 + }, + { + "epoch": 0.8050306748466257, + "grad_norm": 0.9752095169831656, + "learning_rate": 1.9285586486224705e-06, + "loss": 0.5419, + "step": 6561 + }, + { + "epoch": 0.8051533742331288, + "grad_norm": 0.8100965619499285, + "learning_rate": 1.926213108837649e-06, + "loss": 0.5441, + "step": 6562 + }, + { + "epoch": 0.8052760736196319, + "grad_norm": 0.9020772166133411, + "learning_rate": 1.923868844238054e-06, + "loss": 0.5031, + "step": 6563 + }, + { + "epoch": 0.8053987730061349, + "grad_norm": 0.8575704976620127, + "learning_rate": 1.9215258551939443e-06, + "loss": 0.5718, + "step": 6564 + }, + { + "epoch": 0.805521472392638, + "grad_norm": 0.8893535387963964, + "learning_rate": 1.919184142075372e-06, + "loss": 0.5536, + "step": 6565 + }, + { + "epoch": 0.805644171779141, + "grad_norm": 0.8654214865844677, + "learning_rate": 1.916843705252195e-06, + "loss": 0.5576, + "step": 6566 + }, + { + "epoch": 0.8057668711656442, + "grad_norm": 0.9776910968022675, + "learning_rate": 1.9145045450940604e-06, + "loss": 0.5781, + "step": 6567 + }, + { + "epoch": 0.8058895705521473, + "grad_norm": 0.7726252271908352, + "learning_rate": 1.9121666619704204e-06, + "loss": 0.5931, + "step": 6568 + }, + { + "epoch": 0.8060122699386503, + "grad_norm": 0.9060928516230027, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.5255, + "step": 6569 + }, + { + "epoch": 0.8061349693251534, + "grad_norm": 0.8245376544900588, + "learning_rate": 1.9074947283034206e-06, + "loss": 0.5183, + "step": 6570 + }, + { + "epoch": 0.8062576687116565, + "grad_norm": 0.8810178029730472, + "learning_rate": 1.9051606784979515e-06, + "loss": 0.5756, + "step": 6571 + }, + { + "epoch": 0.8063803680981595, + "grad_norm": 0.8851528232094373, + "learning_rate": 1.9028279072027589e-06, + "loss": 0.5371, + "step": 6572 + }, + { + "epoch": 0.8065030674846626, + "grad_norm": 1.1232925230149684, + "learning_rate": 1.9004964147862882e-06, + "loss": 0.5654, + "step": 6573 + }, + { + "epoch": 0.8066257668711656, + "grad_norm": 0.8012008072804317, + "learning_rate": 1.8981662016167734e-06, + "loss": 0.5109, + "step": 6574 + }, + { + "epoch": 0.8067484662576687, + "grad_norm": 0.8493891001396279, + "learning_rate": 1.895837268062256e-06, + "loss": 0.5192, + "step": 6575 + }, + { + "epoch": 0.8068711656441718, + "grad_norm": 0.8217058424926507, + "learning_rate": 1.8935096144905686e-06, + "loss": 0.5825, + "step": 6576 + }, + { + "epoch": 0.8069938650306748, + "grad_norm": 0.9115234948741071, + "learning_rate": 1.8911832412693464e-06, + "loss": 0.5659, + "step": 6577 + }, + { + "epoch": 0.8071165644171779, + "grad_norm": 0.855631877377554, + "learning_rate": 1.8888581487660185e-06, + "loss": 0.5605, + "step": 6578 + }, + { + "epoch": 0.8072392638036809, + "grad_norm": 0.9079422926992295, + "learning_rate": 1.8865343373478118e-06, + "loss": 0.5108, + "step": 6579 + }, + { + "epoch": 0.807361963190184, + "grad_norm": 0.9355117304187693, + "learning_rate": 1.884211807381755e-06, + "loss": 0.548, + "step": 6580 + }, + { + "epoch": 0.8074846625766872, + "grad_norm": 0.8684158364516086, + "learning_rate": 1.881890559234668e-06, + "loss": 0.5615, + "step": 6581 + }, + { + "epoch": 0.8076073619631902, + "grad_norm": 0.904845909586359, + "learning_rate": 1.8795705932731778e-06, + "loss": 0.5735, + "step": 6582 + }, + { + "epoch": 0.8077300613496933, + "grad_norm": 0.895558654279662, + "learning_rate": 1.8772519098636976e-06, + "loss": 0.5752, + "step": 6583 + }, + { + "epoch": 0.8078527607361963, + "grad_norm": 0.8679898390354236, + "learning_rate": 1.874934509372448e-06, + "loss": 0.5758, + "step": 6584 + }, + { + "epoch": 0.8079754601226994, + "grad_norm": 0.9974813465218, + "learning_rate": 1.8726183921654373e-06, + "loss": 0.584, + "step": 6585 + }, + { + "epoch": 0.8080981595092025, + "grad_norm": 0.7882760637739831, + "learning_rate": 1.8703035586084817e-06, + "loss": 0.5745, + "step": 6586 + }, + { + "epoch": 0.8082208588957055, + "grad_norm": 0.8824086582832861, + "learning_rate": 1.8679900090671854e-06, + "loss": 0.5703, + "step": 6587 + }, + { + "epoch": 0.8083435582822086, + "grad_norm": 0.9993810326628568, + "learning_rate": 1.8656777439069561e-06, + "loss": 0.6005, + "step": 6588 + }, + { + "epoch": 0.8084662576687116, + "grad_norm": 1.000065040422675, + "learning_rate": 1.8633667634929963e-06, + "loss": 0.5623, + "step": 6589 + }, + { + "epoch": 0.8085889570552147, + "grad_norm": 0.9518885251241391, + "learning_rate": 1.8610570681903018e-06, + "loss": 0.4945, + "step": 6590 + }, + { + "epoch": 0.8087116564417178, + "grad_norm": 0.8701048920119479, + "learning_rate": 1.8587486583636727e-06, + "loss": 0.5803, + "step": 6591 + }, + { + "epoch": 0.8088343558282208, + "grad_norm": 0.9112032436895804, + "learning_rate": 1.856441534377701e-06, + "loss": 0.588, + "step": 6592 + }, + { + "epoch": 0.8089570552147239, + "grad_norm": 0.9020203711309351, + "learning_rate": 1.8541356965967782e-06, + "loss": 0.5643, + "step": 6593 + }, + { + "epoch": 0.8090797546012269, + "grad_norm": 0.9311728799314307, + "learning_rate": 1.8518311453850868e-06, + "loss": 0.5642, + "step": 6594 + }, + { + "epoch": 0.80920245398773, + "grad_norm": 0.9815902168665703, + "learning_rate": 1.8495278811066197e-06, + "loss": 0.5292, + "step": 6595 + }, + { + "epoch": 0.8093251533742332, + "grad_norm": 0.900277714857578, + "learning_rate": 1.8472259041251538e-06, + "loss": 0.5504, + "step": 6596 + }, + { + "epoch": 0.8094478527607362, + "grad_norm": 0.8715856073605809, + "learning_rate": 1.8449252148042617e-06, + "loss": 0.5297, + "step": 6597 + }, + { + "epoch": 0.8095705521472393, + "grad_norm": 0.8550567907375418, + "learning_rate": 1.842625813507325e-06, + "loss": 0.538, + "step": 6598 + }, + { + "epoch": 0.8096932515337424, + "grad_norm": 1.2183606426932656, + "learning_rate": 1.8403277005975084e-06, + "loss": 0.566, + "step": 6599 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 0.9060034791725023, + "learning_rate": 1.8380308764377841e-06, + "loss": 0.5357, + "step": 6600 + }, + { + "epoch": 0.8099386503067485, + "grad_norm": 0.9026229654982314, + "learning_rate": 1.8357353413909106e-06, + "loss": 0.527, + "step": 6601 + }, + { + "epoch": 0.8100613496932515, + "grad_norm": 0.8658531927151749, + "learning_rate": 1.8334410958194527e-06, + "loss": 0.5414, + "step": 6602 + }, + { + "epoch": 0.8101840490797546, + "grad_norm": 0.8870883170302762, + "learning_rate": 1.8311481400857622e-06, + "loss": 0.5325, + "step": 6603 + }, + { + "epoch": 0.8103067484662577, + "grad_norm": 0.952287166935617, + "learning_rate": 1.8288564745519966e-06, + "loss": 0.6009, + "step": 6604 + }, + { + "epoch": 0.8104294478527607, + "grad_norm": 1.0036972097887795, + "learning_rate": 1.8265660995801004e-06, + "loss": 0.5524, + "step": 6605 + }, + { + "epoch": 0.8105521472392638, + "grad_norm": 0.8165522489918626, + "learning_rate": 1.8242770155318223e-06, + "loss": 0.5364, + "step": 6606 + }, + { + "epoch": 0.8106748466257668, + "grad_norm": 0.8737411446533367, + "learning_rate": 1.8219892227687018e-06, + "loss": 0.5969, + "step": 6607 + }, + { + "epoch": 0.8107975460122699, + "grad_norm": 0.8521016410062174, + "learning_rate": 1.8197027216520734e-06, + "loss": 0.5619, + "step": 6608 + }, + { + "epoch": 0.810920245398773, + "grad_norm": 0.8813783385893844, + "learning_rate": 1.817417512543076e-06, + "loss": 0.5194, + "step": 6609 + }, + { + "epoch": 0.811042944785276, + "grad_norm": 0.9282863267916092, + "learning_rate": 1.8151335958026317e-06, + "loss": 0.5294, + "step": 6610 + }, + { + "epoch": 0.8111656441717792, + "grad_norm": 0.7585803607266042, + "learning_rate": 1.8128509717914733e-06, + "loss": 0.4315, + "step": 6611 + }, + { + "epoch": 0.8112883435582822, + "grad_norm": 0.9454145429270343, + "learning_rate": 1.8105696408701147e-06, + "loss": 0.4903, + "step": 6612 + }, + { + "epoch": 0.8114110429447853, + "grad_norm": 0.8790388212890932, + "learning_rate": 1.8082896033988784e-06, + "loss": 0.4935, + "step": 6613 + }, + { + "epoch": 0.8115337423312884, + "grad_norm": 0.7907878098636995, + "learning_rate": 1.8060108597378712e-06, + "loss": 0.5372, + "step": 6614 + }, + { + "epoch": 0.8116564417177914, + "grad_norm": 0.8720504457110897, + "learning_rate": 1.803733410247006e-06, + "loss": 0.536, + "step": 6615 + }, + { + "epoch": 0.8117791411042945, + "grad_norm": 0.9006352571613861, + "learning_rate": 1.8014572552859821e-06, + "loss": 0.4951, + "step": 6616 + }, + { + "epoch": 0.8119018404907975, + "grad_norm": 0.951746327321133, + "learning_rate": 1.7991823952143028e-06, + "loss": 0.5509, + "step": 6617 + }, + { + "epoch": 0.8120245398773006, + "grad_norm": 0.8177748199002272, + "learning_rate": 1.7969088303912574e-06, + "loss": 0.5477, + "step": 6618 + }, + { + "epoch": 0.8121472392638037, + "grad_norm": 0.8983507033291406, + "learning_rate": 1.794636561175942e-06, + "loss": 0.5786, + "step": 6619 + }, + { + "epoch": 0.8122699386503067, + "grad_norm": 0.8106913490779231, + "learning_rate": 1.7923655879272395e-06, + "loss": 0.5701, + "step": 6620 + }, + { + "epoch": 0.8123926380368098, + "grad_norm": 0.7830108617576624, + "learning_rate": 1.7900959110038274e-06, + "loss": 0.5392, + "step": 6621 + }, + { + "epoch": 0.8125153374233128, + "grad_norm": 0.8185643921120551, + "learning_rate": 1.7878275307641846e-06, + "loss": 0.6276, + "step": 6622 + }, + { + "epoch": 0.812638036809816, + "grad_norm": 0.8824969562109206, + "learning_rate": 1.785560447566581e-06, + "loss": 0.4987, + "step": 6623 + }, + { + "epoch": 0.8127607361963191, + "grad_norm": 0.918257682741519, + "learning_rate": 1.7832946617690872e-06, + "loss": 0.5402, + "step": 6624 + }, + { + "epoch": 0.8128834355828221, + "grad_norm": 0.9027439495730281, + "learning_rate": 1.7810301737295588e-06, + "loss": 0.5673, + "step": 6625 + }, + { + "epoch": 0.8130061349693252, + "grad_norm": 0.8647649541759735, + "learning_rate": 1.7787669838056575e-06, + "loss": 0.5762, + "step": 6626 + }, + { + "epoch": 0.8131288343558282, + "grad_norm": 0.8515830283169451, + "learning_rate": 1.776505092354831e-06, + "loss": 0.5145, + "step": 6627 + }, + { + "epoch": 0.8132515337423313, + "grad_norm": 0.8178787752492946, + "learning_rate": 1.7742444997343245e-06, + "loss": 0.5783, + "step": 6628 + }, + { + "epoch": 0.8133742331288344, + "grad_norm": 0.8582292652256964, + "learning_rate": 1.7719852063011844e-06, + "loss": 0.4942, + "step": 6629 + }, + { + "epoch": 0.8134969325153374, + "grad_norm": 0.8651500148305459, + "learning_rate": 1.76972721241224e-06, + "loss": 0.5739, + "step": 6630 + }, + { + "epoch": 0.8136196319018405, + "grad_norm": 0.9296997868948025, + "learning_rate": 1.767470518424129e-06, + "loss": 0.51, + "step": 6631 + }, + { + "epoch": 0.8137423312883436, + "grad_norm": 0.8559515350215907, + "learning_rate": 1.7652151246932703e-06, + "loss": 0.5224, + "step": 6632 + }, + { + "epoch": 0.8138650306748466, + "grad_norm": 0.9383222225809347, + "learning_rate": 1.7629610315758893e-06, + "loss": 0.5743, + "step": 6633 + }, + { + "epoch": 0.8139877300613497, + "grad_norm": 0.7991503960057167, + "learning_rate": 1.7607082394279963e-06, + "loss": 0.5244, + "step": 6634 + }, + { + "epoch": 0.8141104294478527, + "grad_norm": 0.7703694184321719, + "learning_rate": 1.7584567486054039e-06, + "loss": 0.515, + "step": 6635 + }, + { + "epoch": 0.8142331288343558, + "grad_norm": 0.9610000944011363, + "learning_rate": 1.7562065594637124e-06, + "loss": 0.5403, + "step": 6636 + }, + { + "epoch": 0.814355828220859, + "grad_norm": 0.911386753153724, + "learning_rate": 1.753957672358324e-06, + "loss": 0.5922, + "step": 6637 + }, + { + "epoch": 0.814478527607362, + "grad_norm": 0.9609859307044175, + "learning_rate": 1.7517100876444294e-06, + "loss": 0.5594, + "step": 6638 + }, + { + "epoch": 0.8146012269938651, + "grad_norm": 0.968752746601028, + "learning_rate": 1.7494638056770119e-06, + "loss": 0.554, + "step": 6639 + }, + { + "epoch": 0.8147239263803681, + "grad_norm": 1.0233233555444512, + "learning_rate": 1.7472188268108569e-06, + "loss": 0.602, + "step": 6640 + }, + { + "epoch": 0.8148466257668712, + "grad_norm": 0.8688650667049777, + "learning_rate": 1.7449751514005365e-06, + "loss": 0.539, + "step": 6641 + }, + { + "epoch": 0.8149693251533743, + "grad_norm": 0.9070448544880922, + "learning_rate": 1.7427327798004235e-06, + "loss": 0.5421, + "step": 6642 + }, + { + "epoch": 0.8150920245398773, + "grad_norm": 0.8526832185114633, + "learning_rate": 1.7404917123646757e-06, + "loss": 0.5913, + "step": 6643 + }, + { + "epoch": 0.8152147239263804, + "grad_norm": 0.9823473395435084, + "learning_rate": 1.7382519494472571e-06, + "loss": 0.5833, + "step": 6644 + }, + { + "epoch": 0.8153374233128834, + "grad_norm": 0.8671455920415909, + "learning_rate": 1.7360134914019122e-06, + "loss": 0.5797, + "step": 6645 + }, + { + "epoch": 0.8154601226993865, + "grad_norm": 0.9257088120808078, + "learning_rate": 1.7337763385821926e-06, + "loss": 0.518, + "step": 6646 + }, + { + "epoch": 0.8155828220858896, + "grad_norm": 0.8796090161275594, + "learning_rate": 1.7315404913414325e-06, + "loss": 0.5477, + "step": 6647 + }, + { + "epoch": 0.8157055214723926, + "grad_norm": 0.8617021706906645, + "learning_rate": 1.729305950032768e-06, + "loss": 0.5563, + "step": 6648 + }, + { + "epoch": 0.8158282208588957, + "grad_norm": 0.7892168514354706, + "learning_rate": 1.7270727150091215e-06, + "loss": 0.5162, + "step": 6649 + }, + { + "epoch": 0.8159509202453987, + "grad_norm": 0.9728058683186198, + "learning_rate": 1.7248407866232175e-06, + "loss": 0.4815, + "step": 6650 + }, + { + "epoch": 0.8160736196319018, + "grad_norm": 0.9174223112205531, + "learning_rate": 1.7226101652275695e-06, + "loss": 0.5344, + "step": 6651 + }, + { + "epoch": 0.816196319018405, + "grad_norm": 0.8064975444246848, + "learning_rate": 1.7203808511744824e-06, + "loss": 0.5126, + "step": 6652 + }, + { + "epoch": 0.816319018404908, + "grad_norm": 0.7748783584834578, + "learning_rate": 1.7181528448160611e-06, + "loss": 0.5583, + "step": 6653 + }, + { + "epoch": 0.8164417177914111, + "grad_norm": 0.8395924822915994, + "learning_rate": 1.7159261465041954e-06, + "loss": 0.5743, + "step": 6654 + }, + { + "epoch": 0.8165644171779141, + "grad_norm": 0.787438530518658, + "learning_rate": 1.7137007565905772e-06, + "loss": 0.5554, + "step": 6655 + }, + { + "epoch": 0.8166871165644172, + "grad_norm": 0.8852664521356371, + "learning_rate": 1.7114766754266842e-06, + "loss": 0.5214, + "step": 6656 + }, + { + "epoch": 0.8168098159509203, + "grad_norm": 0.9193758851078817, + "learning_rate": 1.7092539033637946e-06, + "loss": 0.5896, + "step": 6657 + }, + { + "epoch": 0.8169325153374233, + "grad_norm": 0.9987023713485751, + "learning_rate": 1.7070324407529749e-06, + "loss": 0.5297, + "step": 6658 + }, + { + "epoch": 0.8170552147239264, + "grad_norm": 0.8757960656250047, + "learning_rate": 1.7048122879450812e-06, + "loss": 0.5451, + "step": 6659 + }, + { + "epoch": 0.8171779141104294, + "grad_norm": 0.9514531269065271, + "learning_rate": 1.7025934452907755e-06, + "loss": 0.5436, + "step": 6660 + }, + { + "epoch": 0.8173006134969325, + "grad_norm": 0.937854536252588, + "learning_rate": 1.7003759131404984e-06, + "loss": 0.5209, + "step": 6661 + }, + { + "epoch": 0.8174233128834356, + "grad_norm": 0.9545672177223857, + "learning_rate": 1.6981596918444953e-06, + "loss": 0.5449, + "step": 6662 + }, + { + "epoch": 0.8175460122699386, + "grad_norm": 0.9711698809914905, + "learning_rate": 1.695944781752793e-06, + "loss": 0.5606, + "step": 6663 + }, + { + "epoch": 0.8176687116564417, + "grad_norm": 0.8912618285939552, + "learning_rate": 1.6937311832152248e-06, + "loss": 0.5397, + "step": 6664 + }, + { + "epoch": 0.8177914110429448, + "grad_norm": 0.9223086473701997, + "learning_rate": 1.6915188965814034e-06, + "loss": 0.563, + "step": 6665 + }, + { + "epoch": 0.8179141104294478, + "grad_norm": 0.9655356083624042, + "learning_rate": 1.6893079222007458e-06, + "loss": 0.5595, + "step": 6666 + }, + { + "epoch": 0.818036809815951, + "grad_norm": 0.9661929085703875, + "learning_rate": 1.6870982604224506e-06, + "loss": 0.5377, + "step": 6667 + }, + { + "epoch": 0.818159509202454, + "grad_norm": 0.8792921865439494, + "learning_rate": 1.6848899115955208e-06, + "loss": 0.5373, + "step": 6668 + }, + { + "epoch": 0.8182822085889571, + "grad_norm": 0.9206117049839854, + "learning_rate": 1.682682876068743e-06, + "loss": 0.5546, + "step": 6669 + }, + { + "epoch": 0.8184049079754602, + "grad_norm": 0.9662001832166099, + "learning_rate": 1.6804771541906972e-06, + "loss": 0.5394, + "step": 6670 + }, + { + "epoch": 0.8185276073619632, + "grad_norm": 0.896940717072658, + "learning_rate": 1.6782727463097626e-06, + "loss": 0.5216, + "step": 6671 + }, + { + "epoch": 0.8186503067484663, + "grad_norm": 0.832223360497499, + "learning_rate": 1.6760696527741027e-06, + "loss": 0.5483, + "step": 6672 + }, + { + "epoch": 0.8187730061349693, + "grad_norm": 1.0207852137975277, + "learning_rate": 1.6738678739316816e-06, + "loss": 0.5779, + "step": 6673 + }, + { + "epoch": 0.8188957055214724, + "grad_norm": 0.9468078129372328, + "learning_rate": 1.6716674101302465e-06, + "loss": 0.4722, + "step": 6674 + }, + { + "epoch": 0.8190184049079755, + "grad_norm": 0.7699254387079634, + "learning_rate": 1.6694682617173452e-06, + "loss": 0.5151, + "step": 6675 + }, + { + "epoch": 0.8191411042944785, + "grad_norm": 0.8575136809555483, + "learning_rate": 1.6672704290403119e-06, + "loss": 0.5552, + "step": 6676 + }, + { + "epoch": 0.8192638036809816, + "grad_norm": 0.9092530347028304, + "learning_rate": 1.6650739124462766e-06, + "loss": 0.5377, + "step": 6677 + }, + { + "epoch": 0.8193865030674846, + "grad_norm": 0.9191822921252538, + "learning_rate": 1.6628787122821622e-06, + "loss": 0.5512, + "step": 6678 + }, + { + "epoch": 0.8195092024539877, + "grad_norm": 0.9101591247439599, + "learning_rate": 1.6606848288946776e-06, + "loss": 0.5802, + "step": 6679 + }, + { + "epoch": 0.8196319018404908, + "grad_norm": 0.9251941662053921, + "learning_rate": 1.6584922626303325e-06, + "loss": 0.5467, + "step": 6680 + }, + { + "epoch": 0.8197546012269938, + "grad_norm": 1.1147091519933943, + "learning_rate": 1.6563010138354185e-06, + "loss": 0.5417, + "step": 6681 + }, + { + "epoch": 0.819877300613497, + "grad_norm": 0.8287489634804838, + "learning_rate": 1.6541110828560303e-06, + "loss": 0.5349, + "step": 6682 + }, + { + "epoch": 0.82, + "grad_norm": 0.8873621161730125, + "learning_rate": 1.6519224700380422e-06, + "loss": 0.5467, + "step": 6683 + }, + { + "epoch": 0.8201226993865031, + "grad_norm": 0.8127673613678572, + "learning_rate": 1.649735175727134e-06, + "loss": 0.5509, + "step": 6684 + }, + { + "epoch": 0.8202453987730062, + "grad_norm": 0.9353848921795529, + "learning_rate": 1.6475492002687632e-06, + "loss": 0.5809, + "step": 6685 + }, + { + "epoch": 0.8203680981595092, + "grad_norm": 0.8903758300409808, + "learning_rate": 1.645364544008191e-06, + "loss": 0.5555, + "step": 6686 + }, + { + "epoch": 0.8204907975460123, + "grad_norm": 0.8265335293274719, + "learning_rate": 1.643181207290464e-06, + "loss": 0.5125, + "step": 6687 + }, + { + "epoch": 0.8206134969325153, + "grad_norm": 0.7724972233047553, + "learning_rate": 1.6409991904604173e-06, + "loss": 0.5374, + "step": 6688 + }, + { + "epoch": 0.8207361963190184, + "grad_norm": 0.8970124149823062, + "learning_rate": 1.6388184938626871e-06, + "loss": 0.5606, + "step": 6689 + }, + { + "epoch": 0.8208588957055215, + "grad_norm": 0.8397762869926751, + "learning_rate": 1.6366391178416918e-06, + "loss": 0.5291, + "step": 6690 + }, + { + "epoch": 0.8209815950920245, + "grad_norm": 0.846132242639059, + "learning_rate": 1.634461062741649e-06, + "loss": 0.5763, + "step": 6691 + }, + { + "epoch": 0.8211042944785276, + "grad_norm": 0.8532017541392048, + "learning_rate": 1.6322843289065581e-06, + "loss": 0.527, + "step": 6692 + }, + { + "epoch": 0.8212269938650307, + "grad_norm": 0.9452532072216833, + "learning_rate": 1.6301089166802232e-06, + "loss": 0.5636, + "step": 6693 + }, + { + "epoch": 0.8213496932515337, + "grad_norm": 0.8728328715288038, + "learning_rate": 1.6279348264062233e-06, + "loss": 0.5939, + "step": 6694 + }, + { + "epoch": 0.8214723926380368, + "grad_norm": 0.8888430390107439, + "learning_rate": 1.6257620584279454e-06, + "loss": 0.5759, + "step": 6695 + }, + { + "epoch": 0.8215950920245398, + "grad_norm": 1.0143737277576828, + "learning_rate": 1.6235906130885536e-06, + "loss": 0.577, + "step": 6696 + }, + { + "epoch": 0.821717791411043, + "grad_norm": 0.807257007375475, + "learning_rate": 1.6214204907310128e-06, + "loss": 0.5199, + "step": 6697 + }, + { + "epoch": 0.8218404907975461, + "grad_norm": 1.13622497953347, + "learning_rate": 1.6192516916980717e-06, + "loss": 0.5557, + "step": 6698 + }, + { + "epoch": 0.8219631901840491, + "grad_norm": 1.3476955390508023, + "learning_rate": 1.6170842163322775e-06, + "loss": 0.5768, + "step": 6699 + }, + { + "epoch": 0.8220858895705522, + "grad_norm": 0.9127366281252768, + "learning_rate": 1.6149180649759622e-06, + "loss": 0.5751, + "step": 6700 + }, + { + "epoch": 0.8222085889570552, + "grad_norm": 0.8091382990676321, + "learning_rate": 1.6127532379712473e-06, + "loss": 0.5497, + "step": 6701 + }, + { + "epoch": 0.8223312883435583, + "grad_norm": 0.9088132112311289, + "learning_rate": 1.6105897356600553e-06, + "loss": 0.547, + "step": 6702 + }, + { + "epoch": 0.8224539877300614, + "grad_norm": 0.8809176300987825, + "learning_rate": 1.608427558384088e-06, + "loss": 0.5035, + "step": 6703 + }, + { + "epoch": 0.8225766871165644, + "grad_norm": 0.8464008058764535, + "learning_rate": 1.6062667064848424e-06, + "loss": 0.5135, + "step": 6704 + }, + { + "epoch": 0.8226993865030675, + "grad_norm": 1.0346649804182453, + "learning_rate": 1.60410718030361e-06, + "loss": 0.573, + "step": 6705 + }, + { + "epoch": 0.8228220858895705, + "grad_norm": 0.8414139068322378, + "learning_rate": 1.6019489801814693e-06, + "loss": 0.4981, + "step": 6706 + }, + { + "epoch": 0.8229447852760736, + "grad_norm": 0.7922156926247995, + "learning_rate": 1.599792106459288e-06, + "loss": 0.4719, + "step": 6707 + }, + { + "epoch": 0.8230674846625767, + "grad_norm": 0.9313950482622767, + "learning_rate": 1.5976365594777233e-06, + "loss": 0.5463, + "step": 6708 + }, + { + "epoch": 0.8231901840490797, + "grad_norm": 1.0262946107161166, + "learning_rate": 1.59548233957723e-06, + "loss": 0.5328, + "step": 6709 + }, + { + "epoch": 0.8233128834355828, + "grad_norm": 0.9921963215294416, + "learning_rate": 1.5933294470980443e-06, + "loss": 0.5231, + "step": 6710 + }, + { + "epoch": 0.8234355828220858, + "grad_norm": 0.8366110067113568, + "learning_rate": 1.5911778823802004e-06, + "loss": 0.4838, + "step": 6711 + }, + { + "epoch": 0.823558282208589, + "grad_norm": 0.9134489691789538, + "learning_rate": 1.5890276457635167e-06, + "loss": 0.5815, + "step": 6712 + }, + { + "epoch": 0.8236809815950921, + "grad_norm": 0.8394388926146419, + "learning_rate": 1.5868787375876082e-06, + "loss": 0.5556, + "step": 6713 + }, + { + "epoch": 0.8238036809815951, + "grad_norm": 1.1645268995124531, + "learning_rate": 1.5847311581918712e-06, + "loss": 0.5232, + "step": 6714 + }, + { + "epoch": 0.8239263803680982, + "grad_norm": 0.8483461110699625, + "learning_rate": 1.5825849079155032e-06, + "loss": 0.5458, + "step": 6715 + }, + { + "epoch": 0.8240490797546012, + "grad_norm": 0.9132517338458201, + "learning_rate": 1.5804399870974806e-06, + "loss": 0.5222, + "step": 6716 + }, + { + "epoch": 0.8241717791411043, + "grad_norm": 0.8346272848969352, + "learning_rate": 1.5782963960765807e-06, + "loss": 0.5529, + "step": 6717 + }, + { + "epoch": 0.8242944785276074, + "grad_norm": 0.9330802197502315, + "learning_rate": 1.5761541351913623e-06, + "loss": 0.5874, + "step": 6718 + }, + { + "epoch": 0.8244171779141104, + "grad_norm": 0.9026130318388336, + "learning_rate": 1.5740132047801738e-06, + "loss": 0.4912, + "step": 6719 + }, + { + "epoch": 0.8245398773006135, + "grad_norm": 0.8290932206318583, + "learning_rate": 1.5718736051811634e-06, + "loss": 0.5564, + "step": 6720 + }, + { + "epoch": 0.8246625766871165, + "grad_norm": 0.8997368064107947, + "learning_rate": 1.5697353367322564e-06, + "loss": 0.5463, + "step": 6721 + }, + { + "epoch": 0.8247852760736196, + "grad_norm": 0.8975400036735657, + "learning_rate": 1.5675983997711797e-06, + "loss": 0.5442, + "step": 6722 + }, + { + "epoch": 0.8249079754601227, + "grad_norm": 0.802969772186422, + "learning_rate": 1.5654627946354384e-06, + "loss": 0.5557, + "step": 6723 + }, + { + "epoch": 0.8250306748466257, + "grad_norm": 0.880392554023856, + "learning_rate": 1.5633285216623384e-06, + "loss": 0.5351, + "step": 6724 + }, + { + "epoch": 0.8251533742331288, + "grad_norm": 0.9651596595728303, + "learning_rate": 1.5611955811889645e-06, + "loss": 0.5043, + "step": 6725 + }, + { + "epoch": 0.825276073619632, + "grad_norm": 0.8318637669570874, + "learning_rate": 1.5590639735522006e-06, + "loss": 0.562, + "step": 6726 + }, + { + "epoch": 0.825398773006135, + "grad_norm": 0.8956540669847836, + "learning_rate": 1.556933699088713e-06, + "loss": 0.4922, + "step": 6727 + }, + { + "epoch": 0.8255214723926381, + "grad_norm": 0.8283809304804435, + "learning_rate": 1.5548047581349624e-06, + "loss": 0.5173, + "step": 6728 + }, + { + "epoch": 0.8256441717791411, + "grad_norm": 0.8407053834380167, + "learning_rate": 1.5526771510271942e-06, + "loss": 0.538, + "step": 6729 + }, + { + "epoch": 0.8257668711656442, + "grad_norm": 0.8585455869069737, + "learning_rate": 1.5505508781014489e-06, + "loss": 0.5104, + "step": 6730 + }, + { + "epoch": 0.8258895705521473, + "grad_norm": 0.8671740692317732, + "learning_rate": 1.5484259396935508e-06, + "loss": 0.6014, + "step": 6731 + }, + { + "epoch": 0.8260122699386503, + "grad_norm": 0.8808698347236269, + "learning_rate": 1.5463023361391128e-06, + "loss": 0.529, + "step": 6732 + }, + { + "epoch": 0.8261349693251534, + "grad_norm": 0.7618683146412004, + "learning_rate": 1.5441800677735463e-06, + "loss": 0.5286, + "step": 6733 + }, + { + "epoch": 0.8262576687116564, + "grad_norm": 0.8939146479454746, + "learning_rate": 1.5420591349320401e-06, + "loss": 0.5356, + "step": 6734 + }, + { + "epoch": 0.8263803680981595, + "grad_norm": 0.834779364227376, + "learning_rate": 1.539939537949583e-06, + "loss": 0.5025, + "step": 6735 + }, + { + "epoch": 0.8265030674846626, + "grad_norm": 0.9090466961174998, + "learning_rate": 1.5378212771609425e-06, + "loss": 0.5492, + "step": 6736 + }, + { + "epoch": 0.8266257668711656, + "grad_norm": 0.7922905613105632, + "learning_rate": 1.5357043529006777e-06, + "loss": 0.5595, + "step": 6737 + }, + { + "epoch": 0.8267484662576687, + "grad_norm": 0.8409788422529515, + "learning_rate": 1.5335887655031446e-06, + "loss": 0.5835, + "step": 6738 + }, + { + "epoch": 0.8268711656441717, + "grad_norm": 0.8245810852517655, + "learning_rate": 1.5314745153024768e-06, + "loss": 0.505, + "step": 6739 + }, + { + "epoch": 0.8269938650306748, + "grad_norm": 0.8724534618914845, + "learning_rate": 1.5293616026326053e-06, + "loss": 0.5629, + "step": 6740 + }, + { + "epoch": 0.827116564417178, + "grad_norm": 0.959555475188923, + "learning_rate": 1.527250027827244e-06, + "loss": 0.5334, + "step": 6741 + }, + { + "epoch": 0.827239263803681, + "grad_norm": 0.8208163038248572, + "learning_rate": 1.5251397912199017e-06, + "loss": 0.5366, + "step": 6742 + }, + { + "epoch": 0.8273619631901841, + "grad_norm": 0.7542741463566516, + "learning_rate": 1.5230308931438675e-06, + "loss": 0.5365, + "step": 6743 + }, + { + "epoch": 0.8274846625766871, + "grad_norm": 0.8700264829762993, + "learning_rate": 1.520923333932227e-06, + "loss": 0.5434, + "step": 6744 + }, + { + "epoch": 0.8276073619631902, + "grad_norm": 0.8090007679353642, + "learning_rate": 1.5188171139178486e-06, + "loss": 0.4784, + "step": 6745 + }, + { + "epoch": 0.8277300613496933, + "grad_norm": 0.8681664082742047, + "learning_rate": 1.516712233433396e-06, + "loss": 0.5061, + "step": 6746 + }, + { + "epoch": 0.8278527607361963, + "grad_norm": 0.9512406482661137, + "learning_rate": 1.5146086928113113e-06, + "loss": 0.5868, + "step": 6747 + }, + { + "epoch": 0.8279754601226994, + "grad_norm": 0.9104756419952568, + "learning_rate": 1.512506492383835e-06, + "loss": 0.5214, + "step": 6748 + }, + { + "epoch": 0.8280981595092024, + "grad_norm": 0.9671109480864956, + "learning_rate": 1.5104056324829908e-06, + "loss": 0.572, + "step": 6749 + }, + { + "epoch": 0.8282208588957055, + "grad_norm": 0.9441475884354189, + "learning_rate": 1.5083061134405874e-06, + "loss": 0.5625, + "step": 6750 + }, + { + "epoch": 0.8283435582822086, + "grad_norm": 0.9234656713785683, + "learning_rate": 1.506207935588232e-06, + "loss": 0.5438, + "step": 6751 + }, + { + "epoch": 0.8284662576687116, + "grad_norm": 0.8692550979739614, + "learning_rate": 1.5041110992573083e-06, + "loss": 0.4917, + "step": 6752 + }, + { + "epoch": 0.8285889570552147, + "grad_norm": 0.9398449026046084, + "learning_rate": 1.502015604778999e-06, + "loss": 0.4855, + "step": 6753 + }, + { + "epoch": 0.8287116564417177, + "grad_norm": 0.8547073578644478, + "learning_rate": 1.499921452484263e-06, + "loss": 0.4955, + "step": 6754 + }, + { + "epoch": 0.8288343558282208, + "grad_norm": 0.9143059021246231, + "learning_rate": 1.4978286427038602e-06, + "loss": 0.5436, + "step": 6755 + }, + { + "epoch": 0.828957055214724, + "grad_norm": 0.8800710714911247, + "learning_rate": 1.4957371757683258e-06, + "loss": 0.4938, + "step": 6756 + }, + { + "epoch": 0.829079754601227, + "grad_norm": 0.9213324672123073, + "learning_rate": 1.493647052007995e-06, + "loss": 0.5348, + "step": 6757 + }, + { + "epoch": 0.8292024539877301, + "grad_norm": 0.8810224483016798, + "learning_rate": 1.4915582717529797e-06, + "loss": 0.5293, + "step": 6758 + }, + { + "epoch": 0.8293251533742332, + "grad_norm": 0.8737998404935466, + "learning_rate": 1.4894708353331887e-06, + "loss": 0.5386, + "step": 6759 + }, + { + "epoch": 0.8294478527607362, + "grad_norm": 0.8358524570628727, + "learning_rate": 1.4873847430783118e-06, + "loss": 0.5227, + "step": 6760 + }, + { + "epoch": 0.8295705521472393, + "grad_norm": 0.9398097681458862, + "learning_rate": 1.4852999953178293e-06, + "loss": 0.5774, + "step": 6761 + }, + { + "epoch": 0.8296932515337423, + "grad_norm": 0.8277686112367623, + "learning_rate": 1.4832165923810137e-06, + "loss": 0.5205, + "step": 6762 + }, + { + "epoch": 0.8298159509202454, + "grad_norm": 1.1942210830806217, + "learning_rate": 1.481134534596914e-06, + "loss": 0.5149, + "step": 6763 + }, + { + "epoch": 0.8299386503067485, + "grad_norm": 1.1563029222773569, + "learning_rate": 1.4790538222943795e-06, + "loss": 0.5621, + "step": 6764 + }, + { + "epoch": 0.8300613496932515, + "grad_norm": 1.152371976507019, + "learning_rate": 1.476974455802036e-06, + "loss": 0.5548, + "step": 6765 + }, + { + "epoch": 0.8301840490797546, + "grad_norm": 0.7713948916256862, + "learning_rate": 1.4748964354483053e-06, + "loss": 0.5832, + "step": 6766 + }, + { + "epoch": 0.8303067484662576, + "grad_norm": 0.9188219131237926, + "learning_rate": 1.4728197615613905e-06, + "loss": 0.5066, + "step": 6767 + }, + { + "epoch": 0.8304294478527607, + "grad_norm": 0.9245634636582519, + "learning_rate": 1.4707444344692833e-06, + "loss": 0.5779, + "step": 6768 + }, + { + "epoch": 0.8305521472392638, + "grad_norm": 0.9155847261726279, + "learning_rate": 1.4686704544997666e-06, + "loss": 0.5254, + "step": 6769 + }, + { + "epoch": 0.8306748466257668, + "grad_norm": 0.8728322613536593, + "learning_rate": 1.4665978219804056e-06, + "loss": 0.5427, + "step": 6770 + }, + { + "epoch": 0.83079754601227, + "grad_norm": 0.8708303866913919, + "learning_rate": 1.4645265372385574e-06, + "loss": 0.4923, + "step": 6771 + }, + { + "epoch": 0.830920245398773, + "grad_norm": 0.9500717051944281, + "learning_rate": 1.4624566006013596e-06, + "loss": 0.6013, + "step": 6772 + }, + { + "epoch": 0.8310429447852761, + "grad_norm": 0.8944277089442718, + "learning_rate": 1.4603880123957448e-06, + "loss": 0.5486, + "step": 6773 + }, + { + "epoch": 0.8311656441717792, + "grad_norm": 0.9085557891678718, + "learning_rate": 1.458320772948426e-06, + "loss": 0.5919, + "step": 6774 + }, + { + "epoch": 0.8312883435582822, + "grad_norm": 0.8806119362583532, + "learning_rate": 1.4562548825859092e-06, + "loss": 0.5572, + "step": 6775 + }, + { + "epoch": 0.8314110429447853, + "grad_norm": 0.7662922687614963, + "learning_rate": 1.4541903416344783e-06, + "loss": 0.6125, + "step": 6776 + }, + { + "epoch": 0.8315337423312883, + "grad_norm": 0.8806468622702248, + "learning_rate": 1.452127150420216e-06, + "loss": 0.4205, + "step": 6777 + }, + { + "epoch": 0.8316564417177914, + "grad_norm": 0.8915762805385924, + "learning_rate": 1.450065309268982e-06, + "loss": 0.4866, + "step": 6778 + }, + { + "epoch": 0.8317791411042945, + "grad_norm": 0.9092139070724266, + "learning_rate": 1.4480048185064288e-06, + "loss": 0.5354, + "step": 6779 + }, + { + "epoch": 0.8319018404907975, + "grad_norm": 0.8274760090018434, + "learning_rate": 1.4459456784579917e-06, + "loss": 0.5585, + "step": 6780 + }, + { + "epoch": 0.8320245398773006, + "grad_norm": 0.9053116134504778, + "learning_rate": 1.4438878894488917e-06, + "loss": 0.4812, + "step": 6781 + }, + { + "epoch": 0.8321472392638036, + "grad_norm": 0.9714579671552708, + "learning_rate": 1.4418314518041432e-06, + "loss": 0.6113, + "step": 6782 + }, + { + "epoch": 0.8322699386503067, + "grad_norm": 0.7757944020385821, + "learning_rate": 1.4397763658485397e-06, + "loss": 0.5266, + "step": 6783 + }, + { + "epoch": 0.8323926380368099, + "grad_norm": 0.8535043352754723, + "learning_rate": 1.4377226319066673e-06, + "loss": 0.5685, + "step": 6784 + }, + { + "epoch": 0.8325153374233129, + "grad_norm": 0.8463098366177758, + "learning_rate": 1.435670250302892e-06, + "loss": 0.4859, + "step": 6785 + }, + { + "epoch": 0.832638036809816, + "grad_norm": 0.8936871511690065, + "learning_rate": 1.4336192213613742e-06, + "loss": 0.562, + "step": 6786 + }, + { + "epoch": 0.8327607361963191, + "grad_norm": 0.83022574242133, + "learning_rate": 1.4315695454060519e-06, + "loss": 0.5492, + "step": 6787 + }, + { + "epoch": 0.8328834355828221, + "grad_norm": 0.870746296461119, + "learning_rate": 1.4295212227606548e-06, + "loss": 0.5362, + "step": 6788 + }, + { + "epoch": 0.8330061349693252, + "grad_norm": 0.8239978980065945, + "learning_rate": 1.427474253748702e-06, + "loss": 0.5116, + "step": 6789 + }, + { + "epoch": 0.8331288343558282, + "grad_norm": 0.835847456010034, + "learning_rate": 1.425428638693489e-06, + "loss": 0.5051, + "step": 6790 + }, + { + "epoch": 0.8332515337423313, + "grad_norm": 0.8735526870102082, + "learning_rate": 1.4233843779181079e-06, + "loss": 0.535, + "step": 6791 + }, + { + "epoch": 0.8333742331288344, + "grad_norm": 0.8737089822616221, + "learning_rate": 1.4213414717454278e-06, + "loss": 0.5266, + "step": 6792 + }, + { + "epoch": 0.8334969325153374, + "grad_norm": 0.8548816123337662, + "learning_rate": 1.4192999204981128e-06, + "loss": 0.5376, + "step": 6793 + }, + { + "epoch": 0.8336196319018405, + "grad_norm": 0.8505925614060493, + "learning_rate": 1.417259724498603e-06, + "loss": 0.5585, + "step": 6794 + }, + { + "epoch": 0.8337423312883435, + "grad_norm": 0.924846030222089, + "learning_rate": 1.415220884069135e-06, + "loss": 0.5149, + "step": 6795 + }, + { + "epoch": 0.8338650306748466, + "grad_norm": 0.8406314410781507, + "learning_rate": 1.4131833995317212e-06, + "loss": 0.554, + "step": 6796 + }, + { + "epoch": 0.8339877300613497, + "grad_norm": 0.8859539646963046, + "learning_rate": 1.4111472712081698e-06, + "loss": 0.5476, + "step": 6797 + }, + { + "epoch": 0.8341104294478527, + "grad_norm": 1.0907743599689121, + "learning_rate": 1.4091124994200666e-06, + "loss": 0.556, + "step": 6798 + }, + { + "epoch": 0.8342331288343559, + "grad_norm": 0.8649955438330592, + "learning_rate": 1.407079084488786e-06, + "loss": 0.4836, + "step": 6799 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 0.830785272372693, + "learning_rate": 1.405047026735491e-06, + "loss": 0.5443, + "step": 6800 + }, + { + "epoch": 0.834478527607362, + "grad_norm": 1.0047462243077054, + "learning_rate": 1.4030163264811237e-06, + "loss": 0.5219, + "step": 6801 + }, + { + "epoch": 0.8346012269938651, + "grad_norm": 0.983808735495109, + "learning_rate": 1.4009869840464195e-06, + "loss": 0.5166, + "step": 6802 + }, + { + "epoch": 0.8347239263803681, + "grad_norm": 0.8929713543714688, + "learning_rate": 1.3989589997518937e-06, + "loss": 0.485, + "step": 6803 + }, + { + "epoch": 0.8348466257668712, + "grad_norm": 0.8127015427042539, + "learning_rate": 1.3969323739178497e-06, + "loss": 0.5315, + "step": 6804 + }, + { + "epoch": 0.8349693251533742, + "grad_norm": 0.9287303761939749, + "learning_rate": 1.394907106864375e-06, + "loss": 0.4571, + "step": 6805 + }, + { + "epoch": 0.8350920245398773, + "grad_norm": 0.8125326075536082, + "learning_rate": 1.3928831989113444e-06, + "loss": 0.5403, + "step": 6806 + }, + { + "epoch": 0.8352147239263804, + "grad_norm": 0.9523871535765622, + "learning_rate": 1.390860650378414e-06, + "loss": 0.5822, + "step": 6807 + }, + { + "epoch": 0.8353374233128834, + "grad_norm": 0.8893858157611106, + "learning_rate": 1.388839461585032e-06, + "loss": 0.5335, + "step": 6808 + }, + { + "epoch": 0.8354601226993865, + "grad_norm": 0.8023695324984063, + "learning_rate": 1.3868196328504258e-06, + "loss": 0.5329, + "step": 6809 + }, + { + "epoch": 0.8355828220858895, + "grad_norm": 1.0438815816811096, + "learning_rate": 1.3848011644936077e-06, + "loss": 0.5505, + "step": 6810 + }, + { + "epoch": 0.8357055214723926, + "grad_norm": 1.0671741971621354, + "learning_rate": 1.3827840568333816e-06, + "loss": 0.5499, + "step": 6811 + }, + { + "epoch": 0.8358282208588957, + "grad_norm": 0.8230600812763589, + "learning_rate": 1.3807683101883285e-06, + "loss": 0.531, + "step": 6812 + }, + { + "epoch": 0.8359509202453987, + "grad_norm": 0.9060028284145519, + "learning_rate": 1.3787539248768212e-06, + "loss": 0.4755, + "step": 6813 + }, + { + "epoch": 0.8360736196319019, + "grad_norm": 0.8634997237219246, + "learning_rate": 1.3767409012170119e-06, + "loss": 0.5302, + "step": 6814 + }, + { + "epoch": 0.8361963190184049, + "grad_norm": 0.9870242790895374, + "learning_rate": 1.3747292395268407e-06, + "loss": 0.5314, + "step": 6815 + }, + { + "epoch": 0.836319018404908, + "grad_norm": 0.8171943997605975, + "learning_rate": 1.372718940124036e-06, + "loss": 0.4843, + "step": 6816 + }, + { + "epoch": 0.8364417177914111, + "grad_norm": 0.8244113003305606, + "learning_rate": 1.3707100033261035e-06, + "loss": 0.541, + "step": 6817 + }, + { + "epoch": 0.8365644171779141, + "grad_norm": 0.9680147584552812, + "learning_rate": 1.3687024294503392e-06, + "loss": 0.5466, + "step": 6818 + }, + { + "epoch": 0.8366871165644172, + "grad_norm": 0.9367884152888448, + "learning_rate": 1.366696218813821e-06, + "loss": 0.5892, + "step": 6819 + }, + { + "epoch": 0.8368098159509203, + "grad_norm": 0.867292331092751, + "learning_rate": 1.3646913717334142e-06, + "loss": 0.4822, + "step": 6820 + }, + { + "epoch": 0.8369325153374233, + "grad_norm": 0.9020727067629887, + "learning_rate": 1.3626878885257644e-06, + "loss": 0.5133, + "step": 6821 + }, + { + "epoch": 0.8370552147239264, + "grad_norm": 0.8881429642859411, + "learning_rate": 1.3606857695073095e-06, + "loss": 0.4941, + "step": 6822 + }, + { + "epoch": 0.8371779141104294, + "grad_norm": 0.9415936075469074, + "learning_rate": 1.3586850149942598e-06, + "loss": 0.5588, + "step": 6823 + }, + { + "epoch": 0.8373006134969325, + "grad_norm": 0.844828918894492, + "learning_rate": 1.356685625302625e-06, + "loss": 0.5459, + "step": 6824 + }, + { + "epoch": 0.8374233128834356, + "grad_norm": 1.1313291250503814, + "learning_rate": 1.3546876007481847e-06, + "loss": 0.5393, + "step": 6825 + }, + { + "epoch": 0.8375460122699386, + "grad_norm": 0.834160622286363, + "learning_rate": 1.3526909416465162e-06, + "loss": 0.5228, + "step": 6826 + }, + { + "epoch": 0.8376687116564417, + "grad_norm": 0.9549680003433615, + "learning_rate": 1.3506956483129674e-06, + "loss": 0.5312, + "step": 6827 + }, + { + "epoch": 0.8377914110429447, + "grad_norm": 0.851619550342218, + "learning_rate": 1.3487017210626852e-06, + "loss": 0.5538, + "step": 6828 + }, + { + "epoch": 0.8379141104294479, + "grad_norm": 0.8913607296532218, + "learning_rate": 1.3467091602105886e-06, + "loss": 0.5156, + "step": 6829 + }, + { + "epoch": 0.838036809815951, + "grad_norm": 0.9991974846862147, + "learning_rate": 1.344717966071385e-06, + "loss": 0.4664, + "step": 6830 + }, + { + "epoch": 0.838159509202454, + "grad_norm": 0.9028647286475832, + "learning_rate": 1.3427281389595702e-06, + "loss": 0.5891, + "step": 6831 + }, + { + "epoch": 0.8382822085889571, + "grad_norm": 0.8066205697225681, + "learning_rate": 1.3407396791894156e-06, + "loss": 0.4987, + "step": 6832 + }, + { + "epoch": 0.8384049079754601, + "grad_norm": 0.8910481771480886, + "learning_rate": 1.3387525870749851e-06, + "loss": 0.5161, + "step": 6833 + }, + { + "epoch": 0.8385276073619632, + "grad_norm": 0.9338903650051112, + "learning_rate": 1.33676686293012e-06, + "loss": 0.5333, + "step": 6834 + }, + { + "epoch": 0.8386503067484663, + "grad_norm": 0.8606146649743837, + "learning_rate": 1.3347825070684518e-06, + "loss": 0.5502, + "step": 6835 + }, + { + "epoch": 0.8387730061349693, + "grad_norm": 0.8130618248044634, + "learning_rate": 1.3327995198033883e-06, + "loss": 0.481, + "step": 6836 + }, + { + "epoch": 0.8388957055214724, + "grad_norm": 1.1068712181954432, + "learning_rate": 1.3308179014481292e-06, + "loss": 0.5663, + "step": 6837 + }, + { + "epoch": 0.8390184049079754, + "grad_norm": 0.8358518929611956, + "learning_rate": 1.328837652315651e-06, + "loss": 0.5261, + "step": 6838 + }, + { + "epoch": 0.8391411042944785, + "grad_norm": 0.9539847193021758, + "learning_rate": 1.3268587727187198e-06, + "loss": 0.5963, + "step": 6839 + }, + { + "epoch": 0.8392638036809816, + "grad_norm": 1.0319837084119874, + "learning_rate": 1.3248812629698815e-06, + "loss": 0.5511, + "step": 6840 + }, + { + "epoch": 0.8393865030674846, + "grad_norm": 0.926516885232379, + "learning_rate": 1.3229051233814637e-06, + "loss": 0.5911, + "step": 6841 + }, + { + "epoch": 0.8395092024539877, + "grad_norm": 0.8294603295937173, + "learning_rate": 1.3209303542655837e-06, + "loss": 0.5178, + "step": 6842 + }, + { + "epoch": 0.8396319018404907, + "grad_norm": 0.9051568772237387, + "learning_rate": 1.3189569559341397e-06, + "loss": 0.537, + "step": 6843 + }, + { + "epoch": 0.8397546012269939, + "grad_norm": 0.8737827172284973, + "learning_rate": 1.3169849286988134e-06, + "loss": 0.5514, + "step": 6844 + }, + { + "epoch": 0.839877300613497, + "grad_norm": 0.8944251538554838, + "learning_rate": 1.3150142728710669e-06, + "loss": 0.5572, + "step": 6845 + }, + { + "epoch": 0.84, + "grad_norm": 1.1490556764718836, + "learning_rate": 1.313044988762151e-06, + "loss": 0.5235, + "step": 6846 + }, + { + "epoch": 0.8401226993865031, + "grad_norm": 0.8761231389259917, + "learning_rate": 1.3110770766830972e-06, + "loss": 0.5625, + "step": 6847 + }, + { + "epoch": 0.8402453987730061, + "grad_norm": 1.0173686209999009, + "learning_rate": 1.3091105369447166e-06, + "loss": 0.5743, + "step": 6848 + }, + { + "epoch": 0.8403680981595092, + "grad_norm": 1.1172230368496283, + "learning_rate": 1.307145369857612e-06, + "loss": 0.5697, + "step": 6849 + }, + { + "epoch": 0.8404907975460123, + "grad_norm": 0.8110082806855367, + "learning_rate": 1.3051815757321607e-06, + "loss": 0.5609, + "step": 6850 + }, + { + "epoch": 0.8406134969325153, + "grad_norm": 0.9043401145872848, + "learning_rate": 1.30321915487853e-06, + "loss": 0.5573, + "step": 6851 + }, + { + "epoch": 0.8407361963190184, + "grad_norm": 1.3332018810228483, + "learning_rate": 1.3012581076066654e-06, + "loss": 0.5399, + "step": 6852 + }, + { + "epoch": 0.8408588957055215, + "grad_norm": 0.8879906391010832, + "learning_rate": 1.2992984342262993e-06, + "loss": 0.501, + "step": 6853 + }, + { + "epoch": 0.8409815950920245, + "grad_norm": 0.8411306917267924, + "learning_rate": 1.2973401350469428e-06, + "loss": 0.5044, + "step": 6854 + }, + { + "epoch": 0.8411042944785276, + "grad_norm": 0.8118990348299885, + "learning_rate": 1.295383210377895e-06, + "loss": 0.5555, + "step": 6855 + }, + { + "epoch": 0.8412269938650306, + "grad_norm": 0.9183170238969355, + "learning_rate": 1.2934276605282336e-06, + "loss": 0.5887, + "step": 6856 + }, + { + "epoch": 0.8413496932515337, + "grad_norm": 0.9461916475697275, + "learning_rate": 1.2914734858068233e-06, + "loss": 0.5234, + "step": 6857 + }, + { + "epoch": 0.8414723926380369, + "grad_norm": 0.9532793973608866, + "learning_rate": 1.2895206865223065e-06, + "loss": 0.5633, + "step": 6858 + }, + { + "epoch": 0.8415950920245399, + "grad_norm": 0.8806229010528528, + "learning_rate": 1.2875692629831105e-06, + "loss": 0.5316, + "step": 6859 + }, + { + "epoch": 0.841717791411043, + "grad_norm": 0.9422880249896333, + "learning_rate": 1.2856192154974488e-06, + "loss": 0.5154, + "step": 6860 + }, + { + "epoch": 0.841840490797546, + "grad_norm": 0.9435118521407824, + "learning_rate": 1.2836705443733122e-06, + "loss": 0.5622, + "step": 6861 + }, + { + "epoch": 0.8419631901840491, + "grad_norm": 1.0871067098792178, + "learning_rate": 1.2817232499184784e-06, + "loss": 0.5145, + "step": 6862 + }, + { + "epoch": 0.8420858895705522, + "grad_norm": 0.7670190520310308, + "learning_rate": 1.2797773324405039e-06, + "loss": 0.5248, + "step": 6863 + }, + { + "epoch": 0.8422085889570552, + "grad_norm": 0.9895216803465211, + "learning_rate": 1.2778327922467327e-06, + "loss": 0.5909, + "step": 6864 + }, + { + "epoch": 0.8423312883435583, + "grad_norm": 1.0171182286773182, + "learning_rate": 1.2758896296442834e-06, + "loss": 0.562, + "step": 6865 + }, + { + "epoch": 0.8424539877300613, + "grad_norm": 0.8678374456199709, + "learning_rate": 1.2739478449400667e-06, + "loss": 0.5134, + "step": 6866 + }, + { + "epoch": 0.8425766871165644, + "grad_norm": 0.8461578835145293, + "learning_rate": 1.2720074384407667e-06, + "loss": 0.589, + "step": 6867 + }, + { + "epoch": 0.8426993865030675, + "grad_norm": 1.1748858016584651, + "learning_rate": 1.2700684104528582e-06, + "loss": 0.5917, + "step": 6868 + }, + { + "epoch": 0.8428220858895705, + "grad_norm": 0.8464410794663499, + "learning_rate": 1.26813076128259e-06, + "loss": 0.5684, + "step": 6869 + }, + { + "epoch": 0.8429447852760736, + "grad_norm": 0.8323384954566982, + "learning_rate": 1.266194491235998e-06, + "loss": 0.5802, + "step": 6870 + }, + { + "epoch": 0.8430674846625766, + "grad_norm": 0.9126084659498658, + "learning_rate": 1.2642596006189034e-06, + "loss": 0.5676, + "step": 6871 + }, + { + "epoch": 0.8431901840490797, + "grad_norm": 0.900002978160435, + "learning_rate": 1.2623260897368994e-06, + "loss": 0.5676, + "step": 6872 + }, + { + "epoch": 0.8433128834355829, + "grad_norm": 0.8030392996013049, + "learning_rate": 1.2603939588953728e-06, + "loss": 0.475, + "step": 6873 + }, + { + "epoch": 0.8434355828220859, + "grad_norm": 0.8697911250186267, + "learning_rate": 1.258463208399483e-06, + "loss": 0.5698, + "step": 6874 + }, + { + "epoch": 0.843558282208589, + "grad_norm": 0.7368343985265526, + "learning_rate": 1.2565338385541792e-06, + "loss": 0.5362, + "step": 6875 + }, + { + "epoch": 0.843680981595092, + "grad_norm": 0.8274404807292491, + "learning_rate": 1.2546058496641843e-06, + "loss": 0.5202, + "step": 6876 + }, + { + "epoch": 0.8438036809815951, + "grad_norm": 1.0483363796803633, + "learning_rate": 1.2526792420340117e-06, + "loss": 0.5738, + "step": 6877 + }, + { + "epoch": 0.8439263803680982, + "grad_norm": 0.9102282961397405, + "learning_rate": 1.250754015967951e-06, + "loss": 0.4898, + "step": 6878 + }, + { + "epoch": 0.8440490797546012, + "grad_norm": 1.0082257958207332, + "learning_rate": 1.2488301717700735e-06, + "loss": 0.5556, + "step": 6879 + }, + { + "epoch": 0.8441717791411043, + "grad_norm": 0.9114365029375638, + "learning_rate": 1.2469077097442372e-06, + "loss": 0.5157, + "step": 6880 + }, + { + "epoch": 0.8442944785276074, + "grad_norm": 0.8889476975237984, + "learning_rate": 1.2449866301940738e-06, + "loss": 0.5142, + "step": 6881 + }, + { + "epoch": 0.8444171779141104, + "grad_norm": 0.9554255352816143, + "learning_rate": 1.2430669334230062e-06, + "loss": 0.5816, + "step": 6882 + }, + { + "epoch": 0.8445398773006135, + "grad_norm": 0.8969623527517774, + "learning_rate": 1.2411486197342293e-06, + "loss": 0.6037, + "step": 6883 + }, + { + "epoch": 0.8446625766871165, + "grad_norm": 0.8317560367856028, + "learning_rate": 1.23923168943073e-06, + "loss": 0.4997, + "step": 6884 + }, + { + "epoch": 0.8447852760736196, + "grad_norm": 0.824230945912552, + "learning_rate": 1.2373161428152647e-06, + "loss": 0.4959, + "step": 6885 + }, + { + "epoch": 0.8449079754601228, + "grad_norm": 0.8875344021949954, + "learning_rate": 1.2354019801903828e-06, + "loss": 0.5419, + "step": 6886 + }, + { + "epoch": 0.8450306748466258, + "grad_norm": 0.7500815229131853, + "learning_rate": 1.2334892018584055e-06, + "loss": 0.5579, + "step": 6887 + }, + { + "epoch": 0.8451533742331289, + "grad_norm": 1.1521273620076873, + "learning_rate": 1.2315778081214435e-06, + "loss": 0.512, + "step": 6888 + }, + { + "epoch": 0.8452760736196319, + "grad_norm": 0.8684468808486853, + "learning_rate": 1.2296677992813844e-06, + "loss": 0.4961, + "step": 6889 + }, + { + "epoch": 0.845398773006135, + "grad_norm": 0.8909396009280075, + "learning_rate": 1.2277591756398933e-06, + "loss": 0.5606, + "step": 6890 + }, + { + "epoch": 0.8455214723926381, + "grad_norm": 0.9076955706970559, + "learning_rate": 1.2258519374984268e-06, + "loss": 0.5068, + "step": 6891 + }, + { + "epoch": 0.8456441717791411, + "grad_norm": 0.8727811053683643, + "learning_rate": 1.2239460851582118e-06, + "loss": 0.4995, + "step": 6892 + }, + { + "epoch": 0.8457668711656442, + "grad_norm": 0.8604156436705646, + "learning_rate": 1.2220416189202656e-06, + "loss": 0.5542, + "step": 6893 + }, + { + "epoch": 0.8458895705521472, + "grad_norm": 0.9345843902467692, + "learning_rate": 1.220138539085378e-06, + "loss": 0.5347, + "step": 6894 + }, + { + "epoch": 0.8460122699386503, + "grad_norm": 0.9775031634349522, + "learning_rate": 1.2182368459541294e-06, + "loss": 0.5387, + "step": 6895 + }, + { + "epoch": 0.8461349693251534, + "grad_norm": 0.8392328829341317, + "learning_rate": 1.21633653982687e-06, + "loss": 0.4999, + "step": 6896 + }, + { + "epoch": 0.8462576687116564, + "grad_norm": 0.9107165652713368, + "learning_rate": 1.21443762100374e-06, + "loss": 0.5674, + "step": 6897 + }, + { + "epoch": 0.8463803680981595, + "grad_norm": 0.8408291097307093, + "learning_rate": 1.2125400897846584e-06, + "loss": 0.5923, + "step": 6898 + }, + { + "epoch": 0.8465030674846625, + "grad_norm": 0.8576727823989307, + "learning_rate": 1.210643946469321e-06, + "loss": 0.5326, + "step": 6899 + }, + { + "epoch": 0.8466257668711656, + "grad_norm": 0.9223934393897382, + "learning_rate": 1.2087491913572103e-06, + "loss": 0.5327, + "step": 6900 + }, + { + "epoch": 0.8467484662576688, + "grad_norm": 0.9182927159470242, + "learning_rate": 1.2068558247475837e-06, + "loss": 0.5839, + "step": 6901 + }, + { + "epoch": 0.8468711656441718, + "grad_norm": 0.8300548349825813, + "learning_rate": 1.2049638469394854e-06, + "loss": 0.5974, + "step": 6902 + }, + { + "epoch": 0.8469938650306749, + "grad_norm": 0.8715515476850575, + "learning_rate": 1.203073258231733e-06, + "loss": 0.5631, + "step": 6903 + }, + { + "epoch": 0.8471165644171779, + "grad_norm": 0.8207539815954199, + "learning_rate": 1.2011840589229339e-06, + "loss": 0.5642, + "step": 6904 + }, + { + "epoch": 0.847239263803681, + "grad_norm": 1.000660333076164, + "learning_rate": 1.1992962493114645e-06, + "loss": 0.5614, + "step": 6905 + }, + { + "epoch": 0.8473619631901841, + "grad_norm": 1.0290587464958667, + "learning_rate": 1.197409829695495e-06, + "loss": 0.5689, + "step": 6906 + }, + { + "epoch": 0.8474846625766871, + "grad_norm": 0.9795523711891181, + "learning_rate": 1.1955248003729626e-06, + "loss": 0.5481, + "step": 6907 + }, + { + "epoch": 0.8476073619631902, + "grad_norm": 0.7991326706942767, + "learning_rate": 1.1936411616415966e-06, + "loss": 0.5276, + "step": 6908 + }, + { + "epoch": 0.8477300613496932, + "grad_norm": 0.8728100446659637, + "learning_rate": 1.1917589137989006e-06, + "loss": 0.5565, + "step": 6909 + }, + { + "epoch": 0.8478527607361963, + "grad_norm": 0.8600706712379023, + "learning_rate": 1.1898780571421554e-06, + "loss": 0.5581, + "step": 6910 + }, + { + "epoch": 0.8479754601226994, + "grad_norm": 0.8542392241492417, + "learning_rate": 1.1879985919684312e-06, + "loss": 0.5566, + "step": 6911 + }, + { + "epoch": 0.8480981595092024, + "grad_norm": 0.9703642899806496, + "learning_rate": 1.1861205185745694e-06, + "loss": 0.584, + "step": 6912 + }, + { + "epoch": 0.8482208588957055, + "grad_norm": 0.8842605560575844, + "learning_rate": 1.1842438372571996e-06, + "loss": 0.5223, + "step": 6913 + }, + { + "epoch": 0.8483435582822086, + "grad_norm": 0.8388442722651023, + "learning_rate": 1.1823685483127234e-06, + "loss": 0.5327, + "step": 6914 + }, + { + "epoch": 0.8484662576687116, + "grad_norm": 0.8655352797875301, + "learning_rate": 1.1804946520373307e-06, + "loss": 0.5211, + "step": 6915 + }, + { + "epoch": 0.8485889570552148, + "grad_norm": 0.9059248061041579, + "learning_rate": 1.1786221487269822e-06, + "loss": 0.564, + "step": 6916 + }, + { + "epoch": 0.8487116564417178, + "grad_norm": 0.9293267915754222, + "learning_rate": 1.1767510386774294e-06, + "loss": 0.5512, + "step": 6917 + }, + { + "epoch": 0.8488343558282209, + "grad_norm": 0.9648736281194694, + "learning_rate": 1.1748813221841926e-06, + "loss": 0.551, + "step": 6918 + }, + { + "epoch": 0.848957055214724, + "grad_norm": 0.8175324714300974, + "learning_rate": 1.1730129995425831e-06, + "loss": 0.4987, + "step": 6919 + }, + { + "epoch": 0.849079754601227, + "grad_norm": 1.2255871046774047, + "learning_rate": 1.171146071047683e-06, + "loss": 0.5306, + "step": 6920 + }, + { + "epoch": 0.8492024539877301, + "grad_norm": 1.098141858567006, + "learning_rate": 1.169280536994355e-06, + "loss": 0.5419, + "step": 6921 + }, + { + "epoch": 0.8493251533742331, + "grad_norm": 0.8523736955079717, + "learning_rate": 1.1674163976772502e-06, + "loss": 0.6121, + "step": 6922 + }, + { + "epoch": 0.8494478527607362, + "grad_norm": 1.1798557297056957, + "learning_rate": 1.1655536533907874e-06, + "loss": 0.5545, + "step": 6923 + }, + { + "epoch": 0.8495705521472393, + "grad_norm": 0.8155220778319877, + "learning_rate": 1.163692304429177e-06, + "loss": 0.5984, + "step": 6924 + }, + { + "epoch": 0.8496932515337423, + "grad_norm": 0.8631063046954967, + "learning_rate": 1.161832351086396e-06, + "loss": 0.4929, + "step": 6925 + }, + { + "epoch": 0.8498159509202454, + "grad_norm": 1.0859949444566674, + "learning_rate": 1.159973793656215e-06, + "loss": 0.5688, + "step": 6926 + }, + { + "epoch": 0.8499386503067484, + "grad_norm": 1.125386339667288, + "learning_rate": 1.1581166324321759e-06, + "loss": 0.6271, + "step": 6927 + }, + { + "epoch": 0.8500613496932515, + "grad_norm": 0.8392369598243069, + "learning_rate": 1.156260867707596e-06, + "loss": 0.5764, + "step": 6928 + }, + { + "epoch": 0.8501840490797546, + "grad_norm": 0.8719557597644185, + "learning_rate": 1.1544064997755843e-06, + "loss": 0.5742, + "step": 6929 + }, + { + "epoch": 0.8503067484662576, + "grad_norm": 0.8992944116964621, + "learning_rate": 1.1525535289290168e-06, + "loss": 0.5304, + "step": 6930 + }, + { + "epoch": 0.8504294478527608, + "grad_norm": 0.8753545184031071, + "learning_rate": 1.1507019554605581e-06, + "loss": 0.5234, + "step": 6931 + }, + { + "epoch": 0.8505521472392638, + "grad_norm": 0.8072714574601205, + "learning_rate": 1.1488517796626453e-06, + "loss": 0.5174, + "step": 6932 + }, + { + "epoch": 0.8506748466257669, + "grad_norm": 0.8852865458337166, + "learning_rate": 1.1470030018275014e-06, + "loss": 0.5759, + "step": 6933 + }, + { + "epoch": 0.85079754601227, + "grad_norm": 0.8841572807156646, + "learning_rate": 1.1451556222471206e-06, + "loss": 0.5496, + "step": 6934 + }, + { + "epoch": 0.850920245398773, + "grad_norm": 1.057612268421732, + "learning_rate": 1.1433096412132838e-06, + "loss": 0.5603, + "step": 6935 + }, + { + "epoch": 0.8510429447852761, + "grad_norm": 0.9521677702026842, + "learning_rate": 1.1414650590175459e-06, + "loss": 0.5985, + "step": 6936 + }, + { + "epoch": 0.8511656441717791, + "grad_norm": 0.855936460801999, + "learning_rate": 1.1396218759512456e-06, + "loss": 0.5234, + "step": 6937 + }, + { + "epoch": 0.8512883435582822, + "grad_norm": 1.1071254528808954, + "learning_rate": 1.1377800923054961e-06, + "loss": 0.5415, + "step": 6938 + }, + { + "epoch": 0.8514110429447853, + "grad_norm": 0.8392269485190591, + "learning_rate": 1.1359397083711898e-06, + "loss": 0.579, + "step": 6939 + }, + { + "epoch": 0.8515337423312883, + "grad_norm": 0.9511860068570326, + "learning_rate": 1.1341007244390023e-06, + "loss": 0.5386, + "step": 6940 + }, + { + "epoch": 0.8516564417177914, + "grad_norm": 0.9466500421822619, + "learning_rate": 1.132263140799381e-06, + "loss": 0.5971, + "step": 6941 + }, + { + "epoch": 0.8517791411042944, + "grad_norm": 0.9821888982981792, + "learning_rate": 1.1304269577425632e-06, + "loss": 0.5954, + "step": 6942 + }, + { + "epoch": 0.8519018404907975, + "grad_norm": 1.0289496590931517, + "learning_rate": 1.1285921755585504e-06, + "loss": 0.551, + "step": 6943 + }, + { + "epoch": 0.8520245398773006, + "grad_norm": 0.818281669578713, + "learning_rate": 1.1267587945371383e-06, + "loss": 0.5423, + "step": 6944 + }, + { + "epoch": 0.8521472392638036, + "grad_norm": 0.9100171489447638, + "learning_rate": 1.124926814967887e-06, + "loss": 0.5744, + "step": 6945 + }, + { + "epoch": 0.8522699386503068, + "grad_norm": 0.803903974911459, + "learning_rate": 1.1230962371401466e-06, + "loss": 0.5301, + "step": 6946 + }, + { + "epoch": 0.8523926380368099, + "grad_norm": 0.8578465489687388, + "learning_rate": 1.1212670613430388e-06, + "loss": 0.5529, + "step": 6947 + }, + { + "epoch": 0.8525153374233129, + "grad_norm": 0.8097947845217983, + "learning_rate": 1.1194392878654681e-06, + "loss": 0.5147, + "step": 6948 + }, + { + "epoch": 0.852638036809816, + "grad_norm": 0.8308673465437696, + "learning_rate": 1.1176129169961126e-06, + "loss": 0.5519, + "step": 6949 + }, + { + "epoch": 0.852760736196319, + "grad_norm": 0.8143484205140145, + "learning_rate": 1.1157879490234346e-06, + "loss": 0.552, + "step": 6950 + }, + { + "epoch": 0.8528834355828221, + "grad_norm": 1.0762123421438305, + "learning_rate": 1.113964384235673e-06, + "loss": 0.4809, + "step": 6951 + }, + { + "epoch": 0.8530061349693252, + "grad_norm": 0.817783106811496, + "learning_rate": 1.1121422229208368e-06, + "loss": 0.4616, + "step": 6952 + }, + { + "epoch": 0.8531288343558282, + "grad_norm": 0.9451310012169849, + "learning_rate": 1.1103214653667306e-06, + "loss": 0.5638, + "step": 6953 + }, + { + "epoch": 0.8532515337423313, + "grad_norm": 0.8337332494059526, + "learning_rate": 1.1085021118609207e-06, + "loss": 0.5852, + "step": 6954 + }, + { + "epoch": 0.8533742331288343, + "grad_norm": 0.7918998176248052, + "learning_rate": 1.1066841626907633e-06, + "loss": 0.5681, + "step": 6955 + }, + { + "epoch": 0.8534969325153374, + "grad_norm": 1.0765885596847191, + "learning_rate": 1.1048676181433837e-06, + "loss": 0.5624, + "step": 6956 + }, + { + "epoch": 0.8536196319018405, + "grad_norm": 0.8201097246293843, + "learning_rate": 1.1030524785056916e-06, + "loss": 0.5188, + "step": 6957 + }, + { + "epoch": 0.8537423312883435, + "grad_norm": 0.9873832291198879, + "learning_rate": 1.1012387440643735e-06, + "loss": 0.5592, + "step": 6958 + }, + { + "epoch": 0.8538650306748466, + "grad_norm": 0.8332917774923608, + "learning_rate": 1.0994264151058897e-06, + "loss": 0.5276, + "step": 6959 + }, + { + "epoch": 0.8539877300613496, + "grad_norm": 1.1002607576786572, + "learning_rate": 1.097615491916485e-06, + "loss": 0.5851, + "step": 6960 + }, + { + "epoch": 0.8541104294478528, + "grad_norm": 1.0770851931766396, + "learning_rate": 1.095805974782177e-06, + "loss": 0.5256, + "step": 6961 + }, + { + "epoch": 0.8542331288343559, + "grad_norm": 0.9780461932100392, + "learning_rate": 1.0939978639887661e-06, + "loss": 0.5814, + "step": 6962 + }, + { + "epoch": 0.8543558282208589, + "grad_norm": 0.8708780997884171, + "learning_rate": 1.092191159821825e-06, + "loss": 0.5298, + "step": 6963 + }, + { + "epoch": 0.854478527607362, + "grad_norm": 0.9708714448455381, + "learning_rate": 1.0903858625667097e-06, + "loss": 0.5647, + "step": 6964 + }, + { + "epoch": 0.854601226993865, + "grad_norm": 0.8974118314580496, + "learning_rate": 1.088581972508549e-06, + "loss": 0.5042, + "step": 6965 + }, + { + "epoch": 0.8547239263803681, + "grad_norm": 0.928265483592387, + "learning_rate": 1.0867794899322537e-06, + "loss": 0.4928, + "step": 6966 + }, + { + "epoch": 0.8548466257668712, + "grad_norm": 0.7576119804424963, + "learning_rate": 1.0849784151225084e-06, + "loss": 0.5385, + "step": 6967 + }, + { + "epoch": 0.8549693251533742, + "grad_norm": 0.8601947896864326, + "learning_rate": 1.0831787483637812e-06, + "loss": 0.5568, + "step": 6968 + }, + { + "epoch": 0.8550920245398773, + "grad_norm": 0.9472786608731, + "learning_rate": 1.0813804899403101e-06, + "loss": 0.599, + "step": 6969 + }, + { + "epoch": 0.8552147239263803, + "grad_norm": 0.8828286453810454, + "learning_rate": 1.0795836401361148e-06, + "loss": 0.4918, + "step": 6970 + }, + { + "epoch": 0.8553374233128834, + "grad_norm": 0.8524038022414937, + "learning_rate": 1.0777881992349959e-06, + "loss": 0.5507, + "step": 6971 + }, + { + "epoch": 0.8554601226993865, + "grad_norm": 0.8998652550199303, + "learning_rate": 1.0759941675205221e-06, + "loss": 0.5249, + "step": 6972 + }, + { + "epoch": 0.8555828220858895, + "grad_norm": 0.933212004826475, + "learning_rate": 1.0742015452760512e-06, + "loss": 0.5585, + "step": 6973 + }, + { + "epoch": 0.8557055214723926, + "grad_norm": 0.8418538749124761, + "learning_rate": 1.072410332784708e-06, + "loss": 0.5345, + "step": 6974 + }, + { + "epoch": 0.8558282208588958, + "grad_norm": 0.8611512623684134, + "learning_rate": 1.0706205303294025e-06, + "loss": 0.4956, + "step": 6975 + }, + { + "epoch": 0.8559509202453988, + "grad_norm": 0.830814621390022, + "learning_rate": 1.0688321381928147e-06, + "loss": 0.5096, + "step": 6976 + }, + { + "epoch": 0.8560736196319019, + "grad_norm": 0.9403280820449206, + "learning_rate": 1.0670451566574102e-06, + "loss": 0.5383, + "step": 6977 + }, + { + "epoch": 0.8561963190184049, + "grad_norm": 1.038070019435437, + "learning_rate": 1.0652595860054237e-06, + "loss": 0.5418, + "step": 6978 + }, + { + "epoch": 0.856319018404908, + "grad_norm": 0.8593160201248756, + "learning_rate": 1.063475426518874e-06, + "loss": 0.5759, + "step": 6979 + }, + { + "epoch": 0.8564417177914111, + "grad_norm": 0.8173055309385752, + "learning_rate": 1.0616926784795511e-06, + "loss": 0.4975, + "step": 6980 + }, + { + "epoch": 0.8565644171779141, + "grad_norm": 0.8303500348803763, + "learning_rate": 1.0599113421690244e-06, + "loss": 0.4685, + "step": 6981 + }, + { + "epoch": 0.8566871165644172, + "grad_norm": 1.0337154075432378, + "learning_rate": 1.0581314178686442e-06, + "loss": 0.4856, + "step": 6982 + }, + { + "epoch": 0.8568098159509202, + "grad_norm": 0.8359004994718947, + "learning_rate": 1.0563529058595302e-06, + "loss": 0.5405, + "step": 6983 + }, + { + "epoch": 0.8569325153374233, + "grad_norm": 0.9272259986893939, + "learning_rate": 1.0545758064225864e-06, + "loss": 0.5506, + "step": 6984 + }, + { + "epoch": 0.8570552147239264, + "grad_norm": 0.8564854567701929, + "learning_rate": 1.0528001198384862e-06, + "loss": 0.5864, + "step": 6985 + }, + { + "epoch": 0.8571779141104294, + "grad_norm": 0.7963356237557425, + "learning_rate": 1.0510258463876899e-06, + "loss": 0.4719, + "step": 6986 + }, + { + "epoch": 0.8573006134969325, + "grad_norm": 0.936194826396358, + "learning_rate": 1.0492529863504253e-06, + "loss": 0.5541, + "step": 6987 + }, + { + "epoch": 0.8574233128834355, + "grad_norm": 1.623064055560171, + "learning_rate": 1.0474815400066984e-06, + "loss": 0.5871, + "step": 6988 + }, + { + "epoch": 0.8575460122699387, + "grad_norm": 0.885768818519405, + "learning_rate": 1.0457115076362978e-06, + "loss": 0.549, + "step": 6989 + }, + { + "epoch": 0.8576687116564418, + "grad_norm": 0.8838305349975877, + "learning_rate": 1.043942889518782e-06, + "loss": 0.5645, + "step": 6990 + }, + { + "epoch": 0.8577914110429448, + "grad_norm": 0.8034910606709506, + "learning_rate": 1.0421756859334908e-06, + "loss": 0.5109, + "step": 6991 + }, + { + "epoch": 0.8579141104294479, + "grad_norm": 0.8855384453324868, + "learning_rate": 1.0404098971595378e-06, + "loss": 0.5097, + "step": 6992 + }, + { + "epoch": 0.8580368098159509, + "grad_norm": 0.8315422870016285, + "learning_rate": 1.0386455234758152e-06, + "loss": 0.5052, + "step": 6993 + }, + { + "epoch": 0.858159509202454, + "grad_norm": 0.8212402701372251, + "learning_rate": 1.0368825651609893e-06, + "loss": 0.5594, + "step": 6994 + }, + { + "epoch": 0.8582822085889571, + "grad_norm": 0.8203223788006281, + "learning_rate": 1.035121022493506e-06, + "loss": 0.4948, + "step": 6995 + }, + { + "epoch": 0.8584049079754601, + "grad_norm": 0.9016479844623038, + "learning_rate": 1.033360895751584e-06, + "loss": 0.5234, + "step": 6996 + }, + { + "epoch": 0.8585276073619632, + "grad_norm": 0.896979036633696, + "learning_rate": 1.031602185213222e-06, + "loss": 0.5583, + "step": 6997 + }, + { + "epoch": 0.8586503067484662, + "grad_norm": 0.8448790832852231, + "learning_rate": 1.0298448911561899e-06, + "loss": 0.556, + "step": 6998 + }, + { + "epoch": 0.8587730061349693, + "grad_norm": 1.0427188082735714, + "learning_rate": 1.0280890138580425e-06, + "loss": 0.5375, + "step": 6999 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.9170293920851218, + "learning_rate": 1.026334553596101e-06, + "loss": 0.5202, + "step": 7000 + }, + { + "epoch": 0.8590184049079754, + "grad_norm": 1.040103278349523, + "learning_rate": 1.0245815106474677e-06, + "loss": 0.5623, + "step": 7001 + }, + { + "epoch": 0.8591411042944785, + "grad_norm": 0.870622816338693, + "learning_rate": 1.022829885289024e-06, + "loss": 0.5069, + "step": 7002 + }, + { + "epoch": 0.8592638036809815, + "grad_norm": 0.8612345399461827, + "learning_rate": 1.0210796777974196e-06, + "loss": 0.5315, + "step": 7003 + }, + { + "epoch": 0.8593865030674847, + "grad_norm": 0.8851071377641954, + "learning_rate": 1.0193308884490894e-06, + "loss": 0.4831, + "step": 7004 + }, + { + "epoch": 0.8595092024539878, + "grad_norm": 0.889493169118982, + "learning_rate": 1.0175835175202341e-06, + "loss": 0.5061, + "step": 7005 + }, + { + "epoch": 0.8596319018404908, + "grad_norm": 0.8401399642833383, + "learning_rate": 1.0158375652868425e-06, + "loss": 0.5142, + "step": 7006 + }, + { + "epoch": 0.8597546012269939, + "grad_norm": 0.784353453793335, + "learning_rate": 1.0140930320246667e-06, + "loss": 0.5571, + "step": 7007 + }, + { + "epoch": 0.859877300613497, + "grad_norm": 0.9391698053012294, + "learning_rate": 1.0123499180092422e-06, + "loss": 0.543, + "step": 7008 + }, + { + "epoch": 0.86, + "grad_norm": 0.896690340816686, + "learning_rate": 1.010608223515883e-06, + "loss": 0.4779, + "step": 7009 + }, + { + "epoch": 0.8601226993865031, + "grad_norm": 0.8171129408289278, + "learning_rate": 1.0088679488196695e-06, + "loss": 0.4753, + "step": 7010 + }, + { + "epoch": 0.8602453987730061, + "grad_norm": 0.7560921935043973, + "learning_rate": 1.007129094195468e-06, + "loss": 0.5205, + "step": 7011 + }, + { + "epoch": 0.8603680981595092, + "grad_norm": 1.0115506930407507, + "learning_rate": 1.0053916599179104e-06, + "loss": 0.5684, + "step": 7012 + }, + { + "epoch": 0.8604907975460123, + "grad_norm": 0.9110920861701297, + "learning_rate": 1.0036556462614133e-06, + "loss": 0.5213, + "step": 7013 + }, + { + "epoch": 0.8606134969325153, + "grad_norm": 0.9195517681661372, + "learning_rate": 1.0019210535001633e-06, + "loss": 0.5545, + "step": 7014 + }, + { + "epoch": 0.8607361963190184, + "grad_norm": 0.9400395731757898, + "learning_rate": 1.0001878819081268e-06, + "loss": 0.5101, + "step": 7015 + }, + { + "epoch": 0.8608588957055214, + "grad_norm": 0.8747126960616377, + "learning_rate": 9.984561317590392e-07, + "loss": 0.5917, + "step": 7016 + }, + { + "epoch": 0.8609815950920245, + "grad_norm": 0.8303355812747572, + "learning_rate": 9.967258033264205e-07, + "loss": 0.5687, + "step": 7017 + }, + { + "epoch": 0.8611042944785277, + "grad_norm": 0.8623532190887733, + "learning_rate": 9.949968968835578e-07, + "loss": 0.56, + "step": 7018 + }, + { + "epoch": 0.8612269938650307, + "grad_norm": 0.8715083783989391, + "learning_rate": 9.932694127035169e-07, + "loss": 0.5901, + "step": 7019 + }, + { + "epoch": 0.8613496932515338, + "grad_norm": 0.8461951859797269, + "learning_rate": 9.91543351059141e-07, + "loss": 0.5126, + "step": 7020 + }, + { + "epoch": 0.8614723926380368, + "grad_norm": 0.9543099927859581, + "learning_rate": 9.89818712223044e-07, + "loss": 0.6198, + "step": 7021 + }, + { + "epoch": 0.8615950920245399, + "grad_norm": 0.8356040236876422, + "learning_rate": 9.880954964676226e-07, + "loss": 0.5857, + "step": 7022 + }, + { + "epoch": 0.861717791411043, + "grad_norm": 0.9042723301893344, + "learning_rate": 9.863737040650378e-07, + "loss": 0.5218, + "step": 7023 + }, + { + "epoch": 0.861840490797546, + "grad_norm": 0.8893797486398394, + "learning_rate": 9.846533352872368e-07, + "loss": 0.5209, + "step": 7024 + }, + { + "epoch": 0.8619631901840491, + "grad_norm": 1.052410535359856, + "learning_rate": 9.829343904059342e-07, + "loss": 0.5832, + "step": 7025 + }, + { + "epoch": 0.8620858895705521, + "grad_norm": 0.9692667202450497, + "learning_rate": 9.812168696926238e-07, + "loss": 0.5589, + "step": 7026 + }, + { + "epoch": 0.8622085889570552, + "grad_norm": 0.8956305855789364, + "learning_rate": 9.795007734185724e-07, + "loss": 0.5713, + "step": 7027 + }, + { + "epoch": 0.8623312883435583, + "grad_norm": 0.9771877196957063, + "learning_rate": 9.77786101854825e-07, + "loss": 0.5698, + "step": 7028 + }, + { + "epoch": 0.8624539877300613, + "grad_norm": 0.99811463279742, + "learning_rate": 9.760728552721955e-07, + "loss": 0.5909, + "step": 7029 + }, + { + "epoch": 0.8625766871165644, + "grad_norm": 0.9145079961768467, + "learning_rate": 9.743610339412801e-07, + "loss": 0.6024, + "step": 7030 + }, + { + "epoch": 0.8626993865030674, + "grad_norm": 0.794269955263444, + "learning_rate": 9.726506381324451e-07, + "loss": 0.5494, + "step": 7031 + }, + { + "epoch": 0.8628220858895705, + "grad_norm": 1.0475267276841336, + "learning_rate": 9.709416681158301e-07, + "loss": 0.5918, + "step": 7032 + }, + { + "epoch": 0.8629447852760737, + "grad_norm": 0.8544009296628897, + "learning_rate": 9.692341241613567e-07, + "loss": 0.5271, + "step": 7033 + }, + { + "epoch": 0.8630674846625767, + "grad_norm": 0.8806839929718743, + "learning_rate": 9.675280065387117e-07, + "loss": 0.5735, + "step": 7034 + }, + { + "epoch": 0.8631901840490798, + "grad_norm": 0.8978612849678443, + "learning_rate": 9.658233155173657e-07, + "loss": 0.545, + "step": 7035 + }, + { + "epoch": 0.8633128834355828, + "grad_norm": 0.9151879264978947, + "learning_rate": 9.641200513665605e-07, + "loss": 0.535, + "step": 7036 + }, + { + "epoch": 0.8634355828220859, + "grad_norm": 0.8525584190521983, + "learning_rate": 9.624182143553073e-07, + "loss": 0.5532, + "step": 7037 + }, + { + "epoch": 0.863558282208589, + "grad_norm": 0.8073584924513302, + "learning_rate": 9.607178047524024e-07, + "loss": 0.5572, + "step": 7038 + }, + { + "epoch": 0.863680981595092, + "grad_norm": 0.7866567228752965, + "learning_rate": 9.590188228264064e-07, + "loss": 0.5917, + "step": 7039 + }, + { + "epoch": 0.8638036809815951, + "grad_norm": 0.8390731160902485, + "learning_rate": 9.573212688456635e-07, + "loss": 0.5562, + "step": 7040 + }, + { + "epoch": 0.8639263803680982, + "grad_norm": 0.858017267609799, + "learning_rate": 9.556251430782816e-07, + "loss": 0.5128, + "step": 7041 + }, + { + "epoch": 0.8640490797546012, + "grad_norm": 0.8334277974615858, + "learning_rate": 9.53930445792155e-07, + "loss": 0.5369, + "step": 7042 + }, + { + "epoch": 0.8641717791411043, + "grad_norm": 0.8625244286878716, + "learning_rate": 9.522371772549421e-07, + "loss": 0.5696, + "step": 7043 + }, + { + "epoch": 0.8642944785276073, + "grad_norm": 0.9758662098857653, + "learning_rate": 9.505453377340834e-07, + "loss": 0.5742, + "step": 7044 + }, + { + "epoch": 0.8644171779141104, + "grad_norm": 0.8285482629400578, + "learning_rate": 9.488549274967873e-07, + "loss": 0.5263, + "step": 7045 + }, + { + "epoch": 0.8645398773006135, + "grad_norm": 0.8295399862451146, + "learning_rate": 9.471659468100436e-07, + "loss": 0.5789, + "step": 7046 + }, + { + "epoch": 0.8646625766871165, + "grad_norm": 0.8652214110330964, + "learning_rate": 9.454783959406089e-07, + "loss": 0.5524, + "step": 7047 + }, + { + "epoch": 0.8647852760736197, + "grad_norm": 0.8884198462414112, + "learning_rate": 9.437922751550188e-07, + "loss": 0.5584, + "step": 7048 + }, + { + "epoch": 0.8649079754601227, + "grad_norm": 0.8090166187731133, + "learning_rate": 9.421075847195827e-07, + "loss": 0.4664, + "step": 7049 + }, + { + "epoch": 0.8650306748466258, + "grad_norm": 0.8536034367812615, + "learning_rate": 9.404243249003786e-07, + "loss": 0.52, + "step": 7050 + }, + { + "epoch": 0.8651533742331289, + "grad_norm": 1.1109236390148285, + "learning_rate": 9.387424959632685e-07, + "loss": 0.5667, + "step": 7051 + }, + { + "epoch": 0.8652760736196319, + "grad_norm": 1.0112861677740344, + "learning_rate": 9.370620981738776e-07, + "loss": 0.5206, + "step": 7052 + }, + { + "epoch": 0.865398773006135, + "grad_norm": 0.8102561040856545, + "learning_rate": 9.353831317976148e-07, + "loss": 0.5051, + "step": 7053 + }, + { + "epoch": 0.865521472392638, + "grad_norm": 0.9072442247289817, + "learning_rate": 9.337055970996545e-07, + "loss": 0.5258, + "step": 7054 + }, + { + "epoch": 0.8656441717791411, + "grad_norm": 0.8880143907414202, + "learning_rate": 9.320294943449537e-07, + "loss": 0.5775, + "step": 7055 + }, + { + "epoch": 0.8657668711656442, + "grad_norm": 0.8388693373851239, + "learning_rate": 9.30354823798233e-07, + "loss": 0.5227, + "step": 7056 + }, + { + "epoch": 0.8658895705521472, + "grad_norm": 0.9191401232010848, + "learning_rate": 9.286815857239961e-07, + "loss": 0.4743, + "step": 7057 + }, + { + "epoch": 0.8660122699386503, + "grad_norm": 0.833742134715623, + "learning_rate": 9.270097803865141e-07, + "loss": 0.4448, + "step": 7058 + }, + { + "epoch": 0.8661349693251533, + "grad_norm": 1.3770161136483023, + "learning_rate": 9.253394080498368e-07, + "loss": 0.5231, + "step": 7059 + }, + { + "epoch": 0.8662576687116564, + "grad_norm": 0.8449795475079193, + "learning_rate": 9.236704689777842e-07, + "loss": 0.4777, + "step": 7060 + }, + { + "epoch": 0.8663803680981595, + "grad_norm": 0.7389167320359757, + "learning_rate": 9.220029634339489e-07, + "loss": 0.5391, + "step": 7061 + }, + { + "epoch": 0.8665030674846625, + "grad_norm": 0.8604106845748556, + "learning_rate": 9.203368916817012e-07, + "loss": 0.5319, + "step": 7062 + }, + { + "epoch": 0.8666257668711657, + "grad_norm": 0.9172959947945968, + "learning_rate": 9.186722539841797e-07, + "loss": 0.4913, + "step": 7063 + }, + { + "epoch": 0.8667484662576687, + "grad_norm": 0.8015060909815277, + "learning_rate": 9.170090506043061e-07, + "loss": 0.4915, + "step": 7064 + }, + { + "epoch": 0.8668711656441718, + "grad_norm": 1.0060061630917225, + "learning_rate": 9.153472818047627e-07, + "loss": 0.5494, + "step": 7065 + }, + { + "epoch": 0.8669938650306749, + "grad_norm": 0.831549040217711, + "learning_rate": 9.136869478480159e-07, + "loss": 0.532, + "step": 7066 + }, + { + "epoch": 0.8671165644171779, + "grad_norm": 0.8579199784204135, + "learning_rate": 9.120280489962985e-07, + "loss": 0.5501, + "step": 7067 + }, + { + "epoch": 0.867239263803681, + "grad_norm": 0.9488188341345584, + "learning_rate": 9.103705855116196e-07, + "loss": 0.5469, + "step": 7068 + }, + { + "epoch": 0.8673619631901841, + "grad_norm": 0.8911845104507523, + "learning_rate": 9.087145576557621e-07, + "loss": 0.4864, + "step": 7069 + }, + { + "epoch": 0.8674846625766871, + "grad_norm": 0.8056166476310836, + "learning_rate": 9.070599656902801e-07, + "loss": 0.589, + "step": 7070 + }, + { + "epoch": 0.8676073619631902, + "grad_norm": 0.9099280730293839, + "learning_rate": 9.054068098765056e-07, + "loss": 0.5765, + "step": 7071 + }, + { + "epoch": 0.8677300613496932, + "grad_norm": 0.8735595407193677, + "learning_rate": 9.037550904755355e-07, + "loss": 0.5656, + "step": 7072 + }, + { + "epoch": 0.8678527607361963, + "grad_norm": 0.8915226613437626, + "learning_rate": 9.021048077482486e-07, + "loss": 0.4497, + "step": 7073 + }, + { + "epoch": 0.8679754601226994, + "grad_norm": 0.8358561872263173, + "learning_rate": 9.004559619552899e-07, + "loss": 0.4254, + "step": 7074 + }, + { + "epoch": 0.8680981595092024, + "grad_norm": 0.8436232245917917, + "learning_rate": 8.988085533570833e-07, + "loss": 0.4781, + "step": 7075 + }, + { + "epoch": 0.8682208588957055, + "grad_norm": 0.8380002414896118, + "learning_rate": 8.971625822138197e-07, + "loss": 0.5863, + "step": 7076 + }, + { + "epoch": 0.8683435582822085, + "grad_norm": 0.8817499323356982, + "learning_rate": 8.955180487854698e-07, + "loss": 0.5789, + "step": 7077 + }, + { + "epoch": 0.8684662576687117, + "grad_norm": 0.7661589467229551, + "learning_rate": 8.938749533317703e-07, + "loss": 0.4725, + "step": 7078 + }, + { + "epoch": 0.8685889570552148, + "grad_norm": 0.8750030453744679, + "learning_rate": 8.92233296112236e-07, + "loss": 0.5644, + "step": 7079 + }, + { + "epoch": 0.8687116564417178, + "grad_norm": 0.9268922248936463, + "learning_rate": 8.905930773861527e-07, + "loss": 0.5304, + "step": 7080 + }, + { + "epoch": 0.8688343558282209, + "grad_norm": 0.9097190724611974, + "learning_rate": 8.889542974125753e-07, + "loss": 0.5298, + "step": 7081 + }, + { + "epoch": 0.8689570552147239, + "grad_norm": 0.9384025767117816, + "learning_rate": 8.873169564503392e-07, + "loss": 0.494, + "step": 7082 + }, + { + "epoch": 0.869079754601227, + "grad_norm": 0.8313984792607836, + "learning_rate": 8.856810547580452e-07, + "loss": 0.5701, + "step": 7083 + }, + { + "epoch": 0.8692024539877301, + "grad_norm": 0.877907714580634, + "learning_rate": 8.840465925940733e-07, + "loss": 0.5326, + "step": 7084 + }, + { + "epoch": 0.8693251533742331, + "grad_norm": 0.9283670302230844, + "learning_rate": 8.824135702165693e-07, + "loss": 0.5693, + "step": 7085 + }, + { + "epoch": 0.8694478527607362, + "grad_norm": 0.7700118507342533, + "learning_rate": 8.807819878834578e-07, + "loss": 0.5566, + "step": 7086 + }, + { + "epoch": 0.8695705521472392, + "grad_norm": 0.8772327554456935, + "learning_rate": 8.791518458524307e-07, + "loss": 0.4907, + "step": 7087 + }, + { + "epoch": 0.8696932515337423, + "grad_norm": 0.8546109911838545, + "learning_rate": 8.775231443809574e-07, + "loss": 0.4972, + "step": 7088 + }, + { + "epoch": 0.8698159509202454, + "grad_norm": 0.8605874886772109, + "learning_rate": 8.758958837262754e-07, + "loss": 0.5464, + "step": 7089 + }, + { + "epoch": 0.8699386503067484, + "grad_norm": 0.9975393995475681, + "learning_rate": 8.74270064145396e-07, + "loss": 0.5641, + "step": 7090 + }, + { + "epoch": 0.8700613496932516, + "grad_norm": 0.8155240540654124, + "learning_rate": 8.726456858951082e-07, + "loss": 0.5182, + "step": 7091 + }, + { + "epoch": 0.8701840490797546, + "grad_norm": 0.8827685428928541, + "learning_rate": 8.710227492319622e-07, + "loss": 0.5945, + "step": 7092 + }, + { + "epoch": 0.8703067484662577, + "grad_norm": 0.8679507161155494, + "learning_rate": 8.694012544122909e-07, + "loss": 0.5547, + "step": 7093 + }, + { + "epoch": 0.8704294478527608, + "grad_norm": 0.8726354684144337, + "learning_rate": 8.677812016921938e-07, + "loss": 0.5279, + "step": 7094 + }, + { + "epoch": 0.8705521472392638, + "grad_norm": 0.8297812137595807, + "learning_rate": 8.661625913275463e-07, + "loss": 0.5334, + "step": 7095 + }, + { + "epoch": 0.8706748466257669, + "grad_norm": 1.1291075637150083, + "learning_rate": 8.645454235739903e-07, + "loss": 0.5387, + "step": 7096 + }, + { + "epoch": 0.8707975460122699, + "grad_norm": 0.8350057318239398, + "learning_rate": 8.629296986869473e-07, + "loss": 0.5379, + "step": 7097 + }, + { + "epoch": 0.870920245398773, + "grad_norm": 0.7694545959122362, + "learning_rate": 8.613154169216065e-07, + "loss": 0.5656, + "step": 7098 + }, + { + "epoch": 0.8710429447852761, + "grad_norm": 0.8511618629176536, + "learning_rate": 8.597025785329271e-07, + "loss": 0.5205, + "step": 7099 + }, + { + "epoch": 0.8711656441717791, + "grad_norm": 0.7839051656450572, + "learning_rate": 8.580911837756467e-07, + "loss": 0.5564, + "step": 7100 + }, + { + "epoch": 0.8712883435582822, + "grad_norm": 0.959781863975948, + "learning_rate": 8.564812329042671e-07, + "loss": 0.5288, + "step": 7101 + }, + { + "epoch": 0.8714110429447853, + "grad_norm": 0.8316705262016714, + "learning_rate": 8.548727261730705e-07, + "loss": 0.4948, + "step": 7102 + }, + { + "epoch": 0.8715337423312883, + "grad_norm": 0.8737607130632717, + "learning_rate": 8.532656638361025e-07, + "loss": 0.5249, + "step": 7103 + }, + { + "epoch": 0.8716564417177914, + "grad_norm": 0.9986816924982682, + "learning_rate": 8.516600461471902e-07, + "loss": 0.5627, + "step": 7104 + }, + { + "epoch": 0.8717791411042944, + "grad_norm": 1.1090281769857395, + "learning_rate": 8.500558733599206e-07, + "loss": 0.6003, + "step": 7105 + }, + { + "epoch": 0.8719018404907976, + "grad_norm": 0.830973769266513, + "learning_rate": 8.484531457276657e-07, + "loss": 0.4989, + "step": 7106 + }, + { + "epoch": 0.8720245398773007, + "grad_norm": 1.1067731900386881, + "learning_rate": 8.468518635035561e-07, + "loss": 0.5412, + "step": 7107 + }, + { + "epoch": 0.8721472392638037, + "grad_norm": 0.871640775702948, + "learning_rate": 8.452520269405062e-07, + "loss": 0.5323, + "step": 7108 + }, + { + "epoch": 0.8722699386503068, + "grad_norm": 0.8400170428462626, + "learning_rate": 8.43653636291194e-07, + "loss": 0.5407, + "step": 7109 + }, + { + "epoch": 0.8723926380368098, + "grad_norm": 0.8344005864618846, + "learning_rate": 8.420566918080686e-07, + "loss": 0.5726, + "step": 7110 + }, + { + "epoch": 0.8725153374233129, + "grad_norm": 0.9168474022905813, + "learning_rate": 8.404611937433594e-07, + "loss": 0.5444, + "step": 7111 + }, + { + "epoch": 0.872638036809816, + "grad_norm": 0.9140905513433688, + "learning_rate": 8.388671423490568e-07, + "loss": 0.5584, + "step": 7112 + }, + { + "epoch": 0.872760736196319, + "grad_norm": 0.7960596920107944, + "learning_rate": 8.37274537876931e-07, + "loss": 0.4874, + "step": 7113 + }, + { + "epoch": 0.8728834355828221, + "grad_norm": 0.8696472744205076, + "learning_rate": 8.356833805785169e-07, + "loss": 0.5602, + "step": 7114 + }, + { + "epoch": 0.8730061349693251, + "grad_norm": 0.7683626761880956, + "learning_rate": 8.340936707051273e-07, + "loss": 0.5038, + "step": 7115 + }, + { + "epoch": 0.8731288343558282, + "grad_norm": 0.8272288341853847, + "learning_rate": 8.325054085078399e-07, + "loss": 0.6258, + "step": 7116 + }, + { + "epoch": 0.8732515337423313, + "grad_norm": 0.8549211007119751, + "learning_rate": 8.309185942375109e-07, + "loss": 0.5172, + "step": 7117 + }, + { + "epoch": 0.8733742331288343, + "grad_norm": 0.8731742615554199, + "learning_rate": 8.293332281447608e-07, + "loss": 0.5374, + "step": 7118 + }, + { + "epoch": 0.8734969325153374, + "grad_norm": 0.8502041512272449, + "learning_rate": 8.27749310479985e-07, + "loss": 0.5028, + "step": 7119 + }, + { + "epoch": 0.8736196319018404, + "grad_norm": 0.9453032018521413, + "learning_rate": 8.261668414933521e-07, + "loss": 0.5351, + "step": 7120 + }, + { + "epoch": 0.8737423312883436, + "grad_norm": 0.7951029696152805, + "learning_rate": 8.245858214347969e-07, + "loss": 0.5321, + "step": 7121 + }, + { + "epoch": 0.8738650306748467, + "grad_norm": 0.8577030716600735, + "learning_rate": 8.230062505540292e-07, + "loss": 0.5914, + "step": 7122 + }, + { + "epoch": 0.8739877300613497, + "grad_norm": 0.8417138472734071, + "learning_rate": 8.214281291005277e-07, + "loss": 0.4638, + "step": 7123 + }, + { + "epoch": 0.8741104294478528, + "grad_norm": 0.8220267733690095, + "learning_rate": 8.19851457323545e-07, + "loss": 0.5538, + "step": 7124 + }, + { + "epoch": 0.8742331288343558, + "grad_norm": 0.9749381319064273, + "learning_rate": 8.182762354720985e-07, + "loss": 0.5293, + "step": 7125 + }, + { + "epoch": 0.8743558282208589, + "grad_norm": 0.9094767713564798, + "learning_rate": 8.167024637949872e-07, + "loss": 0.5667, + "step": 7126 + }, + { + "epoch": 0.874478527607362, + "grad_norm": 0.8820753729784686, + "learning_rate": 8.151301425407699e-07, + "loss": 0.5477, + "step": 7127 + }, + { + "epoch": 0.874601226993865, + "grad_norm": 0.8929184349316969, + "learning_rate": 8.135592719577833e-07, + "loss": 0.4626, + "step": 7128 + }, + { + "epoch": 0.8747239263803681, + "grad_norm": 1.2066533500364884, + "learning_rate": 8.119898522941338e-07, + "loss": 0.5085, + "step": 7129 + }, + { + "epoch": 0.8748466257668711, + "grad_norm": 0.8136981850958541, + "learning_rate": 8.10421883797694e-07, + "loss": 0.5447, + "step": 7130 + }, + { + "epoch": 0.8749693251533742, + "grad_norm": 1.0337488428272104, + "learning_rate": 8.088553667161158e-07, + "loss": 0.5414, + "step": 7131 + }, + { + "epoch": 0.8750920245398773, + "grad_norm": 0.8658925434879352, + "learning_rate": 8.072903012968125e-07, + "loss": 0.5447, + "step": 7132 + }, + { + "epoch": 0.8752147239263803, + "grad_norm": 0.9433818596233657, + "learning_rate": 8.057266877869763e-07, + "loss": 0.6063, + "step": 7133 + }, + { + "epoch": 0.8753374233128834, + "grad_norm": 0.8915008050457346, + "learning_rate": 8.04164526433564e-07, + "loss": 0.5215, + "step": 7134 + }, + { + "epoch": 0.8754601226993866, + "grad_norm": 0.8985718087372805, + "learning_rate": 8.026038174833085e-07, + "loss": 0.5192, + "step": 7135 + }, + { + "epoch": 0.8755828220858896, + "grad_norm": 0.9284207154539359, + "learning_rate": 8.010445611827067e-07, + "loss": 0.5059, + "step": 7136 + }, + { + "epoch": 0.8757055214723927, + "grad_norm": 0.9477178011344192, + "learning_rate": 7.994867577780341e-07, + "loss": 0.5055, + "step": 7137 + }, + { + "epoch": 0.8758282208588957, + "grad_norm": 0.873785287516004, + "learning_rate": 7.979304075153271e-07, + "loss": 0.5417, + "step": 7138 + }, + { + "epoch": 0.8759509202453988, + "grad_norm": 0.6955144222982487, + "learning_rate": 7.963755106404036e-07, + "loss": 0.56, + "step": 7139 + }, + { + "epoch": 0.8760736196319019, + "grad_norm": 0.8281196646811245, + "learning_rate": 7.948220673988427e-07, + "loss": 0.466, + "step": 7140 + }, + { + "epoch": 0.8761963190184049, + "grad_norm": 0.8123011682517706, + "learning_rate": 7.932700780359959e-07, + "loss": 0.5532, + "step": 7141 + }, + { + "epoch": 0.876319018404908, + "grad_norm": 1.0746707666304682, + "learning_rate": 7.917195427969904e-07, + "loss": 0.5157, + "step": 7142 + }, + { + "epoch": 0.876441717791411, + "grad_norm": 0.9103783754450562, + "learning_rate": 7.90170461926717e-07, + "loss": 0.5678, + "step": 7143 + }, + { + "epoch": 0.8765644171779141, + "grad_norm": 0.9854238007949769, + "learning_rate": 7.886228356698422e-07, + "loss": 0.5432, + "step": 7144 + }, + { + "epoch": 0.8766871165644172, + "grad_norm": 0.8145957601213488, + "learning_rate": 7.87076664270795e-07, + "loss": 0.5581, + "step": 7145 + }, + { + "epoch": 0.8768098159509202, + "grad_norm": 0.953963339914824, + "learning_rate": 7.855319479737877e-07, + "loss": 0.5532, + "step": 7146 + }, + { + "epoch": 0.8769325153374233, + "grad_norm": 0.8623639917830896, + "learning_rate": 7.839886870227909e-07, + "loss": 0.5329, + "step": 7147 + }, + { + "epoch": 0.8770552147239263, + "grad_norm": 0.9459485819247287, + "learning_rate": 7.824468816615482e-07, + "loss": 0.5131, + "step": 7148 + }, + { + "epoch": 0.8771779141104294, + "grad_norm": 1.3814313632729924, + "learning_rate": 7.809065321335774e-07, + "loss": 0.5477, + "step": 7149 + }, + { + "epoch": 0.8773006134969326, + "grad_norm": 0.8709552092679929, + "learning_rate": 7.793676386821602e-07, + "loss": 0.4935, + "step": 7150 + }, + { + "epoch": 0.8774233128834356, + "grad_norm": 0.9289368054196445, + "learning_rate": 7.778302015503547e-07, + "loss": 0.519, + "step": 7151 + }, + { + "epoch": 0.8775460122699387, + "grad_norm": 0.8469028090644883, + "learning_rate": 7.762942209809831e-07, + "loss": 0.4707, + "step": 7152 + }, + { + "epoch": 0.8776687116564417, + "grad_norm": 0.9562863052748979, + "learning_rate": 7.747596972166426e-07, + "loss": 0.543, + "step": 7153 + }, + { + "epoch": 0.8777914110429448, + "grad_norm": 0.9202066184253018, + "learning_rate": 7.732266304996971e-07, + "loss": 0.5274, + "step": 7154 + }, + { + "epoch": 0.8779141104294479, + "grad_norm": 1.002238366114945, + "learning_rate": 7.716950210722818e-07, + "loss": 0.5507, + "step": 7155 + }, + { + "epoch": 0.8780368098159509, + "grad_norm": 0.9288915367105812, + "learning_rate": 7.701648691762997e-07, + "loss": 0.5305, + "step": 7156 + }, + { + "epoch": 0.878159509202454, + "grad_norm": 0.8750628258196377, + "learning_rate": 7.686361750534266e-07, + "loss": 0.5095, + "step": 7157 + }, + { + "epoch": 0.878282208588957, + "grad_norm": 0.9370551026151221, + "learning_rate": 7.671089389451059e-07, + "loss": 0.5124, + "step": 7158 + }, + { + "epoch": 0.8784049079754601, + "grad_norm": 0.821247246154517, + "learning_rate": 7.655831610925535e-07, + "loss": 0.5325, + "step": 7159 + }, + { + "epoch": 0.8785276073619632, + "grad_norm": 0.8484149844074282, + "learning_rate": 7.6405884173675e-07, + "loss": 0.5721, + "step": 7160 + }, + { + "epoch": 0.8786503067484662, + "grad_norm": 0.7988301437457307, + "learning_rate": 7.625359811184496e-07, + "loss": 0.54, + "step": 7161 + }, + { + "epoch": 0.8787730061349693, + "grad_norm": 0.8533234896783398, + "learning_rate": 7.610145794781753e-07, + "loss": 0.5747, + "step": 7162 + }, + { + "epoch": 0.8788957055214724, + "grad_norm": 0.8030512881713802, + "learning_rate": 7.594946370562184e-07, + "loss": 0.5782, + "step": 7163 + }, + { + "epoch": 0.8790184049079754, + "grad_norm": 0.8591010534738366, + "learning_rate": 7.579761540926434e-07, + "loss": 0.5148, + "step": 7164 + }, + { + "epoch": 0.8791411042944786, + "grad_norm": 0.7924893830882056, + "learning_rate": 7.564591308272773e-07, + "loss": 0.5544, + "step": 7165 + }, + { + "epoch": 0.8792638036809816, + "grad_norm": 0.8094652473497129, + "learning_rate": 7.549435674997252e-07, + "loss": 0.5532, + "step": 7166 + }, + { + "epoch": 0.8793865030674847, + "grad_norm": 0.8907730494062406, + "learning_rate": 7.534294643493545e-07, + "loss": 0.5156, + "step": 7167 + }, + { + "epoch": 0.8795092024539878, + "grad_norm": 0.9868443018253313, + "learning_rate": 7.519168216153061e-07, + "loss": 0.5503, + "step": 7168 + }, + { + "epoch": 0.8796319018404908, + "grad_norm": 0.8136491758892466, + "learning_rate": 7.504056395364879e-07, + "loss": 0.5291, + "step": 7169 + }, + { + "epoch": 0.8797546012269939, + "grad_norm": 0.9078202832206522, + "learning_rate": 7.488959183515809e-07, + "loss": 0.505, + "step": 7170 + }, + { + "epoch": 0.8798773006134969, + "grad_norm": 0.9707741991157871, + "learning_rate": 7.473876582990292e-07, + "loss": 0.5676, + "step": 7171 + }, + { + "epoch": 0.88, + "grad_norm": 0.856872514548765, + "learning_rate": 7.458808596170497e-07, + "loss": 0.5133, + "step": 7172 + }, + { + "epoch": 0.8801226993865031, + "grad_norm": 0.833841684159388, + "learning_rate": 7.4437552254363e-07, + "loss": 0.5631, + "step": 7173 + }, + { + "epoch": 0.8802453987730061, + "grad_norm": 1.2773990599417362, + "learning_rate": 7.428716473165232e-07, + "loss": 0.6043, + "step": 7174 + }, + { + "epoch": 0.8803680981595092, + "grad_norm": 0.8666135264561443, + "learning_rate": 7.413692341732582e-07, + "loss": 0.5306, + "step": 7175 + }, + { + "epoch": 0.8804907975460122, + "grad_norm": 0.8809562897186419, + "learning_rate": 7.398682833511217e-07, + "loss": 0.5851, + "step": 7176 + }, + { + "epoch": 0.8806134969325153, + "grad_norm": 1.0141826497077204, + "learning_rate": 7.38368795087181e-07, + "loss": 0.5557, + "step": 7177 + }, + { + "epoch": 0.8807361963190184, + "grad_norm": 0.9915335056016483, + "learning_rate": 7.368707696182653e-07, + "loss": 0.5435, + "step": 7178 + }, + { + "epoch": 0.8808588957055214, + "grad_norm": 0.9047742135421275, + "learning_rate": 7.353742071809744e-07, + "loss": 0.4971, + "step": 7179 + }, + { + "epoch": 0.8809815950920246, + "grad_norm": 0.8570773325584096, + "learning_rate": 7.338791080116792e-07, + "loss": 0.53, + "step": 7180 + }, + { + "epoch": 0.8811042944785276, + "grad_norm": 0.8759235004681858, + "learning_rate": 7.32385472346514e-07, + "loss": 0.5378, + "step": 7181 + }, + { + "epoch": 0.8812269938650307, + "grad_norm": 0.8232290944798223, + "learning_rate": 7.308933004213914e-07, + "loss": 0.5556, + "step": 7182 + }, + { + "epoch": 0.8813496932515338, + "grad_norm": 0.8557731783903151, + "learning_rate": 7.294025924719816e-07, + "loss": 0.4894, + "step": 7183 + }, + { + "epoch": 0.8814723926380368, + "grad_norm": 0.830206179005088, + "learning_rate": 7.279133487337342e-07, + "loss": 0.5532, + "step": 7184 + }, + { + "epoch": 0.8815950920245399, + "grad_norm": 1.3642277392028666, + "learning_rate": 7.264255694418576e-07, + "loss": 0.5274, + "step": 7185 + }, + { + "epoch": 0.8817177914110429, + "grad_norm": 0.8684490081220496, + "learning_rate": 7.249392548313372e-07, + "loss": 0.5497, + "step": 7186 + }, + { + "epoch": 0.881840490797546, + "grad_norm": 0.7993710192693029, + "learning_rate": 7.234544051369218e-07, + "loss": 0.5442, + "step": 7187 + }, + { + "epoch": 0.8819631901840491, + "grad_norm": 0.834104625930123, + "learning_rate": 7.219710205931318e-07, + "loss": 0.576, + "step": 7188 + }, + { + "epoch": 0.8820858895705521, + "grad_norm": 0.8490112126742042, + "learning_rate": 7.204891014342552e-07, + "loss": 0.5073, + "step": 7189 + }, + { + "epoch": 0.8822085889570552, + "grad_norm": 0.8618265692450275, + "learning_rate": 7.190086478943459e-07, + "loss": 0.5662, + "step": 7190 + }, + { + "epoch": 0.8823312883435582, + "grad_norm": 0.8925153306321646, + "learning_rate": 7.175296602072313e-07, + "loss": 0.4651, + "step": 7191 + }, + { + "epoch": 0.8824539877300613, + "grad_norm": 0.8763307132686226, + "learning_rate": 7.160521386065034e-07, + "loss": 0.4857, + "step": 7192 + }, + { + "epoch": 0.8825766871165645, + "grad_norm": 0.968567867958054, + "learning_rate": 7.145760833255267e-07, + "loss": 0.6052, + "step": 7193 + }, + { + "epoch": 0.8826993865030675, + "grad_norm": 0.8021921333992993, + "learning_rate": 7.131014945974269e-07, + "loss": 0.5001, + "step": 7194 + }, + { + "epoch": 0.8828220858895706, + "grad_norm": 0.8778400405206256, + "learning_rate": 7.116283726551077e-07, + "loss": 0.4984, + "step": 7195 + }, + { + "epoch": 0.8829447852760737, + "grad_norm": 0.8928225327441737, + "learning_rate": 7.101567177312307e-07, + "loss": 0.5885, + "step": 7196 + }, + { + "epoch": 0.8830674846625767, + "grad_norm": 0.8815154422598817, + "learning_rate": 7.086865300582368e-07, + "loss": 0.5096, + "step": 7197 + }, + { + "epoch": 0.8831901840490798, + "grad_norm": 0.9864785340043798, + "learning_rate": 7.072178098683247e-07, + "loss": 0.5292, + "step": 7198 + }, + { + "epoch": 0.8833128834355828, + "grad_norm": 0.8760382009079953, + "learning_rate": 7.057505573934686e-07, + "loss": 0.4482, + "step": 7199 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 0.917393010636608, + "learning_rate": 7.042847728654078e-07, + "loss": 0.5067, + "step": 7200 + }, + { + "epoch": 0.883558282208589, + "grad_norm": 0.8466750858371468, + "learning_rate": 7.028204565156494e-07, + "loss": 0.5804, + "step": 7201 + }, + { + "epoch": 0.883680981595092, + "grad_norm": 0.8422149054874519, + "learning_rate": 7.013576085754725e-07, + "loss": 0.5633, + "step": 7202 + }, + { + "epoch": 0.8838036809815951, + "grad_norm": 0.8419928654368432, + "learning_rate": 6.998962292759182e-07, + "loss": 0.5604, + "step": 7203 + }, + { + "epoch": 0.8839263803680981, + "grad_norm": 1.0730922930846836, + "learning_rate": 6.984363188478016e-07, + "loss": 0.557, + "step": 7204 + }, + { + "epoch": 0.8840490797546012, + "grad_norm": 0.8359720054862496, + "learning_rate": 6.969778775217007e-07, + "loss": 0.5286, + "step": 7205 + }, + { + "epoch": 0.8841717791411043, + "grad_norm": 0.9632891870476772, + "learning_rate": 6.955209055279644e-07, + "loss": 0.5311, + "step": 7206 + }, + { + "epoch": 0.8842944785276073, + "grad_norm": 0.8596608375809558, + "learning_rate": 6.940654030967086e-07, + "loss": 0.5405, + "step": 7207 + }, + { + "epoch": 0.8844171779141105, + "grad_norm": 0.881486980351892, + "learning_rate": 6.926113704578197e-07, + "loss": 0.4944, + "step": 7208 + }, + { + "epoch": 0.8845398773006135, + "grad_norm": 0.9850232536808561, + "learning_rate": 6.91158807840947e-07, + "loss": 0.5754, + "step": 7209 + }, + { + "epoch": 0.8846625766871166, + "grad_norm": 0.7805649187556867, + "learning_rate": 6.897077154755094e-07, + "loss": 0.5249, + "step": 7210 + }, + { + "epoch": 0.8847852760736197, + "grad_norm": 0.8854559180052757, + "learning_rate": 6.88258093590699e-07, + "loss": 0.535, + "step": 7211 + }, + { + "epoch": 0.8849079754601227, + "grad_norm": 0.8333052986768116, + "learning_rate": 6.868099424154662e-07, + "loss": 0.6086, + "step": 7212 + }, + { + "epoch": 0.8850306748466258, + "grad_norm": 0.9282081065697225, + "learning_rate": 6.853632621785366e-07, + "loss": 0.5257, + "step": 7213 + }, + { + "epoch": 0.8851533742331288, + "grad_norm": 0.8479015401987308, + "learning_rate": 6.839180531084e-07, + "loss": 0.5441, + "step": 7214 + }, + { + "epoch": 0.8852760736196319, + "grad_norm": 0.9590184941844406, + "learning_rate": 6.824743154333157e-07, + "loss": 0.5841, + "step": 7215 + }, + { + "epoch": 0.885398773006135, + "grad_norm": 0.8511143807894943, + "learning_rate": 6.810320493813083e-07, + "loss": 0.5539, + "step": 7216 + }, + { + "epoch": 0.885521472392638, + "grad_norm": 0.9798220274942011, + "learning_rate": 6.795912551801742e-07, + "loss": 0.5782, + "step": 7217 + }, + { + "epoch": 0.8856441717791411, + "grad_norm": 0.8722837745377222, + "learning_rate": 6.781519330574693e-07, + "loss": 0.5367, + "step": 7218 + }, + { + "epoch": 0.8857668711656441, + "grad_norm": 0.8315644621669793, + "learning_rate": 6.767140832405283e-07, + "loss": 0.5137, + "step": 7219 + }, + { + "epoch": 0.8858895705521472, + "grad_norm": 0.89400978088476, + "learning_rate": 6.752777059564431e-07, + "loss": 0.5358, + "step": 7220 + }, + { + "epoch": 0.8860122699386503, + "grad_norm": 0.9017138876662185, + "learning_rate": 6.738428014320775e-07, + "loss": 0.5167, + "step": 7221 + }, + { + "epoch": 0.8861349693251533, + "grad_norm": 0.9044660226184077, + "learning_rate": 6.724093698940637e-07, + "loss": 0.5183, + "step": 7222 + }, + { + "epoch": 0.8862576687116565, + "grad_norm": 0.8749853693956128, + "learning_rate": 6.709774115687983e-07, + "loss": 0.5206, + "step": 7223 + }, + { + "epoch": 0.8863803680981595, + "grad_norm": 0.8806046012274441, + "learning_rate": 6.695469266824494e-07, + "loss": 0.5681, + "step": 7224 + }, + { + "epoch": 0.8865030674846626, + "grad_norm": 1.0456088372343852, + "learning_rate": 6.681179154609463e-07, + "loss": 0.5581, + "step": 7225 + }, + { + "epoch": 0.8866257668711657, + "grad_norm": 0.9188853405261224, + "learning_rate": 6.666903781299927e-07, + "loss": 0.5664, + "step": 7226 + }, + { + "epoch": 0.8867484662576687, + "grad_norm": 0.875432516786441, + "learning_rate": 6.652643149150518e-07, + "loss": 0.5547, + "step": 7227 + }, + { + "epoch": 0.8868711656441718, + "grad_norm": 0.8871902358631771, + "learning_rate": 6.638397260413609e-07, + "loss": 0.5316, + "step": 7228 + }, + { + "epoch": 0.8869938650306749, + "grad_norm": 0.8546740179572856, + "learning_rate": 6.624166117339215e-07, + "loss": 0.5208, + "step": 7229 + }, + { + "epoch": 0.8871165644171779, + "grad_norm": 0.7927401643136057, + "learning_rate": 6.609949722175013e-07, + "loss": 0.4376, + "step": 7230 + }, + { + "epoch": 0.887239263803681, + "grad_norm": 0.8753190461864451, + "learning_rate": 6.595748077166375e-07, + "loss": 0.501, + "step": 7231 + }, + { + "epoch": 0.887361963190184, + "grad_norm": 0.8896254715682409, + "learning_rate": 6.581561184556296e-07, + "loss": 0.5135, + "step": 7232 + }, + { + "epoch": 0.8874846625766871, + "grad_norm": 1.002499931301515, + "learning_rate": 6.567389046585515e-07, + "loss": 0.5582, + "step": 7233 + }, + { + "epoch": 0.8876073619631902, + "grad_norm": 0.8850076232731283, + "learning_rate": 6.553231665492366e-07, + "loss": 0.5844, + "step": 7234 + }, + { + "epoch": 0.8877300613496932, + "grad_norm": 0.8779953137836296, + "learning_rate": 6.539089043512914e-07, + "loss": 0.5529, + "step": 7235 + }, + { + "epoch": 0.8878527607361963, + "grad_norm": 0.8486726772585156, + "learning_rate": 6.524961182880841e-07, + "loss": 0.5165, + "step": 7236 + }, + { + "epoch": 0.8879754601226993, + "grad_norm": 0.8573403130708411, + "learning_rate": 6.510848085827536e-07, + "loss": 0.5367, + "step": 7237 + }, + { + "epoch": 0.8880981595092025, + "grad_norm": 0.8488254861553273, + "learning_rate": 6.496749754582043e-07, + "loss": 0.5205, + "step": 7238 + }, + { + "epoch": 0.8882208588957056, + "grad_norm": 0.7644010158650457, + "learning_rate": 6.482666191371045e-07, + "loss": 0.5372, + "step": 7239 + }, + { + "epoch": 0.8883435582822086, + "grad_norm": 0.9187773236565998, + "learning_rate": 6.468597398418952e-07, + "loss": 0.496, + "step": 7240 + }, + { + "epoch": 0.8884662576687117, + "grad_norm": 0.9511499227286658, + "learning_rate": 6.454543377947786e-07, + "loss": 0.5056, + "step": 7241 + }, + { + "epoch": 0.8885889570552147, + "grad_norm": 0.856803557491644, + "learning_rate": 6.440504132177284e-07, + "loss": 0.5335, + "step": 7242 + }, + { + "epoch": 0.8887116564417178, + "grad_norm": 0.8771329470657344, + "learning_rate": 6.426479663324792e-07, + "loss": 0.503, + "step": 7243 + }, + { + "epoch": 0.8888343558282209, + "grad_norm": 0.8673780705026184, + "learning_rate": 6.412469973605384e-07, + "loss": 0.5179, + "step": 7244 + }, + { + "epoch": 0.8889570552147239, + "grad_norm": 0.777191743332289, + "learning_rate": 6.398475065231746e-07, + "loss": 0.5173, + "step": 7245 + }, + { + "epoch": 0.889079754601227, + "grad_norm": 0.8910110802525131, + "learning_rate": 6.384494940414288e-07, + "loss": 0.539, + "step": 7246 + }, + { + "epoch": 0.88920245398773, + "grad_norm": 0.9757006128337494, + "learning_rate": 6.370529601360997e-07, + "loss": 0.5882, + "step": 7247 + }, + { + "epoch": 0.8893251533742331, + "grad_norm": 0.8875879148401925, + "learning_rate": 6.356579050277634e-07, + "loss": 0.5102, + "step": 7248 + }, + { + "epoch": 0.8894478527607362, + "grad_norm": 0.9315335694652412, + "learning_rate": 6.342643289367523e-07, + "loss": 0.5751, + "step": 7249 + }, + { + "epoch": 0.8895705521472392, + "grad_norm": 0.9646952929271388, + "learning_rate": 6.328722320831737e-07, + "loss": 0.634, + "step": 7250 + }, + { + "epoch": 0.8896932515337423, + "grad_norm": 0.9431454366345261, + "learning_rate": 6.314816146868951e-07, + "loss": 0.5484, + "step": 7251 + }, + { + "epoch": 0.8898159509202453, + "grad_norm": 1.3949850433245234, + "learning_rate": 6.30092476967552e-07, + "loss": 0.5828, + "step": 7252 + }, + { + "epoch": 0.8899386503067485, + "grad_norm": 0.8338000301674846, + "learning_rate": 6.287048191445488e-07, + "loss": 0.4701, + "step": 7253 + }, + { + "epoch": 0.8900613496932516, + "grad_norm": 0.7972722879044751, + "learning_rate": 6.273186414370513e-07, + "loss": 0.5653, + "step": 7254 + }, + { + "epoch": 0.8901840490797546, + "grad_norm": 0.8716215728179733, + "learning_rate": 6.259339440639966e-07, + "loss": 0.4982, + "step": 7255 + }, + { + "epoch": 0.8903067484662577, + "grad_norm": 0.8640769506943996, + "learning_rate": 6.245507272440843e-07, + "loss": 0.5234, + "step": 7256 + }, + { + "epoch": 0.8904294478527608, + "grad_norm": 0.8832367156596578, + "learning_rate": 6.23168991195785e-07, + "loss": 0.5272, + "step": 7257 + }, + { + "epoch": 0.8905521472392638, + "grad_norm": 0.8157530340205192, + "learning_rate": 6.217887361373287e-07, + "loss": 0.5022, + "step": 7258 + }, + { + "epoch": 0.8906748466257669, + "grad_norm": 0.8419897010992797, + "learning_rate": 6.204099622867144e-07, + "loss": 0.4924, + "step": 7259 + }, + { + "epoch": 0.8907975460122699, + "grad_norm": 0.9083675471336001, + "learning_rate": 6.1903266986171e-07, + "loss": 0.5119, + "step": 7260 + }, + { + "epoch": 0.890920245398773, + "grad_norm": 0.8131696291322972, + "learning_rate": 6.176568590798448e-07, + "loss": 0.5046, + "step": 7261 + }, + { + "epoch": 0.8910429447852761, + "grad_norm": 1.018758773127332, + "learning_rate": 6.162825301584185e-07, + "loss": 0.5884, + "step": 7262 + }, + { + "epoch": 0.8911656441717791, + "grad_norm": 0.8867034403181866, + "learning_rate": 6.149096833144919e-07, + "loss": 0.5228, + "step": 7263 + }, + { + "epoch": 0.8912883435582822, + "grad_norm": 0.9171044943873421, + "learning_rate": 6.135383187648969e-07, + "loss": 0.5012, + "step": 7264 + }, + { + "epoch": 0.8914110429447852, + "grad_norm": 0.891419121383737, + "learning_rate": 6.121684367262271e-07, + "loss": 0.5281, + "step": 7265 + }, + { + "epoch": 0.8915337423312883, + "grad_norm": 0.7911775037251412, + "learning_rate": 6.108000374148448e-07, + "loss": 0.5849, + "step": 7266 + }, + { + "epoch": 0.8916564417177915, + "grad_norm": 0.8343047172810568, + "learning_rate": 6.094331210468751e-07, + "loss": 0.5631, + "step": 7267 + }, + { + "epoch": 0.8917791411042945, + "grad_norm": 0.8622548338663106, + "learning_rate": 6.08067687838213e-07, + "loss": 0.5317, + "step": 7268 + }, + { + "epoch": 0.8919018404907976, + "grad_norm": 0.9157964313664975, + "learning_rate": 6.06703738004516e-07, + "loss": 0.5377, + "step": 7269 + }, + { + "epoch": 0.8920245398773006, + "grad_norm": 0.8316683940172581, + "learning_rate": 6.053412717612061e-07, + "loss": 0.4756, + "step": 7270 + }, + { + "epoch": 0.8921472392638037, + "grad_norm": 1.013941174998429, + "learning_rate": 6.039802893234769e-07, + "loss": 0.5234, + "step": 7271 + }, + { + "epoch": 0.8922699386503068, + "grad_norm": 1.2676732602832441, + "learning_rate": 6.026207909062809e-07, + "loss": 0.576, + "step": 7272 + }, + { + "epoch": 0.8923926380368098, + "grad_norm": 0.8648120746110559, + "learning_rate": 6.012627767243417e-07, + "loss": 0.5555, + "step": 7273 + }, + { + "epoch": 0.8925153374233129, + "grad_norm": 0.9103972677166715, + "learning_rate": 5.999062469921424e-07, + "loss": 0.576, + "step": 7274 + }, + { + "epoch": 0.8926380368098159, + "grad_norm": 0.9536644649639898, + "learning_rate": 5.985512019239392e-07, + "loss": 0.4792, + "step": 7275 + }, + { + "epoch": 0.892760736196319, + "grad_norm": 0.9321322994711734, + "learning_rate": 5.971976417337466e-07, + "loss": 0.4922, + "step": 7276 + }, + { + "epoch": 0.8928834355828221, + "grad_norm": 0.9794730414045809, + "learning_rate": 5.958455666353502e-07, + "loss": 0.5431, + "step": 7277 + }, + { + "epoch": 0.8930061349693251, + "grad_norm": 0.8129313786371742, + "learning_rate": 5.944949768422969e-07, + "loss": 0.4848, + "step": 7278 + }, + { + "epoch": 0.8931288343558282, + "grad_norm": 0.8976819018923246, + "learning_rate": 5.931458725679018e-07, + "loss": 0.5532, + "step": 7279 + }, + { + "epoch": 0.8932515337423312, + "grad_norm": 0.9305364580507547, + "learning_rate": 5.917982540252442e-07, + "loss": 0.5382, + "step": 7280 + }, + { + "epoch": 0.8933742331288343, + "grad_norm": 0.8529154224978902, + "learning_rate": 5.904521214271685e-07, + "loss": 0.5604, + "step": 7281 + }, + { + "epoch": 0.8934969325153375, + "grad_norm": 0.9088644250690437, + "learning_rate": 5.891074749862857e-07, + "loss": 0.5743, + "step": 7282 + }, + { + "epoch": 0.8936196319018405, + "grad_norm": 0.8008316064257691, + "learning_rate": 5.877643149149669e-07, + "loss": 0.5271, + "step": 7283 + }, + { + "epoch": 0.8937423312883436, + "grad_norm": 1.0027731762709016, + "learning_rate": 5.864226414253604e-07, + "loss": 0.5578, + "step": 7284 + }, + { + "epoch": 0.8938650306748466, + "grad_norm": 0.9027421757100359, + "learning_rate": 5.850824547293655e-07, + "loss": 0.498, + "step": 7285 + }, + { + "epoch": 0.8939877300613497, + "grad_norm": 0.8686049268654136, + "learning_rate": 5.83743755038656e-07, + "loss": 0.4995, + "step": 7286 + }, + { + "epoch": 0.8941104294478528, + "grad_norm": 0.9836237288236549, + "learning_rate": 5.824065425646685e-07, + "loss": 0.5907, + "step": 7287 + }, + { + "epoch": 0.8942331288343558, + "grad_norm": 0.8283053081822848, + "learning_rate": 5.810708175186008e-07, + "loss": 0.639, + "step": 7288 + }, + { + "epoch": 0.8943558282208589, + "grad_norm": 0.8439718796740718, + "learning_rate": 5.797365801114241e-07, + "loss": 0.4905, + "step": 7289 + }, + { + "epoch": 0.894478527607362, + "grad_norm": 0.8307233177309722, + "learning_rate": 5.784038305538653e-07, + "loss": 0.498, + "step": 7290 + }, + { + "epoch": 0.894601226993865, + "grad_norm": 0.9959189854776495, + "learning_rate": 5.770725690564238e-07, + "loss": 0.5367, + "step": 7291 + }, + { + "epoch": 0.8947239263803681, + "grad_norm": 0.8697291837643057, + "learning_rate": 5.757427958293582e-07, + "loss": 0.5245, + "step": 7292 + }, + { + "epoch": 0.8948466257668711, + "grad_norm": 0.8902274721847552, + "learning_rate": 5.744145110826971e-07, + "loss": 0.5968, + "step": 7293 + }, + { + "epoch": 0.8949693251533742, + "grad_norm": 0.8436425870479907, + "learning_rate": 5.730877150262304e-07, + "loss": 0.5758, + "step": 7294 + }, + { + "epoch": 0.8950920245398774, + "grad_norm": 0.874348516511612, + "learning_rate": 5.71762407869515e-07, + "loss": 0.5155, + "step": 7295 + }, + { + "epoch": 0.8952147239263804, + "grad_norm": 1.002566139418447, + "learning_rate": 5.704385898218712e-07, + "loss": 0.5256, + "step": 7296 + }, + { + "epoch": 0.8953374233128835, + "grad_norm": 0.8268704323550572, + "learning_rate": 5.69116261092385e-07, + "loss": 0.5251, + "step": 7297 + }, + { + "epoch": 0.8954601226993865, + "grad_norm": 1.0495451737057817, + "learning_rate": 5.677954218899063e-07, + "loss": 0.5251, + "step": 7298 + }, + { + "epoch": 0.8955828220858896, + "grad_norm": 0.8875136524415617, + "learning_rate": 5.664760724230512e-07, + "loss": 0.4651, + "step": 7299 + }, + { + "epoch": 0.8957055214723927, + "grad_norm": 0.8429894535687202, + "learning_rate": 5.651582129001987e-07, + "loss": 0.448, + "step": 7300 + }, + { + "epoch": 0.8958282208588957, + "grad_norm": 0.8544518618816165, + "learning_rate": 5.638418435294935e-07, + "loss": 0.4797, + "step": 7301 + }, + { + "epoch": 0.8959509202453988, + "grad_norm": 0.8519002109301935, + "learning_rate": 5.625269645188458e-07, + "loss": 0.4476, + "step": 7302 + }, + { + "epoch": 0.8960736196319018, + "grad_norm": 0.8891607153360795, + "learning_rate": 5.612135760759263e-07, + "loss": 0.5228, + "step": 7303 + }, + { + "epoch": 0.8961963190184049, + "grad_norm": 1.0460683396060824, + "learning_rate": 5.59901678408179e-07, + "loss": 0.558, + "step": 7304 + }, + { + "epoch": 0.896319018404908, + "grad_norm": 0.9767256076562602, + "learning_rate": 5.585912717228015e-07, + "loss": 0.5745, + "step": 7305 + }, + { + "epoch": 0.896441717791411, + "grad_norm": 0.8299849239965319, + "learning_rate": 5.572823562267649e-07, + "loss": 0.5166, + "step": 7306 + }, + { + "epoch": 0.8965644171779141, + "grad_norm": 0.8336338184259705, + "learning_rate": 5.559749321267971e-07, + "loss": 0.4644, + "step": 7307 + }, + { + "epoch": 0.8966871165644171, + "grad_norm": 0.8112828842731644, + "learning_rate": 5.546689996293996e-07, + "loss": 0.5757, + "step": 7308 + }, + { + "epoch": 0.8968098159509202, + "grad_norm": 0.9058192836491206, + "learning_rate": 5.533645589408298e-07, + "loss": 0.4668, + "step": 7309 + }, + { + "epoch": 0.8969325153374234, + "grad_norm": 0.8094366868708536, + "learning_rate": 5.520616102671128e-07, + "loss": 0.5636, + "step": 7310 + }, + { + "epoch": 0.8970552147239264, + "grad_norm": 0.9596540851320362, + "learning_rate": 5.507601538140406e-07, + "loss": 0.5934, + "step": 7311 + }, + { + "epoch": 0.8971779141104295, + "grad_norm": 0.80310128949796, + "learning_rate": 5.494601897871643e-07, + "loss": 0.5763, + "step": 7312 + }, + { + "epoch": 0.8973006134969325, + "grad_norm": 0.8338049157082933, + "learning_rate": 5.481617183918053e-07, + "loss": 0.5418, + "step": 7313 + }, + { + "epoch": 0.8974233128834356, + "grad_norm": 0.9522677478241932, + "learning_rate": 5.468647398330418e-07, + "loss": 0.5632, + "step": 7314 + }, + { + "epoch": 0.8975460122699387, + "grad_norm": 0.88892138079441, + "learning_rate": 5.455692543157243e-07, + "loss": 0.5635, + "step": 7315 + }, + { + "epoch": 0.8976687116564417, + "grad_norm": 0.8053818941270895, + "learning_rate": 5.442752620444602e-07, + "loss": 0.5618, + "step": 7316 + }, + { + "epoch": 0.8977914110429448, + "grad_norm": 0.815970152770482, + "learning_rate": 5.429827632236284e-07, + "loss": 0.5518, + "step": 7317 + }, + { + "epoch": 0.8979141104294478, + "grad_norm": 0.8941225367591471, + "learning_rate": 5.416917580573644e-07, + "loss": 0.5561, + "step": 7318 + }, + { + "epoch": 0.8980368098159509, + "grad_norm": 0.782629999506735, + "learning_rate": 5.404022467495728e-07, + "loss": 0.5732, + "step": 7319 + }, + { + "epoch": 0.898159509202454, + "grad_norm": 0.9489448907697519, + "learning_rate": 5.391142295039209e-07, + "loss": 0.5545, + "step": 7320 + }, + { + "epoch": 0.898282208588957, + "grad_norm": 0.8559365232853875, + "learning_rate": 5.37827706523839e-07, + "loss": 0.4618, + "step": 7321 + }, + { + "epoch": 0.8984049079754601, + "grad_norm": 0.876148636638479, + "learning_rate": 5.36542678012525e-07, + "loss": 0.5702, + "step": 7322 + }, + { + "epoch": 0.8985276073619632, + "grad_norm": 0.8479032834274418, + "learning_rate": 5.352591441729337e-07, + "loss": 0.452, + "step": 7323 + }, + { + "epoch": 0.8986503067484662, + "grad_norm": 0.8220895530380199, + "learning_rate": 5.339771052077935e-07, + "loss": 0.532, + "step": 7324 + }, + { + "epoch": 0.8987730061349694, + "grad_norm": 0.9085341860821898, + "learning_rate": 5.326965613195867e-07, + "loss": 0.5687, + "step": 7325 + }, + { + "epoch": 0.8988957055214724, + "grad_norm": 0.857975304009353, + "learning_rate": 5.314175127105669e-07, + "loss": 0.5266, + "step": 7326 + }, + { + "epoch": 0.8990184049079755, + "grad_norm": 0.8472923559086589, + "learning_rate": 5.301399595827483e-07, + "loss": 0.5422, + "step": 7327 + }, + { + "epoch": 0.8991411042944786, + "grad_norm": 0.8345889492920929, + "learning_rate": 5.288639021379094e-07, + "loss": 0.5494, + "step": 7328 + }, + { + "epoch": 0.8992638036809816, + "grad_norm": 0.8881659786695356, + "learning_rate": 5.275893405775911e-07, + "loss": 0.5343, + "step": 7329 + }, + { + "epoch": 0.8993865030674847, + "grad_norm": 0.9108161487134665, + "learning_rate": 5.263162751031025e-07, + "loss": 0.5467, + "step": 7330 + }, + { + "epoch": 0.8995092024539877, + "grad_norm": 0.8279606152000988, + "learning_rate": 5.250447059155117e-07, + "loss": 0.5128, + "step": 7331 + }, + { + "epoch": 0.8996319018404908, + "grad_norm": 0.9719108367794117, + "learning_rate": 5.237746332156501e-07, + "loss": 0.5945, + "step": 7332 + }, + { + "epoch": 0.8997546012269939, + "grad_norm": 0.953020287137752, + "learning_rate": 5.225060572041186e-07, + "loss": 0.5782, + "step": 7333 + }, + { + "epoch": 0.8998773006134969, + "grad_norm": 0.8517930960339081, + "learning_rate": 5.212389780812733e-07, + "loss": 0.481, + "step": 7334 + }, + { + "epoch": 0.9, + "grad_norm": 0.9212180712479511, + "learning_rate": 5.199733960472431e-07, + "loss": 0.4721, + "step": 7335 + }, + { + "epoch": 0.900122699386503, + "grad_norm": 0.9475677618157754, + "learning_rate": 5.187093113019126e-07, + "loss": 0.548, + "step": 7336 + }, + { + "epoch": 0.9002453987730061, + "grad_norm": 0.8568241939911996, + "learning_rate": 5.174467240449366e-07, + "loss": 0.5383, + "step": 7337 + }, + { + "epoch": 0.9003680981595092, + "grad_norm": 0.810997124728237, + "learning_rate": 5.161856344757243e-07, + "loss": 0.5848, + "step": 7338 + }, + { + "epoch": 0.9004907975460122, + "grad_norm": 1.0476281187995442, + "learning_rate": 5.149260427934588e-07, + "loss": 0.5574, + "step": 7339 + }, + { + "epoch": 0.9006134969325154, + "grad_norm": 0.8544386348969634, + "learning_rate": 5.136679491970809e-07, + "loss": 0.5217, + "step": 7340 + }, + { + "epoch": 0.9007361963190184, + "grad_norm": 0.934152984656196, + "learning_rate": 5.124113538852937e-07, + "loss": 0.5184, + "step": 7341 + }, + { + "epoch": 0.9008588957055215, + "grad_norm": 0.8743836172136829, + "learning_rate": 5.111562570565687e-07, + "loss": 0.5662, + "step": 7342 + }, + { + "epoch": 0.9009815950920246, + "grad_norm": 0.7610650112765248, + "learning_rate": 5.099026589091338e-07, + "loss": 0.5323, + "step": 7343 + }, + { + "epoch": 0.9011042944785276, + "grad_norm": 0.8528309676592901, + "learning_rate": 5.086505596409885e-07, + "loss": 0.5573, + "step": 7344 + }, + { + "epoch": 0.9012269938650307, + "grad_norm": 0.8818721994723767, + "learning_rate": 5.073999594498869e-07, + "loss": 0.5405, + "step": 7345 + }, + { + "epoch": 0.9013496932515337, + "grad_norm": 0.9117811468854542, + "learning_rate": 5.06150858533353e-07, + "loss": 0.4975, + "step": 7346 + }, + { + "epoch": 0.9014723926380368, + "grad_norm": 0.8781544175184623, + "learning_rate": 5.049032570886703e-07, + "loss": 0.538, + "step": 7347 + }, + { + "epoch": 0.9015950920245399, + "grad_norm": 0.9815109589716281, + "learning_rate": 5.036571553128888e-07, + "loss": 0.6014, + "step": 7348 + }, + { + "epoch": 0.9017177914110429, + "grad_norm": 0.8569584063602896, + "learning_rate": 5.024125534028179e-07, + "loss": 0.5534, + "step": 7349 + }, + { + "epoch": 0.901840490797546, + "grad_norm": 0.8225710414417818, + "learning_rate": 5.011694515550303e-07, + "loss": 0.5524, + "step": 7350 + }, + { + "epoch": 0.9019631901840491, + "grad_norm": 0.8486655882132156, + "learning_rate": 4.999278499658667e-07, + "loss": 0.568, + "step": 7351 + }, + { + "epoch": 0.9020858895705521, + "grad_norm": 0.8607435036944419, + "learning_rate": 4.986877488314246e-07, + "loss": 0.5198, + "step": 7352 + }, + { + "epoch": 0.9022085889570552, + "grad_norm": 0.9389482976610397, + "learning_rate": 4.974491483475697e-07, + "loss": 0.5752, + "step": 7353 + }, + { + "epoch": 0.9023312883435582, + "grad_norm": 1.0070663783674443, + "learning_rate": 4.962120487099253e-07, + "loss": 0.5746, + "step": 7354 + }, + { + "epoch": 0.9024539877300614, + "grad_norm": 0.9776289629442374, + "learning_rate": 4.949764501138832e-07, + "loss": 0.57, + "step": 7355 + }, + { + "epoch": 0.9025766871165645, + "grad_norm": 0.8676002164623542, + "learning_rate": 4.937423527545937e-07, + "loss": 0.5044, + "step": 7356 + }, + { + "epoch": 0.9026993865030675, + "grad_norm": 0.8794950980637707, + "learning_rate": 4.925097568269743e-07, + "loss": 0.5185, + "step": 7357 + }, + { + "epoch": 0.9028220858895706, + "grad_norm": 0.9170696085226201, + "learning_rate": 4.912786625257005e-07, + "loss": 0.5595, + "step": 7358 + }, + { + "epoch": 0.9029447852760736, + "grad_norm": 0.8596916724769632, + "learning_rate": 4.900490700452155e-07, + "loss": 0.5302, + "step": 7359 + }, + { + "epoch": 0.9030674846625767, + "grad_norm": 0.8143670273449951, + "learning_rate": 4.888209795797205e-07, + "loss": 0.5598, + "step": 7360 + }, + { + "epoch": 0.9031901840490798, + "grad_norm": 0.8572540147776502, + "learning_rate": 4.875943913231818e-07, + "loss": 0.5832, + "step": 7361 + }, + { + "epoch": 0.9033128834355828, + "grad_norm": 0.8949684028850637, + "learning_rate": 4.863693054693308e-07, + "loss": 0.5355, + "step": 7362 + }, + { + "epoch": 0.9034355828220859, + "grad_norm": 1.030952416996802, + "learning_rate": 4.851457222116562e-07, + "loss": 0.536, + "step": 7363 + }, + { + "epoch": 0.9035582822085889, + "grad_norm": 0.9471413417773435, + "learning_rate": 4.839236417434157e-07, + "loss": 0.5086, + "step": 7364 + }, + { + "epoch": 0.903680981595092, + "grad_norm": 0.834498796000116, + "learning_rate": 4.827030642576236e-07, + "loss": 0.5149, + "step": 7365 + }, + { + "epoch": 0.9038036809815951, + "grad_norm": 0.8517916196201927, + "learning_rate": 4.814839899470614e-07, + "loss": 0.5517, + "step": 7366 + }, + { + "epoch": 0.9039263803680981, + "grad_norm": 0.8593764731891269, + "learning_rate": 4.802664190042716e-07, + "loss": 0.56, + "step": 7367 + }, + { + "epoch": 0.9040490797546012, + "grad_norm": 0.9060668714228005, + "learning_rate": 4.790503516215572e-07, + "loss": 0.5334, + "step": 7368 + }, + { + "epoch": 0.9041717791411042, + "grad_norm": 0.8572388922035084, + "learning_rate": 4.778357879909878e-07, + "loss": 0.5993, + "step": 7369 + }, + { + "epoch": 0.9042944785276074, + "grad_norm": 0.9027693401290185, + "learning_rate": 4.766227283043912e-07, + "loss": 0.5714, + "step": 7370 + }, + { + "epoch": 0.9044171779141105, + "grad_norm": 0.9705814431193551, + "learning_rate": 4.754111727533617e-07, + "loss": 0.542, + "step": 7371 + }, + { + "epoch": 0.9045398773006135, + "grad_norm": 0.9218992798161079, + "learning_rate": 4.742011215292519e-07, + "loss": 0.4875, + "step": 7372 + }, + { + "epoch": 0.9046625766871166, + "grad_norm": 1.0057163966381872, + "learning_rate": 4.7299257482318116e-07, + "loss": 0.5365, + "step": 7373 + }, + { + "epoch": 0.9047852760736196, + "grad_norm": 0.8290240298094179, + "learning_rate": 4.717855328260279e-07, + "loss": 0.4627, + "step": 7374 + }, + { + "epoch": 0.9049079754601227, + "grad_norm": 0.8040959623401928, + "learning_rate": 4.7057999572843516e-07, + "loss": 0.4983, + "step": 7375 + }, + { + "epoch": 0.9050306748466258, + "grad_norm": 1.0016704926657671, + "learning_rate": 4.6937596372080397e-07, + "loss": 0.5449, + "step": 7376 + }, + { + "epoch": 0.9051533742331288, + "grad_norm": 0.9002997087077681, + "learning_rate": 4.6817343699330554e-07, + "loss": 0.5452, + "step": 7377 + }, + { + "epoch": 0.9052760736196319, + "grad_norm": 0.9264572178852299, + "learning_rate": 4.669724157358646e-07, + "loss": 0.532, + "step": 7378 + }, + { + "epoch": 0.9053987730061349, + "grad_norm": 0.8513293435909433, + "learning_rate": 4.65772900138175e-07, + "loss": 0.5709, + "step": 7379 + }, + { + "epoch": 0.905521472392638, + "grad_norm": 0.8853943343973996, + "learning_rate": 4.645748903896885e-07, + "loss": 0.5435, + "step": 7380 + }, + { + "epoch": 0.9056441717791411, + "grad_norm": 0.9695635092674078, + "learning_rate": 4.633783866796193e-07, + "loss": 0.5671, + "step": 7381 + }, + { + "epoch": 0.9057668711656441, + "grad_norm": 0.8397513070659617, + "learning_rate": 4.621833891969474e-07, + "loss": 0.5471, + "step": 7382 + }, + { + "epoch": 0.9058895705521472, + "grad_norm": 0.880865538844151, + "learning_rate": 4.609898981304095e-07, + "loss": 0.5548, + "step": 7383 + }, + { + "epoch": 0.9060122699386504, + "grad_norm": 0.8169969374613577, + "learning_rate": 4.5979791366851046e-07, + "loss": 0.5101, + "step": 7384 + }, + { + "epoch": 0.9061349693251534, + "grad_norm": 0.8873005181499007, + "learning_rate": 4.5860743599951186e-07, + "loss": 0.5413, + "step": 7385 + }, + { + "epoch": 0.9062576687116565, + "grad_norm": 1.0168638569071868, + "learning_rate": 4.5741846531143995e-07, + "loss": 0.5254, + "step": 7386 + }, + { + "epoch": 0.9063803680981595, + "grad_norm": 0.9052171292262319, + "learning_rate": 4.5623100179208235e-07, + "loss": 0.5165, + "step": 7387 + }, + { + "epoch": 0.9065030674846626, + "grad_norm": 0.9558593643649128, + "learning_rate": 4.550450456289901e-07, + "loss": 0.5479, + "step": 7388 + }, + { + "epoch": 0.9066257668711657, + "grad_norm": 0.908966596429183, + "learning_rate": 4.5386059700947116e-07, + "loss": 0.5347, + "step": 7389 + }, + { + "epoch": 0.9067484662576687, + "grad_norm": 0.826862126988917, + "learning_rate": 4.5267765612060253e-07, + "loss": 0.5302, + "step": 7390 + }, + { + "epoch": 0.9068711656441718, + "grad_norm": 0.8755037505607719, + "learning_rate": 4.5149622314921925e-07, + "loss": 0.5637, + "step": 7391 + }, + { + "epoch": 0.9069938650306748, + "grad_norm": 1.015874852628626, + "learning_rate": 4.503162982819165e-07, + "loss": 0.5216, + "step": 7392 + }, + { + "epoch": 0.9071165644171779, + "grad_norm": 0.9193008702963879, + "learning_rate": 4.491378817050529e-07, + "loss": 0.5803, + "step": 7393 + }, + { + "epoch": 0.907239263803681, + "grad_norm": 0.9146145076922529, + "learning_rate": 4.4796097360475186e-07, + "loss": 0.5758, + "step": 7394 + }, + { + "epoch": 0.907361963190184, + "grad_norm": 0.940647976081337, + "learning_rate": 4.4678557416689586e-07, + "loss": 0.5322, + "step": 7395 + }, + { + "epoch": 0.9074846625766871, + "grad_norm": 0.9539755337229432, + "learning_rate": 4.4561168357712626e-07, + "loss": 0.582, + "step": 7396 + }, + { + "epoch": 0.9076073619631901, + "grad_norm": 0.9981332163234304, + "learning_rate": 4.444393020208526e-07, + "loss": 0.6128, + "step": 7397 + }, + { + "epoch": 0.9077300613496933, + "grad_norm": 0.9530154001950696, + "learning_rate": 4.432684296832401e-07, + "loss": 0.5294, + "step": 7398 + }, + { + "epoch": 0.9078527607361964, + "grad_norm": 1.0156491864491293, + "learning_rate": 4.4209906674921755e-07, + "loss": 0.5898, + "step": 7399 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 0.848209677188701, + "learning_rate": 4.4093121340347824e-07, + "loss": 0.4878, + "step": 7400 + }, + { + "epoch": 0.9080981595092025, + "grad_norm": 0.889932685315895, + "learning_rate": 4.397648698304713e-07, + "loss": 0.5705, + "step": 7401 + }, + { + "epoch": 0.9082208588957055, + "grad_norm": 0.859285220977202, + "learning_rate": 4.3860003621441384e-07, + "loss": 0.5407, + "step": 7402 + }, + { + "epoch": 0.9083435582822086, + "grad_norm": 0.9382801267166786, + "learning_rate": 4.3743671273927867e-07, + "loss": 0.5285, + "step": 7403 + }, + { + "epoch": 0.9084662576687117, + "grad_norm": 0.8799252667245347, + "learning_rate": 4.3627489958880443e-07, + "loss": 0.4476, + "step": 7404 + }, + { + "epoch": 0.9085889570552147, + "grad_norm": 0.850487508933098, + "learning_rate": 4.3511459694648873e-07, + "loss": 0.5457, + "step": 7405 + }, + { + "epoch": 0.9087116564417178, + "grad_norm": 0.9652347794285479, + "learning_rate": 4.3395580499559276e-07, + "loss": 0.5254, + "step": 7406 + }, + { + "epoch": 0.9088343558282208, + "grad_norm": 0.9328661976631238, + "learning_rate": 4.3279852391913456e-07, + "loss": 0.5888, + "step": 7407 + }, + { + "epoch": 0.9089570552147239, + "grad_norm": 0.8697855784717021, + "learning_rate": 4.316427538999013e-07, + "loss": 0.5555, + "step": 7408 + }, + { + "epoch": 0.909079754601227, + "grad_norm": 0.8184751331558405, + "learning_rate": 4.3048849512043245e-07, + "loss": 0.5408, + "step": 7409 + }, + { + "epoch": 0.90920245398773, + "grad_norm": 0.8104611668604433, + "learning_rate": 4.2933574776303664e-07, + "loss": 0.5476, + "step": 7410 + }, + { + "epoch": 0.9093251533742331, + "grad_norm": 0.8801975506741833, + "learning_rate": 4.281845120097794e-07, + "loss": 0.5122, + "step": 7411 + }, + { + "epoch": 0.9094478527607361, + "grad_norm": 0.8838846274594853, + "learning_rate": 4.270347880424863e-07, + "loss": 0.4971, + "step": 7412 + }, + { + "epoch": 0.9095705521472393, + "grad_norm": 0.8529627985195479, + "learning_rate": 4.2588657604274994e-07, + "loss": 0.4947, + "step": 7413 + }, + { + "epoch": 0.9096932515337424, + "grad_norm": 1.0293312450734802, + "learning_rate": 4.247398761919175e-07, + "loss": 0.5719, + "step": 7414 + }, + { + "epoch": 0.9098159509202454, + "grad_norm": 0.8078712826610389, + "learning_rate": 4.235946886711018e-07, + "loss": 0.5071, + "step": 7415 + }, + { + "epoch": 0.9099386503067485, + "grad_norm": 0.848541757573401, + "learning_rate": 4.224510136611759e-07, + "loss": 0.5669, + "step": 7416 + }, + { + "epoch": 0.9100613496932516, + "grad_norm": 0.9538720331137459, + "learning_rate": 4.213088513427721e-07, + "loss": 0.5508, + "step": 7417 + }, + { + "epoch": 0.9101840490797546, + "grad_norm": 0.902356426672235, + "learning_rate": 4.2016820189628603e-07, + "loss": 0.4622, + "step": 7418 + }, + { + "epoch": 0.9103067484662577, + "grad_norm": 0.8666655459406466, + "learning_rate": 4.190290655018736e-07, + "loss": 0.5658, + "step": 7419 + }, + { + "epoch": 0.9104294478527607, + "grad_norm": 0.9062113109835003, + "learning_rate": 4.1789144233945087e-07, + "loss": 0.5161, + "step": 7420 + }, + { + "epoch": 0.9105521472392638, + "grad_norm": 0.7969859230251519, + "learning_rate": 4.1675533258869525e-07, + "loss": 0.4473, + "step": 7421 + }, + { + "epoch": 0.9106748466257669, + "grad_norm": 0.830412160266288, + "learning_rate": 4.1562073642904763e-07, + "loss": 0.4861, + "step": 7422 + }, + { + "epoch": 0.9107975460122699, + "grad_norm": 1.032660197358741, + "learning_rate": 4.144876540397047e-07, + "loss": 0.5141, + "step": 7423 + }, + { + "epoch": 0.910920245398773, + "grad_norm": 0.7847965373401324, + "learning_rate": 4.133560855996299e-07, + "loss": 0.5141, + "step": 7424 + }, + { + "epoch": 0.911042944785276, + "grad_norm": 0.9606505326020213, + "learning_rate": 4.122260312875437e-07, + "loss": 0.5264, + "step": 7425 + }, + { + "epoch": 0.9111656441717791, + "grad_norm": 1.0270659334800423, + "learning_rate": 4.1109749128192875e-07, + "loss": 0.5299, + "step": 7426 + }, + { + "epoch": 0.9112883435582823, + "grad_norm": 0.8204411819909975, + "learning_rate": 4.0997046576102597e-07, + "loss": 0.533, + "step": 7427 + }, + { + "epoch": 0.9114110429447853, + "grad_norm": 0.7648476797634133, + "learning_rate": 4.088449549028428e-07, + "loss": 0.5048, + "step": 7428 + }, + { + "epoch": 0.9115337423312884, + "grad_norm": 0.8874535961098823, + "learning_rate": 4.0772095888514274e-07, + "loss": 0.488, + "step": 7429 + }, + { + "epoch": 0.9116564417177914, + "grad_norm": 1.0557010770431392, + "learning_rate": 4.0659847788544926e-07, + "loss": 0.5622, + "step": 7430 + }, + { + "epoch": 0.9117791411042945, + "grad_norm": 0.9449779739530665, + "learning_rate": 4.0547751208105166e-07, + "loss": 0.4699, + "step": 7431 + }, + { + "epoch": 0.9119018404907976, + "grad_norm": 0.9656441049405622, + "learning_rate": 4.0435806164899507e-07, + "loss": 0.5142, + "step": 7432 + }, + { + "epoch": 0.9120245398773006, + "grad_norm": 1.1177052974153212, + "learning_rate": 4.032401267660879e-07, + "loss": 0.5601, + "step": 7433 + }, + { + "epoch": 0.9121472392638037, + "grad_norm": 0.8487260541301022, + "learning_rate": 4.021237076088957e-07, + "loss": 0.4469, + "step": 7434 + }, + { + "epoch": 0.9122699386503067, + "grad_norm": 0.8837786744185944, + "learning_rate": 4.010088043537519e-07, + "loss": 0.5694, + "step": 7435 + }, + { + "epoch": 0.9123926380368098, + "grad_norm": 0.8465164918896511, + "learning_rate": 3.998954171767422e-07, + "loss": 0.5429, + "step": 7436 + }, + { + "epoch": 0.9125153374233129, + "grad_norm": 0.9423243267531874, + "learning_rate": 3.9878354625371927e-07, + "loss": 0.633, + "step": 7437 + }, + { + "epoch": 0.9126380368098159, + "grad_norm": 0.8352655937170084, + "learning_rate": 3.976731917602905e-07, + "loss": 0.5858, + "step": 7438 + }, + { + "epoch": 0.912760736196319, + "grad_norm": 0.8917722941975623, + "learning_rate": 3.9656435387183e-07, + "loss": 0.5565, + "step": 7439 + }, + { + "epoch": 0.912883435582822, + "grad_norm": 0.942757064611391, + "learning_rate": 3.954570327634677e-07, + "loss": 0.5265, + "step": 7440 + }, + { + "epoch": 0.9130061349693251, + "grad_norm": 0.830235516480063, + "learning_rate": 3.943512286100937e-07, + "loss": 0.5466, + "step": 7441 + }, + { + "epoch": 0.9131288343558283, + "grad_norm": 0.827327722870745, + "learning_rate": 3.9324694158636266e-07, + "loss": 0.5717, + "step": 7442 + }, + { + "epoch": 0.9132515337423313, + "grad_norm": 0.956783769817238, + "learning_rate": 3.921441718666863e-07, + "loss": 0.5559, + "step": 7443 + }, + { + "epoch": 0.9133742331288344, + "grad_norm": 0.8916235207455981, + "learning_rate": 3.910429196252386e-07, + "loss": 0.4909, + "step": 7444 + }, + { + "epoch": 0.9134969325153375, + "grad_norm": 0.8650931156699319, + "learning_rate": 3.899431850359503e-07, + "loss": 0.5599, + "step": 7445 + }, + { + "epoch": 0.9136196319018405, + "grad_norm": 0.8661079066729692, + "learning_rate": 3.888449682725193e-07, + "loss": 0.5429, + "step": 7446 + }, + { + "epoch": 0.9137423312883436, + "grad_norm": 0.9146790040170167, + "learning_rate": 3.877482695083945e-07, + "loss": 0.5631, + "step": 7447 + }, + { + "epoch": 0.9138650306748466, + "grad_norm": 0.8383102470221414, + "learning_rate": 3.86653088916793e-07, + "loss": 0.5937, + "step": 7448 + }, + { + "epoch": 0.9139877300613497, + "grad_norm": 0.9051441090726875, + "learning_rate": 3.855594266706908e-07, + "loss": 0.5931, + "step": 7449 + }, + { + "epoch": 0.9141104294478528, + "grad_norm": 0.8984880830121664, + "learning_rate": 3.8446728294281865e-07, + "loss": 0.5022, + "step": 7450 + }, + { + "epoch": 0.9142331288343558, + "grad_norm": 0.937066296643228, + "learning_rate": 3.83376657905673e-07, + "loss": 0.469, + "step": 7451 + }, + { + "epoch": 0.9143558282208589, + "grad_norm": 0.9149480485727726, + "learning_rate": 3.8228755173150943e-07, + "loss": 0.4924, + "step": 7452 + }, + { + "epoch": 0.9144785276073619, + "grad_norm": 0.8947095421910775, + "learning_rate": 3.8119996459234144e-07, + "loss": 0.5062, + "step": 7453 + }, + { + "epoch": 0.914601226993865, + "grad_norm": 0.7923631095923394, + "learning_rate": 3.801138966599449e-07, + "loss": 0.6012, + "step": 7454 + }, + { + "epoch": 0.9147239263803681, + "grad_norm": 1.0272177809021807, + "learning_rate": 3.7902934810585603e-07, + "loss": 0.5516, + "step": 7455 + }, + { + "epoch": 0.9148466257668711, + "grad_norm": 0.9593131742467184, + "learning_rate": 3.779463191013666e-07, + "loss": 0.5167, + "step": 7456 + }, + { + "epoch": 0.9149693251533743, + "grad_norm": 0.7549672641036818, + "learning_rate": 3.7686480981753537e-07, + "loss": 0.5251, + "step": 7457 + }, + { + "epoch": 0.9150920245398773, + "grad_norm": 0.880273278436911, + "learning_rate": 3.757848204251746e-07, + "loss": 0.5006, + "step": 7458 + }, + { + "epoch": 0.9152147239263804, + "grad_norm": 0.8988573665425276, + "learning_rate": 3.747063510948623e-07, + "loss": 0.5458, + "step": 7459 + }, + { + "epoch": 0.9153374233128835, + "grad_norm": 0.8413713382931423, + "learning_rate": 3.736294019969311e-07, + "loss": 0.5158, + "step": 7460 + }, + { + "epoch": 0.9154601226993865, + "grad_norm": 0.8109363913856971, + "learning_rate": 3.72553973301476e-07, + "loss": 0.5678, + "step": 7461 + }, + { + "epoch": 0.9155828220858896, + "grad_norm": 0.7970213440167664, + "learning_rate": 3.7148006517835233e-07, + "loss": 0.4866, + "step": 7462 + }, + { + "epoch": 0.9157055214723926, + "grad_norm": 0.6836740513137952, + "learning_rate": 3.7040767779717435e-07, + "loss": 0.5817, + "step": 7463 + }, + { + "epoch": 0.9158282208588957, + "grad_norm": 0.935417661480535, + "learning_rate": 3.693368113273177e-07, + "loss": 0.5083, + "step": 7464 + }, + { + "epoch": 0.9159509202453988, + "grad_norm": 0.9202132811578458, + "learning_rate": 3.682674659379137e-07, + "loss": 0.5216, + "step": 7465 + }, + { + "epoch": 0.9160736196319018, + "grad_norm": 0.7813202286805786, + "learning_rate": 3.671996417978596e-07, + "loss": 0.5714, + "step": 7466 + }, + { + "epoch": 0.9161963190184049, + "grad_norm": 1.1645564624807967, + "learning_rate": 3.6613333907580595e-07, + "loss": 0.5701, + "step": 7467 + }, + { + "epoch": 0.9163190184049079, + "grad_norm": 0.8314928312503118, + "learning_rate": 3.650685579401692e-07, + "loss": 0.5384, + "step": 7468 + }, + { + "epoch": 0.916441717791411, + "grad_norm": 0.7544333968928595, + "learning_rate": 3.640052985591191e-07, + "loss": 0.549, + "step": 7469 + }, + { + "epoch": 0.9165644171779141, + "grad_norm": 0.934993633828897, + "learning_rate": 3.629435611005916e-07, + "loss": 0.5384, + "step": 7470 + }, + { + "epoch": 0.9166871165644171, + "grad_norm": 0.9009132756780067, + "learning_rate": 3.618833457322779e-07, + "loss": 0.4978, + "step": 7471 + }, + { + "epoch": 0.9168098159509203, + "grad_norm": 0.9918067306038892, + "learning_rate": 3.608246526216275e-07, + "loss": 0.6006, + "step": 7472 + }, + { + "epoch": 0.9169325153374233, + "grad_norm": 0.8906649422561299, + "learning_rate": 3.5976748193585544e-07, + "loss": 0.579, + "step": 7473 + }, + { + "epoch": 0.9170552147239264, + "grad_norm": 0.9101459245661885, + "learning_rate": 3.587118338419293e-07, + "loss": 0.5368, + "step": 7474 + }, + { + "epoch": 0.9171779141104295, + "grad_norm": 0.9804277605063257, + "learning_rate": 3.5765770850658244e-07, + "loss": 0.5737, + "step": 7475 + }, + { + "epoch": 0.9173006134969325, + "grad_norm": 1.867124675145869, + "learning_rate": 3.566051060963016e-07, + "loss": 0.5313, + "step": 7476 + }, + { + "epoch": 0.9174233128834356, + "grad_norm": 0.8571712548384298, + "learning_rate": 3.5555402677734164e-07, + "loss": 0.5584, + "step": 7477 + }, + { + "epoch": 0.9175460122699387, + "grad_norm": 0.7448969011203989, + "learning_rate": 3.545044707157075e-07, + "loss": 0.5693, + "step": 7478 + }, + { + "epoch": 0.9176687116564417, + "grad_norm": 0.8844194640669231, + "learning_rate": 3.5345643807716765e-07, + "loss": 0.4626, + "step": 7479 + }, + { + "epoch": 0.9177914110429448, + "grad_norm": 0.9627117340811194, + "learning_rate": 3.5240992902725204e-07, + "loss": 0.5111, + "step": 7480 + }, + { + "epoch": 0.9179141104294478, + "grad_norm": 0.7990484611069706, + "learning_rate": 3.5136494373124495e-07, + "loss": 0.5453, + "step": 7481 + }, + { + "epoch": 0.9180368098159509, + "grad_norm": 0.8586140760148265, + "learning_rate": 3.503214823541945e-07, + "loss": 0.5517, + "step": 7482 + }, + { + "epoch": 0.918159509202454, + "grad_norm": 0.9828157095252509, + "learning_rate": 3.4927954506090654e-07, + "loss": 0.5587, + "step": 7483 + }, + { + "epoch": 0.918282208588957, + "grad_norm": 0.9734644485757105, + "learning_rate": 3.4823913201594616e-07, + "loss": 0.5136, + "step": 7484 + }, + { + "epoch": 0.9184049079754601, + "grad_norm": 1.0046303135260963, + "learning_rate": 3.4720024338363633e-07, + "loss": 0.5324, + "step": 7485 + }, + { + "epoch": 0.9185276073619631, + "grad_norm": 0.7960869777283264, + "learning_rate": 3.4616287932806246e-07, + "loss": 0.5745, + "step": 7486 + }, + { + "epoch": 0.9186503067484663, + "grad_norm": 1.0554984912105172, + "learning_rate": 3.451270400130646e-07, + "loss": 0.552, + "step": 7487 + }, + { + "epoch": 0.9187730061349694, + "grad_norm": 0.8194516388846872, + "learning_rate": 3.440927256022486e-07, + "loss": 0.4913, + "step": 7488 + }, + { + "epoch": 0.9188957055214724, + "grad_norm": 0.9258796407007607, + "learning_rate": 3.4305993625897263e-07, + "loss": 0.5465, + "step": 7489 + }, + { + "epoch": 0.9190184049079755, + "grad_norm": 1.0719219651275331, + "learning_rate": 3.420286721463562e-07, + "loss": 0.6, + "step": 7490 + }, + { + "epoch": 0.9191411042944785, + "grad_norm": 1.0312436203496027, + "learning_rate": 3.409989334272812e-07, + "loss": 0.5117, + "step": 7491 + }, + { + "epoch": 0.9192638036809816, + "grad_norm": 0.829002983020245, + "learning_rate": 3.3997072026438425e-07, + "loss": 0.5617, + "step": 7492 + }, + { + "epoch": 0.9193865030674847, + "grad_norm": 0.8529647378011763, + "learning_rate": 3.389440328200644e-07, + "loss": 0.5431, + "step": 7493 + }, + { + "epoch": 0.9195092024539877, + "grad_norm": 0.8620917823194322, + "learning_rate": 3.3791887125647515e-07, + "loss": 0.5783, + "step": 7494 + }, + { + "epoch": 0.9196319018404908, + "grad_norm": 0.8571305474068214, + "learning_rate": 3.3689523573553597e-07, + "loss": 0.5075, + "step": 7495 + }, + { + "epoch": 0.9197546012269938, + "grad_norm": 0.9069457156852427, + "learning_rate": 3.3587312641891747e-07, + "loss": 0.5857, + "step": 7496 + }, + { + "epoch": 0.9198773006134969, + "grad_norm": 0.8096601422907336, + "learning_rate": 3.3485254346805497e-07, + "loss": 0.5564, + "step": 7497 + }, + { + "epoch": 0.92, + "grad_norm": 0.8590056413833924, + "learning_rate": 3.3383348704414065e-07, + "loss": 0.5849, + "step": 7498 + }, + { + "epoch": 0.920122699386503, + "grad_norm": 1.0851676380957984, + "learning_rate": 3.328159573081258e-07, + "loss": 0.5375, + "step": 7499 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 1.0104866854461876, + "learning_rate": 3.3179995442071956e-07, + "loss": 0.5589, + "step": 7500 + }, + { + "epoch": 0.9203680981595092, + "grad_norm": 0.8466006419386655, + "learning_rate": 3.3078547854239254e-07, + "loss": 0.5439, + "step": 7501 + }, + { + "epoch": 0.9204907975460123, + "grad_norm": 1.0742497711275067, + "learning_rate": 3.2977252983337204e-07, + "loss": 0.5564, + "step": 7502 + }, + { + "epoch": 0.9206134969325154, + "grad_norm": 0.9454706854807093, + "learning_rate": 3.287611084536413e-07, + "loss": 0.5168, + "step": 7503 + }, + { + "epoch": 0.9207361963190184, + "grad_norm": 0.7909942029464009, + "learning_rate": 3.2775121456295024e-07, + "loss": 0.527, + "step": 7504 + }, + { + "epoch": 0.9208588957055215, + "grad_norm": 0.8828204206625895, + "learning_rate": 3.2674284832080127e-07, + "loss": 0.5608, + "step": 7505 + }, + { + "epoch": 0.9209815950920245, + "grad_norm": 0.9524356101231691, + "learning_rate": 3.25736009886457e-07, + "loss": 0.5056, + "step": 7506 + }, + { + "epoch": 0.9211042944785276, + "grad_norm": 0.9505804573837306, + "learning_rate": 3.2473069941893807e-07, + "loss": 0.5838, + "step": 7507 + }, + { + "epoch": 0.9212269938650307, + "grad_norm": 0.8343141085396999, + "learning_rate": 3.237269170770263e-07, + "loss": 0.5483, + "step": 7508 + }, + { + "epoch": 0.9213496932515337, + "grad_norm": 0.9642411764710196, + "learning_rate": 3.227246630192593e-07, + "loss": 0.5264, + "step": 7509 + }, + { + "epoch": 0.9214723926380368, + "grad_norm": 0.9676332289962836, + "learning_rate": 3.217239374039338e-07, + "loss": 0.5222, + "step": 7510 + }, + { + "epoch": 0.9215950920245399, + "grad_norm": 0.8256805572210026, + "learning_rate": 3.2072474038910895e-07, + "loss": 0.5704, + "step": 7511 + }, + { + "epoch": 0.9217177914110429, + "grad_norm": 0.915062310366043, + "learning_rate": 3.197270721325951e-07, + "loss": 0.524, + "step": 7512 + }, + { + "epoch": 0.921840490797546, + "grad_norm": 0.8942509334295079, + "learning_rate": 3.1873093279196854e-07, + "loss": 0.5751, + "step": 7513 + }, + { + "epoch": 0.921963190184049, + "grad_norm": 0.8226166434047, + "learning_rate": 3.1773632252455886e-07, + "loss": 0.4973, + "step": 7514 + }, + { + "epoch": 0.9220858895705522, + "grad_norm": 0.968065570765519, + "learning_rate": 3.1674324148745827e-07, + "loss": 0.56, + "step": 7515 + }, + { + "epoch": 0.9222085889570553, + "grad_norm": 0.8288068507398434, + "learning_rate": 3.1575168983751345e-07, + "loss": 0.5493, + "step": 7516 + }, + { + "epoch": 0.9223312883435583, + "grad_norm": 0.9856777333942789, + "learning_rate": 3.1476166773133254e-07, + "loss": 0.5235, + "step": 7517 + }, + { + "epoch": 0.9224539877300614, + "grad_norm": 0.8973701289843019, + "learning_rate": 3.137731753252804e-07, + "loss": 0.5151, + "step": 7518 + }, + { + "epoch": 0.9225766871165644, + "grad_norm": 0.8678522847252154, + "learning_rate": 3.127862127754822e-07, + "loss": 0.5578, + "step": 7519 + }, + { + "epoch": 0.9226993865030675, + "grad_norm": 0.9453305797676378, + "learning_rate": 3.118007802378198e-07, + "loss": 0.4835, + "step": 7520 + }, + { + "epoch": 0.9228220858895706, + "grad_norm": 0.9315943749750814, + "learning_rate": 3.10816877867931e-07, + "loss": 0.5208, + "step": 7521 + }, + { + "epoch": 0.9229447852760736, + "grad_norm": 1.2704634918234092, + "learning_rate": 3.098345058212193e-07, + "loss": 0.5228, + "step": 7522 + }, + { + "epoch": 0.9230674846625767, + "grad_norm": 0.9215286594472275, + "learning_rate": 3.0885366425283835e-07, + "loss": 0.5536, + "step": 7523 + }, + { + "epoch": 0.9231901840490797, + "grad_norm": 0.8872809742973694, + "learning_rate": 3.078743533177053e-07, + "loss": 0.5781, + "step": 7524 + }, + { + "epoch": 0.9233128834355828, + "grad_norm": 0.83803179400091, + "learning_rate": 3.0689657317049205e-07, + "loss": 0.505, + "step": 7525 + }, + { + "epoch": 0.9234355828220859, + "grad_norm": 0.845420465833472, + "learning_rate": 3.0592032396563385e-07, + "loss": 0.5446, + "step": 7526 + }, + { + "epoch": 0.9235582822085889, + "grad_norm": 0.8460236394733033, + "learning_rate": 3.049456058573175e-07, + "loss": 0.5705, + "step": 7527 + }, + { + "epoch": 0.923680981595092, + "grad_norm": 0.8989492439462027, + "learning_rate": 3.0397241899949414e-07, + "loss": 0.5438, + "step": 7528 + }, + { + "epoch": 0.923803680981595, + "grad_norm": 1.1734177136172927, + "learning_rate": 3.030007635458687e-07, + "loss": 0.5445, + "step": 7529 + }, + { + "epoch": 0.9239263803680982, + "grad_norm": 0.9285602570624981, + "learning_rate": 3.020306396499062e-07, + "loss": 0.5147, + "step": 7530 + }, + { + "epoch": 0.9240490797546013, + "grad_norm": 0.9727077663914121, + "learning_rate": 3.010620474648285e-07, + "loss": 0.5867, + "step": 7531 + }, + { + "epoch": 0.9241717791411043, + "grad_norm": 0.8961654356022976, + "learning_rate": 3.000949871436165e-07, + "loss": 0.5397, + "step": 7532 + }, + { + "epoch": 0.9242944785276074, + "grad_norm": 1.3354074284093773, + "learning_rate": 2.9912945883901144e-07, + "loss": 0.5571, + "step": 7533 + }, + { + "epoch": 0.9244171779141104, + "grad_norm": 0.8816096725732552, + "learning_rate": 2.98165462703508e-07, + "loss": 0.551, + "step": 7534 + }, + { + "epoch": 0.9245398773006135, + "grad_norm": 0.8221115730403888, + "learning_rate": 2.972029988893621e-07, + "loss": 0.5469, + "step": 7535 + }, + { + "epoch": 0.9246625766871166, + "grad_norm": 0.8184402548015195, + "learning_rate": 2.9624206754858443e-07, + "loss": 0.5807, + "step": 7536 + }, + { + "epoch": 0.9247852760736196, + "grad_norm": 0.8441298330531539, + "learning_rate": 2.952826688329491e-07, + "loss": 0.596, + "step": 7537 + }, + { + "epoch": 0.9249079754601227, + "grad_norm": 0.7713913078096033, + "learning_rate": 2.943248028939838e-07, + "loss": 0.5229, + "step": 7538 + }, + { + "epoch": 0.9250306748466258, + "grad_norm": 0.8929092419775989, + "learning_rate": 2.933684698829731e-07, + "loss": 0.5549, + "step": 7539 + }, + { + "epoch": 0.9251533742331288, + "grad_norm": 0.8549783651188786, + "learning_rate": 2.9241366995096387e-07, + "loss": 0.5592, + "step": 7540 + }, + { + "epoch": 0.9252760736196319, + "grad_norm": 0.8203378296346878, + "learning_rate": 2.9146040324875777e-07, + "loss": 0.5222, + "step": 7541 + }, + { + "epoch": 0.9253987730061349, + "grad_norm": 0.9987912823028519, + "learning_rate": 2.9050866992691553e-07, + "loss": 0.529, + "step": 7542 + }, + { + "epoch": 0.925521472392638, + "grad_norm": 0.9641903941566279, + "learning_rate": 2.8955847013575457e-07, + "loss": 0.5587, + "step": 7543 + }, + { + "epoch": 0.9256441717791412, + "grad_norm": 0.9450199654049142, + "learning_rate": 2.8860980402535156e-07, + "loss": 0.5935, + "step": 7544 + }, + { + "epoch": 0.9257668711656442, + "grad_norm": 0.800390942284583, + "learning_rate": 2.8766267174553884e-07, + "loss": 0.5256, + "step": 7545 + }, + { + "epoch": 0.9258895705521473, + "grad_norm": 0.9424307017600538, + "learning_rate": 2.8671707344591016e-07, + "loss": 0.5568, + "step": 7546 + }, + { + "epoch": 0.9260122699386503, + "grad_norm": 0.8853648252513517, + "learning_rate": 2.857730092758115e-07, + "loss": 0.5129, + "step": 7547 + }, + { + "epoch": 0.9261349693251534, + "grad_norm": 0.8090251116408296, + "learning_rate": 2.848304793843526e-07, + "loss": 0.5378, + "step": 7548 + }, + { + "epoch": 0.9262576687116565, + "grad_norm": 0.8759796499068394, + "learning_rate": 2.838894839203965e-07, + "loss": 0.5223, + "step": 7549 + }, + { + "epoch": 0.9263803680981595, + "grad_norm": 0.9482932472275186, + "learning_rate": 2.8295002303256546e-07, + "loss": 0.5168, + "step": 7550 + }, + { + "epoch": 0.9265030674846626, + "grad_norm": 0.8187116187521393, + "learning_rate": 2.820120968692397e-07, + "loss": 0.5178, + "step": 7551 + }, + { + "epoch": 0.9266257668711656, + "grad_norm": 0.9148589207928729, + "learning_rate": 2.810757055785551e-07, + "loss": 0.5475, + "step": 7552 + }, + { + "epoch": 0.9267484662576687, + "grad_norm": 0.9916133796342039, + "learning_rate": 2.80140849308409e-07, + "loss": 0.5475, + "step": 7553 + }, + { + "epoch": 0.9268711656441718, + "grad_norm": 0.9798153974806058, + "learning_rate": 2.7920752820645105e-07, + "loss": 0.5261, + "step": 7554 + }, + { + "epoch": 0.9269938650306748, + "grad_norm": 0.9036054160776944, + "learning_rate": 2.7827574242009434e-07, + "loss": 0.5028, + "step": 7555 + }, + { + "epoch": 0.9271165644171779, + "grad_norm": 0.9298008136060799, + "learning_rate": 2.773454920965046e-07, + "loss": 0.4905, + "step": 7556 + }, + { + "epoch": 0.9272392638036809, + "grad_norm": 0.8561541989823532, + "learning_rate": 2.7641677738260763e-07, + "loss": 0.5813, + "step": 7557 + }, + { + "epoch": 0.927361963190184, + "grad_norm": 0.8356321346019722, + "learning_rate": 2.754895984250838e-07, + "loss": 0.5714, + "step": 7558 + }, + { + "epoch": 0.9274846625766872, + "grad_norm": 0.9196948180115966, + "learning_rate": 2.7456395537037605e-07, + "loss": 0.5954, + "step": 7559 + }, + { + "epoch": 0.9276073619631902, + "grad_norm": 0.9416633025360703, + "learning_rate": 2.736398483646807e-07, + "loss": 0.4666, + "step": 7560 + }, + { + "epoch": 0.9277300613496933, + "grad_norm": 0.9036766443398977, + "learning_rate": 2.727172775539522e-07, + "loss": 0.5323, + "step": 7561 + }, + { + "epoch": 0.9278527607361963, + "grad_norm": 0.8976458448003696, + "learning_rate": 2.7179624308390387e-07, + "loss": 0.5446, + "step": 7562 + }, + { + "epoch": 0.9279754601226994, + "grad_norm": 0.8585975502843252, + "learning_rate": 2.708767451000016e-07, + "loss": 0.444, + "step": 7563 + }, + { + "epoch": 0.9280981595092025, + "grad_norm": 0.8639406488641246, + "learning_rate": 2.699587837474771e-07, + "loss": 0.4857, + "step": 7564 + }, + { + "epoch": 0.9282208588957055, + "grad_norm": 0.8854337291849325, + "learning_rate": 2.6904235917131094e-07, + "loss": 0.5659, + "step": 7565 + }, + { + "epoch": 0.9283435582822086, + "grad_norm": 0.8655500273630606, + "learning_rate": 2.681274715162463e-07, + "loss": 0.5474, + "step": 7566 + }, + { + "epoch": 0.9284662576687116, + "grad_norm": 0.9017144082508365, + "learning_rate": 2.6721412092677976e-07, + "loss": 0.4899, + "step": 7567 + }, + { + "epoch": 0.9285889570552147, + "grad_norm": 0.8179622963208683, + "learning_rate": 2.6630230754717047e-07, + "loss": 0.5435, + "step": 7568 + }, + { + "epoch": 0.9287116564417178, + "grad_norm": 0.9521427992472443, + "learning_rate": 2.653920315214287e-07, + "loss": 0.5407, + "step": 7569 + }, + { + "epoch": 0.9288343558282208, + "grad_norm": 0.8789366482087265, + "learning_rate": 2.64483292993325e-07, + "loss": 0.4425, + "step": 7570 + }, + { + "epoch": 0.9289570552147239, + "grad_norm": 0.9028609320628551, + "learning_rate": 2.63576092106389e-07, + "loss": 0.6178, + "step": 7571 + }, + { + "epoch": 0.929079754601227, + "grad_norm": 0.9115679715287299, + "learning_rate": 2.6267042900390173e-07, + "loss": 0.481, + "step": 7572 + }, + { + "epoch": 0.92920245398773, + "grad_norm": 0.8878041442834597, + "learning_rate": 2.6176630382890864e-07, + "loss": 0.4664, + "step": 7573 + }, + { + "epoch": 0.9293251533742332, + "grad_norm": 0.8932057058738765, + "learning_rate": 2.6086371672420565e-07, + "loss": 0.513, + "step": 7574 + }, + { + "epoch": 0.9294478527607362, + "grad_norm": 0.8177341624903088, + "learning_rate": 2.599626678323508e-07, + "loss": 0.5703, + "step": 7575 + }, + { + "epoch": 0.9295705521472393, + "grad_norm": 0.9181622375883447, + "learning_rate": 2.590631572956559e-07, + "loss": 0.5875, + "step": 7576 + }, + { + "epoch": 0.9296932515337424, + "grad_norm": 0.8122047965367964, + "learning_rate": 2.581651852561906e-07, + "loss": 0.5622, + "step": 7577 + }, + { + "epoch": 0.9298159509202454, + "grad_norm": 0.9258701796637068, + "learning_rate": 2.5726875185578147e-07, + "loss": 0.4756, + "step": 7578 + }, + { + "epoch": 0.9299386503067485, + "grad_norm": 0.9268989978629428, + "learning_rate": 2.563738572360153e-07, + "loss": 0.5427, + "step": 7579 + }, + { + "epoch": 0.9300613496932515, + "grad_norm": 0.8917366543733088, + "learning_rate": 2.554805015382289e-07, + "loss": 0.5362, + "step": 7580 + }, + { + "epoch": 0.9301840490797546, + "grad_norm": 0.8995970174155917, + "learning_rate": 2.5458868490352395e-07, + "loss": 0.5361, + "step": 7581 + }, + { + "epoch": 0.9303067484662577, + "grad_norm": 0.7834572883482875, + "learning_rate": 2.536984074727522e-07, + "loss": 0.5357, + "step": 7582 + }, + { + "epoch": 0.9304294478527607, + "grad_norm": 0.8257532550787948, + "learning_rate": 2.528096693865267e-07, + "loss": 0.5446, + "step": 7583 + }, + { + "epoch": 0.9305521472392638, + "grad_norm": 0.8745583127758826, + "learning_rate": 2.519224707852164e-07, + "loss": 0.5043, + "step": 7584 + }, + { + "epoch": 0.9306748466257668, + "grad_norm": 1.2171262296651313, + "learning_rate": 2.5103681180894566e-07, + "loss": 0.5621, + "step": 7585 + }, + { + "epoch": 0.9307975460122699, + "grad_norm": 0.8602589078978837, + "learning_rate": 2.501526925975961e-07, + "loss": 0.5066, + "step": 7586 + }, + { + "epoch": 0.930920245398773, + "grad_norm": 0.9607643756777814, + "learning_rate": 2.492701132908093e-07, + "loss": 0.571, + "step": 7587 + }, + { + "epoch": 0.931042944785276, + "grad_norm": 0.8988623152199674, + "learning_rate": 2.483890740279793e-07, + "loss": 0.5108, + "step": 7588 + }, + { + "epoch": 0.9311656441717792, + "grad_norm": 0.8980205256719431, + "learning_rate": 2.4750957494826033e-07, + "loss": 0.5543, + "step": 7589 + }, + { + "epoch": 0.9312883435582822, + "grad_norm": 0.8685832784635897, + "learning_rate": 2.4663161619055797e-07, + "loss": 0.5536, + "step": 7590 + }, + { + "epoch": 0.9314110429447853, + "grad_norm": 0.8719090062423321, + "learning_rate": 2.457551978935424e-07, + "loss": 0.5596, + "step": 7591 + }, + { + "epoch": 0.9315337423312884, + "grad_norm": 0.9610072179891403, + "learning_rate": 2.44880320195634e-07, + "loss": 0.5363, + "step": 7592 + }, + { + "epoch": 0.9316564417177914, + "grad_norm": 0.8310964290772538, + "learning_rate": 2.4400698323501224e-07, + "loss": 0.5588, + "step": 7593 + }, + { + "epoch": 0.9317791411042945, + "grad_norm": 1.081876073908358, + "learning_rate": 2.431351871496146e-07, + "loss": 0.5657, + "step": 7594 + }, + { + "epoch": 0.9319018404907975, + "grad_norm": 0.8479888391156825, + "learning_rate": 2.422649320771331e-07, + "loss": 0.54, + "step": 7595 + }, + { + "epoch": 0.9320245398773006, + "grad_norm": 0.8500261692107108, + "learning_rate": 2.413962181550156e-07, + "loss": 0.5476, + "step": 7596 + }, + { + "epoch": 0.9321472392638037, + "grad_norm": 0.8553951419614974, + "learning_rate": 2.4052904552047006e-07, + "loss": 0.544, + "step": 7597 + }, + { + "epoch": 0.9322699386503067, + "grad_norm": 0.7415674542923785, + "learning_rate": 2.3966341431045813e-07, + "loss": 0.4743, + "step": 7598 + }, + { + "epoch": 0.9323926380368098, + "grad_norm": 1.0651240002236335, + "learning_rate": 2.387993246616993e-07, + "loss": 0.5682, + "step": 7599 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 0.8655865613754825, + "learning_rate": 2.3793677671066882e-07, + "loss": 0.505, + "step": 7600 + }, + { + "epoch": 0.9326380368098159, + "grad_norm": 0.8542283346978339, + "learning_rate": 2.3707577059359665e-07, + "loss": 0.5809, + "step": 7601 + }, + { + "epoch": 0.932760736196319, + "grad_norm": 0.9080965938263591, + "learning_rate": 2.362163064464751e-07, + "loss": 0.572, + "step": 7602 + }, + { + "epoch": 0.932883435582822, + "grad_norm": 0.8955925709757271, + "learning_rate": 2.3535838440504554e-07, + "loss": 0.5199, + "step": 7603 + }, + { + "epoch": 0.9330061349693252, + "grad_norm": 0.7904860606142682, + "learning_rate": 2.3450200460481298e-07, + "loss": 0.515, + "step": 7604 + }, + { + "epoch": 0.9331288343558283, + "grad_norm": 0.8743855811541881, + "learning_rate": 2.3364716718103143e-07, + "loss": 0.5171, + "step": 7605 + }, + { + "epoch": 0.9332515337423313, + "grad_norm": 0.8744709683961568, + "learning_rate": 2.3279387226871842e-07, + "loss": 0.589, + "step": 7606 + }, + { + "epoch": 0.9333742331288344, + "grad_norm": 0.8621733817566045, + "learning_rate": 2.3194212000264283e-07, + "loss": 0.5319, + "step": 7607 + }, + { + "epoch": 0.9334969325153374, + "grad_norm": 0.9335473582405158, + "learning_rate": 2.310919105173315e-07, + "loss": 0.5601, + "step": 7608 + }, + { + "epoch": 0.9336196319018405, + "grad_norm": 0.7732937497079384, + "learning_rate": 2.302432439470681e-07, + "loss": 0.5633, + "step": 7609 + }, + { + "epoch": 0.9337423312883436, + "grad_norm": 0.9803789962548172, + "learning_rate": 2.293961204258932e-07, + "loss": 0.5792, + "step": 7610 + }, + { + "epoch": 0.9338650306748466, + "grad_norm": 0.8530329599144056, + "learning_rate": 2.2855054008760202e-07, + "loss": 0.5053, + "step": 7611 + }, + { + "epoch": 0.9339877300613497, + "grad_norm": 0.8765599561040889, + "learning_rate": 2.2770650306574438e-07, + "loss": 0.4545, + "step": 7612 + }, + { + "epoch": 0.9341104294478527, + "grad_norm": 0.8828643506759196, + "learning_rate": 2.2686400949363142e-07, + "loss": 0.5338, + "step": 7613 + }, + { + "epoch": 0.9342331288343558, + "grad_norm": 0.8336649754316103, + "learning_rate": 2.2602305950432669e-07, + "loss": 0.5746, + "step": 7614 + }, + { + "epoch": 0.9343558282208589, + "grad_norm": 0.8797789660178233, + "learning_rate": 2.2518365323065284e-07, + "loss": 0.5622, + "step": 7615 + }, + { + "epoch": 0.9344785276073619, + "grad_norm": 0.8594839357571816, + "learning_rate": 2.2434579080518382e-07, + "loss": 0.5327, + "step": 7616 + }, + { + "epoch": 0.934601226993865, + "grad_norm": 0.8166807945920967, + "learning_rate": 2.23509472360256e-07, + "loss": 0.5557, + "step": 7617 + }, + { + "epoch": 0.934723926380368, + "grad_norm": 0.8135498539483398, + "learning_rate": 2.2267469802795705e-07, + "loss": 0.5558, + "step": 7618 + }, + { + "epoch": 0.9348466257668712, + "grad_norm": 0.8700040381602447, + "learning_rate": 2.218414679401304e-07, + "loss": 0.4733, + "step": 7619 + }, + { + "epoch": 0.9349693251533743, + "grad_norm": 0.8315400348015413, + "learning_rate": 2.2100978222838186e-07, + "loss": 0.5811, + "step": 7620 + }, + { + "epoch": 0.9350920245398773, + "grad_norm": 0.9401154718990399, + "learning_rate": 2.2017964102406532e-07, + "loss": 0.5282, + "step": 7621 + }, + { + "epoch": 0.9352147239263804, + "grad_norm": 0.9500045429462696, + "learning_rate": 2.193510444582958e-07, + "loss": 0.5346, + "step": 7622 + }, + { + "epoch": 0.9353374233128834, + "grad_norm": 0.7914214971648168, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.559, + "step": 7623 + }, + { + "epoch": 0.9354601226993865, + "grad_norm": 0.8119545736654481, + "learning_rate": 2.176984857656339e-07, + "loss": 0.5252, + "step": 7624 + }, + { + "epoch": 0.9355828220858896, + "grad_norm": 1.0233526220058693, + "learning_rate": 2.1687452389974829e-07, + "loss": 0.5464, + "step": 7625 + }, + { + "epoch": 0.9357055214723926, + "grad_norm": 0.8091866580863111, + "learning_rate": 2.1605210719442548e-07, + "loss": 0.5072, + "step": 7626 + }, + { + "epoch": 0.9358282208588957, + "grad_norm": 0.8813932050483524, + "learning_rate": 2.1523123577955717e-07, + "loss": 0.5265, + "step": 7627 + }, + { + "epoch": 0.9359509202453987, + "grad_norm": 0.8929870905472025, + "learning_rate": 2.144119097847941e-07, + "loss": 0.5456, + "step": 7628 + }, + { + "epoch": 0.9360736196319018, + "grad_norm": 0.8782916594008786, + "learning_rate": 2.135941293395416e-07, + "loss": 0.5847, + "step": 7629 + }, + { + "epoch": 0.9361963190184049, + "grad_norm": 0.8565034196301514, + "learning_rate": 2.1277789457296306e-07, + "loss": 0.5632, + "step": 7630 + }, + { + "epoch": 0.9363190184049079, + "grad_norm": 0.844899767743473, + "learning_rate": 2.11963205613972e-07, + "loss": 0.5434, + "step": 7631 + }, + { + "epoch": 0.936441717791411, + "grad_norm": 0.9689671949174902, + "learning_rate": 2.1115006259124326e-07, + "loss": 0.5356, + "step": 7632 + }, + { + "epoch": 0.9365644171779142, + "grad_norm": 0.9707503040787632, + "learning_rate": 2.1033846563320748e-07, + "loss": 0.5437, + "step": 7633 + }, + { + "epoch": 0.9366871165644172, + "grad_norm": 0.9249252945357465, + "learning_rate": 2.0952841486804653e-07, + "loss": 0.5628, + "step": 7634 + }, + { + "epoch": 0.9368098159509203, + "grad_norm": 0.8195038281796737, + "learning_rate": 2.0871991042370255e-07, + "loss": 0.5267, + "step": 7635 + }, + { + "epoch": 0.9369325153374233, + "grad_norm": 0.8997642795849274, + "learning_rate": 2.0791295242787112e-07, + "loss": 0.5266, + "step": 7636 + }, + { + "epoch": 0.9370552147239264, + "grad_norm": 0.8856971853879255, + "learning_rate": 2.071075410080059e-07, + "loss": 0.5305, + "step": 7637 + }, + { + "epoch": 0.9371779141104295, + "grad_norm": 0.9472506976944727, + "learning_rate": 2.0630367629131288e-07, + "loss": 0.5389, + "step": 7638 + }, + { + "epoch": 0.9373006134969325, + "grad_norm": 0.8510422845502141, + "learning_rate": 2.0550135840475605e-07, + "loss": 0.5462, + "step": 7639 + }, + { + "epoch": 0.9374233128834356, + "grad_norm": 0.871285391453543, + "learning_rate": 2.0470058747505516e-07, + "loss": 0.4836, + "step": 7640 + }, + { + "epoch": 0.9375460122699386, + "grad_norm": 0.9429115913112948, + "learning_rate": 2.0390136362868462e-07, + "loss": 0.5552, + "step": 7641 + }, + { + "epoch": 0.9376687116564417, + "grad_norm": 0.8272794751615381, + "learning_rate": 2.0310368699187565e-07, + "loss": 0.5393, + "step": 7642 + }, + { + "epoch": 0.9377914110429448, + "grad_norm": 0.8172633875037947, + "learning_rate": 2.0230755769061306e-07, + "loss": 0.5735, + "step": 7643 + }, + { + "epoch": 0.9379141104294478, + "grad_norm": 0.8245565627624354, + "learning_rate": 2.0151297585064067e-07, + "loss": 0.5792, + "step": 7644 + }, + { + "epoch": 0.938036809815951, + "grad_norm": 0.9262135645727948, + "learning_rate": 2.0071994159745367e-07, + "loss": 0.5743, + "step": 7645 + }, + { + "epoch": 0.938159509202454, + "grad_norm": 0.7550855662096896, + "learning_rate": 1.9992845505630741e-07, + "loss": 0.5085, + "step": 7646 + }, + { + "epoch": 0.9382822085889571, + "grad_norm": 0.9187299015455275, + "learning_rate": 1.9913851635220748e-07, + "loss": 0.5845, + "step": 7647 + }, + { + "epoch": 0.9384049079754602, + "grad_norm": 0.8909059619840108, + "learning_rate": 1.9835012560992074e-07, + "loss": 0.5101, + "step": 7648 + }, + { + "epoch": 0.9385276073619632, + "grad_norm": 0.7804747592469741, + "learning_rate": 1.9756328295396532e-07, + "loss": 0.5248, + "step": 7649 + }, + { + "epoch": 0.9386503067484663, + "grad_norm": 0.8819618326410679, + "learning_rate": 1.9677798850861517e-07, + "loss": 0.5259, + "step": 7650 + }, + { + "epoch": 0.9387730061349693, + "grad_norm": 0.8246214009270953, + "learning_rate": 1.959942423979022e-07, + "loss": 0.5239, + "step": 7651 + }, + { + "epoch": 0.9388957055214724, + "grad_norm": 0.9543069258305177, + "learning_rate": 1.9521204474561294e-07, + "loss": 0.5711, + "step": 7652 + }, + { + "epoch": 0.9390184049079755, + "grad_norm": 0.814633320660075, + "learning_rate": 1.9443139567528747e-07, + "loss": 0.5955, + "step": 7653 + }, + { + "epoch": 0.9391411042944785, + "grad_norm": 0.8090273065169049, + "learning_rate": 1.9365229531022267e-07, + "loss": 0.5988, + "step": 7654 + }, + { + "epoch": 0.9392638036809816, + "grad_norm": 0.9133785033004433, + "learning_rate": 1.9287474377347238e-07, + "loss": 0.5347, + "step": 7655 + }, + { + "epoch": 0.9393865030674846, + "grad_norm": 0.9310218975200225, + "learning_rate": 1.9209874118784167e-07, + "loss": 0.5342, + "step": 7656 + }, + { + "epoch": 0.9395092024539877, + "grad_norm": 0.9211088784579949, + "learning_rate": 1.9132428767589471e-07, + "loss": 0.5309, + "step": 7657 + }, + { + "epoch": 0.9396319018404908, + "grad_norm": 0.9924862283270086, + "learning_rate": 1.905513833599504e-07, + "loss": 0.5051, + "step": 7658 + }, + { + "epoch": 0.9397546012269938, + "grad_norm": 0.81239560396409, + "learning_rate": 1.897800283620821e-07, + "loss": 0.5916, + "step": 7659 + }, + { + "epoch": 0.939877300613497, + "grad_norm": 0.9719289386464892, + "learning_rate": 1.8901022280411906e-07, + "loss": 0.5463, + "step": 7660 + }, + { + "epoch": 0.94, + "grad_norm": 0.8670357732796907, + "learning_rate": 1.8824196680764296e-07, + "loss": 0.4647, + "step": 7661 + }, + { + "epoch": 0.9401226993865031, + "grad_norm": 0.9939757483102359, + "learning_rate": 1.8747526049399668e-07, + "loss": 0.5652, + "step": 7662 + }, + { + "epoch": 0.9402453987730062, + "grad_norm": 1.221556425444844, + "learning_rate": 1.8671010398427225e-07, + "loss": 0.5715, + "step": 7663 + }, + { + "epoch": 0.9403680981595092, + "grad_norm": 0.9403638653569756, + "learning_rate": 1.859464973993208e-07, + "loss": 0.5096, + "step": 7664 + }, + { + "epoch": 0.9404907975460123, + "grad_norm": 0.8293658238133718, + "learning_rate": 1.8518444085974697e-07, + "loss": 0.5363, + "step": 7665 + }, + { + "epoch": 0.9406134969325154, + "grad_norm": 0.8710341325565231, + "learning_rate": 1.8442393448591113e-07, + "loss": 0.5432, + "step": 7666 + }, + { + "epoch": 0.9407361963190184, + "grad_norm": 0.8069240700291898, + "learning_rate": 1.836649783979294e-07, + "loss": 0.5135, + "step": 7667 + }, + { + "epoch": 0.9408588957055215, + "grad_norm": 0.9092357850140995, + "learning_rate": 1.829075727156715e-07, + "loss": 0.5815, + "step": 7668 + }, + { + "epoch": 0.9409815950920245, + "grad_norm": 0.9225845271122357, + "learning_rate": 1.8215171755876393e-07, + "loss": 0.5064, + "step": 7669 + }, + { + "epoch": 0.9411042944785276, + "grad_norm": 0.8264506850899066, + "learning_rate": 1.8139741304658566e-07, + "loss": 0.5742, + "step": 7670 + }, + { + "epoch": 0.9412269938650307, + "grad_norm": 0.9552035010259564, + "learning_rate": 1.806446592982758e-07, + "loss": 0.5679, + "step": 7671 + }, + { + "epoch": 0.9413496932515337, + "grad_norm": 0.9579406160522664, + "learning_rate": 1.798934564327226e-07, + "loss": 0.5152, + "step": 7672 + }, + { + "epoch": 0.9414723926380368, + "grad_norm": 0.81901206047957, + "learning_rate": 1.791438045685734e-07, + "loss": 0.5116, + "step": 7673 + }, + { + "epoch": 0.9415950920245398, + "grad_norm": 0.8558304063110608, + "learning_rate": 1.783957038242279e-07, + "loss": 0.5009, + "step": 7674 + }, + { + "epoch": 0.941717791411043, + "grad_norm": 0.8299164660552679, + "learning_rate": 1.776491543178438e-07, + "loss": 0.4387, + "step": 7675 + }, + { + "epoch": 0.9418404907975461, + "grad_norm": 0.9999853101741261, + "learning_rate": 1.7690415616733124e-07, + "loss": 0.541, + "step": 7676 + }, + { + "epoch": 0.9419631901840491, + "grad_norm": 0.7709762741294895, + "learning_rate": 1.7616070949035835e-07, + "loss": 0.583, + "step": 7677 + }, + { + "epoch": 0.9420858895705522, + "grad_norm": 0.8454471510472822, + "learning_rate": 1.7541881440434227e-07, + "loss": 0.4878, + "step": 7678 + }, + { + "epoch": 0.9422085889570552, + "grad_norm": 0.9798398583392104, + "learning_rate": 1.7467847102646262e-07, + "loss": 0.5362, + "step": 7679 + }, + { + "epoch": 0.9423312883435583, + "grad_norm": 0.8296712597070192, + "learning_rate": 1.739396794736481e-07, + "loss": 0.5908, + "step": 7680 + }, + { + "epoch": 0.9424539877300614, + "grad_norm": 0.8291492741170345, + "learning_rate": 1.7320243986258423e-07, + "loss": 0.4702, + "step": 7681 + }, + { + "epoch": 0.9425766871165644, + "grad_norm": 0.8627463716517106, + "learning_rate": 1.7246675230971345e-07, + "loss": 0.4814, + "step": 7682 + }, + { + "epoch": 0.9426993865030675, + "grad_norm": 0.7810393588881607, + "learning_rate": 1.7173261693122945e-07, + "loss": 0.5354, + "step": 7683 + }, + { + "epoch": 0.9428220858895705, + "grad_norm": 0.8900708208270579, + "learning_rate": 1.7100003384308506e-07, + "loss": 0.5126, + "step": 7684 + }, + { + "epoch": 0.9429447852760736, + "grad_norm": 1.0069723758875695, + "learning_rate": 1.7026900316098217e-07, + "loss": 0.4687, + "step": 7685 + }, + { + "epoch": 0.9430674846625767, + "grad_norm": 0.8609129075940629, + "learning_rate": 1.6953952500038396e-07, + "loss": 0.5421, + "step": 7686 + }, + { + "epoch": 0.9431901840490797, + "grad_norm": 0.7839481639626672, + "learning_rate": 1.6881159947650272e-07, + "loss": 0.5053, + "step": 7687 + }, + { + "epoch": 0.9433128834355828, + "grad_norm": 0.7807776822210837, + "learning_rate": 1.6808522670430981e-07, + "loss": 0.5487, + "step": 7688 + }, + { + "epoch": 0.9434355828220858, + "grad_norm": 0.8703237969122459, + "learning_rate": 1.6736040679852906e-07, + "loss": 0.5683, + "step": 7689 + }, + { + "epoch": 0.943558282208589, + "grad_norm": 0.8388813005696386, + "learning_rate": 1.6663713987363882e-07, + "loss": 0.5389, + "step": 7690 + }, + { + "epoch": 0.9436809815950921, + "grad_norm": 0.8756883477732642, + "learning_rate": 1.6591542604387445e-07, + "loss": 0.5614, + "step": 7691 + }, + { + "epoch": 0.9438036809815951, + "grad_norm": 1.0051975971181581, + "learning_rate": 1.651952654232225e-07, + "loss": 0.4999, + "step": 7692 + }, + { + "epoch": 0.9439263803680982, + "grad_norm": 0.9583848977325528, + "learning_rate": 1.6447665812542758e-07, + "loss": 0.5877, + "step": 7693 + }, + { + "epoch": 0.9440490797546012, + "grad_norm": 0.8995788508169946, + "learning_rate": 1.6375960426398552e-07, + "loss": 0.5172, + "step": 7694 + }, + { + "epoch": 0.9441717791411043, + "grad_norm": 0.8292480748842063, + "learning_rate": 1.6304410395215243e-07, + "loss": 0.5173, + "step": 7695 + }, + { + "epoch": 0.9442944785276074, + "grad_norm": 0.7552486294400121, + "learning_rate": 1.6233015730293123e-07, + "loss": 0.5517, + "step": 7696 + }, + { + "epoch": 0.9444171779141104, + "grad_norm": 0.8217640799123916, + "learning_rate": 1.6161776442908728e-07, + "loss": 0.5239, + "step": 7697 + }, + { + "epoch": 0.9445398773006135, + "grad_norm": 0.8574742569147638, + "learning_rate": 1.6090692544313502e-07, + "loss": 0.4863, + "step": 7698 + }, + { + "epoch": 0.9446625766871166, + "grad_norm": 0.9242108827457631, + "learning_rate": 1.6019764045734576e-07, + "loss": 0.5477, + "step": 7699 + }, + { + "epoch": 0.9447852760736196, + "grad_norm": 0.9049158279481482, + "learning_rate": 1.5948990958374543e-07, + "loss": 0.49, + "step": 7700 + }, + { + "epoch": 0.9449079754601227, + "grad_norm": 0.9115432983091347, + "learning_rate": 1.5878373293411242e-07, + "loss": 0.5813, + "step": 7701 + }, + { + "epoch": 0.9450306748466257, + "grad_norm": 0.9292352098211931, + "learning_rate": 1.5807911061998304e-07, + "loss": 0.5694, + "step": 7702 + }, + { + "epoch": 0.9451533742331288, + "grad_norm": 0.8073206602144581, + "learning_rate": 1.5737604275264496e-07, + "loss": 0.4357, + "step": 7703 + }, + { + "epoch": 0.945276073619632, + "grad_norm": 0.8766738765983215, + "learning_rate": 1.5667452944314377e-07, + "loss": 0.5222, + "step": 7704 + }, + { + "epoch": 0.945398773006135, + "grad_norm": 0.8139228996573203, + "learning_rate": 1.559745708022753e-07, + "loss": 0.5097, + "step": 7705 + }, + { + "epoch": 0.9455214723926381, + "grad_norm": 0.9513168724377031, + "learning_rate": 1.5527616694059333e-07, + "loss": 0.5119, + "step": 7706 + }, + { + "epoch": 0.9456441717791411, + "grad_norm": 0.7889601690576005, + "learning_rate": 1.5457931796840298e-07, + "loss": 0.5614, + "step": 7707 + }, + { + "epoch": 0.9457668711656442, + "grad_norm": 0.9838178381617247, + "learning_rate": 1.538840239957684e-07, + "loss": 0.521, + "step": 7708 + }, + { + "epoch": 0.9458895705521473, + "grad_norm": 0.8744909983388373, + "learning_rate": 1.5319028513250177e-07, + "loss": 0.4966, + "step": 7709 + }, + { + "epoch": 0.9460122699386503, + "grad_norm": 0.8921724605540216, + "learning_rate": 1.5249810148817658e-07, + "loss": 0.5687, + "step": 7710 + }, + { + "epoch": 0.9461349693251534, + "grad_norm": 1.209113002907251, + "learning_rate": 1.518074731721153e-07, + "loss": 0.5876, + "step": 7711 + }, + { + "epoch": 0.9462576687116564, + "grad_norm": 0.9635637891950973, + "learning_rate": 1.5111840029339743e-07, + "loss": 0.5495, + "step": 7712 + }, + { + "epoch": 0.9463803680981595, + "grad_norm": 1.1200147223676544, + "learning_rate": 1.5043088296085583e-07, + "loss": 0.5408, + "step": 7713 + }, + { + "epoch": 0.9465030674846626, + "grad_norm": 0.8573047039496922, + "learning_rate": 1.4974492128307817e-07, + "loss": 0.5255, + "step": 7714 + }, + { + "epoch": 0.9466257668711656, + "grad_norm": 0.7657788614244504, + "learning_rate": 1.490605153684066e-07, + "loss": 0.5362, + "step": 7715 + }, + { + "epoch": 0.9467484662576687, + "grad_norm": 0.9773277321900748, + "learning_rate": 1.4837766532493469e-07, + "loss": 0.5498, + "step": 7716 + }, + { + "epoch": 0.9468711656441717, + "grad_norm": 0.902585608696185, + "learning_rate": 1.4769637126051618e-07, + "loss": 0.5674, + "step": 7717 + }, + { + "epoch": 0.9469938650306748, + "grad_norm": 0.8763207549232158, + "learning_rate": 1.4701663328275383e-07, + "loss": 0.5285, + "step": 7718 + }, + { + "epoch": 0.947116564417178, + "grad_norm": 0.8768281618984947, + "learning_rate": 1.4633845149900738e-07, + "loss": 0.5541, + "step": 7719 + }, + { + "epoch": 0.947239263803681, + "grad_norm": 0.8869515042917104, + "learning_rate": 1.4566182601638779e-07, + "loss": 0.5143, + "step": 7720 + }, + { + "epoch": 0.9473619631901841, + "grad_norm": 0.9368384003538297, + "learning_rate": 1.4498675694176402e-07, + "loss": 0.5812, + "step": 7721 + }, + { + "epoch": 0.9474846625766871, + "grad_norm": 0.9469996357829905, + "learning_rate": 1.4431324438175742e-07, + "loss": 0.6059, + "step": 7722 + }, + { + "epoch": 0.9476073619631902, + "grad_norm": 0.9447165122015475, + "learning_rate": 1.4364128844274295e-07, + "loss": 0.5044, + "step": 7723 + }, + { + "epoch": 0.9477300613496933, + "grad_norm": 0.8359623355929172, + "learning_rate": 1.4297088923084902e-07, + "loss": 0.5105, + "step": 7724 + }, + { + "epoch": 0.9478527607361963, + "grad_norm": 0.859800367679731, + "learning_rate": 1.4230204685196202e-07, + "loss": 0.5108, + "step": 7725 + }, + { + "epoch": 0.9479754601226994, + "grad_norm": 0.8114845303622485, + "learning_rate": 1.4163476141171862e-07, + "loss": 0.5743, + "step": 7726 + }, + { + "epoch": 0.9480981595092025, + "grad_norm": 0.8753835570874497, + "learning_rate": 1.4096903301551e-07, + "loss": 0.5424, + "step": 7727 + }, + { + "epoch": 0.9482208588957055, + "grad_norm": 0.8554718470001779, + "learning_rate": 1.4030486176848434e-07, + "loss": 0.5074, + "step": 7728 + }, + { + "epoch": 0.9483435582822086, + "grad_norm": 0.8498620305048771, + "learning_rate": 1.3964224777553992e-07, + "loss": 0.5463, + "step": 7729 + }, + { + "epoch": 0.9484662576687116, + "grad_norm": 0.880876781415818, + "learning_rate": 1.3898119114133192e-07, + "loss": 0.5623, + "step": 7730 + }, + { + "epoch": 0.9485889570552147, + "grad_norm": 0.8412871902254361, + "learning_rate": 1.383216919702679e-07, + "loss": 0.5531, + "step": 7731 + }, + { + "epoch": 0.9487116564417178, + "grad_norm": 0.9101691869268816, + "learning_rate": 1.3766375036650904e-07, + "loss": 0.5205, + "step": 7732 + }, + { + "epoch": 0.9488343558282208, + "grad_norm": 0.8573290697680922, + "learning_rate": 1.3700736643397437e-07, + "loss": 0.5142, + "step": 7733 + }, + { + "epoch": 0.948957055214724, + "grad_norm": 0.9351292303897464, + "learning_rate": 1.3635254027633105e-07, + "loss": 0.5781, + "step": 7734 + }, + { + "epoch": 0.949079754601227, + "grad_norm": 0.8407709967312395, + "learning_rate": 1.3569927199700628e-07, + "loss": 0.6015, + "step": 7735 + }, + { + "epoch": 0.9492024539877301, + "grad_norm": 0.8774246007694776, + "learning_rate": 1.3504756169917533e-07, + "loss": 0.5167, + "step": 7736 + }, + { + "epoch": 0.9493251533742332, + "grad_norm": 0.8265276484014248, + "learning_rate": 1.343974094857725e-07, + "loss": 0.489, + "step": 7737 + }, + { + "epoch": 0.9494478527607362, + "grad_norm": 0.8853157842515883, + "learning_rate": 1.3374881545948237e-07, + "loss": 0.5035, + "step": 7738 + }, + { + "epoch": 0.9495705521472393, + "grad_norm": 0.8867693023704892, + "learning_rate": 1.3310177972274517e-07, + "loss": 0.5308, + "step": 7739 + }, + { + "epoch": 0.9496932515337423, + "grad_norm": 0.8723773894096034, + "learning_rate": 1.3245630237775585e-07, + "loss": 0.5335, + "step": 7740 + }, + { + "epoch": 0.9498159509202454, + "grad_norm": 0.9478362344404202, + "learning_rate": 1.3181238352645843e-07, + "loss": 0.5277, + "step": 7741 + }, + { + "epoch": 0.9499386503067485, + "grad_norm": 0.8861646119533891, + "learning_rate": 1.3117002327055927e-07, + "loss": 0.5175, + "step": 7742 + }, + { + "epoch": 0.9500613496932515, + "grad_norm": 0.9130511710171312, + "learning_rate": 1.305292217115095e-07, + "loss": 0.5837, + "step": 7743 + }, + { + "epoch": 0.9501840490797546, + "grad_norm": 0.8981303377360058, + "learning_rate": 1.2988997895052037e-07, + "loss": 0.5317, + "step": 7744 + }, + { + "epoch": 0.9503067484662576, + "grad_norm": 0.9874484901335712, + "learning_rate": 1.292522950885533e-07, + "loss": 0.5483, + "step": 7745 + }, + { + "epoch": 0.9504294478527607, + "grad_norm": 0.8515289866811133, + "learning_rate": 1.2861617022632667e-07, + "loss": 0.4798, + "step": 7746 + }, + { + "epoch": 0.9505521472392638, + "grad_norm": 0.8417154807941243, + "learning_rate": 1.2798160446431006e-07, + "loss": 0.5288, + "step": 7747 + }, + { + "epoch": 0.9506748466257668, + "grad_norm": 0.9542716074677744, + "learning_rate": 1.273485979027278e-07, + "loss": 0.5571, + "step": 7748 + }, + { + "epoch": 0.95079754601227, + "grad_norm": 0.8628894677987349, + "learning_rate": 1.2671715064155765e-07, + "loss": 0.5031, + "step": 7749 + }, + { + "epoch": 0.950920245398773, + "grad_norm": 1.0415300905469151, + "learning_rate": 1.2608726278053208e-07, + "loss": 0.5121, + "step": 7750 + }, + { + "epoch": 0.9510429447852761, + "grad_norm": 0.980966805252537, + "learning_rate": 1.2545893441913592e-07, + "loss": 0.6092, + "step": 7751 + }, + { + "epoch": 0.9511656441717792, + "grad_norm": 0.8700405127484439, + "learning_rate": 1.248321656566065e-07, + "loss": 0.4731, + "step": 7752 + }, + { + "epoch": 0.9512883435582822, + "grad_norm": 0.8176524198357842, + "learning_rate": 1.2420695659194016e-07, + "loss": 0.4732, + "step": 7753 + }, + { + "epoch": 0.9514110429447853, + "grad_norm": 0.890108134653391, + "learning_rate": 1.2358330732388012e-07, + "loss": 0.5475, + "step": 7754 + }, + { + "epoch": 0.9515337423312883, + "grad_norm": 0.9276231225541093, + "learning_rate": 1.2296121795092874e-07, + "loss": 0.5284, + "step": 7755 + }, + { + "epoch": 0.9516564417177914, + "grad_norm": 0.8186090998042662, + "learning_rate": 1.223406885713374e-07, + "loss": 0.5214, + "step": 7756 + }, + { + "epoch": 0.9517791411042945, + "grad_norm": 0.8417570086944437, + "learning_rate": 1.217217192831155e-07, + "loss": 0.5191, + "step": 7757 + }, + { + "epoch": 0.9519018404907975, + "grad_norm": 0.955530278604229, + "learning_rate": 1.2110431018402258e-07, + "loss": 0.5202, + "step": 7758 + }, + { + "epoch": 0.9520245398773006, + "grad_norm": 0.8982446888594119, + "learning_rate": 1.20488461371574e-07, + "loss": 0.5278, + "step": 7759 + }, + { + "epoch": 0.9521472392638037, + "grad_norm": 0.93340889052929, + "learning_rate": 1.1987417294303748e-07, + "loss": 0.5407, + "step": 7760 + }, + { + "epoch": 0.9522699386503067, + "grad_norm": 0.9615022376186584, + "learning_rate": 1.1926144499543325e-07, + "loss": 0.5317, + "step": 7761 + }, + { + "epoch": 0.9523926380368098, + "grad_norm": 0.9575243826329489, + "learning_rate": 1.1865027762553827e-07, + "loss": 0.5392, + "step": 7762 + }, + { + "epoch": 0.9525153374233128, + "grad_norm": 0.9402445946195164, + "learning_rate": 1.1804067092987981e-07, + "loss": 0.4714, + "step": 7763 + }, + { + "epoch": 0.952638036809816, + "grad_norm": 0.9638715885655834, + "learning_rate": 1.1743262500474195e-07, + "loss": 0.5603, + "step": 7764 + }, + { + "epoch": 0.9527607361963191, + "grad_norm": 0.9217246517962062, + "learning_rate": 1.1682613994615788e-07, + "loss": 0.5143, + "step": 7765 + }, + { + "epoch": 0.9528834355828221, + "grad_norm": 0.8749784309148486, + "learning_rate": 1.1622121584991874e-07, + "loss": 0.5591, + "step": 7766 + }, + { + "epoch": 0.9530061349693252, + "grad_norm": 1.1563693715239711, + "learning_rate": 1.156178528115648e-07, + "loss": 0.5264, + "step": 7767 + }, + { + "epoch": 0.9531288343558282, + "grad_norm": 0.9729307791904171, + "learning_rate": 1.1501605092639534e-07, + "loss": 0.5588, + "step": 7768 + }, + { + "epoch": 0.9532515337423313, + "grad_norm": 0.7796899170080228, + "learning_rate": 1.1441581028945659e-07, + "loss": 0.5611, + "step": 7769 + }, + { + "epoch": 0.9533742331288344, + "grad_norm": 0.838321117897018, + "learning_rate": 1.1381713099555381e-07, + "loss": 0.5332, + "step": 7770 + }, + { + "epoch": 0.9534969325153374, + "grad_norm": 1.0385110217171194, + "learning_rate": 1.1322001313924247e-07, + "loss": 0.5283, + "step": 7771 + }, + { + "epoch": 0.9536196319018405, + "grad_norm": 0.9693628503823956, + "learning_rate": 1.1262445681483048e-07, + "loss": 0.5794, + "step": 7772 + }, + { + "epoch": 0.9537423312883435, + "grad_norm": 0.8135169135628242, + "learning_rate": 1.1203046211638258e-07, + "loss": 0.5412, + "step": 7773 + }, + { + "epoch": 0.9538650306748466, + "grad_norm": 0.866447773242277, + "learning_rate": 1.1143802913771485e-07, + "loss": 0.5306, + "step": 7774 + }, + { + "epoch": 0.9539877300613497, + "grad_norm": 0.8933450329739944, + "learning_rate": 1.1084715797239798e-07, + "loss": 0.46, + "step": 7775 + }, + { + "epoch": 0.9541104294478527, + "grad_norm": 1.020618655245341, + "learning_rate": 1.102578487137529e-07, + "loss": 0.5745, + "step": 7776 + }, + { + "epoch": 0.9542331288343558, + "grad_norm": 0.8966398271059919, + "learning_rate": 1.0967010145485735e-07, + "loss": 0.5429, + "step": 7777 + }, + { + "epoch": 0.9543558282208588, + "grad_norm": 0.9350917098754012, + "learning_rate": 1.0908391628854042e-07, + "loss": 0.5724, + "step": 7778 + }, + { + "epoch": 0.954478527607362, + "grad_norm": 0.9111005045130933, + "learning_rate": 1.0849929330738474e-07, + "loss": 0.5636, + "step": 7779 + }, + { + "epoch": 0.9546012269938651, + "grad_norm": 0.944969987184775, + "learning_rate": 1.0791623260372863e-07, + "loss": 0.5655, + "step": 7780 + }, + { + "epoch": 0.9547239263803681, + "grad_norm": 0.9720851204862422, + "learning_rate": 1.0733473426965846e-07, + "loss": 0.5616, + "step": 7781 + }, + { + "epoch": 0.9548466257668712, + "grad_norm": 0.807260289799555, + "learning_rate": 1.0675479839701852e-07, + "loss": 0.5225, + "step": 7782 + }, + { + "epoch": 0.9549693251533742, + "grad_norm": 0.8695835838784529, + "learning_rate": 1.0617642507740444e-07, + "loss": 0.5381, + "step": 7783 + }, + { + "epoch": 0.9550920245398773, + "grad_norm": 0.837902061867829, + "learning_rate": 1.0559961440216538e-07, + "loss": 0.5397, + "step": 7784 + }, + { + "epoch": 0.9552147239263804, + "grad_norm": 0.8495919935985407, + "learning_rate": 1.0502436646240399e-07, + "loss": 0.527, + "step": 7785 + }, + { + "epoch": 0.9553374233128834, + "grad_norm": 0.844393263396338, + "learning_rate": 1.044506813489754e-07, + "loss": 0.5242, + "step": 7786 + }, + { + "epoch": 0.9554601226993865, + "grad_norm": 0.8757893045701635, + "learning_rate": 1.0387855915248712e-07, + "loss": 0.5563, + "step": 7787 + }, + { + "epoch": 0.9555828220858895, + "grad_norm": 0.8919459584661319, + "learning_rate": 1.0330799996330354e-07, + "loss": 0.5902, + "step": 7788 + }, + { + "epoch": 0.9557055214723926, + "grad_norm": 0.8938119536012505, + "learning_rate": 1.0273900387153702e-07, + "loss": 0.4916, + "step": 7789 + }, + { + "epoch": 0.9558282208588957, + "grad_norm": 0.8278102298037713, + "learning_rate": 1.0217157096705676e-07, + "loss": 0.5271, + "step": 7790 + }, + { + "epoch": 0.9559509202453987, + "grad_norm": 0.9878232319491567, + "learning_rate": 1.0160570133948333e-07, + "loss": 0.5231, + "step": 7791 + }, + { + "epoch": 0.9560736196319018, + "grad_norm": 0.822354840196603, + "learning_rate": 1.0104139507819189e-07, + "loss": 0.5267, + "step": 7792 + }, + { + "epoch": 0.956196319018405, + "grad_norm": 0.9717105981161684, + "learning_rate": 1.0047865227230891e-07, + "loss": 0.6069, + "step": 7793 + }, + { + "epoch": 0.956319018404908, + "grad_norm": 1.049036639372752, + "learning_rate": 9.991747301071553e-08, + "loss": 0.5669, + "step": 7794 + }, + { + "epoch": 0.9564417177914111, + "grad_norm": 0.8567246784239424, + "learning_rate": 9.935785738204417e-08, + "loss": 0.5509, + "step": 7795 + }, + { + "epoch": 0.9565644171779141, + "grad_norm": 0.9548707226621533, + "learning_rate": 9.879980547468193e-08, + "loss": 0.5204, + "step": 7796 + }, + { + "epoch": 0.9566871165644172, + "grad_norm": 0.8746304447511603, + "learning_rate": 9.82433173767694e-08, + "loss": 0.5322, + "step": 7797 + }, + { + "epoch": 0.9568098159509203, + "grad_norm": 0.7885248900339835, + "learning_rate": 9.768839317619739e-08, + "loss": 0.5062, + "step": 7798 + }, + { + "epoch": 0.9569325153374233, + "grad_norm": 0.8969734245520984, + "learning_rate": 9.713503296061133e-08, + "loss": 0.5662, + "step": 7799 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 0.899542395502555, + "learning_rate": 9.658323681741133e-08, + "loss": 0.4593, + "step": 7800 + }, + { + "epoch": 0.9571779141104294, + "grad_norm": 0.8657024444193843, + "learning_rate": 9.603300483374767e-08, + "loss": 0.4844, + "step": 7801 + }, + { + "epoch": 0.9573006134969325, + "grad_norm": 0.9577649688764471, + "learning_rate": 9.548433709652527e-08, + "loss": 0.5014, + "step": 7802 + }, + { + "epoch": 0.9574233128834356, + "grad_norm": 1.0312324681010985, + "learning_rate": 9.493723369240038e-08, + "loss": 0.5565, + "step": 7803 + }, + { + "epoch": 0.9575460122699386, + "grad_norm": 0.9309046443129293, + "learning_rate": 9.439169470778497e-08, + "loss": 0.5004, + "step": 7804 + }, + { + "epoch": 0.9576687116564417, + "grad_norm": 0.8843468904565137, + "learning_rate": 9.384772022884015e-08, + "loss": 0.5361, + "step": 7805 + }, + { + "epoch": 0.9577914110429447, + "grad_norm": 0.8065558212017239, + "learning_rate": 9.330531034148382e-08, + "loss": 0.5167, + "step": 7806 + }, + { + "epoch": 0.9579141104294479, + "grad_norm": 0.9527636565500864, + "learning_rate": 9.276446513138416e-08, + "loss": 0.5834, + "step": 7807 + }, + { + "epoch": 0.958036809815951, + "grad_norm": 0.819469556285485, + "learning_rate": 9.222518468396391e-08, + "loss": 0.5203, + "step": 7808 + }, + { + "epoch": 0.958159509202454, + "grad_norm": 1.0330281855714196, + "learning_rate": 9.168746908439718e-08, + "loss": 0.6157, + "step": 7809 + }, + { + "epoch": 0.9582822085889571, + "grad_norm": 0.8562182245680645, + "learning_rate": 9.11513184176116e-08, + "loss": 0.5261, + "step": 7810 + }, + { + "epoch": 0.9584049079754601, + "grad_norm": 0.9682259166144994, + "learning_rate": 9.06167327682872e-08, + "loss": 0.5401, + "step": 7811 + }, + { + "epoch": 0.9585276073619632, + "grad_norm": 0.7921453080998789, + "learning_rate": 9.008371222085643e-08, + "loss": 0.5358, + "step": 7812 + }, + { + "epoch": 0.9586503067484663, + "grad_norm": 0.8302670271107327, + "learning_rate": 8.955225685950753e-08, + "loss": 0.5415, + "step": 7813 + }, + { + "epoch": 0.9587730061349693, + "grad_norm": 0.8059395467384586, + "learning_rate": 8.902236676817666e-08, + "loss": 0.4897, + "step": 7814 + }, + { + "epoch": 0.9588957055214724, + "grad_norm": 0.8844476760209218, + "learning_rate": 8.8494042030558e-08, + "loss": 0.5641, + "step": 7815 + }, + { + "epoch": 0.9590184049079754, + "grad_norm": 0.8718193246409499, + "learning_rate": 8.796728273009481e-08, + "loss": 0.4381, + "step": 7816 + }, + { + "epoch": 0.9591411042944785, + "grad_norm": 0.8512293445932777, + "learning_rate": 8.744208894998384e-08, + "loss": 0.5499, + "step": 7817 + }, + { + "epoch": 0.9592638036809816, + "grad_norm": 0.8552054052271991, + "learning_rate": 8.691846077317435e-08, + "loss": 0.5687, + "step": 7818 + }, + { + "epoch": 0.9593865030674846, + "grad_norm": 0.9420190094468442, + "learning_rate": 8.639639828237012e-08, + "loss": 0.5282, + "step": 7819 + }, + { + "epoch": 0.9595092024539877, + "grad_norm": 0.8927992932059855, + "learning_rate": 8.587590156002635e-08, + "loss": 0.4995, + "step": 7820 + }, + { + "epoch": 0.9596319018404909, + "grad_norm": 0.8524820979060257, + "learning_rate": 8.535697068835059e-08, + "loss": 0.5503, + "step": 7821 + }, + { + "epoch": 0.9597546012269939, + "grad_norm": 0.8611142009205897, + "learning_rate": 8.483960574930395e-08, + "loss": 0.4978, + "step": 7822 + }, + { + "epoch": 0.959877300613497, + "grad_norm": 0.8952816554115514, + "learning_rate": 8.432380682459884e-08, + "loss": 0.5777, + "step": 7823 + }, + { + "epoch": 0.96, + "grad_norm": 0.935929340832493, + "learning_rate": 8.380957399570234e-08, + "loss": 0.5544, + "step": 7824 + }, + { + "epoch": 0.9601226993865031, + "grad_norm": 0.8582600905329063, + "learning_rate": 8.329690734383278e-08, + "loss": 0.4983, + "step": 7825 + }, + { + "epoch": 0.9602453987730062, + "grad_norm": 1.0087332904651958, + "learning_rate": 8.278580694996097e-08, + "loss": 0.5699, + "step": 7826 + }, + { + "epoch": 0.9603680981595092, + "grad_norm": 0.8642846714021349, + "learning_rate": 8.227627289481121e-08, + "loss": 0.5045, + "step": 7827 + }, + { + "epoch": 0.9604907975460123, + "grad_norm": 0.7186233926068776, + "learning_rate": 8.176830525886026e-08, + "loss": 0.5012, + "step": 7828 + }, + { + "epoch": 0.9606134969325153, + "grad_norm": 0.9436017860717782, + "learning_rate": 8.126190412233615e-08, + "loss": 0.5664, + "step": 7829 + }, + { + "epoch": 0.9607361963190184, + "grad_norm": 0.8958094977405879, + "learning_rate": 8.075706956522156e-08, + "loss": 0.5122, + "step": 7830 + }, + { + "epoch": 0.9608588957055215, + "grad_norm": 1.0592450875652175, + "learning_rate": 8.025380166725161e-08, + "loss": 0.543, + "step": 7831 + }, + { + "epoch": 0.9609815950920245, + "grad_norm": 0.8852996180036554, + "learning_rate": 7.97521005079116e-08, + "loss": 0.4835, + "step": 7832 + }, + { + "epoch": 0.9611042944785276, + "grad_norm": 0.8733782139718971, + "learning_rate": 7.925196616644148e-08, + "loss": 0.5535, + "step": 7833 + }, + { + "epoch": 0.9612269938650306, + "grad_norm": 0.788261622913343, + "learning_rate": 7.875339872183252e-08, + "loss": 0.5357, + "step": 7834 + }, + { + "epoch": 0.9613496932515337, + "grad_norm": 0.9430663886674372, + "learning_rate": 7.825639825282949e-08, + "loss": 0.6135, + "step": 7835 + }, + { + "epoch": 0.9614723926380369, + "grad_norm": 0.9217766901072477, + "learning_rate": 7.776096483793072e-08, + "loss": 0.5063, + "step": 7836 + }, + { + "epoch": 0.9615950920245399, + "grad_norm": 0.8275758820180856, + "learning_rate": 7.726709855538472e-08, + "loss": 0.4919, + "step": 7837 + }, + { + "epoch": 0.961717791411043, + "grad_norm": 1.0342834405526526, + "learning_rate": 7.677479948319244e-08, + "loss": 0.58, + "step": 7838 + }, + { + "epoch": 0.961840490797546, + "grad_norm": 0.8620288703296337, + "learning_rate": 7.628406769910946e-08, + "loss": 0.4879, + "step": 7839 + }, + { + "epoch": 0.9619631901840491, + "grad_norm": 0.9228039767138025, + "learning_rate": 7.579490328064265e-08, + "loss": 0.496, + "step": 7840 + }, + { + "epoch": 0.9620858895705522, + "grad_norm": 0.9106190546652047, + "learning_rate": 7.530730630505023e-08, + "loss": 0.5224, + "step": 7841 + }, + { + "epoch": 0.9622085889570552, + "grad_norm": 0.8717463723811062, + "learning_rate": 7.482127684934504e-08, + "loss": 0.5349, + "step": 7842 + }, + { + "epoch": 0.9623312883435583, + "grad_norm": 0.9509768089897846, + "learning_rate": 7.433681499029122e-08, + "loss": 0.5443, + "step": 7843 + }, + { + "epoch": 0.9624539877300613, + "grad_norm": 0.8322589288808772, + "learning_rate": 7.385392080440535e-08, + "loss": 0.4973, + "step": 7844 + }, + { + "epoch": 0.9625766871165644, + "grad_norm": 0.7764034744864902, + "learning_rate": 7.33725943679553e-08, + "loss": 0.517, + "step": 7845 + }, + { + "epoch": 0.9626993865030675, + "grad_norm": 0.8025413073356797, + "learning_rate": 7.289283575696359e-08, + "loss": 0.553, + "step": 7846 + }, + { + "epoch": 0.9628220858895705, + "grad_norm": 0.94930962028317, + "learning_rate": 7.241464504720298e-08, + "loss": 0.4938, + "step": 7847 + }, + { + "epoch": 0.9629447852760736, + "grad_norm": 0.9456668661757258, + "learning_rate": 7.193802231420189e-08, + "loss": 0.5475, + "step": 7848 + }, + { + "epoch": 0.9630674846625766, + "grad_norm": 0.7886809791023701, + "learning_rate": 7.146296763323569e-08, + "loss": 0.5058, + "step": 7849 + }, + { + "epoch": 0.9631901840490797, + "grad_norm": 0.8130588047983544, + "learning_rate": 7.098948107933656e-08, + "loss": 0.4427, + "step": 7850 + }, + { + "epoch": 0.9633128834355829, + "grad_norm": 0.8517412503021314, + "learning_rate": 7.051756272728915e-08, + "loss": 0.5785, + "step": 7851 + }, + { + "epoch": 0.9634355828220859, + "grad_norm": 0.8469710395108591, + "learning_rate": 7.004721265162607e-08, + "loss": 0.5271, + "step": 7852 + }, + { + "epoch": 0.963558282208589, + "grad_norm": 1.0102834416073587, + "learning_rate": 6.957843092663675e-08, + "loss": 0.5363, + "step": 7853 + }, + { + "epoch": 0.9636809815950921, + "grad_norm": 0.8498728112683706, + "learning_rate": 6.911121762636086e-08, + "loss": 0.5297, + "step": 7854 + }, + { + "epoch": 0.9638036809815951, + "grad_norm": 0.9864327624840302, + "learning_rate": 6.864557282459162e-08, + "loss": 0.5568, + "step": 7855 + }, + { + "epoch": 0.9639263803680982, + "grad_norm": 0.9047807195210469, + "learning_rate": 6.818149659487127e-08, + "loss": 0.5565, + "step": 7856 + }, + { + "epoch": 0.9640490797546012, + "grad_norm": 0.8593967707275513, + "learning_rate": 6.771898901050011e-08, + "loss": 0.4955, + "step": 7857 + }, + { + "epoch": 0.9641717791411043, + "grad_norm": 0.8701349816474246, + "learning_rate": 6.725805014452414e-08, + "loss": 0.5594, + "step": 7858 + }, + { + "epoch": 0.9642944785276074, + "grad_norm": 0.8090602598593859, + "learning_rate": 6.679868006974621e-08, + "loss": 0.5498, + "step": 7859 + }, + { + "epoch": 0.9644171779141104, + "grad_norm": 0.8744195988417013, + "learning_rate": 6.634087885871832e-08, + "loss": 0.5264, + "step": 7860 + }, + { + "epoch": 0.9645398773006135, + "grad_norm": 0.9535880846424842, + "learning_rate": 6.588464658374816e-08, + "loss": 0.5685, + "step": 7861 + }, + { + "epoch": 0.9646625766871165, + "grad_norm": 0.8181373472519384, + "learning_rate": 6.542998331689365e-08, + "loss": 0.6015, + "step": 7862 + }, + { + "epoch": 0.9647852760736196, + "grad_norm": 1.1037629797325823, + "learning_rate": 6.497688912996403e-08, + "loss": 0.5447, + "step": 7863 + }, + { + "epoch": 0.9649079754601227, + "grad_norm": 0.8086803099972152, + "learning_rate": 6.452536409452204e-08, + "loss": 0.5263, + "step": 7864 + }, + { + "epoch": 0.9650306748466257, + "grad_norm": 0.9180245752601098, + "learning_rate": 6.407540828188175e-08, + "loss": 0.566, + "step": 7865 + }, + { + "epoch": 0.9651533742331289, + "grad_norm": 0.7998471151009293, + "learning_rate": 6.362702176310964e-08, + "loss": 0.5886, + "step": 7866 + }, + { + "epoch": 0.9652760736196319, + "grad_norm": 0.8531444976888216, + "learning_rate": 6.318020460902574e-08, + "loss": 0.557, + "step": 7867 + }, + { + "epoch": 0.965398773006135, + "grad_norm": 0.9002857383163099, + "learning_rate": 6.273495689020026e-08, + "loss": 0.5466, + "step": 7868 + }, + { + "epoch": 0.9655214723926381, + "grad_norm": 0.9025088357022731, + "learning_rate": 6.229127867695584e-08, + "loss": 0.508, + "step": 7869 + }, + { + "epoch": 0.9656441717791411, + "grad_norm": 1.0157162126176977, + "learning_rate": 6.184917003936752e-08, + "loss": 0.5295, + "step": 7870 + }, + { + "epoch": 0.9657668711656442, + "grad_norm": 0.9896717514558906, + "learning_rate": 6.140863104726391e-08, + "loss": 0.5884, + "step": 7871 + }, + { + "epoch": 0.9658895705521472, + "grad_norm": 0.8235128955560136, + "learning_rate": 6.096966177022267e-08, + "loss": 0.5465, + "step": 7872 + }, + { + "epoch": 0.9660122699386503, + "grad_norm": 0.7996196096100325, + "learning_rate": 6.053226227757614e-08, + "loss": 0.4377, + "step": 7873 + }, + { + "epoch": 0.9661349693251534, + "grad_norm": 0.8350183250522375, + "learning_rate": 6.009643263840791e-08, + "loss": 0.45, + "step": 7874 + }, + { + "epoch": 0.9662576687116564, + "grad_norm": 0.8932354145354491, + "learning_rate": 5.966217292155296e-08, + "loss": 0.5384, + "step": 7875 + }, + { + "epoch": 0.9663803680981595, + "grad_norm": 0.9065829786491345, + "learning_rate": 5.922948319559973e-08, + "loss": 0.5208, + "step": 7876 + }, + { + "epoch": 0.9665030674846625, + "grad_norm": 0.9165782157224539, + "learning_rate": 5.8798363528886904e-08, + "loss": 0.5187, + "step": 7877 + }, + { + "epoch": 0.9666257668711656, + "grad_norm": 0.898353378005197, + "learning_rate": 5.836881398950667e-08, + "loss": 0.513, + "step": 7878 + }, + { + "epoch": 0.9667484662576687, + "grad_norm": 0.8146594437006073, + "learning_rate": 5.794083464530254e-08, + "loss": 0.5658, + "step": 7879 + }, + { + "epoch": 0.9668711656441717, + "grad_norm": 0.809281314106449, + "learning_rate": 5.7514425563870436e-08, + "loss": 0.517, + "step": 7880 + }, + { + "epoch": 0.9669938650306749, + "grad_norm": 0.8551861406483837, + "learning_rate": 5.7089586812557605e-08, + "loss": 0.5827, + "step": 7881 + }, + { + "epoch": 0.9671165644171779, + "grad_norm": 0.954928774937728, + "learning_rate": 5.666631845846371e-08, + "loss": 0.556, + "step": 7882 + }, + { + "epoch": 0.967239263803681, + "grad_norm": 0.7996869715239464, + "learning_rate": 5.624462056844082e-08, + "loss": 0.567, + "step": 7883 + }, + { + "epoch": 0.9673619631901841, + "grad_norm": 0.901615927061668, + "learning_rate": 5.582449320909233e-08, + "loss": 0.5721, + "step": 7884 + }, + { + "epoch": 0.9674846625766871, + "grad_norm": 0.942206495444459, + "learning_rate": 5.540593644677295e-08, + "loss": 0.5577, + "step": 7885 + }, + { + "epoch": 0.9676073619631902, + "grad_norm": 0.7884916974925734, + "learning_rate": 5.498895034759088e-08, + "loss": 0.4665, + "step": 7886 + }, + { + "epoch": 0.9677300613496933, + "grad_norm": 0.907063207666415, + "learning_rate": 5.45735349774057e-08, + "loss": 0.5069, + "step": 7887 + }, + { + "epoch": 0.9678527607361963, + "grad_norm": 0.7792526737572882, + "learning_rate": 5.4159690401828225e-08, + "loss": 0.5387, + "step": 7888 + }, + { + "epoch": 0.9679754601226994, + "grad_norm": 0.9071903787697326, + "learning_rate": 5.374741668622063e-08, + "loss": 0.6151, + "step": 7889 + }, + { + "epoch": 0.9680981595092024, + "grad_norm": 0.9073963393388981, + "learning_rate": 5.333671389569972e-08, + "loss": 0.5627, + "step": 7890 + }, + { + "epoch": 0.9682208588957055, + "grad_norm": 0.9354569222388865, + "learning_rate": 5.292758209513249e-08, + "loss": 0.5028, + "step": 7891 + }, + { + "epoch": 0.9683435582822086, + "grad_norm": 1.0788980577146985, + "learning_rate": 5.252002134913725e-08, + "loss": 0.5629, + "step": 7892 + }, + { + "epoch": 0.9684662576687116, + "grad_norm": 0.8437863981642614, + "learning_rate": 5.2114031722084734e-08, + "loss": 0.5639, + "step": 7893 + }, + { + "epoch": 0.9685889570552147, + "grad_norm": 0.8086778217099491, + "learning_rate": 5.170961327809698e-08, + "loss": 0.5321, + "step": 7894 + }, + { + "epoch": 0.9687116564417177, + "grad_norm": 0.8566744025783166, + "learning_rate": 5.1306766081048456e-08, + "loss": 0.5724, + "step": 7895 + }, + { + "epoch": 0.9688343558282209, + "grad_norm": 1.0823141233002551, + "learning_rate": 5.090549019456603e-08, + "loss": 0.4952, + "step": 7896 + }, + { + "epoch": 0.968957055214724, + "grad_norm": 0.8825026053741528, + "learning_rate": 5.050578568202791e-08, + "loss": 0.5243, + "step": 7897 + }, + { + "epoch": 0.969079754601227, + "grad_norm": 0.8251024493275593, + "learning_rate": 5.0107652606563585e-08, + "loss": 0.5159, + "step": 7898 + }, + { + "epoch": 0.9692024539877301, + "grad_norm": 0.8690194352509677, + "learning_rate": 4.971109103105498e-08, + "loss": 0.5056, + "step": 7899 + }, + { + "epoch": 0.9693251533742331, + "grad_norm": 1.038085355604145, + "learning_rate": 4.931610101813533e-08, + "loss": 0.5613, + "step": 7900 + }, + { + "epoch": 0.9694478527607362, + "grad_norm": 0.9058863542644363, + "learning_rate": 4.892268263019029e-08, + "loss": 0.5175, + "step": 7901 + }, + { + "epoch": 0.9695705521472393, + "grad_norm": 0.8562062499483117, + "learning_rate": 4.85308359293557e-08, + "loss": 0.5328, + "step": 7902 + }, + { + "epoch": 0.9696932515337423, + "grad_norm": 0.7935619742281944, + "learning_rate": 4.8140560977522065e-08, + "loss": 0.568, + "step": 7903 + }, + { + "epoch": 0.9698159509202454, + "grad_norm": 0.8382006521975622, + "learning_rate": 4.7751857836328966e-08, + "loss": 0.51, + "step": 7904 + }, + { + "epoch": 0.9699386503067484, + "grad_norm": 0.8270117312752767, + "learning_rate": 4.73647265671684e-08, + "loss": 0.5551, + "step": 7905 + }, + { + "epoch": 0.9700613496932515, + "grad_norm": 0.9386442604317384, + "learning_rate": 4.6979167231185894e-08, + "loss": 0.5805, + "step": 7906 + }, + { + "epoch": 0.9701840490797546, + "grad_norm": 0.8474699714351389, + "learning_rate": 4.659517988927609e-08, + "loss": 0.5269, + "step": 7907 + }, + { + "epoch": 0.9703067484662576, + "grad_norm": 0.9152540464792616, + "learning_rate": 4.6212764602086017e-08, + "loss": 0.6239, + "step": 7908 + }, + { + "epoch": 0.9704294478527608, + "grad_norm": 0.7683690241240541, + "learning_rate": 4.583192143001625e-08, + "loss": 0.5089, + "step": 7909 + }, + { + "epoch": 0.9705521472392638, + "grad_norm": 0.8299363221378673, + "learning_rate": 4.545265043321645e-08, + "loss": 0.5285, + "step": 7910 + }, + { + "epoch": 0.9706748466257669, + "grad_norm": 0.9915977950231374, + "learning_rate": 4.507495167159093e-08, + "loss": 0.5485, + "step": 7911 + }, + { + "epoch": 0.97079754601227, + "grad_norm": 0.9070297824165975, + "learning_rate": 4.469882520479196e-08, + "loss": 0.5292, + "step": 7912 + }, + { + "epoch": 0.970920245398773, + "grad_norm": 0.8907852716952651, + "learning_rate": 4.432427109222648e-08, + "loss": 0.5256, + "step": 7913 + }, + { + "epoch": 0.9710429447852761, + "grad_norm": 0.957078511598773, + "learning_rate": 4.39512893930516e-08, + "loss": 0.5197, + "step": 7914 + }, + { + "epoch": 0.9711656441717792, + "grad_norm": 0.8236108665377776, + "learning_rate": 4.357988016617687e-08, + "loss": 0.5415, + "step": 7915 + }, + { + "epoch": 0.9712883435582822, + "grad_norm": 0.9421658172637425, + "learning_rate": 4.321004347026314e-08, + "loss": 0.5432, + "step": 7916 + }, + { + "epoch": 0.9714110429447853, + "grad_norm": 0.9314342058597289, + "learning_rate": 4.284177936372369e-08, + "loss": 0.5381, + "step": 7917 + }, + { + "epoch": 0.9715337423312883, + "grad_norm": 1.0450813232653806, + "learning_rate": 4.247508790472199e-08, + "loss": 0.5388, + "step": 7918 + }, + { + "epoch": 0.9716564417177914, + "grad_norm": 0.7751488308062672, + "learning_rate": 4.210996915117283e-08, + "loss": 0.4546, + "step": 7919 + }, + { + "epoch": 0.9717791411042945, + "grad_norm": 0.8671811528332855, + "learning_rate": 4.174642316074562e-08, + "loss": 0.549, + "step": 7920 + }, + { + "epoch": 0.9719018404907975, + "grad_norm": 0.913403609575526, + "learning_rate": 4.138444999085778e-08, + "loss": 0.4752, + "step": 7921 + }, + { + "epoch": 0.9720245398773006, + "grad_norm": 0.9738115737937914, + "learning_rate": 4.102404969868024e-08, + "loss": 0.5847, + "step": 7922 + }, + { + "epoch": 0.9721472392638036, + "grad_norm": 0.9374794845686899, + "learning_rate": 4.066522234113523e-08, + "loss": 0.5527, + "step": 7923 + }, + { + "epoch": 0.9722699386503068, + "grad_norm": 0.9843203541648861, + "learning_rate": 4.0307967974897446e-08, + "loss": 0.4631, + "step": 7924 + }, + { + "epoch": 0.9723926380368099, + "grad_norm": 0.7924832313259182, + "learning_rate": 3.9952286656389506e-08, + "loss": 0.6116, + "step": 7925 + }, + { + "epoch": 0.9725153374233129, + "grad_norm": 0.8760326451917355, + "learning_rate": 3.9598178441790924e-08, + "loss": 0.572, + "step": 7926 + }, + { + "epoch": 0.972638036809816, + "grad_norm": 0.813609841744047, + "learning_rate": 3.924564338702919e-08, + "loss": 0.5073, + "step": 7927 + }, + { + "epoch": 0.972760736196319, + "grad_norm": 0.8584677483250254, + "learning_rate": 3.889468154778309e-08, + "loss": 0.5559, + "step": 7928 + }, + { + "epoch": 0.9728834355828221, + "grad_norm": 0.9787905051130329, + "learning_rate": 3.854529297948606e-08, + "loss": 0.5414, + "step": 7929 + }, + { + "epoch": 0.9730061349693252, + "grad_norm": 0.8593239839014529, + "learning_rate": 3.819747773731841e-08, + "loss": 0.5643, + "step": 7930 + }, + { + "epoch": 0.9731288343558282, + "grad_norm": 0.8412318239676891, + "learning_rate": 3.785123587621731e-08, + "loss": 0.5259, + "step": 7931 + }, + { + "epoch": 0.9732515337423313, + "grad_norm": 0.9940355731288762, + "learning_rate": 3.750656745086678e-08, + "loss": 0.4784, + "step": 7932 + }, + { + "epoch": 0.9733742331288343, + "grad_norm": 0.8836018615018122, + "learning_rate": 3.716347251570551e-08, + "loss": 0.4717, + "step": 7933 + }, + { + "epoch": 0.9734969325153374, + "grad_norm": 0.9125431226570788, + "learning_rate": 3.6821951124921264e-08, + "loss": 0.5614, + "step": 7934 + }, + { + "epoch": 0.9736196319018405, + "grad_norm": 0.8396914085308703, + "learning_rate": 3.648200333245422e-08, + "loss": 0.5089, + "step": 7935 + }, + { + "epoch": 0.9737423312883435, + "grad_norm": 0.7967302554357093, + "learning_rate": 3.6143629191998096e-08, + "loss": 0.5459, + "step": 7936 + }, + { + "epoch": 0.9738650306748466, + "grad_norm": 0.8273488008282219, + "learning_rate": 3.580682875699459e-08, + "loss": 0.4633, + "step": 7937 + }, + { + "epoch": 0.9739877300613496, + "grad_norm": 1.0232395275281465, + "learning_rate": 3.547160208063782e-08, + "loss": 0.5684, + "step": 7938 + }, + { + "epoch": 0.9741104294478528, + "grad_norm": 0.8096068652763262, + "learning_rate": 3.5137949215876544e-08, + "loss": 0.518, + "step": 7939 + }, + { + "epoch": 0.9742331288343559, + "grad_norm": 0.8276211745856081, + "learning_rate": 3.480587021540527e-08, + "loss": 0.5465, + "step": 7940 + }, + { + "epoch": 0.9743558282208589, + "grad_norm": 0.8800585867027165, + "learning_rate": 3.447536513167538e-08, + "loss": 0.5144, + "step": 7941 + }, + { + "epoch": 0.974478527607362, + "grad_norm": 0.7931623752494039, + "learning_rate": 3.4146434016886223e-08, + "loss": 0.5549, + "step": 7942 + }, + { + "epoch": 0.974601226993865, + "grad_norm": 0.8122212441568787, + "learning_rate": 3.381907692299069e-08, + "loss": 0.4461, + "step": 7943 + }, + { + "epoch": 0.9747239263803681, + "grad_norm": 0.7828445503092291, + "learning_rate": 3.349329390168965e-08, + "loss": 0.5254, + "step": 7944 + }, + { + "epoch": 0.9748466257668712, + "grad_norm": 0.8501257950389189, + "learning_rate": 3.316908500443972e-08, + "loss": 0.5622, + "step": 7945 + }, + { + "epoch": 0.9749693251533742, + "grad_norm": 0.8881370801224874, + "learning_rate": 3.284645028244771e-08, + "loss": 0.5273, + "step": 7946 + }, + { + "epoch": 0.9750920245398773, + "grad_norm": 0.785692303930699, + "learning_rate": 3.252538978666841e-08, + "loss": 0.5725, + "step": 7947 + }, + { + "epoch": 0.9752147239263804, + "grad_norm": 1.006232530524126, + "learning_rate": 3.220590356781239e-08, + "loss": 0.5943, + "step": 7948 + }, + { + "epoch": 0.9753374233128834, + "grad_norm": 0.9832692815290839, + "learning_rate": 3.188799167633927e-08, + "loss": 0.5673, + "step": 7949 + }, + { + "epoch": 0.9754601226993865, + "grad_norm": 0.7855752587733366, + "learning_rate": 3.1571654162461107e-08, + "loss": 0.523, + "step": 7950 + }, + { + "epoch": 0.9755828220858895, + "grad_norm": 0.8204304869768528, + "learning_rate": 3.125689107613905e-08, + "loss": 0.4873, + "step": 7951 + }, + { + "epoch": 0.9757055214723926, + "grad_norm": 0.8856225384791921, + "learning_rate": 3.094370246708889e-08, + "loss": 0.5278, + "step": 7952 + }, + { + "epoch": 0.9758282208588958, + "grad_norm": 0.754791549550983, + "learning_rate": 3.063208838477549e-08, + "loss": 0.5676, + "step": 7953 + }, + { + "epoch": 0.9759509202453988, + "grad_norm": 0.9487115151647902, + "learning_rate": 3.0322048878415055e-08, + "loss": 0.5775, + "step": 7954 + }, + { + "epoch": 0.9760736196319019, + "grad_norm": 1.106371219371213, + "learning_rate": 3.001358399697618e-08, + "loss": 0.5758, + "step": 7955 + }, + { + "epoch": 0.9761963190184049, + "grad_norm": 0.9189250870329098, + "learning_rate": 2.9706693789178788e-08, + "loss": 0.5469, + "step": 7956 + }, + { + "epoch": 0.976319018404908, + "grad_norm": 1.0106081653455334, + "learning_rate": 2.9401378303492988e-08, + "loss": 0.5528, + "step": 7957 + }, + { + "epoch": 0.9764417177914111, + "grad_norm": 0.8659866478848717, + "learning_rate": 2.9097637588140216e-08, + "loss": 0.547, + "step": 7958 + }, + { + "epoch": 0.9765644171779141, + "grad_norm": 0.8307047777641547, + "learning_rate": 2.8795471691094313e-08, + "loss": 0.5418, + "step": 7959 + }, + { + "epoch": 0.9766871165644172, + "grad_norm": 1.1989004262959937, + "learning_rate": 2.8494880660080437e-08, + "loss": 0.4933, + "step": 7960 + }, + { + "epoch": 0.9768098159509202, + "grad_norm": 0.8352418639260725, + "learning_rate": 2.8195864542572837e-08, + "loss": 0.5364, + "step": 7961 + }, + { + "epoch": 0.9769325153374233, + "grad_norm": 0.9927350791758488, + "learning_rate": 2.789842338579929e-08, + "loss": 0.5412, + "step": 7962 + }, + { + "epoch": 0.9770552147239264, + "grad_norm": 0.8842681982562148, + "learning_rate": 2.760255723673888e-08, + "loss": 0.5064, + "step": 7963 + }, + { + "epoch": 0.9771779141104294, + "grad_norm": 0.8863364429622417, + "learning_rate": 2.7308266142119788e-08, + "loss": 0.5509, + "step": 7964 + }, + { + "epoch": 0.9773006134969325, + "grad_norm": 0.8340596874033134, + "learning_rate": 2.7015550148423718e-08, + "loss": 0.5022, + "step": 7965 + }, + { + "epoch": 0.9774233128834355, + "grad_norm": 0.8883975515210084, + "learning_rate": 2.672440930188147e-08, + "loss": 0.4602, + "step": 7966 + }, + { + "epoch": 0.9775460122699386, + "grad_norm": 0.8507460694011764, + "learning_rate": 2.643484364847737e-08, + "loss": 0.5254, + "step": 7967 + }, + { + "epoch": 0.9776687116564418, + "grad_norm": 0.8322483467610219, + "learning_rate": 2.6146853233945946e-08, + "loss": 0.5277, + "step": 7968 + }, + { + "epoch": 0.9777914110429448, + "grad_norm": 0.8011705041970553, + "learning_rate": 2.5860438103771924e-08, + "loss": 0.5253, + "step": 7969 + }, + { + "epoch": 0.9779141104294479, + "grad_norm": 0.8383396458688473, + "learning_rate": 2.557559830319245e-08, + "loss": 0.5323, + "step": 7970 + }, + { + "epoch": 0.9780368098159509, + "grad_norm": 0.8311496995307499, + "learning_rate": 2.5292333877195985e-08, + "loss": 0.5402, + "step": 7971 + }, + { + "epoch": 0.978159509202454, + "grad_norm": 0.9691050145090433, + "learning_rate": 2.5010644870520073e-08, + "loss": 0.5536, + "step": 7972 + }, + { + "epoch": 0.9782822085889571, + "grad_norm": 0.8355377330143107, + "learning_rate": 2.4730531327658014e-08, + "loss": 0.5545, + "step": 7973 + }, + { + "epoch": 0.9784049079754601, + "grad_norm": 0.8959701046857947, + "learning_rate": 2.4451993292848864e-08, + "loss": 0.4584, + "step": 7974 + }, + { + "epoch": 0.9785276073619632, + "grad_norm": 0.9411455913231079, + "learning_rate": 2.417503081008632e-08, + "loss": 0.5659, + "step": 7975 + }, + { + "epoch": 0.9786503067484662, + "grad_norm": 0.9493304372130247, + "learning_rate": 2.389964392311317e-08, + "loss": 0.5505, + "step": 7976 + }, + { + "epoch": 0.9787730061349693, + "grad_norm": 0.9058919268526394, + "learning_rate": 2.362583267542684e-08, + "loss": 0.5626, + "step": 7977 + }, + { + "epoch": 0.9788957055214724, + "grad_norm": 0.9451647449489633, + "learning_rate": 2.335359711027052e-08, + "loss": 0.5241, + "step": 7978 + }, + { + "epoch": 0.9790184049079754, + "grad_norm": 0.9871652897387921, + "learning_rate": 2.3082937270643145e-08, + "loss": 0.5333, + "step": 7979 + }, + { + "epoch": 0.9791411042944785, + "grad_norm": 0.8746659274885416, + "learning_rate": 2.2813853199292745e-08, + "loss": 0.5386, + "step": 7980 + }, + { + "epoch": 0.9792638036809816, + "grad_norm": 0.9534190025357069, + "learning_rate": 2.2546344938718655e-08, + "loss": 0.5396, + "step": 7981 + }, + { + "epoch": 0.9793865030674846, + "grad_norm": 0.9111582556185539, + "learning_rate": 2.2280412531172635e-08, + "loss": 0.4981, + "step": 7982 + }, + { + "epoch": 0.9795092024539878, + "grad_norm": 0.8043164165485571, + "learning_rate": 2.201605601865442e-08, + "loss": 0.5589, + "step": 7983 + }, + { + "epoch": 0.9796319018404908, + "grad_norm": 0.8088653224153336, + "learning_rate": 2.1753275442918387e-08, + "loss": 0.5031, + "step": 7984 + }, + { + "epoch": 0.9797546012269939, + "grad_norm": 0.9116928681222749, + "learning_rate": 2.1492070845468005e-08, + "loss": 0.5683, + "step": 7985 + }, + { + "epoch": 0.979877300613497, + "grad_norm": 0.8794729458377741, + "learning_rate": 2.123244226755805e-08, + "loss": 0.4805, + "step": 7986 + }, + { + "epoch": 0.98, + "grad_norm": 0.8282781442204029, + "learning_rate": 2.097438975019572e-08, + "loss": 0.6069, + "step": 7987 + }, + { + "epoch": 0.9801226993865031, + "grad_norm": 0.8187214742849125, + "learning_rate": 2.0717913334136197e-08, + "loss": 0.5053, + "step": 7988 + }, + { + "epoch": 0.9802453987730061, + "grad_norm": 1.25287369238123, + "learning_rate": 2.0463013059889293e-08, + "loss": 0.5357, + "step": 7989 + }, + { + "epoch": 0.9803680981595092, + "grad_norm": 0.8936668635474296, + "learning_rate": 2.0209688967713914e-08, + "loss": 0.5034, + "step": 7990 + }, + { + "epoch": 0.9804907975460123, + "grad_norm": 0.9228090356990621, + "learning_rate": 1.9957941097620283e-08, + "loss": 0.5279, + "step": 7991 + }, + { + "epoch": 0.9806134969325153, + "grad_norm": 1.0135227800869049, + "learning_rate": 1.970776948937103e-08, + "loss": 0.5787, + "step": 7992 + }, + { + "epoch": 0.9807361963190184, + "grad_norm": 0.8740265280439142, + "learning_rate": 1.945917418247567e-08, + "loss": 0.5822, + "step": 7993 + }, + { + "epoch": 0.9808588957055214, + "grad_norm": 0.8649679645177933, + "learning_rate": 1.9212155216200567e-08, + "loss": 0.5326, + "step": 7994 + }, + { + "epoch": 0.9809815950920245, + "grad_norm": 0.830398073077827, + "learning_rate": 1.896671262955896e-08, + "loss": 0.5128, + "step": 7995 + }, + { + "epoch": 0.9811042944785276, + "grad_norm": 0.9090198458810024, + "learning_rate": 1.872284646131761e-08, + "loss": 0.4886, + "step": 7996 + }, + { + "epoch": 0.9812269938650306, + "grad_norm": 0.8510253740586807, + "learning_rate": 1.8480556749991274e-08, + "loss": 0.471, + "step": 7997 + }, + { + "epoch": 0.9813496932515338, + "grad_norm": 0.9506837878051284, + "learning_rate": 1.8239843533848222e-08, + "loss": 0.4659, + "step": 7998 + }, + { + "epoch": 0.9814723926380368, + "grad_norm": 0.8501063407706189, + "learning_rate": 1.8000706850906934e-08, + "loss": 0.4511, + "step": 7999 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.9208543830727108, + "learning_rate": 1.7763146738938307e-08, + "loss": 0.5175, + "step": 8000 + }, + { + "epoch": 0.981717791411043, + "grad_norm": 0.9239409951711733, + "learning_rate": 1.752716323546011e-08, + "loss": 0.535, + "step": 8001 + }, + { + "epoch": 0.981840490797546, + "grad_norm": 0.8492456372616244, + "learning_rate": 1.7292756377746965e-08, + "loss": 0.5098, + "step": 8002 + }, + { + "epoch": 0.9819631901840491, + "grad_norm": 0.8370996597542587, + "learning_rate": 1.7059926202820376e-08, + "loss": 0.5412, + "step": 8003 + }, + { + "epoch": 0.9820858895705521, + "grad_norm": 0.8878603328210576, + "learning_rate": 1.6828672747453146e-08, + "loss": 0.454, + "step": 8004 + }, + { + "epoch": 0.9822085889570552, + "grad_norm": 0.9198002775618334, + "learning_rate": 1.659899604816939e-08, + "loss": 0.4856, + "step": 8005 + }, + { + "epoch": 0.9823312883435583, + "grad_norm": 0.9802582572267956, + "learning_rate": 1.6370896141246762e-08, + "loss": 0.5967, + "step": 8006 + }, + { + "epoch": 0.9824539877300613, + "grad_norm": 0.8707017905022766, + "learning_rate": 1.6144373062709772e-08, + "loss": 0.5545, + "step": 8007 + }, + { + "epoch": 0.9825766871165644, + "grad_norm": 0.867545641271299, + "learning_rate": 1.5919426848336473e-08, + "loss": 0.5634, + "step": 8008 + }, + { + "epoch": 0.9826993865030675, + "grad_norm": 0.8590501301248262, + "learning_rate": 1.5696057533655108e-08, + "loss": 0.5674, + "step": 8009 + }, + { + "epoch": 0.9828220858895705, + "grad_norm": 0.8233266264586266, + "learning_rate": 1.5474265153944124e-08, + "loss": 0.4639, + "step": 8010 + }, + { + "epoch": 0.9829447852760737, + "grad_norm": 0.88613478938243, + "learning_rate": 1.5254049744235498e-08, + "loss": 0.5775, + "step": 8011 + }, + { + "epoch": 0.9830674846625767, + "grad_norm": 0.8433415680605006, + "learning_rate": 1.503541133930919e-08, + "loss": 0.5571, + "step": 8012 + }, + { + "epoch": 0.9831901840490798, + "grad_norm": 0.9859549860459219, + "learning_rate": 1.4818349973697577e-08, + "loss": 0.553, + "step": 8013 + }, + { + "epoch": 0.9833128834355829, + "grad_norm": 1.0147521922354465, + "learning_rate": 1.4602865681682122e-08, + "loss": 0.5348, + "step": 8014 + }, + { + "epoch": 0.9834355828220859, + "grad_norm": 0.8954473365383864, + "learning_rate": 1.4388958497300043e-08, + "loss": 0.5278, + "step": 8015 + }, + { + "epoch": 0.983558282208589, + "grad_norm": 0.8780945824185079, + "learning_rate": 1.4176628454332097e-08, + "loss": 0.5405, + "step": 8016 + }, + { + "epoch": 0.983680981595092, + "grad_norm": 1.0688271171418156, + "learning_rate": 1.3965875586318122e-08, + "loss": 0.5751, + "step": 8017 + }, + { + "epoch": 0.9838036809815951, + "grad_norm": 0.8670146969149469, + "learning_rate": 1.3756699926541495e-08, + "loss": 0.509, + "step": 8018 + }, + { + "epoch": 0.9839263803680982, + "grad_norm": 0.8976139823325352, + "learning_rate": 1.354910150804023e-08, + "loss": 0.574, + "step": 8019 + }, + { + "epoch": 0.9840490797546012, + "grad_norm": 0.8790312166527992, + "learning_rate": 1.3343080363604766e-08, + "loss": 0.4838, + "step": 8020 + }, + { + "epoch": 0.9841717791411043, + "grad_norm": 0.8388723290467638, + "learning_rate": 1.3138636525772408e-08, + "loss": 0.5537, + "step": 8021 + }, + { + "epoch": 0.9842944785276073, + "grad_norm": 0.8004624334036268, + "learning_rate": 1.2935770026833994e-08, + "loss": 0.4314, + "step": 8022 + }, + { + "epoch": 0.9844171779141104, + "grad_norm": 0.8162497267600809, + "learning_rate": 1.273448089882945e-08, + "loss": 0.4985, + "step": 8023 + }, + { + "epoch": 0.9845398773006135, + "grad_norm": 0.8017739120710097, + "learning_rate": 1.253476917355334e-08, + "loss": 0.4714, + "step": 8024 + }, + { + "epoch": 0.9846625766871165, + "grad_norm": 0.9006083375752278, + "learning_rate": 1.2336634882544885e-08, + "loss": 0.512, + "step": 8025 + }, + { + "epoch": 0.9847852760736197, + "grad_norm": 0.9984487978070483, + "learning_rate": 1.2140078057101269e-08, + "loss": 0.5854, + "step": 8026 + }, + { + "epoch": 0.9849079754601227, + "grad_norm": 0.8759698302919335, + "learning_rate": 1.1945098728263215e-08, + "loss": 0.4759, + "step": 8027 + }, + { + "epoch": 0.9850306748466258, + "grad_norm": 0.8623436920214094, + "learning_rate": 1.1751696926829425e-08, + "loss": 0.5311, + "step": 8028 + }, + { + "epoch": 0.9851533742331289, + "grad_norm": 0.9563749723732631, + "learning_rate": 1.1559872683344352e-08, + "loss": 0.541, + "step": 8029 + }, + { + "epoch": 0.9852760736196319, + "grad_norm": 1.0403877558211114, + "learning_rate": 1.1369626028104874e-08, + "loss": 0.5393, + "step": 8030 + }, + { + "epoch": 0.985398773006135, + "grad_norm": 0.8821621253114168, + "learning_rate": 1.1180956991160285e-08, + "loss": 0.4942, + "step": 8031 + }, + { + "epoch": 0.985521472392638, + "grad_norm": 0.7267949315371341, + "learning_rate": 1.0993865602306753e-08, + "loss": 0.5161, + "step": 8032 + }, + { + "epoch": 0.9856441717791411, + "grad_norm": 0.805588025301413, + "learning_rate": 1.0808351891096191e-08, + "loss": 0.4556, + "step": 8033 + }, + { + "epoch": 0.9857668711656442, + "grad_norm": 0.8927511105803629, + "learning_rate": 1.0624415886827389e-08, + "loss": 0.5279, + "step": 8034 + }, + { + "epoch": 0.9858895705521472, + "grad_norm": 0.8459323726144836, + "learning_rate": 1.0442057618551549e-08, + "loss": 0.4843, + "step": 8035 + }, + { + "epoch": 0.9860122699386503, + "grad_norm": 0.8753366294860945, + "learning_rate": 1.0261277115071188e-08, + "loss": 0.501, + "step": 8036 + }, + { + "epoch": 0.9861349693251533, + "grad_norm": 0.9306000217812646, + "learning_rate": 1.0082074404940134e-08, + "loss": 0.5169, + "step": 8037 + }, + { + "epoch": 0.9862576687116564, + "grad_norm": 0.7312309543909025, + "learning_rate": 9.904449516459081e-09, + "loss": 0.531, + "step": 8038 + }, + { + "epoch": 0.9863803680981595, + "grad_norm": 0.9083110409540003, + "learning_rate": 9.728402477684472e-09, + "loss": 0.4119, + "step": 8039 + }, + { + "epoch": 0.9865030674846625, + "grad_norm": 0.8655446990850848, + "learning_rate": 9.553933316420739e-09, + "loss": 0.5468, + "step": 8040 + }, + { + "epoch": 0.9866257668711657, + "grad_norm": 0.861511367758594, + "learning_rate": 9.38104206022361e-09, + "loss": 0.4669, + "step": 8041 + }, + { + "epoch": 0.9867484662576688, + "grad_norm": 0.8693267015325528, + "learning_rate": 9.209728736400136e-09, + "loss": 0.5258, + "step": 8042 + }, + { + "epoch": 0.9868711656441718, + "grad_norm": 0.9771570998618054, + "learning_rate": 9.039993372007561e-09, + "loss": 0.4855, + "step": 8043 + }, + { + "epoch": 0.9869938650306749, + "grad_norm": 0.8955437844374673, + "learning_rate": 8.871835993854439e-09, + "loss": 0.4915, + "step": 8044 + }, + { + "epoch": 0.9871165644171779, + "grad_norm": 0.8585584827180223, + "learning_rate": 8.705256628499525e-09, + "loss": 0.531, + "step": 8045 + }, + { + "epoch": 0.987239263803681, + "grad_norm": 0.8542543399260744, + "learning_rate": 8.540255302252887e-09, + "loss": 0.5431, + "step": 8046 + }, + { + "epoch": 0.9873619631901841, + "grad_norm": 0.8228956650446958, + "learning_rate": 8.376832041174787e-09, + "loss": 0.5423, + "step": 8047 + }, + { + "epoch": 0.9874846625766871, + "grad_norm": 0.9198179381114585, + "learning_rate": 8.214986871076803e-09, + "loss": 0.5514, + "step": 8048 + }, + { + "epoch": 0.9876073619631902, + "grad_norm": 0.8816583365177868, + "learning_rate": 8.054719817519596e-09, + "loss": 0.5577, + "step": 8049 + }, + { + "epoch": 0.9877300613496932, + "grad_norm": 0.9052543239389714, + "learning_rate": 7.896030905818474e-09, + "loss": 0.5354, + "step": 8050 + }, + { + "epoch": 0.9878527607361963, + "grad_norm": 0.8543715000014107, + "learning_rate": 7.738920161035613e-09, + "loss": 0.5261, + "step": 8051 + }, + { + "epoch": 0.9879754601226994, + "grad_norm": 0.9198759569116836, + "learning_rate": 7.583387607984493e-09, + "loss": 0.4823, + "step": 8052 + }, + { + "epoch": 0.9880981595092024, + "grad_norm": 0.9578516460389854, + "learning_rate": 7.429433271231024e-09, + "loss": 0.5572, + "step": 8053 + }, + { + "epoch": 0.9882208588957055, + "grad_norm": 0.9081208917932677, + "learning_rate": 7.277057175091307e-09, + "loss": 0.4946, + "step": 8054 + }, + { + "epoch": 0.9883435582822085, + "grad_norm": 0.8747611139039927, + "learning_rate": 7.126259343631648e-09, + "loss": 0.5524, + "step": 8055 + }, + { + "epoch": 0.9884662576687117, + "grad_norm": 0.8787984601811579, + "learning_rate": 6.97703980066855e-09, + "loss": 0.5103, + "step": 8056 + }, + { + "epoch": 0.9885889570552148, + "grad_norm": 0.87072259959036, + "learning_rate": 6.82939856977094e-09, + "loss": 0.5474, + "step": 8057 + }, + { + "epoch": 0.9887116564417178, + "grad_norm": 0.8564390476942163, + "learning_rate": 6.68333567425683e-09, + "loss": 0.5322, + "step": 8058 + }, + { + "epoch": 0.9888343558282209, + "grad_norm": 0.854695122142646, + "learning_rate": 6.538851137196656e-09, + "loss": 0.4934, + "step": 8059 + }, + { + "epoch": 0.9889570552147239, + "grad_norm": 0.9056194701761259, + "learning_rate": 6.39594498140883e-09, + "loss": 0.519, + "step": 8060 + }, + { + "epoch": 0.989079754601227, + "grad_norm": 0.8422972208929244, + "learning_rate": 6.254617229464188e-09, + "loss": 0.5039, + "step": 8061 + }, + { + "epoch": 0.9892024539877301, + "grad_norm": 0.9256714827204287, + "learning_rate": 6.1148679036859836e-09, + "loss": 0.5247, + "step": 8062 + }, + { + "epoch": 0.9893251533742331, + "grad_norm": 0.8624395770871067, + "learning_rate": 5.97669702614545e-09, + "loss": 0.5476, + "step": 8063 + }, + { + "epoch": 0.9894478527607362, + "grad_norm": 1.0688125470041885, + "learning_rate": 5.840104618665132e-09, + "loss": 0.5801, + "step": 8064 + }, + { + "epoch": 0.9895705521472392, + "grad_norm": 0.8190728932369684, + "learning_rate": 5.705090702819993e-09, + "loss": 0.5366, + "step": 8065 + }, + { + "epoch": 0.9896932515337423, + "grad_norm": 0.8482950462843711, + "learning_rate": 5.571655299931866e-09, + "loss": 0.579, + "step": 8066 + }, + { + "epoch": 0.9898159509202454, + "grad_norm": 0.8557424497071219, + "learning_rate": 5.439798431078336e-09, + "loss": 0.5821, + "step": 8067 + }, + { + "epoch": 0.9899386503067484, + "grad_norm": 0.8383896917763847, + "learning_rate": 5.309520117082745e-09, + "loss": 0.5722, + "step": 8068 + }, + { + "epoch": 0.9900613496932515, + "grad_norm": 0.9178972492705302, + "learning_rate": 5.180820378524187e-09, + "loss": 0.5422, + "step": 8069 + }, + { + "epoch": 0.9901840490797545, + "grad_norm": 0.9600285398137407, + "learning_rate": 5.053699235726406e-09, + "loss": 0.5252, + "step": 8070 + }, + { + "epoch": 0.9903067484662577, + "grad_norm": 0.8162668048804597, + "learning_rate": 4.9281567087700044e-09, + "loss": 0.4932, + "step": 8071 + }, + { + "epoch": 0.9904294478527608, + "grad_norm": 0.8142952720052488, + "learning_rate": 4.804192817481345e-09, + "loss": 0.4845, + "step": 8072 + }, + { + "epoch": 0.9905521472392638, + "grad_norm": 0.8057179393570743, + "learning_rate": 4.681807581440323e-09, + "loss": 0.5525, + "step": 8073 + }, + { + "epoch": 0.9906748466257669, + "grad_norm": 0.9366795662116038, + "learning_rate": 4.5610010199770295e-09, + "loss": 0.5666, + "step": 8074 + }, + { + "epoch": 0.99079754601227, + "grad_norm": 0.8662139568499244, + "learning_rate": 4.4417731521717576e-09, + "loss": 0.515, + "step": 8075 + }, + { + "epoch": 0.990920245398773, + "grad_norm": 0.9590081911308814, + "learning_rate": 4.324123996853891e-09, + "loss": 0.5811, + "step": 8076 + }, + { + "epoch": 0.9910429447852761, + "grad_norm": 0.8870411957538403, + "learning_rate": 4.208053572606341e-09, + "loss": 0.495, + "step": 8077 + }, + { + "epoch": 0.9911656441717791, + "grad_norm": 0.8344581413975124, + "learning_rate": 4.093561897762222e-09, + "loss": 0.5469, + "step": 8078 + }, + { + "epoch": 0.9912883435582822, + "grad_norm": 1.0099098315622514, + "learning_rate": 3.980648990403735e-09, + "loss": 0.577, + "step": 8079 + }, + { + "epoch": 0.9914110429447853, + "grad_norm": 0.9563147486294613, + "learning_rate": 3.869314868363283e-09, + "loss": 0.5218, + "step": 8080 + }, + { + "epoch": 0.9915337423312883, + "grad_norm": 0.8611158842908662, + "learning_rate": 3.759559549227909e-09, + "loss": 0.5954, + "step": 8081 + }, + { + "epoch": 0.9916564417177914, + "grad_norm": 0.9097672699169956, + "learning_rate": 3.6513830503293047e-09, + "loss": 0.5247, + "step": 8082 + }, + { + "epoch": 0.9917791411042944, + "grad_norm": 0.7652367179069026, + "learning_rate": 3.5447853887560226e-09, + "loss": 0.6219, + "step": 8083 + }, + { + "epoch": 0.9919018404907975, + "grad_norm": 0.9393390578529786, + "learning_rate": 3.439766581342374e-09, + "loss": 0.4957, + "step": 8084 + }, + { + "epoch": 0.9920245398773007, + "grad_norm": 0.8373375281498044, + "learning_rate": 3.3363266446750918e-09, + "loss": 0.4627, + "step": 8085 + }, + { + "epoch": 0.9921472392638037, + "grad_norm": 0.9034458528380025, + "learning_rate": 3.2344655950922176e-09, + "loss": 0.4945, + "step": 8086 + }, + { + "epoch": 0.9922699386503068, + "grad_norm": 0.9526404878453558, + "learning_rate": 3.1341834486831035e-09, + "loss": 0.5604, + "step": 8087 + }, + { + "epoch": 0.9923926380368098, + "grad_norm": 0.8977836091910447, + "learning_rate": 3.0354802212839705e-09, + "loss": 0.5696, + "step": 8088 + }, + { + "epoch": 0.9925153374233129, + "grad_norm": 0.8715294699020937, + "learning_rate": 2.938355928485681e-09, + "loss": 0.5148, + "step": 8089 + }, + { + "epoch": 0.992638036809816, + "grad_norm": 0.9853394089797015, + "learning_rate": 2.842810585627076e-09, + "loss": 0.5578, + "step": 8090 + }, + { + "epoch": 0.992760736196319, + "grad_norm": 0.9274242502223354, + "learning_rate": 2.7488442078005273e-09, + "loss": 0.5316, + "step": 8091 + }, + { + "epoch": 0.9928834355828221, + "grad_norm": 1.0413802994917132, + "learning_rate": 2.656456809846386e-09, + "loss": 0.5213, + "step": 8092 + }, + { + "epoch": 0.9930061349693251, + "grad_norm": 0.8100579106661275, + "learning_rate": 2.5656484063552035e-09, + "loss": 0.5716, + "step": 8093 + }, + { + "epoch": 0.9931288343558282, + "grad_norm": 0.8899447691177579, + "learning_rate": 2.4764190116710607e-09, + "loss": 0.5314, + "step": 8094 + }, + { + "epoch": 0.9932515337423313, + "grad_norm": 0.9222314718459492, + "learning_rate": 2.388768639886019e-09, + "loss": 0.5352, + "step": 8095 + }, + { + "epoch": 0.9933742331288343, + "grad_norm": 0.9069918617561391, + "learning_rate": 2.302697304843449e-09, + "loss": 0.4852, + "step": 8096 + }, + { + "epoch": 0.9934969325153374, + "grad_norm": 0.8137172378200116, + "learning_rate": 2.2182050201391414e-09, + "loss": 0.5232, + "step": 8097 + }, + { + "epoch": 0.9936196319018404, + "grad_norm": 0.8874866091069821, + "learning_rate": 2.1352917991168674e-09, + "loss": 0.5463, + "step": 8098 + }, + { + "epoch": 0.9937423312883435, + "grad_norm": 0.8136865280677629, + "learning_rate": 2.053957654871708e-09, + "loss": 0.5901, + "step": 8099 + }, + { + "epoch": 0.9938650306748467, + "grad_norm": 0.8814384764896394, + "learning_rate": 1.9742026002500526e-09, + "loss": 0.5862, + "step": 8100 + }, + { + "epoch": 0.9939877300613497, + "grad_norm": 0.8328294556953024, + "learning_rate": 1.8960266478484924e-09, + "loss": 0.5798, + "step": 8101 + }, + { + "epoch": 0.9941104294478528, + "grad_norm": 0.8106544334113898, + "learning_rate": 1.8194298100149277e-09, + "loss": 0.526, + "step": 8102 + }, + { + "epoch": 0.9942331288343559, + "grad_norm": 0.8340288064955491, + "learning_rate": 1.7444120988452385e-09, + "loss": 0.5601, + "step": 8103 + }, + { + "epoch": 0.9943558282208589, + "grad_norm": 0.8984481256929826, + "learning_rate": 1.6709735261910553e-09, + "loss": 0.5399, + "step": 8104 + }, + { + "epoch": 0.994478527607362, + "grad_norm": 0.9303033322102344, + "learning_rate": 1.5991141036475478e-09, + "loss": 0.5549, + "step": 8105 + }, + { + "epoch": 0.994601226993865, + "grad_norm": 0.85853611283566, + "learning_rate": 1.5288338425678562e-09, + "loss": 0.484, + "step": 8106 + }, + { + "epoch": 0.9947239263803681, + "grad_norm": 0.8930763887982528, + "learning_rate": 1.46013275404977e-09, + "loss": 0.5165, + "step": 8107 + }, + { + "epoch": 0.9948466257668712, + "grad_norm": 0.8438699545783744, + "learning_rate": 1.3930108489446093e-09, + "loss": 0.5223, + "step": 8108 + }, + { + "epoch": 0.9949693251533742, + "grad_norm": 0.8399309561118803, + "learning_rate": 1.3274681378538933e-09, + "loss": 0.5421, + "step": 8109 + }, + { + "epoch": 0.9950920245398773, + "grad_norm": 0.8905008251946089, + "learning_rate": 1.263504631129342e-09, + "loss": 0.5374, + "step": 8110 + }, + { + "epoch": 0.9952147239263803, + "grad_norm": 0.8830448931043606, + "learning_rate": 1.2011203388739844e-09, + "loss": 0.5364, + "step": 8111 + }, + { + "epoch": 0.9953374233128834, + "grad_norm": 1.0417689216822437, + "learning_rate": 1.14031527093994e-09, + "loss": 0.5166, + "step": 8112 + }, + { + "epoch": 0.9954601226993866, + "grad_norm": 1.0347947003396731, + "learning_rate": 1.081089436931748e-09, + "loss": 0.6003, + "step": 8113 + }, + { + "epoch": 0.9955828220858896, + "grad_norm": 0.8792457303148355, + "learning_rate": 1.0234428462030377e-09, + "loss": 0.5029, + "step": 8114 + }, + { + "epoch": 0.9957055214723927, + "grad_norm": 0.8686081457398056, + "learning_rate": 9.673755078598578e-10, + "loss": 0.5278, + "step": 8115 + }, + { + "epoch": 0.9958282208588957, + "grad_norm": 0.8058753311555389, + "learning_rate": 9.128874307551273e-10, + "loss": 0.5146, + "step": 8116 + }, + { + "epoch": 0.9959509202453988, + "grad_norm": 0.9367899368919581, + "learning_rate": 8.59978623497515e-10, + "loss": 0.5681, + "step": 8117 + }, + { + "epoch": 0.9960736196319019, + "grad_norm": 0.971898249370446, + "learning_rate": 8.086490944414494e-10, + "loss": 0.5797, + "step": 8118 + }, + { + "epoch": 0.9961963190184049, + "grad_norm": 0.9683313291599472, + "learning_rate": 7.588988516937789e-10, + "loss": 0.5146, + "step": 8119 + }, + { + "epoch": 0.996319018404908, + "grad_norm": 0.8045901559914443, + "learning_rate": 7.107279031148828e-10, + "loss": 0.5079, + "step": 8120 + }, + { + "epoch": 0.996441717791411, + "grad_norm": 0.9186592668519871, + "learning_rate": 6.641362563097886e-10, + "loss": 0.5177, + "step": 8121 + }, + { + "epoch": 0.9965644171779141, + "grad_norm": 0.9199785348879954, + "learning_rate": 6.19123918639275e-10, + "loss": 0.5649, + "step": 8122 + }, + { + "epoch": 0.9966871165644172, + "grad_norm": 0.8840120731042634, + "learning_rate": 5.756908972109898e-10, + "loss": 0.5299, + "step": 8123 + }, + { + "epoch": 0.9968098159509202, + "grad_norm": 1.083641587766273, + "learning_rate": 5.338371988872216e-10, + "loss": 0.5669, + "step": 8124 + }, + { + "epoch": 0.9969325153374233, + "grad_norm": 0.9769653047897521, + "learning_rate": 4.935628302760175e-10, + "loss": 0.5578, + "step": 8125 + }, + { + "epoch": 0.9970552147239263, + "grad_norm": 0.8170174473131367, + "learning_rate": 4.5486779773895595e-10, + "loss": 0.5487, + "step": 8126 + }, + { + "epoch": 0.9971779141104294, + "grad_norm": 0.8420874721010575, + "learning_rate": 4.177521073889246e-10, + "loss": 0.4847, + "step": 8127 + }, + { + "epoch": 0.9973006134969326, + "grad_norm": 0.896033199112176, + "learning_rate": 3.822157650867908e-10, + "loss": 0.5669, + "step": 8128 + }, + { + "epoch": 0.9974233128834356, + "grad_norm": 0.85675688846535, + "learning_rate": 3.4825877644473207e-10, + "loss": 0.5506, + "step": 8129 + }, + { + "epoch": 0.9975460122699387, + "grad_norm": 0.8311186703343291, + "learning_rate": 3.158811468273459e-10, + "loss": 0.5423, + "step": 8130 + }, + { + "epoch": 0.9976687116564417, + "grad_norm": 0.8426355738890635, + "learning_rate": 2.8508288134720954e-10, + "loss": 0.5768, + "step": 8131 + }, + { + "epoch": 0.9977914110429448, + "grad_norm": 0.7893457484790143, + "learning_rate": 2.5586398486932004e-10, + "loss": 0.5387, + "step": 8132 + }, + { + "epoch": 0.9979141104294479, + "grad_norm": 0.8013536913736445, + "learning_rate": 2.282244620088747e-10, + "loss": 0.5362, + "step": 8133 + }, + { + "epoch": 0.9980368098159509, + "grad_norm": 0.9864595014884832, + "learning_rate": 2.021643171312704e-10, + "loss": 0.5421, + "step": 8134 + }, + { + "epoch": 0.998159509202454, + "grad_norm": 0.920543704113992, + "learning_rate": 1.776835543509936e-10, + "loss": 0.5527, + "step": 8135 + }, + { + "epoch": 0.9982822085889571, + "grad_norm": 0.8915800376660765, + "learning_rate": 1.547821775360614e-10, + "loss": 0.5585, + "step": 8136 + }, + { + "epoch": 0.9984049079754601, + "grad_norm": 0.9062536648460732, + "learning_rate": 1.3346019030247016e-10, + "loss": 0.5451, + "step": 8137 + }, + { + "epoch": 0.9985276073619632, + "grad_norm": 0.8328876678582104, + "learning_rate": 1.1371759601974674e-10, + "loss": 0.5397, + "step": 8138 + }, + { + "epoch": 0.9986503067484662, + "grad_norm": 0.8866597538851861, + "learning_rate": 9.555439780428721e-11, + "loss": 0.5712, + "step": 8139 + }, + { + "epoch": 0.9987730061349693, + "grad_norm": 0.8750166818728027, + "learning_rate": 7.897059852490785e-11, + "loss": 0.5362, + "step": 8140 + }, + { + "epoch": 0.9988957055214724, + "grad_norm": 0.9581191474743798, + "learning_rate": 6.396620080173499e-11, + "loss": 0.5489, + "step": 8141 + }, + { + "epoch": 0.9990184049079754, + "grad_norm": 0.8510303887755856, + "learning_rate": 5.054120700287435e-11, + "loss": 0.5471, + "step": 8142 + }, + { + "epoch": 0.9991411042944786, + "grad_norm": 0.8110949132893618, + "learning_rate": 3.8695619251072345e-11, + "loss": 0.5199, + "step": 8143 + }, + { + "epoch": 0.9992638036809816, + "grad_norm": 0.8832142190327922, + "learning_rate": 2.8429439417054783e-11, + "loss": 0.5182, + "step": 8144 + }, + { + "epoch": 0.9993865030674847, + "grad_norm": 0.8881623546886691, + "learning_rate": 1.9742669119526824e-11, + "loss": 0.5213, + "step": 8145 + }, + { + "epoch": 0.9995092024539878, + "grad_norm": 0.977282339442972, + "learning_rate": 1.2635309732944579e-11, + "loss": 0.5183, + "step": 8146 + }, + { + "epoch": 0.9996319018404908, + "grad_norm": 0.8807034701591079, + "learning_rate": 7.107362379743521e-12, + "loss": 0.5397, + "step": 8147 + }, + { + "epoch": 0.9997546012269939, + "grad_norm": 0.8336735776679527, + "learning_rate": 3.15882793255895e-12, + "loss": 0.4885, + "step": 8148 + }, + { + "epoch": 0.9998773006134969, + "grad_norm": 0.8306700824474735, + "learning_rate": 7.897070142259822e-13, + "loss": 0.4691, + "step": 8149 + }, + { + "epoch": 1.0, + "grad_norm": 0.8556391278665927, + "learning_rate": 0.0, + "loss": 0.5802, + "step": 8150 + }, + { + "epoch": 1.0, + "step": 8150, + "total_flos": 7.786043613317693e+17, + "train_loss": 0.042634430275372934, + "train_runtime": 8950.8274, + "train_samples_per_second": 233.09, + "train_steps_per_second": 0.911 + } + ], + "logging_steps": 1.0, + "max_steps": 8150, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.786043613317693e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}