{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.981177899210686, "eval_steps": 26, "global_step": 822, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024286581663630845, "grad_norm": 3.421875, "learning_rate": 1.25e-06, "loss": 0.9095, "step": 1 }, { "epoch": 0.0024286581663630845, "eval_loss": 0.8089314699172974, "eval_runtime": 98.8099, "eval_samples_per_second": 30.361, "eval_steps_per_second": 3.795, "step": 1 }, { "epoch": 0.004857316332726169, "grad_norm": 3.40625, "learning_rate": 2.5e-06, "loss": 0.8146, "step": 2 }, { "epoch": 0.007285974499089253, "grad_norm": 3.265625, "learning_rate": 3.7500000000000005e-06, "loss": 0.806, "step": 3 }, { "epoch": 0.009714632665452338, "grad_norm": 2.453125, "learning_rate": 5e-06, "loss": 0.781, "step": 4 }, { "epoch": 0.012143290831815421, "grad_norm": 2.015625, "learning_rate": 6.25e-06, "loss": 0.7774, "step": 5 }, { "epoch": 0.014571948998178506, "grad_norm": 1.953125, "learning_rate": 7.500000000000001e-06, "loss": 0.776, "step": 6 }, { "epoch": 0.01700060716454159, "grad_norm": 2.03125, "learning_rate": 8.750000000000001e-06, "loss": 0.7554, "step": 7 }, { "epoch": 0.019429265330904676, "grad_norm": 1.1640625, "learning_rate": 1e-05, "loss": 0.7362, "step": 8 }, { "epoch": 0.02185792349726776, "grad_norm": 0.97265625, "learning_rate": 1.125e-05, "loss": 0.7365, "step": 9 }, { "epoch": 0.024286581663630843, "grad_norm": 1.1640625, "learning_rate": 1.25e-05, "loss": 0.7183, "step": 10 }, { "epoch": 0.02671523982999393, "grad_norm": 1.1953125, "learning_rate": 1.375e-05, "loss": 0.7153, "step": 11 }, { "epoch": 0.029143897996357013, "grad_norm": 1.2109375, "learning_rate": 1.5000000000000002e-05, "loss": 0.8111, "step": 12 }, { "epoch": 0.031572556162720096, "grad_norm": 0.78125, "learning_rate": 1.6250000000000002e-05, "loss": 0.6966, "step": 13 }, { "epoch": 0.03400121432908318, "grad_norm": 0.640625, "learning_rate": 1.7500000000000002e-05, "loss": 0.7068, "step": 14 }, { "epoch": 0.03642987249544627, "grad_norm": 0.6484375, "learning_rate": 1.8750000000000002e-05, "loss": 0.6915, "step": 15 }, { "epoch": 0.03885853066180935, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 0.6878, "step": 16 }, { "epoch": 0.041287188828172436, "grad_norm": 0.61328125, "learning_rate": 1.999992403752328e-05, "loss": 0.6902, "step": 17 }, { "epoch": 0.04371584699453552, "grad_norm": 0.55859375, "learning_rate": 1.999969615124717e-05, "loss": 0.6818, "step": 18 }, { "epoch": 0.0461445051608986, "grad_norm": 0.52734375, "learning_rate": 1.999931634463383e-05, "loss": 0.6732, "step": 19 }, { "epoch": 0.048573163327261686, "grad_norm": 0.47265625, "learning_rate": 1.9998784623453477e-05, "loss": 0.6693, "step": 20 }, { "epoch": 0.051001821493624776, "grad_norm": 0.47265625, "learning_rate": 1.999810099578428e-05, "loss": 0.6663, "step": 21 }, { "epoch": 0.05343047965998786, "grad_norm": 0.72265625, "learning_rate": 1.9997265472012247e-05, "loss": 0.7473, "step": 22 }, { "epoch": 0.05585913782635094, "grad_norm": 0.462890625, "learning_rate": 1.999627806483107e-05, "loss": 0.6457, "step": 23 }, { "epoch": 0.058287795992714025, "grad_norm": 0.70703125, "learning_rate": 1.999513878924193e-05, "loss": 0.7388, "step": 24 }, { "epoch": 0.06071645415907711, "grad_norm": 0.447265625, "learning_rate": 1.9993847662553264e-05, "loss": 0.6505, "step": 25 }, { "epoch": 0.06314511232544019, "grad_norm": 0.4453125, "learning_rate": 1.9992404704380513e-05, "loss": 0.6388, "step": 26 }, { "epoch": 0.06314511232544019, "eval_loss": 0.649046003818512, "eval_runtime": 97.2348, "eval_samples_per_second": 30.853, "eval_steps_per_second": 3.857, "step": 26 }, { "epoch": 0.06557377049180328, "grad_norm": 0.416015625, "learning_rate": 1.9990809936645804e-05, "loss": 0.6507, "step": 27 }, { "epoch": 0.06800242865816636, "grad_norm": 0.42578125, "learning_rate": 1.9989063383577644e-05, "loss": 0.6536, "step": 28 }, { "epoch": 0.07043108682452945, "grad_norm": 0.42578125, "learning_rate": 1.998716507171053e-05, "loss": 0.6508, "step": 29 }, { "epoch": 0.07285974499089254, "grad_norm": 0.412109375, "learning_rate": 1.9985115029884556e-05, "loss": 0.6465, "step": 30 }, { "epoch": 0.07528840315725562, "grad_norm": 0.5546875, "learning_rate": 1.9982913289244977e-05, "loss": 0.7309, "step": 31 }, { "epoch": 0.0777170613236187, "grad_norm": 0.396484375, "learning_rate": 1.9980559883241723e-05, "loss": 0.6319, "step": 32 }, { "epoch": 0.08014571948998178, "grad_norm": 0.392578125, "learning_rate": 1.9978054847628908e-05, "loss": 0.6309, "step": 33 }, { "epoch": 0.08257437765634487, "grad_norm": 0.392578125, "learning_rate": 1.9975398220464268e-05, "loss": 0.6301, "step": 34 }, { "epoch": 0.08500303582270795, "grad_norm": 0.392578125, "learning_rate": 1.9972590042108605e-05, "loss": 0.6364, "step": 35 }, { "epoch": 0.08743169398907104, "grad_norm": 0.400390625, "learning_rate": 1.996963035522515e-05, "loss": 0.6303, "step": 36 }, { "epoch": 0.08986035215543413, "grad_norm": 0.384765625, "learning_rate": 1.9966519204778937e-05, "loss": 0.6374, "step": 37 }, { "epoch": 0.0922890103217972, "grad_norm": 0.390625, "learning_rate": 1.99632566380361e-05, "loss": 0.6177, "step": 38 }, { "epoch": 0.0947176684881603, "grad_norm": 0.38671875, "learning_rate": 1.995984270456317e-05, "loss": 0.6259, "step": 39 }, { "epoch": 0.09714632665452337, "grad_norm": 0.380859375, "learning_rate": 1.995627745622632e-05, "loss": 0.6311, "step": 40 }, { "epoch": 0.09957498482088646, "grad_norm": 0.39453125, "learning_rate": 1.9952560947190568e-05, "loss": 0.6254, "step": 41 }, { "epoch": 0.10200364298724955, "grad_norm": 0.376953125, "learning_rate": 1.994869323391895e-05, "loss": 0.6197, "step": 42 }, { "epoch": 0.10443230115361263, "grad_norm": 0.373046875, "learning_rate": 1.9944674375171697e-05, "loss": 0.6147, "step": 43 }, { "epoch": 0.10686095931997572, "grad_norm": 0.380859375, "learning_rate": 1.9940504432005293e-05, "loss": 0.6281, "step": 44 }, { "epoch": 0.1092896174863388, "grad_norm": 0.36328125, "learning_rate": 1.993618346777158e-05, "loss": 0.6142, "step": 45 }, { "epoch": 0.11171827565270188, "grad_norm": 0.373046875, "learning_rate": 1.993171154811679e-05, "loss": 0.6182, "step": 46 }, { "epoch": 0.11414693381906496, "grad_norm": 0.376953125, "learning_rate": 1.992708874098054e-05, "loss": 0.6181, "step": 47 }, { "epoch": 0.11657559198542805, "grad_norm": 0.375, "learning_rate": 1.992231511659481e-05, "loss": 0.6136, "step": 48 }, { "epoch": 0.11900425015179114, "grad_norm": 0.376953125, "learning_rate": 1.9917390747482855e-05, "loss": 0.6052, "step": 49 }, { "epoch": 0.12143290831815422, "grad_norm": 0.369140625, "learning_rate": 1.9912315708458144e-05, "loss": 0.6087, "step": 50 }, { "epoch": 0.12386156648451731, "grad_norm": 0.359375, "learning_rate": 1.9907090076623174e-05, "loss": 0.6031, "step": 51 }, { "epoch": 0.12629022465088038, "grad_norm": 0.384765625, "learning_rate": 1.9901713931368333e-05, "loss": 0.6131, "step": 52 }, { "epoch": 0.12629022465088038, "eval_loss": 0.612246572971344, "eval_runtime": 97.1281, "eval_samples_per_second": 30.887, "eval_steps_per_second": 3.861, "step": 52 }, { "epoch": 0.12871888281724347, "grad_norm": 0.455078125, "learning_rate": 1.989618735437069e-05, "loss": 0.702, "step": 53 }, { "epoch": 0.13114754098360656, "grad_norm": 0.380859375, "learning_rate": 1.989051042959273e-05, "loss": 0.6192, "step": 54 }, { "epoch": 0.13357619914996965, "grad_norm": 0.388671875, "learning_rate": 1.9884683243281117e-05, "loss": 0.612, "step": 55 }, { "epoch": 0.13600485731633272, "grad_norm": 0.376953125, "learning_rate": 1.9878705883965342e-05, "loss": 0.6026, "step": 56 }, { "epoch": 0.1384335154826958, "grad_norm": 0.380859375, "learning_rate": 1.9872578442456415e-05, "loss": 0.6044, "step": 57 }, { "epoch": 0.1408621736490589, "grad_norm": 0.404296875, "learning_rate": 1.986630101184546e-05, "loss": 0.6061, "step": 58 }, { "epoch": 0.143290831815422, "grad_norm": 0.390625, "learning_rate": 1.9859873687502317e-05, "loss": 0.6113, "step": 59 }, { "epoch": 0.14571948998178508, "grad_norm": 0.384765625, "learning_rate": 1.9853296567074075e-05, "loss": 0.5933, "step": 60 }, { "epoch": 0.14814814814814814, "grad_norm": 0.38671875, "learning_rate": 1.9846569750483605e-05, "loss": 0.6046, "step": 61 }, { "epoch": 0.15057680631451123, "grad_norm": 0.3984375, "learning_rate": 1.983969333992804e-05, "loss": 0.6079, "step": 62 }, { "epoch": 0.15300546448087432, "grad_norm": 0.392578125, "learning_rate": 1.9832667439877217e-05, "loss": 0.6098, "step": 63 }, { "epoch": 0.1554341226472374, "grad_norm": 0.373046875, "learning_rate": 1.982549215707209e-05, "loss": 0.5942, "step": 64 }, { "epoch": 0.15786278081360047, "grad_norm": 0.376953125, "learning_rate": 1.98181676005231e-05, "loss": 0.6082, "step": 65 }, { "epoch": 0.16029143897996356, "grad_norm": 0.478515625, "learning_rate": 1.9810693881508548e-05, "loss": 0.6838, "step": 66 }, { "epoch": 0.16272009714632665, "grad_norm": 0.390625, "learning_rate": 1.980307111357288e-05, "loss": 0.5919, "step": 67 }, { "epoch": 0.16514875531268974, "grad_norm": 0.369140625, "learning_rate": 1.9795299412524948e-05, "loss": 0.5769, "step": 68 }, { "epoch": 0.16757741347905283, "grad_norm": 0.3828125, "learning_rate": 1.9787378896436292e-05, "loss": 0.6, "step": 69 }, { "epoch": 0.1700060716454159, "grad_norm": 0.369140625, "learning_rate": 1.9779309685639317e-05, "loss": 0.5963, "step": 70 }, { "epoch": 0.172434729811779, "grad_norm": 0.373046875, "learning_rate": 1.9771091902725465e-05, "loss": 0.5954, "step": 71 }, { "epoch": 0.17486338797814208, "grad_norm": 0.384765625, "learning_rate": 1.9762725672543372e-05, "loss": 0.5892, "step": 72 }, { "epoch": 0.17729204614450517, "grad_norm": 0.376953125, "learning_rate": 1.9754211122196945e-05, "loss": 0.5883, "step": 73 }, { "epoch": 0.17972070431086826, "grad_norm": 0.373046875, "learning_rate": 1.9745548381043454e-05, "loss": 0.5925, "step": 74 }, { "epoch": 0.18214936247723132, "grad_norm": 0.39453125, "learning_rate": 1.9736737580691553e-05, "loss": 0.5867, "step": 75 }, { "epoch": 0.1845780206435944, "grad_norm": 0.373046875, "learning_rate": 1.9727778854999283e-05, "loss": 0.5931, "step": 76 }, { "epoch": 0.1870066788099575, "grad_norm": 0.384765625, "learning_rate": 1.9718672340072044e-05, "loss": 0.5858, "step": 77 }, { "epoch": 0.1894353369763206, "grad_norm": 0.380859375, "learning_rate": 1.9709418174260523e-05, "loss": 0.5933, "step": 78 }, { "epoch": 0.1894353369763206, "eval_loss": 0.5919594168663025, "eval_runtime": 97.3358, "eval_samples_per_second": 30.821, "eval_steps_per_second": 3.853, "step": 78 }, { "epoch": 0.19186399514268368, "grad_norm": 0.3671875, "learning_rate": 1.970001649815859e-05, "loss": 0.5753, "step": 79 }, { "epoch": 0.19429265330904674, "grad_norm": 0.380859375, "learning_rate": 1.969046745460116e-05, "loss": 0.5892, "step": 80 }, { "epoch": 0.19672131147540983, "grad_norm": 0.3828125, "learning_rate": 1.9680771188662044e-05, "loss": 0.5917, "step": 81 }, { "epoch": 0.19914996964177292, "grad_norm": 0.38671875, "learning_rate": 1.9670927847651707e-05, "loss": 0.5913, "step": 82 }, { "epoch": 0.201578627808136, "grad_norm": 0.37890625, "learning_rate": 1.9660937581115073e-05, "loss": 0.5787, "step": 83 }, { "epoch": 0.2040072859744991, "grad_norm": 0.37109375, "learning_rate": 1.9650800540829204e-05, "loss": 0.5779, "step": 84 }, { "epoch": 0.20643594414086217, "grad_norm": 0.376953125, "learning_rate": 1.964051688080105e-05, "loss": 0.5912, "step": 85 }, { "epoch": 0.20886460230722526, "grad_norm": 0.380859375, "learning_rate": 1.963008675726506e-05, "loss": 0.5879, "step": 86 }, { "epoch": 0.21129326047358835, "grad_norm": 0.3671875, "learning_rate": 1.9619510328680847e-05, "loss": 0.5905, "step": 87 }, { "epoch": 0.21372191863995144, "grad_norm": 0.375, "learning_rate": 1.9608787755730746e-05, "loss": 0.5789, "step": 88 }, { "epoch": 0.2161505768063145, "grad_norm": 0.37890625, "learning_rate": 1.9597919201317393e-05, "loss": 0.5824, "step": 89 }, { "epoch": 0.2185792349726776, "grad_norm": 0.37109375, "learning_rate": 1.958690483056126e-05, "loss": 0.5841, "step": 90 }, { "epoch": 0.22100789313904068, "grad_norm": 0.37109375, "learning_rate": 1.9575744810798118e-05, "loss": 0.5709, "step": 91 }, { "epoch": 0.22343655130540377, "grad_norm": 0.369140625, "learning_rate": 1.9564439311576515e-05, "loss": 0.5799, "step": 92 }, { "epoch": 0.22586520947176686, "grad_norm": 0.369140625, "learning_rate": 1.9552988504655194e-05, "loss": 0.5757, "step": 93 }, { "epoch": 0.22829386763812992, "grad_norm": 0.365234375, "learning_rate": 1.954139256400049e-05, "loss": 0.5768, "step": 94 }, { "epoch": 0.230722525804493, "grad_norm": 0.470703125, "learning_rate": 1.9529651665783675e-05, "loss": 0.6447, "step": 95 }, { "epoch": 0.2331511839708561, "grad_norm": 0.375, "learning_rate": 1.951776598837829e-05, "loss": 0.5888, "step": 96 }, { "epoch": 0.2355798421372192, "grad_norm": 0.4453125, "learning_rate": 1.9505735712357437e-05, "loss": 0.6567, "step": 97 }, { "epoch": 0.23800850030358228, "grad_norm": 0.376953125, "learning_rate": 1.9493561020491024e-05, "loss": 0.5866, "step": 98 }, { "epoch": 0.24043715846994534, "grad_norm": 0.376953125, "learning_rate": 1.9481242097743002e-05, "loss": 0.5775, "step": 99 }, { "epoch": 0.24286581663630843, "grad_norm": 0.369140625, "learning_rate": 1.9468779131268553e-05, "loss": 0.5796, "step": 100 }, { "epoch": 0.24529447480267152, "grad_norm": 0.375, "learning_rate": 1.9456172310411228e-05, "loss": 0.5763, "step": 101 }, { "epoch": 0.24772313296903462, "grad_norm": 0.3828125, "learning_rate": 1.9443421826700096e-05, "loss": 0.5766, "step": 102 }, { "epoch": 0.2501517911353977, "grad_norm": 0.373046875, "learning_rate": 1.9430527873846826e-05, "loss": 0.5766, "step": 103 }, { "epoch": 0.25258044930176077, "grad_norm": 0.3671875, "learning_rate": 1.9417490647742738e-05, "loss": 0.5796, "step": 104 }, { "epoch": 0.25258044930176077, "eval_loss": 0.5772241950035095, "eval_runtime": 97.0571, "eval_samples_per_second": 30.91, "eval_steps_per_second": 3.864, "step": 104 }, { "epoch": 0.2550091074681239, "grad_norm": 0.376953125, "learning_rate": 1.9404310346455822e-05, "loss": 0.5762, "step": 105 }, { "epoch": 0.25743776563448695, "grad_norm": 0.3828125, "learning_rate": 1.9390987170227746e-05, "loss": 0.5833, "step": 106 }, { "epoch": 0.25986642380085, "grad_norm": 0.37890625, "learning_rate": 1.9377521321470806e-05, "loss": 0.5739, "step": 107 }, { "epoch": 0.26229508196721313, "grad_norm": 0.380859375, "learning_rate": 1.9363913004764847e-05, "loss": 0.5771, "step": 108 }, { "epoch": 0.2647237401335762, "grad_norm": 0.361328125, "learning_rate": 1.9350162426854152e-05, "loss": 0.5674, "step": 109 }, { "epoch": 0.2671523982999393, "grad_norm": 0.36328125, "learning_rate": 1.9336269796644314e-05, "loss": 0.5698, "step": 110 }, { "epoch": 0.26958105646630237, "grad_norm": 0.376953125, "learning_rate": 1.9322235325199054e-05, "loss": 0.5681, "step": 111 }, { "epoch": 0.27200971463266543, "grad_norm": 0.37109375, "learning_rate": 1.9308059225737015e-05, "loss": 0.5615, "step": 112 }, { "epoch": 0.27443837279902855, "grad_norm": 0.373046875, "learning_rate": 1.9293741713628518e-05, "loss": 0.5765, "step": 113 }, { "epoch": 0.2768670309653916, "grad_norm": 0.375, "learning_rate": 1.9279283006392304e-05, "loss": 0.5633, "step": 114 }, { "epoch": 0.27929568913175473, "grad_norm": 0.37890625, "learning_rate": 1.9264683323692213e-05, "loss": 0.5629, "step": 115 }, { "epoch": 0.2817243472981178, "grad_norm": 0.376953125, "learning_rate": 1.924994288733386e-05, "loss": 0.5707, "step": 116 }, { "epoch": 0.28415300546448086, "grad_norm": 0.3828125, "learning_rate": 1.9235061921261248e-05, "loss": 0.5658, "step": 117 }, { "epoch": 0.286581663630844, "grad_norm": 0.376953125, "learning_rate": 1.9220040651553388e-05, "loss": 0.5672, "step": 118 }, { "epoch": 0.28901032179720704, "grad_norm": 0.427734375, "learning_rate": 1.9204879306420852e-05, "loss": 0.5644, "step": 119 }, { "epoch": 0.29143897996357016, "grad_norm": 0.515625, "learning_rate": 1.918957811620231e-05, "loss": 0.658, "step": 120 }, { "epoch": 0.2938676381299332, "grad_norm": 0.384765625, "learning_rate": 1.9174137313361012e-05, "loss": 0.5673, "step": 121 }, { "epoch": 0.2962962962962963, "grad_norm": 0.419921875, "learning_rate": 1.915855713248129e-05, "loss": 0.5713, "step": 122 }, { "epoch": 0.2987249544626594, "grad_norm": 0.376953125, "learning_rate": 1.9142837810264972e-05, "loss": 0.5605, "step": 123 }, { "epoch": 0.30115361262902246, "grad_norm": 0.451171875, "learning_rate": 1.912697958552778e-05, "loss": 0.634, "step": 124 }, { "epoch": 0.3035822707953855, "grad_norm": 0.39453125, "learning_rate": 1.9110982699195724e-05, "loss": 0.5743, "step": 125 }, { "epoch": 0.30601092896174864, "grad_norm": 0.41015625, "learning_rate": 1.9094847394301427e-05, "loss": 0.5743, "step": 126 }, { "epoch": 0.3084395871281117, "grad_norm": 0.388671875, "learning_rate": 1.907857391598043e-05, "loss": 0.5685, "step": 127 }, { "epoch": 0.3108682452944748, "grad_norm": 0.37890625, "learning_rate": 1.906216251146748e-05, "loss": 0.5718, "step": 128 }, { "epoch": 0.3132969034608379, "grad_norm": 0.392578125, "learning_rate": 1.904561343009276e-05, "loss": 0.5666, "step": 129 }, { "epoch": 0.31572556162720095, "grad_norm": 0.380859375, "learning_rate": 1.902892692327811e-05, "loss": 0.5487, "step": 130 }, { "epoch": 0.31572556162720095, "eval_loss": 0.565579354763031, "eval_runtime": 96.8785, "eval_samples_per_second": 30.967, "eval_steps_per_second": 3.871, "step": 130 }, { "epoch": 0.31815421979356406, "grad_norm": 0.400390625, "learning_rate": 1.9012103244533217e-05, "loss": 0.5662, "step": 131 }, { "epoch": 0.3205828779599271, "grad_norm": 0.376953125, "learning_rate": 1.899514264945173e-05, "loss": 0.5692, "step": 132 }, { "epoch": 0.32301153612629024, "grad_norm": 0.37109375, "learning_rate": 1.897804539570742e-05, "loss": 0.5571, "step": 133 }, { "epoch": 0.3254401942926533, "grad_norm": 0.384765625, "learning_rate": 1.8960811743050227e-05, "loss": 0.553, "step": 134 }, { "epoch": 0.32786885245901637, "grad_norm": 0.3828125, "learning_rate": 1.8943441953302346e-05, "loss": 0.5598, "step": 135 }, { "epoch": 0.3302975106253795, "grad_norm": 0.388671875, "learning_rate": 1.8925936290354224e-05, "loss": 0.5624, "step": 136 }, { "epoch": 0.33272616879174255, "grad_norm": 0.37109375, "learning_rate": 1.890829502016056e-05, "loss": 0.5597, "step": 137 }, { "epoch": 0.33515482695810567, "grad_norm": 0.380859375, "learning_rate": 1.8890518410736275e-05, "loss": 0.5575, "step": 138 }, { "epoch": 0.33758348512446873, "grad_norm": 0.37109375, "learning_rate": 1.8872606732152426e-05, "loss": 0.5575, "step": 139 }, { "epoch": 0.3400121432908318, "grad_norm": 0.373046875, "learning_rate": 1.8854560256532098e-05, "loss": 0.5549, "step": 140 }, { "epoch": 0.3424408014571949, "grad_norm": 0.388671875, "learning_rate": 1.8836379258046298e-05, "loss": 0.5671, "step": 141 }, { "epoch": 0.344869459623558, "grad_norm": 0.546875, "learning_rate": 1.8818064012909755e-05, "loss": 0.639, "step": 142 }, { "epoch": 0.3472981177899211, "grad_norm": 0.5078125, "learning_rate": 1.8799614799376743e-05, "loss": 0.6433, "step": 143 }, { "epoch": 0.34972677595628415, "grad_norm": 0.380859375, "learning_rate": 1.878103189773686e-05, "loss": 0.5656, "step": 144 }, { "epoch": 0.3521554341226472, "grad_norm": 0.404296875, "learning_rate": 1.876231559031075e-05, "loss": 0.5631, "step": 145 }, { "epoch": 0.35458409228901033, "grad_norm": 0.3828125, "learning_rate": 1.8743466161445823e-05, "loss": 0.5563, "step": 146 }, { "epoch": 0.3570127504553734, "grad_norm": 0.388671875, "learning_rate": 1.872448389751194e-05, "loss": 0.5569, "step": 147 }, { "epoch": 0.3594414086217365, "grad_norm": 0.3671875, "learning_rate": 1.8705369086897063e-05, "loss": 0.5548, "step": 148 }, { "epoch": 0.3618700667880996, "grad_norm": 0.39453125, "learning_rate": 1.8686122020002857e-05, "loss": 0.5587, "step": 149 }, { "epoch": 0.36429872495446264, "grad_norm": 0.390625, "learning_rate": 1.86667429892403e-05, "loss": 0.5508, "step": 150 }, { "epoch": 0.36672738312082576, "grad_norm": 0.373046875, "learning_rate": 1.8647232289025223e-05, "loss": 0.5594, "step": 151 }, { "epoch": 0.3691560412871888, "grad_norm": 0.37890625, "learning_rate": 1.862759021577385e-05, "loss": 0.5579, "step": 152 }, { "epoch": 0.37158469945355194, "grad_norm": 0.373046875, "learning_rate": 1.860781706789829e-05, "loss": 0.5503, "step": 153 }, { "epoch": 0.374013357619915, "grad_norm": 0.373046875, "learning_rate": 1.8587913145801998e-05, "loss": 0.5601, "step": 154 }, { "epoch": 0.37644201578627806, "grad_norm": 0.380859375, "learning_rate": 1.8567878751875218e-05, "loss": 0.5516, "step": 155 }, { "epoch": 0.3788706739526412, "grad_norm": 0.365234375, "learning_rate": 1.8547714190490385e-05, "loss": 0.552, "step": 156 }, { "epoch": 0.3788706739526412, "eval_loss": 0.5556911826133728, "eval_runtime": 96.9623, "eval_samples_per_second": 30.94, "eval_steps_per_second": 3.867, "step": 156 }, { "epoch": 0.38129933211900424, "grad_norm": 0.3828125, "learning_rate": 1.8527419767997506e-05, "loss": 0.5618, "step": 157 }, { "epoch": 0.38372799028536736, "grad_norm": 0.390625, "learning_rate": 1.8506995792719498e-05, "loss": 0.5561, "step": 158 }, { "epoch": 0.3861566484517304, "grad_norm": 0.3671875, "learning_rate": 1.848644257494751e-05, "loss": 0.5486, "step": 159 }, { "epoch": 0.3885853066180935, "grad_norm": 0.369140625, "learning_rate": 1.8465760426936212e-05, "loss": 0.5521, "step": 160 }, { "epoch": 0.3910139647844566, "grad_norm": 0.373046875, "learning_rate": 1.8444949662899038e-05, "loss": 0.5474, "step": 161 }, { "epoch": 0.39344262295081966, "grad_norm": 0.37109375, "learning_rate": 1.8424010599003424e-05, "loss": 0.5508, "step": 162 }, { "epoch": 0.3958712811171828, "grad_norm": 0.388671875, "learning_rate": 1.8402943553365998e-05, "loss": 0.5483, "step": 163 }, { "epoch": 0.39829993928354585, "grad_norm": 0.369140625, "learning_rate": 1.838174884604776e-05, "loss": 0.5525, "step": 164 }, { "epoch": 0.4007285974499089, "grad_norm": 0.376953125, "learning_rate": 1.8360426799049197e-05, "loss": 0.5512, "step": 165 }, { "epoch": 0.403157255616272, "grad_norm": 0.369140625, "learning_rate": 1.8338977736305408e-05, "loss": 0.5509, "step": 166 }, { "epoch": 0.4055859137826351, "grad_norm": 0.37890625, "learning_rate": 1.831740198368118e-05, "loss": 0.5403, "step": 167 }, { "epoch": 0.4080145719489982, "grad_norm": 0.3671875, "learning_rate": 1.8295699868966038e-05, "loss": 0.5507, "step": 168 }, { "epoch": 0.41044323011536127, "grad_norm": 0.376953125, "learning_rate": 1.8273871721869256e-05, "loss": 0.5354, "step": 169 }, { "epoch": 0.41287188828172433, "grad_norm": 0.361328125, "learning_rate": 1.8251917874014854e-05, "loss": 0.5483, "step": 170 }, { "epoch": 0.41530054644808745, "grad_norm": 0.373046875, "learning_rate": 1.8229838658936566e-05, "loss": 0.5416, "step": 171 }, { "epoch": 0.4177292046144505, "grad_norm": 0.38671875, "learning_rate": 1.8207634412072765e-05, "loss": 0.5547, "step": 172 }, { "epoch": 0.4201578627808136, "grad_norm": 0.384765625, "learning_rate": 1.8185305470761366e-05, "loss": 0.548, "step": 173 }, { "epoch": 0.4225865209471767, "grad_norm": 0.59375, "learning_rate": 1.8162852174234712e-05, "loss": 0.6328, "step": 174 }, { "epoch": 0.42501517911353975, "grad_norm": 0.37109375, "learning_rate": 1.81402748636144e-05, "loss": 0.5406, "step": 175 }, { "epoch": 0.42744383727990287, "grad_norm": 0.37890625, "learning_rate": 1.8117573881906114e-05, "loss": 0.5446, "step": 176 }, { "epoch": 0.42987249544626593, "grad_norm": 0.3828125, "learning_rate": 1.809474957399442e-05, "loss": 0.5591, "step": 177 }, { "epoch": 0.432301153612629, "grad_norm": 0.376953125, "learning_rate": 1.8071802286637505e-05, "loss": 0.5415, "step": 178 }, { "epoch": 0.4347298117789921, "grad_norm": 0.384765625, "learning_rate": 1.8048732368461927e-05, "loss": 0.5362, "step": 179 }, { "epoch": 0.4371584699453552, "grad_norm": 0.37890625, "learning_rate": 1.8025540169957315e-05, "loss": 0.5464, "step": 180 }, { "epoch": 0.4395871281117183, "grad_norm": 0.384765625, "learning_rate": 1.8002226043471025e-05, "loss": 0.544, "step": 181 }, { "epoch": 0.44201578627808136, "grad_norm": 0.388671875, "learning_rate": 1.7978790343202826e-05, "loss": 0.5567, "step": 182 }, { "epoch": 0.44201578627808136, "eval_loss": 0.5476920008659363, "eval_runtime": 97.0095, "eval_samples_per_second": 30.925, "eval_steps_per_second": 3.866, "step": 182 }, { "epoch": 0.4444444444444444, "grad_norm": 0.369140625, "learning_rate": 1.795523342519948e-05, "loss": 0.5349, "step": 183 }, { "epoch": 0.44687310261080754, "grad_norm": 0.38671875, "learning_rate": 1.7931555647349358e-05, "loss": 0.5494, "step": 184 }, { "epoch": 0.4493017607771706, "grad_norm": 0.37890625, "learning_rate": 1.7907757369376984e-05, "loss": 0.5431, "step": 185 }, { "epoch": 0.4517304189435337, "grad_norm": 0.380859375, "learning_rate": 1.7883838952837595e-05, "loss": 0.5455, "step": 186 }, { "epoch": 0.4541590771098968, "grad_norm": 0.40625, "learning_rate": 1.785980076111161e-05, "loss": 0.5475, "step": 187 }, { "epoch": 0.45658773527625984, "grad_norm": 0.384765625, "learning_rate": 1.7835643159399156e-05, "loss": 0.5426, "step": 188 }, { "epoch": 0.45901639344262296, "grad_norm": 0.384765625, "learning_rate": 1.7811366514714475e-05, "loss": 0.549, "step": 189 }, { "epoch": 0.461445051608986, "grad_norm": 0.3671875, "learning_rate": 1.778697119588039e-05, "loss": 0.5409, "step": 190 }, { "epoch": 0.46387370977534914, "grad_norm": 0.55078125, "learning_rate": 1.7762457573522658e-05, "loss": 0.6053, "step": 191 }, { "epoch": 0.4663023679417122, "grad_norm": 0.375, "learning_rate": 1.7737826020064377e-05, "loss": 0.5487, "step": 192 }, { "epoch": 0.46873102610807527, "grad_norm": 0.4140625, "learning_rate": 1.771307690972031e-05, "loss": 0.5347, "step": 193 }, { "epoch": 0.4711596842744384, "grad_norm": 0.37109375, "learning_rate": 1.76882106184912e-05, "loss": 0.5525, "step": 194 }, { "epoch": 0.47358834244080145, "grad_norm": 0.37890625, "learning_rate": 1.7663227524158053e-05, "loss": 0.5423, "step": 195 }, { "epoch": 0.47601700060716456, "grad_norm": 0.38671875, "learning_rate": 1.7638128006276422e-05, "loss": 0.5526, "step": 196 }, { "epoch": 0.4784456587735276, "grad_norm": 0.369140625, "learning_rate": 1.7612912446170615e-05, "loss": 0.5464, "step": 197 }, { "epoch": 0.4808743169398907, "grad_norm": 0.69921875, "learning_rate": 1.758758122692791e-05, "loss": 0.6096, "step": 198 }, { "epoch": 0.4833029751062538, "grad_norm": 0.384765625, "learning_rate": 1.7562134733392736e-05, "loss": 0.5399, "step": 199 }, { "epoch": 0.48573163327261687, "grad_norm": 0.39453125, "learning_rate": 1.753657335216083e-05, "loss": 0.5503, "step": 200 }, { "epoch": 0.48816029143898, "grad_norm": 0.373046875, "learning_rate": 1.751089747157336e-05, "loss": 0.5389, "step": 201 }, { "epoch": 0.49058894960534305, "grad_norm": 0.3828125, "learning_rate": 1.7485107481711014e-05, "loss": 0.548, "step": 202 }, { "epoch": 0.4930176077717061, "grad_norm": 0.412109375, "learning_rate": 1.7459203774388097e-05, "loss": 0.5404, "step": 203 }, { "epoch": 0.49544626593806923, "grad_norm": 0.37890625, "learning_rate": 1.743318674314656e-05, "loss": 0.5497, "step": 204 }, { "epoch": 0.4978749241044323, "grad_norm": 0.373046875, "learning_rate": 1.740705678325004e-05, "loss": 0.5313, "step": 205 }, { "epoch": 0.5003035822707954, "grad_norm": 0.375, "learning_rate": 1.7380814291677818e-05, "loss": 0.5446, "step": 206 }, { "epoch": 0.5027322404371585, "grad_norm": 0.50390625, "learning_rate": 1.7354459667118825e-05, "loss": 0.6115, "step": 207 }, { "epoch": 0.5051608986035215, "grad_norm": 0.37109375, "learning_rate": 1.7327993309965583e-05, "loss": 0.5263, "step": 208 }, { "epoch": 0.5051608986035215, "eval_loss": 0.5407972931861877, "eval_runtime": 97.5769, "eval_samples_per_second": 30.745, "eval_steps_per_second": 3.843, "step": 208 }, { "epoch": 0.5075895567698846, "grad_norm": 0.3671875, "learning_rate": 1.730141562230809e-05, "loss": 0.5454, "step": 209 }, { "epoch": 0.5100182149362478, "grad_norm": 0.373046875, "learning_rate": 1.7274727007927747e-05, "loss": 0.5417, "step": 210 }, { "epoch": 0.5124468731026108, "grad_norm": 0.365234375, "learning_rate": 1.72479278722912e-05, "loss": 0.5337, "step": 211 }, { "epoch": 0.5148755312689739, "grad_norm": 0.373046875, "learning_rate": 1.7221018622544197e-05, "loss": 0.5477, "step": 212 }, { "epoch": 0.517304189435337, "grad_norm": 0.373046875, "learning_rate": 1.7193999667505387e-05, "loss": 0.533, "step": 213 }, { "epoch": 0.5197328476017, "grad_norm": 0.369140625, "learning_rate": 1.7166871417660116e-05, "loss": 0.5203, "step": 214 }, { "epoch": 0.5221615057680632, "grad_norm": 0.373046875, "learning_rate": 1.7139634285154198e-05, "loss": 0.5326, "step": 215 }, { "epoch": 0.5245901639344263, "grad_norm": 0.57421875, "learning_rate": 1.7112288683787637e-05, "loss": 0.6092, "step": 216 }, { "epoch": 0.5270188221007893, "grad_norm": 0.3671875, "learning_rate": 1.708483502900836e-05, "loss": 0.5417, "step": 217 }, { "epoch": 0.5294474802671524, "grad_norm": 0.373046875, "learning_rate": 1.7057273737905887e-05, "loss": 0.5347, "step": 218 }, { "epoch": 0.5318761384335154, "grad_norm": 0.37890625, "learning_rate": 1.7029605229205005e-05, "loss": 0.523, "step": 219 }, { "epoch": 0.5343047965998786, "grad_norm": 0.37890625, "learning_rate": 1.70018299232594e-05, "loss": 0.5363, "step": 220 }, { "epoch": 0.5367334547662417, "grad_norm": 0.361328125, "learning_rate": 1.6973948242045284e-05, "loss": 0.5287, "step": 221 }, { "epoch": 0.5391621129326047, "grad_norm": 0.37109375, "learning_rate": 1.6945960609154966e-05, "loss": 0.5396, "step": 222 }, { "epoch": 0.5415907710989678, "grad_norm": 0.3828125, "learning_rate": 1.6917867449790432e-05, "loss": 0.5198, "step": 223 }, { "epoch": 0.5440194292653309, "grad_norm": 0.44921875, "learning_rate": 1.688966919075687e-05, "loss": 0.6069, "step": 224 }, { "epoch": 0.546448087431694, "grad_norm": 0.380859375, "learning_rate": 1.68613662604562e-05, "loss": 0.5376, "step": 225 }, { "epoch": 0.5488767455980571, "grad_norm": 0.375, "learning_rate": 1.6832959088880557e-05, "loss": 0.5264, "step": 226 }, { "epoch": 0.5513054037644202, "grad_norm": 0.369140625, "learning_rate": 1.6804448107605767e-05, "loss": 0.5369, "step": 227 }, { "epoch": 0.5537340619307832, "grad_norm": 0.375, "learning_rate": 1.677583374978478e-05, "loss": 0.537, "step": 228 }, { "epoch": 0.5561627200971463, "grad_norm": 0.380859375, "learning_rate": 1.6747116450141092e-05, "loss": 0.5257, "step": 229 }, { "epoch": 0.5585913782635095, "grad_norm": 0.369140625, "learning_rate": 1.6718296644962146e-05, "loss": 0.532, "step": 230 }, { "epoch": 0.5610200364298725, "grad_norm": 0.3671875, "learning_rate": 1.6689374772092695e-05, "loss": 0.5382, "step": 231 }, { "epoch": 0.5634486945962356, "grad_norm": 0.373046875, "learning_rate": 1.6660351270928164e-05, "loss": 0.5313, "step": 232 }, { "epoch": 0.5658773527625987, "grad_norm": 0.37109375, "learning_rate": 1.6631226582407954e-05, "loss": 0.5283, "step": 233 }, { "epoch": 0.5683060109289617, "grad_norm": 0.361328125, "learning_rate": 1.660200114900876e-05, "loss": 0.5466, "step": 234 }, { "epoch": 0.5683060109289617, "eval_loss": 0.5350908637046814, "eval_runtime": 97.0805, "eval_samples_per_second": 30.902, "eval_steps_per_second": 3.863, "step": 234 }, { "epoch": 0.5707346690953249, "grad_norm": 0.3671875, "learning_rate": 1.6572675414737844e-05, "loss": 0.5343, "step": 235 }, { "epoch": 0.573163327261688, "grad_norm": 0.375, "learning_rate": 1.6543249825126285e-05, "loss": 0.5405, "step": 236 }, { "epoch": 0.575591985428051, "grad_norm": 0.37109375, "learning_rate": 1.6513724827222225e-05, "loss": 0.5252, "step": 237 }, { "epoch": 0.5780206435944141, "grad_norm": 0.365234375, "learning_rate": 1.6484100869584044e-05, "loss": 0.5295, "step": 238 }, { "epoch": 0.5804493017607771, "grad_norm": 0.361328125, "learning_rate": 1.645437840227359e-05, "loss": 0.5331, "step": 239 }, { "epoch": 0.5828779599271403, "grad_norm": 0.36328125, "learning_rate": 1.6424557876849308e-05, "loss": 0.5274, "step": 240 }, { "epoch": 0.5853066180935034, "grad_norm": 0.373046875, "learning_rate": 1.639463974635939e-05, "loss": 0.5303, "step": 241 }, { "epoch": 0.5877352762598664, "grad_norm": 0.369140625, "learning_rate": 1.636462446533489e-05, "loss": 0.5319, "step": 242 }, { "epoch": 0.5901639344262295, "grad_norm": 0.369140625, "learning_rate": 1.6334512489782833e-05, "loss": 0.5316, "step": 243 }, { "epoch": 0.5925925925925926, "grad_norm": 0.388671875, "learning_rate": 1.6304304277179267e-05, "loss": 0.5291, "step": 244 }, { "epoch": 0.5950212507589556, "grad_norm": 0.373046875, "learning_rate": 1.627400028646231e-05, "loss": 0.5341, "step": 245 }, { "epoch": 0.5974499089253188, "grad_norm": 0.37109375, "learning_rate": 1.6243600978025215e-05, "loss": 0.5233, "step": 246 }, { "epoch": 0.5998785670916819, "grad_norm": 0.37109375, "learning_rate": 1.6213106813709328e-05, "loss": 0.5251, "step": 247 }, { "epoch": 0.6023072252580449, "grad_norm": 0.376953125, "learning_rate": 1.6182518256797095e-05, "loss": 0.534, "step": 248 }, { "epoch": 0.604735883424408, "grad_norm": 0.365234375, "learning_rate": 1.6151835772005028e-05, "loss": 0.5215, "step": 249 }, { "epoch": 0.607164541590771, "grad_norm": 0.375, "learning_rate": 1.612105982547663e-05, "loss": 0.5391, "step": 250 }, { "epoch": 0.6095931997571342, "grad_norm": 0.37109375, "learning_rate": 1.6090190884775333e-05, "loss": 0.5316, "step": 251 }, { "epoch": 0.6120218579234973, "grad_norm": 0.3671875, "learning_rate": 1.605922941887737e-05, "loss": 0.5251, "step": 252 }, { "epoch": 0.6144505160898603, "grad_norm": 0.359375, "learning_rate": 1.6028175898164665e-05, "loss": 0.5239, "step": 253 }, { "epoch": 0.6168791742562234, "grad_norm": 0.3671875, "learning_rate": 1.599703079441769e-05, "loss": 0.5229, "step": 254 }, { "epoch": 0.6193078324225865, "grad_norm": 0.3828125, "learning_rate": 1.5965794580808292e-05, "loss": 0.5311, "step": 255 }, { "epoch": 0.6217364905889496, "grad_norm": 0.36328125, "learning_rate": 1.5934467731892497e-05, "loss": 0.5217, "step": 256 }, { "epoch": 0.6241651487553127, "grad_norm": 0.365234375, "learning_rate": 1.590305072360331e-05, "loss": 0.5299, "step": 257 }, { "epoch": 0.6265938069216758, "grad_norm": 0.375, "learning_rate": 1.5871544033243488e-05, "loss": 0.52, "step": 258 }, { "epoch": 0.6290224650880388, "grad_norm": 0.369140625, "learning_rate": 1.583994813947827e-05, "loss": 0.5168, "step": 259 }, { "epoch": 0.6314511232544019, "grad_norm": 0.494140625, "learning_rate": 1.5808263522328137e-05, "loss": 0.6037, "step": 260 }, { "epoch": 0.6314511232544019, "eval_loss": 0.5299703478813171, "eval_runtime": 96.9378, "eval_samples_per_second": 30.948, "eval_steps_per_second": 3.868, "step": 260 }, { "epoch": 0.6338797814207651, "grad_norm": 0.365234375, "learning_rate": 1.5776490663161474e-05, "loss": 0.517, "step": 261 }, { "epoch": 0.6363084395871281, "grad_norm": 0.357421875, "learning_rate": 1.5744630044687307e-05, "loss": 0.5182, "step": 262 }, { "epoch": 0.6387370977534912, "grad_norm": 0.369140625, "learning_rate": 1.5712682150947926e-05, "loss": 0.5219, "step": 263 }, { "epoch": 0.6411657559198543, "grad_norm": 0.373046875, "learning_rate": 1.568064746731156e-05, "loss": 0.5323, "step": 264 }, { "epoch": 0.6435944140862173, "grad_norm": 0.447265625, "learning_rate": 1.5648526480464995e-05, "loss": 0.5902, "step": 265 }, { "epoch": 0.6460230722525805, "grad_norm": 0.37109375, "learning_rate": 1.561631967840617e-05, "loss": 0.5374, "step": 266 }, { "epoch": 0.6484517304189436, "grad_norm": 0.36328125, "learning_rate": 1.558402755043677e-05, "loss": 0.5145, "step": 267 }, { "epoch": 0.6508803885853066, "grad_norm": 0.37109375, "learning_rate": 1.5551650587154815e-05, "loss": 0.5213, "step": 268 }, { "epoch": 0.6533090467516697, "grad_norm": 0.361328125, "learning_rate": 1.5519189280447153e-05, "loss": 0.5192, "step": 269 }, { "epoch": 0.6557377049180327, "grad_norm": 0.369140625, "learning_rate": 1.5486644123482047e-05, "loss": 0.5325, "step": 270 }, { "epoch": 0.6581663630843959, "grad_norm": 0.375, "learning_rate": 1.545401561070163e-05, "loss": 0.5286, "step": 271 }, { "epoch": 0.660595021250759, "grad_norm": 0.37890625, "learning_rate": 1.542130423781444e-05, "loss": 0.526, "step": 272 }, { "epoch": 0.663023679417122, "grad_norm": 0.37109375, "learning_rate": 1.5388510501787855e-05, "loss": 0.5317, "step": 273 }, { "epoch": 0.6654523375834851, "grad_norm": 0.361328125, "learning_rate": 1.5355634900840558e-05, "loss": 0.5204, "step": 274 }, { "epoch": 0.6678809957498482, "grad_norm": 0.369140625, "learning_rate": 1.5322677934434965e-05, "loss": 0.5215, "step": 275 }, { "epoch": 0.6703096539162113, "grad_norm": 0.36328125, "learning_rate": 1.5289640103269626e-05, "loss": 0.5247, "step": 276 }, { "epoch": 0.6727383120825744, "grad_norm": 0.369140625, "learning_rate": 1.5256521909271644e-05, "loss": 0.5163, "step": 277 }, { "epoch": 0.6751669702489375, "grad_norm": 0.36328125, "learning_rate": 1.5223323855589027e-05, "loss": 0.5335, "step": 278 }, { "epoch": 0.6775956284153005, "grad_norm": 0.36328125, "learning_rate": 1.519004644658305e-05, "loss": 0.5199, "step": 279 }, { "epoch": 0.6800242865816636, "grad_norm": 0.3671875, "learning_rate": 1.5156690187820596e-05, "loss": 0.5294, "step": 280 }, { "epoch": 0.6824529447480268, "grad_norm": 0.3671875, "learning_rate": 1.5123255586066467e-05, "loss": 0.5248, "step": 281 }, { "epoch": 0.6848816029143898, "grad_norm": 0.369140625, "learning_rate": 1.50897431492757e-05, "loss": 0.5261, "step": 282 }, { "epoch": 0.6873102610807529, "grad_norm": 0.36328125, "learning_rate": 1.5056153386585828e-05, "loss": 0.5246, "step": 283 }, { "epoch": 0.689738919247116, "grad_norm": 0.3671875, "learning_rate": 1.5022486808309171e-05, "loss": 0.518, "step": 284 }, { "epoch": 0.692167577413479, "grad_norm": 0.373046875, "learning_rate": 1.498874392592506e-05, "loss": 0.5222, "step": 285 }, { "epoch": 0.6945962355798422, "grad_norm": 0.36328125, "learning_rate": 1.4954925252072077e-05, "loss": 0.5333, "step": 286 }, { "epoch": 0.6945962355798422, "eval_loss": 0.5256316661834717, "eval_runtime": 97.1941, "eval_samples_per_second": 30.866, "eval_steps_per_second": 3.858, "step": 286 }, { "epoch": 0.6970248937462052, "grad_norm": 0.37109375, "learning_rate": 1.4921031300540268e-05, "loss": 0.5385, "step": 287 }, { "epoch": 0.6994535519125683, "grad_norm": 0.36328125, "learning_rate": 1.4887062586263334e-05, "loss": 0.5203, "step": 288 }, { "epoch": 0.7018822100789314, "grad_norm": 0.3671875, "learning_rate": 1.4853019625310813e-05, "loss": 0.5163, "step": 289 }, { "epoch": 0.7043108682452944, "grad_norm": 0.359375, "learning_rate": 1.4818902934880222e-05, "loss": 0.5211, "step": 290 }, { "epoch": 0.7067395264116576, "grad_norm": 0.36328125, "learning_rate": 1.4784713033289228e-05, "loss": 0.5251, "step": 291 }, { "epoch": 0.7091681845780207, "grad_norm": 0.4765625, "learning_rate": 1.4750450439967751e-05, "loss": 0.5817, "step": 292 }, { "epoch": 0.7115968427443837, "grad_norm": 0.373046875, "learning_rate": 1.4716115675450078e-05, "loss": 0.5178, "step": 293 }, { "epoch": 0.7140255009107468, "grad_norm": 0.3828125, "learning_rate": 1.4681709261366963e-05, "loss": 0.5317, "step": 294 }, { "epoch": 0.7164541590771099, "grad_norm": 0.3671875, "learning_rate": 1.4647231720437687e-05, "loss": 0.535, "step": 295 }, { "epoch": 0.718882817243473, "grad_norm": 0.376953125, "learning_rate": 1.4612683576462135e-05, "loss": 0.5263, "step": 296 }, { "epoch": 0.7213114754098361, "grad_norm": 0.376953125, "learning_rate": 1.4578065354312816e-05, "loss": 0.5162, "step": 297 }, { "epoch": 0.7237401335761992, "grad_norm": 0.369140625, "learning_rate": 1.4543377579926915e-05, "loss": 0.5262, "step": 298 }, { "epoch": 0.7261687917425622, "grad_norm": 0.390625, "learning_rate": 1.4508620780298288e-05, "loss": 0.5242, "step": 299 }, { "epoch": 0.7285974499089253, "grad_norm": 0.384765625, "learning_rate": 1.4473795483469442e-05, "loss": 0.5258, "step": 300 }, { "epoch": 0.7310261080752884, "grad_norm": 0.515625, "learning_rate": 1.4438902218523537e-05, "loss": 0.5909, "step": 301 }, { "epoch": 0.7334547662416515, "grad_norm": 0.375, "learning_rate": 1.4403941515576344e-05, "loss": 0.5213, "step": 302 }, { "epoch": 0.7358834244080146, "grad_norm": 0.3828125, "learning_rate": 1.4368913905768178e-05, "loss": 0.5192, "step": 303 }, { "epoch": 0.7383120825743776, "grad_norm": 0.55078125, "learning_rate": 1.4333819921255836e-05, "loss": 0.5678, "step": 304 }, { "epoch": 0.7407407407407407, "grad_norm": 0.3671875, "learning_rate": 1.4298660095204516e-05, "loss": 0.5247, "step": 305 }, { "epoch": 0.7431693989071039, "grad_norm": 0.380859375, "learning_rate": 1.4263434961779709e-05, "loss": 0.5291, "step": 306 }, { "epoch": 0.7455980570734669, "grad_norm": 0.392578125, "learning_rate": 1.4228145056139097e-05, "loss": 0.5241, "step": 307 }, { "epoch": 0.74802671523983, "grad_norm": 0.37109375, "learning_rate": 1.41927909144244e-05, "loss": 0.5199, "step": 308 }, { "epoch": 0.7504553734061931, "grad_norm": 0.361328125, "learning_rate": 1.4157373073753255e-05, "loss": 0.5341, "step": 309 }, { "epoch": 0.7528840315725561, "grad_norm": 0.375, "learning_rate": 1.412189207221104e-05, "loss": 0.5282, "step": 310 }, { "epoch": 0.7553126897389193, "grad_norm": 0.376953125, "learning_rate": 1.4086348448842707e-05, "loss": 0.5194, "step": 311 }, { "epoch": 0.7577413479052824, "grad_norm": 0.36328125, "learning_rate": 1.4050742743644588e-05, "loss": 0.5139, "step": 312 }, { "epoch": 0.7577413479052824, "eval_loss": 0.5217667818069458, "eval_runtime": 96.9922, "eval_samples_per_second": 30.93, "eval_steps_per_second": 3.866, "step": 312 }, { "epoch": 0.7601700060716454, "grad_norm": 0.37109375, "learning_rate": 1.4015075497556193e-05, "loss": 0.5176, "step": 313 }, { "epoch": 0.7625986642380085, "grad_norm": 0.38671875, "learning_rate": 1.3979347252451994e-05, "loss": 0.5178, "step": 314 }, { "epoch": 0.7650273224043715, "grad_norm": 0.3828125, "learning_rate": 1.3943558551133186e-05, "loss": 0.5258, "step": 315 }, { "epoch": 0.7674559805707347, "grad_norm": 0.3671875, "learning_rate": 1.3907709937319451e-05, "loss": 0.5176, "step": 316 }, { "epoch": 0.7698846387370978, "grad_norm": 0.625, "learning_rate": 1.3871801955640682e-05, "loss": 0.5865, "step": 317 }, { "epoch": 0.7723132969034608, "grad_norm": 0.380859375, "learning_rate": 1.3835835151628728e-05, "loss": 0.5194, "step": 318 }, { "epoch": 0.7747419550698239, "grad_norm": 0.396484375, "learning_rate": 1.3799810071709088e-05, "loss": 0.5213, "step": 319 }, { "epoch": 0.777170613236187, "grad_norm": 0.37890625, "learning_rate": 1.3763727263192626e-05, "loss": 0.5276, "step": 320 }, { "epoch": 0.7795992714025501, "grad_norm": 0.37890625, "learning_rate": 1.3727587274267235e-05, "loss": 0.5214, "step": 321 }, { "epoch": 0.7820279295689132, "grad_norm": 0.384765625, "learning_rate": 1.3691390653989536e-05, "loss": 0.5307, "step": 322 }, { "epoch": 0.7844565877352763, "grad_norm": 0.37890625, "learning_rate": 1.365513795227651e-05, "loss": 0.5252, "step": 323 }, { "epoch": 0.7868852459016393, "grad_norm": 0.359375, "learning_rate": 1.3618829719897158e-05, "loss": 0.5186, "step": 324 }, { "epoch": 0.7893139040680024, "grad_norm": 0.384765625, "learning_rate": 1.3582466508464132e-05, "loss": 0.5191, "step": 325 }, { "epoch": 0.7917425622343656, "grad_norm": 0.3671875, "learning_rate": 1.3546048870425356e-05, "loss": 0.5268, "step": 326 }, { "epoch": 0.7941712204007286, "grad_norm": 0.376953125, "learning_rate": 1.3509577359055627e-05, "loss": 0.53, "step": 327 }, { "epoch": 0.7965998785670917, "grad_norm": 0.3671875, "learning_rate": 1.3473052528448203e-05, "loss": 0.5142, "step": 328 }, { "epoch": 0.7990285367334548, "grad_norm": 0.384765625, "learning_rate": 1.3436474933506412e-05, "loss": 0.5148, "step": 329 }, { "epoch": 0.8014571948998178, "grad_norm": 0.37109375, "learning_rate": 1.3399845129935191e-05, "loss": 0.5223, "step": 330 }, { "epoch": 0.803885853066181, "grad_norm": 0.361328125, "learning_rate": 1.3363163674232663e-05, "loss": 0.5247, "step": 331 }, { "epoch": 0.806314511232544, "grad_norm": 0.365234375, "learning_rate": 1.3326431123681667e-05, "loss": 0.52, "step": 332 }, { "epoch": 0.8087431693989071, "grad_norm": 0.373046875, "learning_rate": 1.328964803634131e-05, "loss": 0.5172, "step": 333 }, { "epoch": 0.8111718275652702, "grad_norm": 0.37890625, "learning_rate": 1.3252814971038477e-05, "loss": 0.5226, "step": 334 }, { "epoch": 0.8136004857316332, "grad_norm": 0.369140625, "learning_rate": 1.3215932487359338e-05, "loss": 0.5214, "step": 335 }, { "epoch": 0.8160291438979964, "grad_norm": 0.375, "learning_rate": 1.3179001145640856e-05, "loss": 0.5234, "step": 336 }, { "epoch": 0.8184578020643595, "grad_norm": 0.39453125, "learning_rate": 1.314202150696227e-05, "loss": 0.5195, "step": 337 }, { "epoch": 0.8208864602307225, "grad_norm": 0.359375, "learning_rate": 1.3104994133136563e-05, "loss": 0.5212, "step": 338 }, { "epoch": 0.8208864602307225, "eval_loss": 0.5185486674308777, "eval_runtime": 97.0358, "eval_samples_per_second": 30.916, "eval_steps_per_second": 3.865, "step": 338 }, { "epoch": 0.8233151183970856, "grad_norm": 0.369140625, "learning_rate": 1.3067919586701948e-05, "loss": 0.5108, "step": 339 }, { "epoch": 0.8257437765634487, "grad_norm": 0.37890625, "learning_rate": 1.3030798430913289e-05, "loss": 0.5175, "step": 340 }, { "epoch": 0.8281724347298117, "grad_norm": 0.373046875, "learning_rate": 1.2993631229733584e-05, "loss": 0.5165, "step": 341 }, { "epoch": 0.8306010928961749, "grad_norm": 0.369140625, "learning_rate": 1.295641854782535e-05, "loss": 0.5096, "step": 342 }, { "epoch": 0.833029751062538, "grad_norm": 0.37109375, "learning_rate": 1.2919160950542095e-05, "loss": 0.5231, "step": 343 }, { "epoch": 0.835458409228901, "grad_norm": 0.373046875, "learning_rate": 1.2881859003919688e-05, "loss": 0.512, "step": 344 }, { "epoch": 0.8378870673952641, "grad_norm": 0.36328125, "learning_rate": 1.284451327466778e-05, "loss": 0.5081, "step": 345 }, { "epoch": 0.8403157255616271, "grad_norm": 0.369140625, "learning_rate": 1.2807124330161188e-05, "loss": 0.5181, "step": 346 }, { "epoch": 0.8427443837279903, "grad_norm": 0.36328125, "learning_rate": 1.2769692738431279e-05, "loss": 0.5191, "step": 347 }, { "epoch": 0.8451730418943534, "grad_norm": 0.357421875, "learning_rate": 1.2732219068157335e-05, "loss": 0.499, "step": 348 }, { "epoch": 0.8476017000607164, "grad_norm": 0.3828125, "learning_rate": 1.2694703888657915e-05, "loss": 0.5205, "step": 349 }, { "epoch": 0.8500303582270795, "grad_norm": 0.5390625, "learning_rate": 1.2657147769882215e-05, "loss": 0.5799, "step": 350 }, { "epoch": 0.8524590163934426, "grad_norm": 0.361328125, "learning_rate": 1.261955128240139e-05, "loss": 0.5102, "step": 351 }, { "epoch": 0.8548876745598057, "grad_norm": 0.36328125, "learning_rate": 1.2581914997399899e-05, "loss": 0.514, "step": 352 }, { "epoch": 0.8573163327261688, "grad_norm": 0.369140625, "learning_rate": 1.2544239486666831e-05, "loss": 0.5168, "step": 353 }, { "epoch": 0.8597449908925319, "grad_norm": 0.392578125, "learning_rate": 1.2506525322587207e-05, "loss": 0.5138, "step": 354 }, { "epoch": 0.8621736490588949, "grad_norm": 0.53125, "learning_rate": 1.2468773078133286e-05, "loss": 0.563, "step": 355 }, { "epoch": 0.864602307225258, "grad_norm": 0.365234375, "learning_rate": 1.2430983326855873e-05, "loss": 0.5064, "step": 356 }, { "epoch": 0.8670309653916212, "grad_norm": 0.369140625, "learning_rate": 1.2393156642875579e-05, "loss": 0.5148, "step": 357 }, { "epoch": 0.8694596235579842, "grad_norm": 0.380859375, "learning_rate": 1.2355293600874132e-05, "loss": 0.5147, "step": 358 }, { "epoch": 0.8718882817243473, "grad_norm": 0.376953125, "learning_rate": 1.2317394776085614e-05, "loss": 0.5164, "step": 359 }, { "epoch": 0.8743169398907104, "grad_norm": 0.37109375, "learning_rate": 1.2279460744287755e-05, "loss": 0.5109, "step": 360 }, { "epoch": 0.8767455980570734, "grad_norm": 0.375, "learning_rate": 1.2241492081793145e-05, "loss": 0.5184, "step": 361 }, { "epoch": 0.8791742562234366, "grad_norm": 0.56640625, "learning_rate": 1.220348936544052e-05, "loss": 0.5627, "step": 362 }, { "epoch": 0.8816029143897997, "grad_norm": 0.361328125, "learning_rate": 1.2165453172585964e-05, "loss": 0.5149, "step": 363 }, { "epoch": 0.8840315725561627, "grad_norm": 0.388671875, "learning_rate": 1.2127384081094167e-05, "loss": 0.5109, "step": 364 }, { "epoch": 0.8840315725561627, "eval_loss": 0.5158221125602722, "eval_runtime": 97.2582, "eval_samples_per_second": 30.846, "eval_steps_per_second": 3.856, "step": 364 }, { "epoch": 0.8864602307225258, "grad_norm": 0.37890625, "learning_rate": 1.2089282669329625e-05, "loss": 0.4993, "step": 365 }, { "epoch": 0.8888888888888888, "grad_norm": 0.373046875, "learning_rate": 1.205114951614785e-05, "loss": 0.5187, "step": 366 }, { "epoch": 0.891317547055252, "grad_norm": 0.357421875, "learning_rate": 1.2012985200886602e-05, "loss": 0.5088, "step": 367 }, { "epoch": 0.8937462052216151, "grad_norm": 0.380859375, "learning_rate": 1.197479030335706e-05, "loss": 0.5167, "step": 368 }, { "epoch": 0.8961748633879781, "grad_norm": 0.384765625, "learning_rate": 1.1936565403835027e-05, "loss": 0.5138, "step": 369 }, { "epoch": 0.8986035215543412, "grad_norm": 0.369140625, "learning_rate": 1.1898311083052113e-05, "loss": 0.5062, "step": 370 }, { "epoch": 0.9010321797207043, "grad_norm": 0.361328125, "learning_rate": 1.1860027922186908e-05, "loss": 0.5122, "step": 371 }, { "epoch": 0.9034608378870674, "grad_norm": 0.361328125, "learning_rate": 1.1821716502856154e-05, "loss": 0.5108, "step": 372 }, { "epoch": 0.9058894960534305, "grad_norm": 0.373046875, "learning_rate": 1.1783377407105907e-05, "loss": 0.5212, "step": 373 }, { "epoch": 0.9083181542197936, "grad_norm": 0.3828125, "learning_rate": 1.1745011217402709e-05, "loss": 0.5079, "step": 374 }, { "epoch": 0.9107468123861566, "grad_norm": 0.380859375, "learning_rate": 1.1706618516624712e-05, "loss": 0.5105, "step": 375 }, { "epoch": 0.9131754705525197, "grad_norm": 0.36328125, "learning_rate": 1.1668199888052844e-05, "loss": 0.5123, "step": 376 }, { "epoch": 0.9156041287188829, "grad_norm": 0.365234375, "learning_rate": 1.1629755915361947e-05, "loss": 0.5125, "step": 377 }, { "epoch": 0.9180327868852459, "grad_norm": 0.373046875, "learning_rate": 1.159128718261189e-05, "loss": 0.5021, "step": 378 }, { "epoch": 0.920461445051609, "grad_norm": 0.359375, "learning_rate": 1.1552794274238723e-05, "loss": 0.5158, "step": 379 }, { "epoch": 0.922890103217972, "grad_norm": 0.361328125, "learning_rate": 1.1514277775045768e-05, "loss": 0.5064, "step": 380 }, { "epoch": 0.9253187613843351, "grad_norm": 0.369140625, "learning_rate": 1.1475738270194767e-05, "loss": 0.512, "step": 381 }, { "epoch": 0.9277474195506983, "grad_norm": 0.365234375, "learning_rate": 1.1437176345196967e-05, "loss": 0.5236, "step": 382 }, { "epoch": 0.9301760777170613, "grad_norm": 0.361328125, "learning_rate": 1.1398592585904234e-05, "loss": 0.5152, "step": 383 }, { "epoch": 0.9326047358834244, "grad_norm": 0.35546875, "learning_rate": 1.135998757850015e-05, "loss": 0.522, "step": 384 }, { "epoch": 0.9350333940497875, "grad_norm": 0.392578125, "learning_rate": 1.1321361909491108e-05, "loss": 0.5159, "step": 385 }, { "epoch": 0.9374620522161505, "grad_norm": 0.369140625, "learning_rate": 1.128271616569741e-05, "loss": 0.5042, "step": 386 }, { "epoch": 0.9398907103825137, "grad_norm": 0.361328125, "learning_rate": 1.1244050934244333e-05, "loss": 0.5161, "step": 387 }, { "epoch": 0.9423193685488768, "grad_norm": 0.357421875, "learning_rate": 1.1205366802553231e-05, "loss": 0.5094, "step": 388 }, { "epoch": 0.9447480267152398, "grad_norm": 0.365234375, "learning_rate": 1.1166664358332595e-05, "loss": 0.5165, "step": 389 }, { "epoch": 0.9471766848816029, "grad_norm": 0.36328125, "learning_rate": 1.1127944189569122e-05, "loss": 0.5148, "step": 390 }, { "epoch": 0.9471766848816029, "eval_loss": 0.5134184956550598, "eval_runtime": 97.3787, "eval_samples_per_second": 30.808, "eval_steps_per_second": 3.851, "step": 390 }, { "epoch": 0.949605343047966, "grad_norm": 0.375, "learning_rate": 1.1089206884518802e-05, "loss": 0.52, "step": 391 }, { "epoch": 0.9520340012143291, "grad_norm": 0.361328125, "learning_rate": 1.1050453031697958e-05, "loss": 0.5141, "step": 392 }, { "epoch": 0.9544626593806922, "grad_norm": 0.36328125, "learning_rate": 1.1011683219874324e-05, "loss": 0.5114, "step": 393 }, { "epoch": 0.9568913175470553, "grad_norm": 0.35546875, "learning_rate": 1.0972898038058077e-05, "loss": 0.5128, "step": 394 }, { "epoch": 0.9593199757134183, "grad_norm": 0.36328125, "learning_rate": 1.093409807549292e-05, "loss": 0.5107, "step": 395 }, { "epoch": 0.9617486338797814, "grad_norm": 0.74609375, "learning_rate": 1.0895283921647098e-05, "loss": 0.5607, "step": 396 }, { "epoch": 0.9641772920461446, "grad_norm": 0.36328125, "learning_rate": 1.085645616620446e-05, "loss": 0.5203, "step": 397 }, { "epoch": 0.9666059502125076, "grad_norm": 0.359375, "learning_rate": 1.0817615399055513e-05, "loss": 0.511, "step": 398 }, { "epoch": 0.9690346083788707, "grad_norm": 0.365234375, "learning_rate": 1.0778762210288416e-05, "loss": 0.5017, "step": 399 }, { "epoch": 0.9714632665452337, "grad_norm": 0.359375, "learning_rate": 1.0739897190180066e-05, "loss": 0.5149, "step": 400 }, { "epoch": 0.9738919247115968, "grad_norm": 0.57421875, "learning_rate": 1.0701020929187096e-05, "loss": 0.5721, "step": 401 }, { "epoch": 0.97632058287796, "grad_norm": 0.36328125, "learning_rate": 1.0662134017936924e-05, "loss": 0.5081, "step": 402 }, { "epoch": 0.978749241044323, "grad_norm": 0.54296875, "learning_rate": 1.0623237047218771e-05, "loss": 0.5709, "step": 403 }, { "epoch": 0.9811778992106861, "grad_norm": 0.357421875, "learning_rate": 1.0584330607974673e-05, "loss": 0.5015, "step": 404 }, { "epoch": 0.9836065573770492, "grad_norm": 0.359375, "learning_rate": 1.054541529129054e-05, "loss": 0.5167, "step": 405 }, { "epoch": 0.9860352155434122, "grad_norm": 0.53515625, "learning_rate": 1.0506491688387128e-05, "loss": 0.5619, "step": 406 }, { "epoch": 0.9884638737097754, "grad_norm": 0.365234375, "learning_rate": 1.04675603906111e-05, "loss": 0.5261, "step": 407 }, { "epoch": 0.9908925318761385, "grad_norm": 0.359375, "learning_rate": 1.0428621989426016e-05, "loss": 0.4998, "step": 408 }, { "epoch": 0.9933211900425015, "grad_norm": 0.37109375, "learning_rate": 1.0389677076403351e-05, "loss": 0.5051, "step": 409 }, { "epoch": 0.9957498482088646, "grad_norm": 0.5390625, "learning_rate": 1.0350726243213519e-05, "loss": 0.569, "step": 410 }, { "epoch": 0.9981785063752276, "grad_norm": 0.361328125, "learning_rate": 1.0311770081616864e-05, "loss": 0.514, "step": 411 }, { "epoch": 1.0006071645415908, "grad_norm": 0.359375, "learning_rate": 1.0272809183454701e-05, "loss": 0.5084, "step": 412 }, { "epoch": 1.0030358227079539, "grad_norm": 0.51171875, "learning_rate": 1.0233844140640287e-05, "loss": 0.5605, "step": 413 }, { "epoch": 1.005464480874317, "grad_norm": 0.361328125, "learning_rate": 1.0194875545149854e-05, "loss": 0.507, "step": 414 }, { "epoch": 1.00789313904068, "grad_norm": 0.36328125, "learning_rate": 1.015590398901361e-05, "loss": 0.5133, "step": 415 }, { "epoch": 1.010321797207043, "grad_norm": 0.37109375, "learning_rate": 1.0116930064306736e-05, "loss": 0.5121, "step": 416 }, { "epoch": 1.010321797207043, "eval_loss": 0.5115101933479309, "eval_runtime": 96.8252, "eval_samples_per_second": 30.984, "eval_steps_per_second": 3.873, "step": 416 }, { "epoch": 1.0127504553734061, "grad_norm": 0.357421875, "learning_rate": 1.0077954363140407e-05, "loss": 0.5109, "step": 417 }, { "epoch": 1.0151791135397692, "grad_norm": 0.353515625, "learning_rate": 1.0038977477652779e-05, "loss": 0.4991, "step": 418 }, { "epoch": 1.002428658166363, "grad_norm": 0.3984375, "learning_rate": 1e-05, "loss": 0.4774, "step": 419 }, { "epoch": 1.0048573163327261, "grad_norm": 0.384765625, "learning_rate": 9.961022522347226e-06, "loss": 0.475, "step": 420 }, { "epoch": 1.0072859744990892, "grad_norm": 0.365234375, "learning_rate": 9.922045636859596e-06, "loss": 0.4863, "step": 421 }, { "epoch": 1.0097146326654522, "grad_norm": 0.373046875, "learning_rate": 9.883069935693267e-06, "loss": 0.4837, "step": 422 }, { "epoch": 1.0121432908318153, "grad_norm": 0.38671875, "learning_rate": 9.844096010986392e-06, "loss": 0.479, "step": 423 }, { "epoch": 1.0145719489981786, "grad_norm": 0.380859375, "learning_rate": 9.80512445485015e-06, "loss": 0.4849, "step": 424 }, { "epoch": 1.0170006071645417, "grad_norm": 0.3828125, "learning_rate": 9.766155859359718e-06, "loss": 0.4765, "step": 425 }, { "epoch": 1.0194292653309047, "grad_norm": 0.361328125, "learning_rate": 9.7271908165453e-06, "loss": 0.4773, "step": 426 }, { "epoch": 1.0218579234972678, "grad_norm": 0.64453125, "learning_rate": 9.688229918383138e-06, "loss": 0.5238, "step": 427 }, { "epoch": 1.0242865816636308, "grad_norm": 0.359375, "learning_rate": 9.649273756786486e-06, "loss": 0.483, "step": 428 }, { "epoch": 1.026715239829994, "grad_norm": 0.361328125, "learning_rate": 9.610322923596652e-06, "loss": 0.4718, "step": 429 }, { "epoch": 1.029143897996357, "grad_norm": 0.546875, "learning_rate": 9.57137801057399e-06, "loss": 0.5207, "step": 430 }, { "epoch": 1.03157255616272, "grad_norm": 0.35546875, "learning_rate": 9.532439609388901e-06, "loss": 0.4787, "step": 431 }, { "epoch": 1.034001214329083, "grad_norm": 0.3515625, "learning_rate": 9.493508311612874e-06, "loss": 0.4768, "step": 432 }, { "epoch": 1.0364298724954462, "grad_norm": 0.359375, "learning_rate": 9.454584708709462e-06, "loss": 0.484, "step": 433 }, { "epoch": 1.0388585306618094, "grad_norm": 0.361328125, "learning_rate": 9.415669392025329e-06, "loss": 0.4812, "step": 434 }, { "epoch": 1.0412871888281725, "grad_norm": 0.353515625, "learning_rate": 9.376762952781234e-06, "loss": 0.475, "step": 435 }, { "epoch": 1.0437158469945356, "grad_norm": 0.35546875, "learning_rate": 9.337865982063076e-06, "loss": 0.4726, "step": 436 }, { "epoch": 1.0461445051608986, "grad_norm": 0.361328125, "learning_rate": 9.298979070812908e-06, "loss": 0.473, "step": 437 }, { "epoch": 1.0485731633272617, "grad_norm": 0.353515625, "learning_rate": 9.260102809819939e-06, "loss": 0.4739, "step": 438 }, { "epoch": 1.0510018214936248, "grad_norm": 0.361328125, "learning_rate": 9.221237789711587e-06, "loss": 0.4916, "step": 439 }, { "epoch": 1.0534304796599878, "grad_norm": 0.35546875, "learning_rate": 9.182384600944494e-06, "loss": 0.4823, "step": 440 }, { "epoch": 1.0558591378263509, "grad_norm": 0.349609375, "learning_rate": 9.143543833795539e-06, "loss": 0.4737, "step": 441 }, { "epoch": 1.058287795992714, "grad_norm": 0.361328125, "learning_rate": 9.104716078352906e-06, "loss": 0.4788, "step": 442 }, { "epoch": 1.058287795992714, "eval_loss": 0.5110214352607727, "eval_runtime": 96.9242, "eval_samples_per_second": 30.952, "eval_steps_per_second": 3.869, "step": 442 }, { "epoch": 1.060716454159077, "grad_norm": 0.365234375, "learning_rate": 9.065901924507085e-06, "loss": 0.4775, "step": 443 }, { "epoch": 1.0631451123254403, "grad_norm": 0.35546875, "learning_rate": 9.027101961941925e-06, "loss": 0.4735, "step": 444 }, { "epoch": 1.0655737704918034, "grad_norm": 0.353515625, "learning_rate": 8.98831678012568e-06, "loss": 0.4803, "step": 445 }, { "epoch": 1.0680024286581664, "grad_norm": 0.353515625, "learning_rate": 8.949546968302042e-06, "loss": 0.4767, "step": 446 }, { "epoch": 1.0704310868245295, "grad_norm": 0.36328125, "learning_rate": 8.910793115481201e-06, "loss": 0.4765, "step": 447 }, { "epoch": 1.0728597449908925, "grad_norm": 0.357421875, "learning_rate": 8.872055810430881e-06, "loss": 0.4789, "step": 448 }, { "epoch": 1.0752884031572556, "grad_norm": 0.671875, "learning_rate": 8.833335641667408e-06, "loss": 0.5243, "step": 449 }, { "epoch": 1.0777170613236187, "grad_norm": 0.3515625, "learning_rate": 8.79463319744677e-06, "loss": 0.4769, "step": 450 }, { "epoch": 1.0801457194899817, "grad_norm": 0.359375, "learning_rate": 8.755949065755668e-06, "loss": 0.4774, "step": 451 }, { "epoch": 1.0825743776563448, "grad_norm": 0.373046875, "learning_rate": 8.717283834302593e-06, "loss": 0.4792, "step": 452 }, { "epoch": 1.0850030358227079, "grad_norm": 0.353515625, "learning_rate": 8.678638090508897e-06, "loss": 0.4768, "step": 453 }, { "epoch": 1.0874316939890711, "grad_norm": 0.35546875, "learning_rate": 8.640012421499856e-06, "loss": 0.4738, "step": 454 }, { "epoch": 1.0898603521554342, "grad_norm": 0.5625, "learning_rate": 8.601407414095771e-06, "loss": 0.5251, "step": 455 }, { "epoch": 1.0922890103217973, "grad_norm": 0.35546875, "learning_rate": 8.562823654803035e-06, "loss": 0.4847, "step": 456 }, { "epoch": 1.0947176684881603, "grad_norm": 0.359375, "learning_rate": 8.524261729805235e-06, "loss": 0.4815, "step": 457 }, { "epoch": 1.0971463266545234, "grad_norm": 0.7421875, "learning_rate": 8.485722224954237e-06, "loss": 0.5586, "step": 458 }, { "epoch": 1.0995749848208864, "grad_norm": 0.353515625, "learning_rate": 8.44720572576128e-06, "loss": 0.4716, "step": 459 }, { "epoch": 1.1020036429872495, "grad_norm": 0.35546875, "learning_rate": 8.408712817388113e-06, "loss": 0.4782, "step": 460 }, { "epoch": 1.1044323011536126, "grad_norm": 0.50390625, "learning_rate": 8.370244084638055e-06, "loss": 0.5251, "step": 461 }, { "epoch": 1.1068609593199756, "grad_norm": 0.53515625, "learning_rate": 8.331800111947158e-06, "loss": 0.5125, "step": 462 }, { "epoch": 1.1092896174863387, "grad_norm": 0.5234375, "learning_rate": 8.293381483375293e-06, "loss": 0.5175, "step": 463 }, { "epoch": 1.111718275652702, "grad_norm": 0.5234375, "learning_rate": 8.254988782597295e-06, "loss": 0.514, "step": 464 }, { "epoch": 1.114146933819065, "grad_norm": 0.35546875, "learning_rate": 8.216622592894097e-06, "loss": 0.477, "step": 465 }, { "epoch": 1.116575591985428, "grad_norm": 0.375, "learning_rate": 8.178283497143851e-06, "loss": 0.4873, "step": 466 }, { "epoch": 1.1190042501517912, "grad_norm": 0.361328125, "learning_rate": 8.139972077813093e-06, "loss": 0.4805, "step": 467 }, { "epoch": 1.1214329083181542, "grad_norm": 0.359375, "learning_rate": 8.10168891694789e-06, "loss": 0.4738, "step": 468 }, { "epoch": 1.1214329083181542, "eval_loss": 0.5099829435348511, "eval_runtime": 99.9835, "eval_samples_per_second": 30.005, "eval_steps_per_second": 3.751, "step": 468 }, { "epoch": 1.1238615664845173, "grad_norm": 0.35546875, "learning_rate": 8.063434596164974e-06, "loss": 0.471, "step": 469 }, { "epoch": 1.1262902246508804, "grad_norm": 0.357421875, "learning_rate": 8.025209696642942e-06, "loss": 0.4781, "step": 470 }, { "epoch": 1.1287188828172434, "grad_norm": 0.353515625, "learning_rate": 7.987014799113398e-06, "loss": 0.4806, "step": 471 }, { "epoch": 1.1311475409836065, "grad_norm": 0.35546875, "learning_rate": 7.948850483852153e-06, "loss": 0.4737, "step": 472 }, { "epoch": 1.1335761991499695, "grad_norm": 0.6640625, "learning_rate": 7.91071733067038e-06, "loss": 0.5262, "step": 473 }, { "epoch": 1.1360048573163328, "grad_norm": 0.365234375, "learning_rate": 7.872615918905833e-06, "loss": 0.4892, "step": 474 }, { "epoch": 1.138433515482696, "grad_norm": 0.353515625, "learning_rate": 7.83454682741404e-06, "loss": 0.4825, "step": 475 }, { "epoch": 1.140862173649059, "grad_norm": 0.353515625, "learning_rate": 7.796510634559487e-06, "loss": 0.4708, "step": 476 }, { "epoch": 1.143290831815422, "grad_norm": 0.359375, "learning_rate": 7.758507918206859e-06, "loss": 0.474, "step": 477 }, { "epoch": 1.145719489981785, "grad_norm": 0.357421875, "learning_rate": 7.720539255712252e-06, "loss": 0.4705, "step": 478 }, { "epoch": 1.1481481481481481, "grad_norm": 0.3515625, "learning_rate": 7.682605223914386e-06, "loss": 0.4735, "step": 479 }, { "epoch": 1.1505768063145112, "grad_norm": 0.3515625, "learning_rate": 7.644706399125871e-06, "loss": 0.4696, "step": 480 }, { "epoch": 1.1530054644808743, "grad_norm": 0.365234375, "learning_rate": 7.606843357124426e-06, "loss": 0.4953, "step": 481 }, { "epoch": 1.1554341226472373, "grad_norm": 0.3515625, "learning_rate": 7.569016673144132e-06, "loss": 0.4749, "step": 482 }, { "epoch": 1.1578627808136004, "grad_norm": 0.353515625, "learning_rate": 7.531226921866715e-06, "loss": 0.4755, "step": 483 }, { "epoch": 1.1602914389799635, "grad_norm": 0.353515625, "learning_rate": 7.493474677412795e-06, "loss": 0.4661, "step": 484 }, { "epoch": 1.1627200971463267, "grad_norm": 0.63671875, "learning_rate": 7.455760513333172e-06, "loss": 0.5152, "step": 485 }, { "epoch": 1.1651487553126898, "grad_norm": 0.35546875, "learning_rate": 7.418085002600104e-06, "loss": 0.4787, "step": 486 }, { "epoch": 1.1675774134790529, "grad_norm": 0.353515625, "learning_rate": 7.3804487175986135e-06, "loss": 0.4718, "step": 487 }, { "epoch": 1.170006071645416, "grad_norm": 0.3515625, "learning_rate": 7.3428522301177894e-06, "loss": 0.4728, "step": 488 }, { "epoch": 1.172434729811779, "grad_norm": 0.353515625, "learning_rate": 7.305296111342086e-06, "loss": 0.4771, "step": 489 }, { "epoch": 1.174863387978142, "grad_norm": 0.353515625, "learning_rate": 7.267780931842667e-06, "loss": 0.479, "step": 490 }, { "epoch": 1.177292046144505, "grad_norm": 0.34765625, "learning_rate": 7.230307261568725e-06, "loss": 0.468, "step": 491 }, { "epoch": 1.1797207043108682, "grad_norm": 0.353515625, "learning_rate": 7.192875669838815e-06, "loss": 0.475, "step": 492 }, { "epoch": 1.1821493624772312, "grad_norm": 0.353515625, "learning_rate": 7.155486725332224e-06, "loss": 0.4683, "step": 493 }, { "epoch": 1.1845780206435945, "grad_norm": 0.353515625, "learning_rate": 7.118140996080313e-06, "loss": 0.4818, "step": 494 }, { "epoch": 1.1845780206435945, "eval_loss": 0.5090214610099792, "eval_runtime": 100.5205, "eval_samples_per_second": 29.845, "eval_steps_per_second": 3.731, "step": 494 }, { "epoch": 1.1870066788099576, "grad_norm": 0.48828125, "learning_rate": 7.080839049457908e-06, "loss": 0.513, "step": 495 }, { "epoch": 1.1894353369763206, "grad_norm": 0.359375, "learning_rate": 7.043581452174653e-06, "loss": 0.4799, "step": 496 }, { "epoch": 1.1918639951426837, "grad_norm": 0.5859375, "learning_rate": 7.006368770266421e-06, "loss": 0.5165, "step": 497 }, { "epoch": 1.1942926533090468, "grad_norm": 0.349609375, "learning_rate": 6.9692015690867135e-06, "loss": 0.4774, "step": 498 }, { "epoch": 1.1967213114754098, "grad_norm": 0.3515625, "learning_rate": 6.932080413298055e-06, "loss": 0.4723, "step": 499 }, { "epoch": 1.199149969641773, "grad_norm": 0.357421875, "learning_rate": 6.895005866863439e-06, "loss": 0.4679, "step": 500 }, { "epoch": 1.201578627808136, "grad_norm": 0.353515625, "learning_rate": 6.857978493037734e-06, "loss": 0.4769, "step": 501 }, { "epoch": 1.204007285974499, "grad_norm": 0.357421875, "learning_rate": 6.820998854359144e-06, "loss": 0.4752, "step": 502 }, { "epoch": 1.206435944140862, "grad_norm": 0.35546875, "learning_rate": 6.784067512640666e-06, "loss": 0.4781, "step": 503 }, { "epoch": 1.2088646023072251, "grad_norm": 0.35546875, "learning_rate": 6.7471850289615246e-06, "loss": 0.4705, "step": 504 }, { "epoch": 1.2112932604735884, "grad_norm": 0.7109375, "learning_rate": 6.710351963658692e-06, "loss": 0.5441, "step": 505 }, { "epoch": 1.2137219186399515, "grad_norm": 0.35546875, "learning_rate": 6.67356887631834e-06, "loss": 0.4712, "step": 506 }, { "epoch": 1.2161505768063146, "grad_norm": 0.357421875, "learning_rate": 6.636836325767342e-06, "loss": 0.4824, "step": 507 }, { "epoch": 1.2185792349726776, "grad_norm": 0.357421875, "learning_rate": 6.600154870064812e-06, "loss": 0.4772, "step": 508 }, { "epoch": 1.2210078931390407, "grad_norm": 0.349609375, "learning_rate": 6.563525066493588e-06, "loss": 0.4641, "step": 509 }, { "epoch": 1.2234365513054037, "grad_norm": 0.35546875, "learning_rate": 6.526947471551799e-06, "loss": 0.4711, "step": 510 }, { "epoch": 1.2258652094717668, "grad_norm": 0.34765625, "learning_rate": 6.490422640944378e-06, "loss": 0.4702, "step": 511 }, { "epoch": 1.2282938676381299, "grad_norm": 0.353515625, "learning_rate": 6.453951129574644e-06, "loss": 0.4849, "step": 512 }, { "epoch": 1.230722525804493, "grad_norm": 0.515625, "learning_rate": 6.41753349153587e-06, "loss": 0.5051, "step": 513 }, { "epoch": 1.2331511839708562, "grad_norm": 0.3515625, "learning_rate": 6.3811702801028465e-06, "loss": 0.4701, "step": 514 }, { "epoch": 1.2355798421372193, "grad_norm": 0.3515625, "learning_rate": 6.344862047723495e-06, "loss": 0.4765, "step": 515 }, { "epoch": 1.2380085003035823, "grad_norm": 0.3515625, "learning_rate": 6.30860934601047e-06, "loss": 0.4827, "step": 516 }, { "epoch": 1.2404371584699454, "grad_norm": 0.35546875, "learning_rate": 6.272412725732767e-06, "loss": 0.4787, "step": 517 }, { "epoch": 1.2428658166363085, "grad_norm": 0.353515625, "learning_rate": 6.236272736807378e-06, "loss": 0.4825, "step": 518 }, { "epoch": 1.2452944748026715, "grad_norm": 0.357421875, "learning_rate": 6.200189928290916e-06, "loss": 0.4799, "step": 519 }, { "epoch": 1.2477231329690346, "grad_norm": 0.349609375, "learning_rate": 6.1641648483712755e-06, "loss": 0.4719, "step": 520 }, { "epoch": 1.2477231329690346, "eval_loss": 0.5082234740257263, "eval_runtime": 98.6416, "eval_samples_per_second": 30.413, "eval_steps_per_second": 3.802, "step": 520 }, { "epoch": 1.2501517911353976, "grad_norm": 0.357421875, "learning_rate": 6.128198044359322e-06, "loss": 0.4689, "step": 521 }, { "epoch": 1.2525804493017607, "grad_norm": 0.353515625, "learning_rate": 6.09229006268055e-06, "loss": 0.4821, "step": 522 }, { "epoch": 1.255009107468124, "grad_norm": 0.359375, "learning_rate": 6.056441448866817e-06, "loss": 0.4793, "step": 523 }, { "epoch": 1.2574377656344868, "grad_norm": 0.3515625, "learning_rate": 6.020652747548008e-06, "loss": 0.4761, "step": 524 }, { "epoch": 1.2598664238008501, "grad_norm": 0.357421875, "learning_rate": 5.984924502443807e-06, "loss": 0.482, "step": 525 }, { "epoch": 1.2622950819672132, "grad_norm": 0.345703125, "learning_rate": 5.949257256355415e-06, "loss": 0.4674, "step": 526 }, { "epoch": 1.2647237401335762, "grad_norm": 0.34765625, "learning_rate": 5.913651551157295e-06, "loss": 0.4733, "step": 527 }, { "epoch": 1.2671523982999393, "grad_norm": 0.349609375, "learning_rate": 5.878107927788962e-06, "loss": 0.4742, "step": 528 }, { "epoch": 1.2695810564663024, "grad_norm": 0.349609375, "learning_rate": 5.84262692624675e-06, "loss": 0.476, "step": 529 }, { "epoch": 1.2720097146326654, "grad_norm": 0.3515625, "learning_rate": 5.8072090855756e-06, "loss": 0.4698, "step": 530 }, { "epoch": 1.2744383727990285, "grad_norm": 0.349609375, "learning_rate": 5.7718549438609085e-06, "loss": 0.4737, "step": 531 }, { "epoch": 1.2768670309653916, "grad_norm": 0.3515625, "learning_rate": 5.736565038220289e-06, "loss": 0.4787, "step": 532 }, { "epoch": 1.2792956891317546, "grad_norm": 0.3515625, "learning_rate": 5.701339904795486e-06, "loss": 0.4673, "step": 533 }, { "epoch": 1.281724347298118, "grad_norm": 0.353515625, "learning_rate": 5.666180078744169e-06, "loss": 0.4786, "step": 534 }, { "epoch": 1.2841530054644807, "grad_norm": 0.3515625, "learning_rate": 5.6310860942318235e-06, "loss": 0.4766, "step": 535 }, { "epoch": 1.286581663630844, "grad_norm": 0.35546875, "learning_rate": 5.5960584844236565e-06, "loss": 0.4744, "step": 536 }, { "epoch": 1.289010321797207, "grad_norm": 0.349609375, "learning_rate": 5.561097781476463e-06, "loss": 0.4706, "step": 537 }, { "epoch": 1.2914389799635702, "grad_norm": 0.35546875, "learning_rate": 5.5262045165305615e-06, "loss": 0.474, "step": 538 }, { "epoch": 1.2938676381299332, "grad_norm": 0.353515625, "learning_rate": 5.491379219701718e-06, "loss": 0.4737, "step": 539 }, { "epoch": 1.2962962962962963, "grad_norm": 0.349609375, "learning_rate": 5.456622420073084e-06, "loss": 0.4797, "step": 540 }, { "epoch": 1.2987249544626593, "grad_norm": 0.345703125, "learning_rate": 5.421934645687185e-06, "loss": 0.4779, "step": 541 }, { "epoch": 1.3011536126290224, "grad_norm": 0.349609375, "learning_rate": 5.387316423537869e-06, "loss": 0.476, "step": 542 }, { "epoch": 1.3035822707953855, "grad_norm": 0.353515625, "learning_rate": 5.352768279562315e-06, "loss": 0.4792, "step": 543 }, { "epoch": 1.3060109289617485, "grad_norm": 0.51171875, "learning_rate": 5.318290738633041e-06, "loss": 0.5148, "step": 544 }, { "epoch": 1.3084395871281118, "grad_norm": 0.3515625, "learning_rate": 5.283884324549924e-06, "loss": 0.4741, "step": 545 }, { "epoch": 1.3108682452944749, "grad_norm": 0.3515625, "learning_rate": 5.249549560032252e-06, "loss": 0.4643, "step": 546 }, { "epoch": 1.3108682452944749, "eval_loss": 0.5077295899391174, "eval_runtime": 96.7746, "eval_samples_per_second": 31.0, "eval_steps_per_second": 3.875, "step": 546 }, { "epoch": 1.313296903460838, "grad_norm": 0.353515625, "learning_rate": 5.215286966710774e-06, "loss": 0.4723, "step": 547 }, { "epoch": 1.315725561627201, "grad_norm": 0.353515625, "learning_rate": 5.18109706511978e-06, "loss": 0.4812, "step": 548 }, { "epoch": 1.318154219793564, "grad_norm": 0.359375, "learning_rate": 5.146980374689192e-06, "loss": 0.4683, "step": 549 }, { "epoch": 1.3205828779599271, "grad_norm": 0.34765625, "learning_rate": 5.112937413736667e-06, "loss": 0.4731, "step": 550 }, { "epoch": 1.3230115361262902, "grad_norm": 0.34765625, "learning_rate": 5.078968699459736e-06, "loss": 0.4687, "step": 551 }, { "epoch": 1.3254401942926533, "grad_norm": 0.3515625, "learning_rate": 5.045074747927927e-06, "loss": 0.4781, "step": 552 }, { "epoch": 1.3278688524590163, "grad_norm": 0.353515625, "learning_rate": 5.011256074074945e-06, "loss": 0.4764, "step": 553 }, { "epoch": 1.3302975106253796, "grad_norm": 0.345703125, "learning_rate": 4.977513191690834e-06, "loss": 0.4628, "step": 554 }, { "epoch": 1.3327261687917424, "grad_norm": 0.349609375, "learning_rate": 4.943846613414172e-06, "loss": 0.4751, "step": 555 }, { "epoch": 1.3351548269581057, "grad_norm": 0.34765625, "learning_rate": 4.910256850724306e-06, "loss": 0.4742, "step": 556 }, { "epoch": 1.3375834851244688, "grad_norm": 0.345703125, "learning_rate": 4.8767444139335365e-06, "loss": 0.4653, "step": 557 }, { "epoch": 1.3400121432908318, "grad_norm": 0.3515625, "learning_rate": 4.843309812179405e-06, "loss": 0.4779, "step": 558 }, { "epoch": 1.342440801457195, "grad_norm": 0.35546875, "learning_rate": 4.809953553416954e-06, "loss": 0.4845, "step": 559 }, { "epoch": 1.344869459623558, "grad_norm": 0.35546875, "learning_rate": 4.776676144410973e-06, "loss": 0.4687, "step": 560 }, { "epoch": 1.347298117789921, "grad_norm": 0.349609375, "learning_rate": 4.743478090728356e-06, "loss": 0.4819, "step": 561 }, { "epoch": 1.349726775956284, "grad_norm": 0.349609375, "learning_rate": 4.710359896730379e-06, "loss": 0.4757, "step": 562 }, { "epoch": 1.3521554341226472, "grad_norm": 0.3515625, "learning_rate": 4.677322065565039e-06, "loss": 0.4692, "step": 563 }, { "epoch": 1.3545840922890102, "grad_norm": 0.349609375, "learning_rate": 4.644365099159443e-06, "loss": 0.4787, "step": 564 }, { "epoch": 1.3570127504553735, "grad_norm": 0.50390625, "learning_rate": 4.611489498212145e-06, "loss": 0.5029, "step": 565 }, { "epoch": 1.3594414086217366, "grad_norm": 0.34765625, "learning_rate": 4.57869576218556e-06, "loss": 0.473, "step": 566 }, { "epoch": 1.3618700667880996, "grad_norm": 0.35546875, "learning_rate": 4.545984389298371e-06, "loss": 0.4751, "step": 567 }, { "epoch": 1.3642987249544627, "grad_norm": 0.35546875, "learning_rate": 4.5133558765179576e-06, "loss": 0.4757, "step": 568 }, { "epoch": 1.3667273831208258, "grad_norm": 0.34765625, "learning_rate": 4.480810719552848e-06, "loss": 0.4691, "step": 569 }, { "epoch": 1.3691560412871888, "grad_norm": 0.35546875, "learning_rate": 4.4483494128451885e-06, "loss": 0.477, "step": 570 }, { "epoch": 1.3715846994535519, "grad_norm": 0.3515625, "learning_rate": 4.4159724495632295e-06, "loss": 0.4775, "step": 571 }, { "epoch": 1.374013357619915, "grad_norm": 0.349609375, "learning_rate": 4.383680321593836e-06, "loss": 0.4783, "step": 572 }, { "epoch": 1.374013357619915, "eval_loss": 0.5073318481445312, "eval_runtime": 103.2698, "eval_samples_per_second": 29.05, "eval_steps_per_second": 3.631, "step": 572 }, { "epoch": 1.376442015786278, "grad_norm": 0.3515625, "learning_rate": 4.35147351953501e-06, "loss": 0.4735, "step": 573 }, { "epoch": 1.3788706739526413, "grad_norm": 0.357421875, "learning_rate": 4.319352532688444e-06, "loss": 0.4667, "step": 574 }, { "epoch": 1.3812993321190041, "grad_norm": 0.357421875, "learning_rate": 4.287317849052075e-06, "loss": 0.4788, "step": 575 }, { "epoch": 1.3837279902853674, "grad_norm": 0.349609375, "learning_rate": 4.255369955312698e-06, "loss": 0.474, "step": 576 }, { "epoch": 1.3861566484517305, "grad_norm": 0.349609375, "learning_rate": 4.223509336838528e-06, "loss": 0.4688, "step": 577 }, { "epoch": 1.3885853066180935, "grad_norm": 0.353515625, "learning_rate": 4.191736477671864e-06, "loss": 0.4688, "step": 578 }, { "epoch": 1.3910139647844566, "grad_norm": 0.3515625, "learning_rate": 4.160051860521731e-06, "loss": 0.4659, "step": 579 }, { "epoch": 1.3934426229508197, "grad_norm": 0.35546875, "learning_rate": 4.128455966756512e-06, "loss": 0.4759, "step": 580 }, { "epoch": 1.3958712811171827, "grad_norm": 0.35546875, "learning_rate": 4.096949276396694e-06, "loss": 0.4779, "step": 581 }, { "epoch": 1.3982999392835458, "grad_norm": 0.3515625, "learning_rate": 4.065532268107507e-06, "loss": 0.4776, "step": 582 }, { "epoch": 1.4007285974499089, "grad_norm": 0.3515625, "learning_rate": 4.034205419191709e-06, "loss": 0.4749, "step": 583 }, { "epoch": 1.403157255616272, "grad_norm": 0.353515625, "learning_rate": 4.002969205582314e-06, "loss": 0.4791, "step": 584 }, { "epoch": 1.4055859137826352, "grad_norm": 0.35546875, "learning_rate": 3.971824101835341e-06, "loss": 0.4723, "step": 585 }, { "epoch": 1.4080145719489983, "grad_norm": 0.349609375, "learning_rate": 3.940770581122634e-06, "loss": 0.4803, "step": 586 }, { "epoch": 1.4104432301153613, "grad_norm": 0.349609375, "learning_rate": 3.909809115224674e-06, "loss": 0.4667, "step": 587 }, { "epoch": 1.4128718882817244, "grad_norm": 0.357421875, "learning_rate": 3.878940174523371e-06, "loss": 0.4795, "step": 588 }, { "epoch": 1.4153005464480874, "grad_norm": 0.341796875, "learning_rate": 3.848164227994976e-06, "loss": 0.4631, "step": 589 }, { "epoch": 1.4177292046144505, "grad_norm": 0.361328125, "learning_rate": 3.8174817432029125e-06, "loss": 0.4728, "step": 590 }, { "epoch": 1.4201578627808136, "grad_norm": 0.345703125, "learning_rate": 3.7868931862906756e-06, "loss": 0.4658, "step": 591 }, { "epoch": 1.4225865209471766, "grad_norm": 0.353515625, "learning_rate": 3.7563990219747857e-06, "loss": 0.4841, "step": 592 }, { "epoch": 1.4250151791135397, "grad_norm": 0.3515625, "learning_rate": 3.725999713537689e-06, "loss": 0.4763, "step": 593 }, { "epoch": 1.427443837279903, "grad_norm": 0.3515625, "learning_rate": 3.695695722820737e-06, "loss": 0.4804, "step": 594 }, { "epoch": 1.4298724954462658, "grad_norm": 0.345703125, "learning_rate": 3.6654875102171683e-06, "loss": 0.4687, "step": 595 }, { "epoch": 1.432301153612629, "grad_norm": 0.34765625, "learning_rate": 3.635375534665111e-06, "loss": 0.464, "step": 596 }, { "epoch": 1.4347298117789922, "grad_norm": 0.34765625, "learning_rate": 3.605360253640614e-06, "loss": 0.4735, "step": 597 }, { "epoch": 1.4371584699453552, "grad_norm": 0.34765625, "learning_rate": 3.5754421231506953e-06, "loss": 0.4782, "step": 598 }, { "epoch": 1.4371584699453552, "eval_loss": 0.5070293545722961, "eval_runtime": 107.3386, "eval_samples_per_second": 27.949, "eval_steps_per_second": 3.494, "step": 598 }, { "epoch": 1.4395871281117183, "grad_norm": 0.353515625, "learning_rate": 3.545621597726412e-06, "loss": 0.4721, "step": 599 }, { "epoch": 1.4420157862780814, "grad_norm": 0.357421875, "learning_rate": 3.5158991304159572e-06, "loss": 0.4755, "step": 600 }, { "epoch": 1.4444444444444444, "grad_norm": 0.353515625, "learning_rate": 3.48627517277778e-06, "loss": 0.4827, "step": 601 }, { "epoch": 1.4468731026108075, "grad_norm": 0.34765625, "learning_rate": 3.4567501748737153e-06, "loss": 0.4693, "step": 602 }, { "epoch": 1.4493017607771705, "grad_norm": 0.349609375, "learning_rate": 3.427324585262156e-06, "loss": 0.468, "step": 603 }, { "epoch": 1.4517304189435336, "grad_norm": 0.34375, "learning_rate": 3.3979988509912443e-06, "loss": 0.4715, "step": 604 }, { "epoch": 1.454159077109897, "grad_norm": 0.353515625, "learning_rate": 3.3687734175920505e-06, "loss": 0.4844, "step": 605 }, { "epoch": 1.4565877352762597, "grad_norm": 0.353515625, "learning_rate": 3.339648729071836e-06, "loss": 0.4731, "step": 606 }, { "epoch": 1.459016393442623, "grad_norm": 0.34765625, "learning_rate": 3.310625227907307e-06, "loss": 0.4744, "step": 607 }, { "epoch": 1.461445051608986, "grad_norm": 0.353515625, "learning_rate": 3.281703355037854e-06, "loss": 0.4771, "step": 608 }, { "epoch": 1.4638737097753491, "grad_norm": 0.34765625, "learning_rate": 3.2528835498589085e-06, "loss": 0.471, "step": 609 }, { "epoch": 1.4663023679417122, "grad_norm": 0.35546875, "learning_rate": 3.2241662502152236e-06, "loss": 0.4773, "step": 610 }, { "epoch": 1.4687310261080753, "grad_norm": 0.349609375, "learning_rate": 3.195551892394234e-06, "loss": 0.4772, "step": 611 }, { "epoch": 1.4711596842744383, "grad_norm": 0.35546875, "learning_rate": 3.1670409111194454e-06, "loss": 0.4707, "step": 612 }, { "epoch": 1.4735883424408014, "grad_norm": 0.349609375, "learning_rate": 3.138633739543805e-06, "loss": 0.4759, "step": 613 }, { "epoch": 1.4760170006071647, "grad_norm": 0.345703125, "learning_rate": 3.110330809243134e-06, "loss": 0.4693, "step": 614 }, { "epoch": 1.4784456587735275, "grad_norm": 0.34375, "learning_rate": 3.082132550209571e-06, "loss": 0.4666, "step": 615 }, { "epoch": 1.4808743169398908, "grad_norm": 0.349609375, "learning_rate": 3.054039390845035e-06, "loss": 0.4731, "step": 616 }, { "epoch": 1.4833029751062539, "grad_norm": 0.34765625, "learning_rate": 3.0260517579547166e-06, "loss": 0.4782, "step": 617 }, { "epoch": 1.485731633272617, "grad_norm": 0.5390625, "learning_rate": 2.998170076740601e-06, "loss": 0.5016, "step": 618 }, { "epoch": 1.48816029143898, "grad_norm": 0.5078125, "learning_rate": 2.9703947707949974e-06, "loss": 0.5092, "step": 619 }, { "epoch": 1.490588949605343, "grad_norm": 0.353515625, "learning_rate": 2.9427262620941142e-06, "loss": 0.4768, "step": 620 }, { "epoch": 1.4930176077717061, "grad_norm": 0.349609375, "learning_rate": 2.915164970991642e-06, "loss": 0.4699, "step": 621 }, { "epoch": 1.4954462659380692, "grad_norm": 0.34765625, "learning_rate": 2.8877113162123637e-06, "loss": 0.4729, "step": 622 }, { "epoch": 1.4978749241044322, "grad_norm": 0.34765625, "learning_rate": 2.8603657148458053e-06, "loss": 0.4698, "step": 623 }, { "epoch": 1.5003035822707953, "grad_norm": 0.34765625, "learning_rate": 2.833128582339887e-06, "loss": 0.4812, "step": 624 }, { "epoch": 1.5003035822707953, "eval_loss": 0.5068376660346985, "eval_runtime": 97.0195, "eval_samples_per_second": 30.922, "eval_steps_per_second": 3.865, "step": 624 }, { "epoch": 1.5027322404371586, "grad_norm": 0.34765625, "learning_rate": 2.806000332494617e-06, "loss": 0.4651, "step": 625 }, { "epoch": 1.5051608986035214, "grad_norm": 0.345703125, "learning_rate": 2.778981377455806e-06, "loss": 0.4681, "step": 626 }, { "epoch": 1.5075895567698847, "grad_norm": 0.34765625, "learning_rate": 2.7520721277088023e-06, "loss": 0.4747, "step": 627 }, { "epoch": 1.5100182149362478, "grad_norm": 0.353515625, "learning_rate": 2.7252729920722564e-06, "loss": 0.4736, "step": 628 }, { "epoch": 1.5124468731026108, "grad_norm": 0.53125, "learning_rate": 2.698584377691913e-06, "loss": 0.5096, "step": 629 }, { "epoch": 1.514875531268974, "grad_norm": 0.349609375, "learning_rate": 2.6720066900344212e-06, "loss": 0.4703, "step": 630 }, { "epoch": 1.517304189435337, "grad_norm": 0.3515625, "learning_rate": 2.6455403328811736e-06, "loss": 0.4765, "step": 631 }, { "epoch": 1.5197328476017, "grad_norm": 0.349609375, "learning_rate": 2.6191857083221873e-06, "loss": 0.4819, "step": 632 }, { "epoch": 1.522161505768063, "grad_norm": 0.34765625, "learning_rate": 2.5929432167499658e-06, "loss": 0.4673, "step": 633 }, { "epoch": 1.5245901639344264, "grad_norm": 0.34765625, "learning_rate": 2.5668132568534377e-06, "loss": 0.4748, "step": 634 }, { "epoch": 1.5270188221007892, "grad_norm": 0.345703125, "learning_rate": 2.540796225611907e-06, "loss": 0.4674, "step": 635 }, { "epoch": 1.5294474802671525, "grad_norm": 0.546875, "learning_rate": 2.514892518288988e-06, "loss": 0.5083, "step": 636 }, { "epoch": 1.5318761384335153, "grad_norm": 0.51953125, "learning_rate": 2.4891025284266436e-06, "loss": 0.5049, "step": 637 }, { "epoch": 1.5343047965998786, "grad_norm": 0.345703125, "learning_rate": 2.463426647839173e-06, "loss": 0.4701, "step": 638 }, { "epoch": 1.5367334547662417, "grad_norm": 0.349609375, "learning_rate": 2.4378652666072646e-06, "loss": 0.4715, "step": 639 }, { "epoch": 1.5391621129326047, "grad_norm": 0.498046875, "learning_rate": 2.4124187730720916e-06, "loss": 0.5031, "step": 640 }, { "epoch": 1.5415907710989678, "grad_norm": 0.345703125, "learning_rate": 2.387087553829386e-06, "loss": 0.4734, "step": 641 }, { "epoch": 1.5440194292653309, "grad_norm": 0.349609375, "learning_rate": 2.361871993723579e-06, "loss": 0.4649, "step": 642 }, { "epoch": 1.5464480874316942, "grad_norm": 0.5, "learning_rate": 2.3367724758419495e-06, "loss": 0.5191, "step": 643 }, { "epoch": 1.548876745598057, "grad_norm": 0.349609375, "learning_rate": 2.3117893815088067e-06, "loss": 0.4755, "step": 644 }, { "epoch": 1.5513054037644203, "grad_norm": 0.353515625, "learning_rate": 2.2869230902796934e-06, "loss": 0.4805, "step": 645 }, { "epoch": 1.5537340619307831, "grad_norm": 0.3515625, "learning_rate": 2.2621739799356244e-06, "loss": 0.4807, "step": 646 }, { "epoch": 1.5561627200971464, "grad_norm": 0.349609375, "learning_rate": 2.2375424264773447e-06, "loss": 0.4818, "step": 647 }, { "epoch": 1.5585913782635095, "grad_norm": 0.353515625, "learning_rate": 2.2130288041196135e-06, "loss": 0.4773, "step": 648 }, { "epoch": 1.5610200364298725, "grad_norm": 0.34375, "learning_rate": 2.188633485285525e-06, "loss": 0.4696, "step": 649 }, { "epoch": 1.5634486945962356, "grad_norm": 0.34765625, "learning_rate": 2.1643568406008476e-06, "loss": 0.4679, "step": 650 }, { "epoch": 1.5634486945962356, "eval_loss": 0.5066995620727539, "eval_runtime": 98.2878, "eval_samples_per_second": 30.523, "eval_steps_per_second": 3.815, "step": 650 }, { "epoch": 1.5658773527625987, "grad_norm": 0.34765625, "learning_rate": 2.1401992388883888e-06, "loss": 0.4672, "step": 651 }, { "epoch": 1.5683060109289617, "grad_norm": 0.33984375, "learning_rate": 2.1161610471624084e-06, "loss": 0.4629, "step": 652 }, { "epoch": 1.5707346690953248, "grad_norm": 0.345703125, "learning_rate": 2.092242630623016e-06, "loss": 0.4701, "step": 653 }, { "epoch": 1.573163327261688, "grad_norm": 0.349609375, "learning_rate": 2.0684443526506415e-06, "loss": 0.4767, "step": 654 }, { "epoch": 1.575591985428051, "grad_norm": 0.349609375, "learning_rate": 2.0447665748005206e-06, "loss": 0.4677, "step": 655 }, { "epoch": 1.5780206435944142, "grad_norm": 0.48046875, "learning_rate": 2.021209656797174e-06, "loss": 0.5038, "step": 656 }, { "epoch": 1.580449301760777, "grad_norm": 0.349609375, "learning_rate": 1.9977739565289743e-06, "loss": 0.4732, "step": 657 }, { "epoch": 1.5828779599271403, "grad_norm": 0.345703125, "learning_rate": 1.974459830042691e-06, "loss": 0.4743, "step": 658 }, { "epoch": 1.5853066180935034, "grad_norm": 0.34375, "learning_rate": 1.951267631538072e-06, "loss": 0.4686, "step": 659 }, { "epoch": 1.5877352762598664, "grad_norm": 0.59375, "learning_rate": 1.928197713362495e-06, "loss": 0.5074, "step": 660 }, { "epoch": 1.5901639344262295, "grad_norm": 0.34765625, "learning_rate": 1.9052504260055838e-06, "loss": 0.4701, "step": 661 }, { "epoch": 1.5925925925925926, "grad_norm": 0.34765625, "learning_rate": 1.8824261180938875e-06, "loss": 0.4757, "step": 662 }, { "epoch": 1.5950212507589556, "grad_norm": 0.345703125, "learning_rate": 1.8597251363856061e-06, "loss": 0.4754, "step": 663 }, { "epoch": 1.5974499089253187, "grad_norm": 0.345703125, "learning_rate": 1.8371478257652908e-06, "loss": 0.4718, "step": 664 }, { "epoch": 1.599878567091682, "grad_norm": 0.349609375, "learning_rate": 1.8146945292386343e-06, "loss": 0.4765, "step": 665 }, { "epoch": 1.6023072252580448, "grad_norm": 0.349609375, "learning_rate": 1.7923655879272395e-06, "loss": 0.4822, "step": 666 }, { "epoch": 1.604735883424408, "grad_norm": 0.35546875, "learning_rate": 1.7701613410634367e-06, "loss": 0.4802, "step": 667 }, { "epoch": 1.607164541590771, "grad_norm": 0.349609375, "learning_rate": 1.7480821259851488e-06, "loss": 0.4741, "step": 668 }, { "epoch": 1.6095931997571342, "grad_norm": 0.349609375, "learning_rate": 1.7261282781307486e-06, "loss": 0.4686, "step": 669 }, { "epoch": 1.6120218579234973, "grad_norm": 0.349609375, "learning_rate": 1.7043001310339646e-06, "loss": 0.4672, "step": 670 }, { "epoch": 1.6144505160898603, "grad_norm": 0.34765625, "learning_rate": 1.6825980163188204e-06, "loss": 0.4727, "step": 671 }, { "epoch": 1.6168791742562234, "grad_norm": 0.34765625, "learning_rate": 1.661022263694594e-06, "loss": 0.4805, "step": 672 }, { "epoch": 1.6193078324225865, "grad_norm": 0.34765625, "learning_rate": 1.6395732009508058e-06, "loss": 0.469, "step": 673 }, { "epoch": 1.6217364905889498, "grad_norm": 0.349609375, "learning_rate": 1.6182511539522427e-06, "loss": 0.4747, "step": 674 }, { "epoch": 1.6241651487553126, "grad_norm": 0.34375, "learning_rate": 1.5970564466340022e-06, "loss": 0.4635, "step": 675 }, { "epoch": 1.6265938069216759, "grad_norm": 0.349609375, "learning_rate": 1.5759894009965793e-06, "loss": 0.4725, "step": 676 }, { "epoch": 1.6265938069216759, "eval_loss": 0.5065969824790955, "eval_runtime": 97.0109, "eval_samples_per_second": 30.924, "eval_steps_per_second": 3.866, "step": 676 }, { "epoch": 1.6290224650880387, "grad_norm": 0.349609375, "learning_rate": 1.5550503371009652e-06, "loss": 0.4762, "step": 677 }, { "epoch": 1.631451123254402, "grad_norm": 0.34375, "learning_rate": 1.5342395730637904e-06, "loss": 0.4738, "step": 678 }, { "epoch": 1.633879781420765, "grad_norm": 0.349609375, "learning_rate": 1.5135574250524898e-06, "loss": 0.4787, "step": 679 }, { "epoch": 1.6363084395871281, "grad_norm": 0.345703125, "learning_rate": 1.4930042072805062e-06, "loss": 0.4681, "step": 680 }, { "epoch": 1.6387370977534912, "grad_norm": 0.34765625, "learning_rate": 1.4725802320024985e-06, "loss": 0.4772, "step": 681 }, { "epoch": 1.6411657559198543, "grad_norm": 0.349609375, "learning_rate": 1.452285809509617e-06, "loss": 0.4753, "step": 682 }, { "epoch": 1.6435944140862173, "grad_norm": 0.3515625, "learning_rate": 1.432121248124786e-06, "loss": 0.4793, "step": 683 }, { "epoch": 1.6460230722525804, "grad_norm": 0.3515625, "learning_rate": 1.4120868541980026e-06, "loss": 0.4766, "step": 684 }, { "epoch": 1.6484517304189437, "grad_norm": 0.3515625, "learning_rate": 1.39218293210171e-06, "loss": 0.4742, "step": 685 }, { "epoch": 1.6508803885853065, "grad_norm": 0.349609375, "learning_rate": 1.372409784226152e-06, "loss": 0.485, "step": 686 }, { "epoch": 1.6533090467516698, "grad_norm": 0.3515625, "learning_rate": 1.3527677109747784e-06, "loss": 0.476, "step": 687 }, { "epoch": 1.6557377049180326, "grad_norm": 0.349609375, "learning_rate": 1.333257010759702e-06, "loss": 0.4773, "step": 688 }, { "epoch": 1.658166363084396, "grad_norm": 0.349609375, "learning_rate": 1.3138779799971446e-06, "loss": 0.4772, "step": 689 }, { "epoch": 1.660595021250759, "grad_norm": 0.349609375, "learning_rate": 1.294630913102939e-06, "loss": 0.478, "step": 690 }, { "epoch": 1.663023679417122, "grad_norm": 0.345703125, "learning_rate": 1.2755161024880602e-06, "loss": 0.472, "step": 691 }, { "epoch": 1.665452337583485, "grad_norm": 0.34765625, "learning_rate": 1.2565338385541792e-06, "loss": 0.4716, "step": 692 }, { "epoch": 1.6678809957498482, "grad_norm": 0.34375, "learning_rate": 1.2376844096892526e-06, "loss": 0.4646, "step": 693 }, { "epoch": 1.6703096539162114, "grad_norm": 0.34765625, "learning_rate": 1.2189681022631405e-06, "loss": 0.4743, "step": 694 }, { "epoch": 1.6727383120825743, "grad_norm": 0.34375, "learning_rate": 1.2003852006232564e-06, "loss": 0.4727, "step": 695 }, { "epoch": 1.6751669702489376, "grad_norm": 0.34375, "learning_rate": 1.181935987090247e-06, "loss": 0.463, "step": 696 }, { "epoch": 1.6775956284153004, "grad_norm": 0.349609375, "learning_rate": 1.1636207419537038e-06, "loss": 0.4799, "step": 697 }, { "epoch": 1.6800242865816637, "grad_norm": 0.3515625, "learning_rate": 1.1454397434679022e-06, "loss": 0.4795, "step": 698 }, { "epoch": 1.6824529447480268, "grad_norm": 0.3515625, "learning_rate": 1.1273932678475764e-06, "loss": 0.4748, "step": 699 }, { "epoch": 1.6848816029143898, "grad_norm": 0.6171875, "learning_rate": 1.1094815892637256e-06, "loss": 0.5055, "step": 700 }, { "epoch": 1.6873102610807529, "grad_norm": 0.34765625, "learning_rate": 1.0917049798394408e-06, "loss": 0.4721, "step": 701 }, { "epoch": 1.689738919247116, "grad_norm": 0.345703125, "learning_rate": 1.0740637096457773e-06, "loss": 0.4645, "step": 702 }, { "epoch": 1.689738919247116, "eval_loss": 0.5065945386886597, "eval_runtime": 97.1015, "eval_samples_per_second": 30.896, "eval_steps_per_second": 3.862, "step": 702 }, { "epoch": 1.692167577413479, "grad_norm": 0.353515625, "learning_rate": 1.0565580466976566e-06, "loss": 0.4757, "step": 703 }, { "epoch": 1.694596235579842, "grad_norm": 0.3515625, "learning_rate": 1.0391882569497758e-06, "loss": 0.475, "step": 704 }, { "epoch": 1.6970248937462054, "grad_norm": 0.345703125, "learning_rate": 1.0219546042925842e-06, "loss": 0.4777, "step": 705 }, { "epoch": 1.6994535519125682, "grad_norm": 0.34765625, "learning_rate": 1.0048573505482728e-06, "loss": 0.4712, "step": 706 }, { "epoch": 1.7018822100789315, "grad_norm": 0.54296875, "learning_rate": 9.878967554667862e-07, "loss": 0.5034, "step": 707 }, { "epoch": 1.7043108682452943, "grad_norm": 0.34765625, "learning_rate": 9.710730767218913e-07, "loss": 0.469, "step": 708 }, { "epoch": 1.7067395264116576, "grad_norm": 0.34375, "learning_rate": 9.54386569907244e-07, "loss": 0.4712, "step": 709 }, { "epoch": 1.7091681845780207, "grad_norm": 0.34765625, "learning_rate": 9.378374885325225e-07, "loss": 0.4754, "step": 710 }, { "epoch": 1.7115968427443837, "grad_norm": 0.345703125, "learning_rate": 9.214260840195732e-07, "loss": 0.4796, "step": 711 }, { "epoch": 1.7140255009107468, "grad_norm": 0.345703125, "learning_rate": 9.051526056985737e-07, "loss": 0.467, "step": 712 }, { "epoch": 1.7164541590771099, "grad_norm": 0.349609375, "learning_rate": 8.890173008042768e-07, "loss": 0.4749, "step": 713 }, { "epoch": 1.7188828172434731, "grad_norm": 0.482421875, "learning_rate": 8.730204144722232e-07, "loss": 0.5046, "step": 714 }, { "epoch": 1.721311475409836, "grad_norm": 0.34765625, "learning_rate": 8.571621897350312e-07, "loss": 0.4781, "step": 715 }, { "epoch": 1.7237401335761993, "grad_norm": 0.34765625, "learning_rate": 8.414428675187114e-07, "loss": 0.4611, "step": 716 }, { "epoch": 1.726168791742562, "grad_norm": 0.34375, "learning_rate": 8.258626866389897e-07, "loss": 0.4659, "step": 717 }, { "epoch": 1.7285974499089254, "grad_norm": 0.34765625, "learning_rate": 8.10421883797694e-07, "loss": 0.467, "step": 718 }, { "epoch": 1.7310261080752884, "grad_norm": 0.34375, "learning_rate": 7.951206935791478e-07, "loss": 0.4678, "step": 719 }, { "epoch": 1.7334547662416515, "grad_norm": 0.345703125, "learning_rate": 7.799593484466139e-07, "loss": 0.4771, "step": 720 }, { "epoch": 1.7358834244080146, "grad_norm": 0.34765625, "learning_rate": 7.649380787387561e-07, "loss": 0.4725, "step": 721 }, { "epoch": 1.7383120825743776, "grad_norm": 0.34765625, "learning_rate": 7.500571126661449e-07, "loss": 0.4732, "step": 722 }, { "epoch": 1.7407407407407407, "grad_norm": 0.34375, "learning_rate": 7.35316676307789e-07, "loss": 0.4716, "step": 723 }, { "epoch": 1.7431693989071038, "grad_norm": 0.3515625, "learning_rate": 7.207169936076974e-07, "loss": 0.4721, "step": 724 }, { "epoch": 1.745598057073467, "grad_norm": 0.34375, "learning_rate": 7.06258286371484e-07, "loss": 0.4726, "step": 725 }, { "epoch": 1.7480267152398299, "grad_norm": 0.61328125, "learning_rate": 6.919407742629891e-07, "loss": 0.5167, "step": 726 }, { "epoch": 1.7504553734061932, "grad_norm": 0.3515625, "learning_rate": 6.77764674800947e-07, "loss": 0.4826, "step": 727 }, { "epoch": 1.752884031572556, "grad_norm": 0.349609375, "learning_rate": 6.637302033556891e-07, "loss": 0.4792, "step": 728 }, { "epoch": 1.752884031572556, "eval_loss": 0.5065528750419617, "eval_runtime": 97.2896, "eval_samples_per_second": 30.836, "eval_steps_per_second": 3.854, "step": 728 }, { "epoch": 1.7553126897389193, "grad_norm": 0.34765625, "learning_rate": 6.498375731458529e-07, "loss": 0.4687, "step": 729 }, { "epoch": 1.7577413479052824, "grad_norm": 0.349609375, "learning_rate": 6.360869952351568e-07, "loss": 0.4841, "step": 730 }, { "epoch": 1.7601700060716454, "grad_norm": 0.34765625, "learning_rate": 6.22478678529197e-07, "loss": 0.4773, "step": 731 }, { "epoch": 1.7625986642380085, "grad_norm": 0.3515625, "learning_rate": 6.090128297722564e-07, "loss": 0.476, "step": 732 }, { "epoch": 1.7650273224043715, "grad_norm": 0.349609375, "learning_rate": 5.956896535441803e-07, "loss": 0.4749, "step": 733 }, { "epoch": 1.7674559805707348, "grad_norm": 0.34765625, "learning_rate": 5.825093522572666e-07, "loss": 0.4828, "step": 734 }, { "epoch": 1.7698846387370977, "grad_norm": 0.34375, "learning_rate": 5.694721261531732e-07, "loss": 0.4682, "step": 735 }, { "epoch": 1.772313296903461, "grad_norm": 0.345703125, "learning_rate": 5.565781732999043e-07, "loss": 0.4733, "step": 736 }, { "epoch": 1.7747419550698238, "grad_norm": 0.3515625, "learning_rate": 5.438276895887761e-07, "loss": 0.4767, "step": 737 }, { "epoch": 1.777170613236187, "grad_norm": 0.35546875, "learning_rate": 5.312208687314502e-07, "loss": 0.4758, "step": 738 }, { "epoch": 1.7795992714025501, "grad_norm": 0.3515625, "learning_rate": 5.187579022569977e-07, "loss": 0.4839, "step": 739 }, { "epoch": 1.7820279295689132, "grad_norm": 0.470703125, "learning_rate": 5.064389795089764e-07, "loss": 0.5067, "step": 740 }, { "epoch": 1.7844565877352763, "grad_norm": 0.5078125, "learning_rate": 4.942642876425641e-07, "loss": 0.5085, "step": 741 }, { "epoch": 1.7868852459016393, "grad_norm": 0.34765625, "learning_rate": 4.822340116217116e-07, "loss": 0.4757, "step": 742 }, { "epoch": 1.7893139040680024, "grad_norm": 0.349609375, "learning_rate": 4.703483342163262e-07, "loss": 0.4792, "step": 743 }, { "epoch": 1.7917425622343655, "grad_norm": 0.349609375, "learning_rate": 4.5860743599951186e-07, "loss": 0.4667, "step": 744 }, { "epoch": 1.7941712204007287, "grad_norm": 0.34765625, "learning_rate": 4.470114953448079e-07, "loss": 0.4772, "step": 745 }, { "epoch": 1.7965998785670916, "grad_norm": 0.349609375, "learning_rate": 4.3556068842348865e-07, "loss": 0.4801, "step": 746 }, { "epoch": 1.7990285367334549, "grad_norm": 0.34375, "learning_rate": 4.2425518920188536e-07, "loss": 0.4718, "step": 747 }, { "epoch": 1.8014571948998177, "grad_norm": 0.349609375, "learning_rate": 4.1309516943874196e-07, "loss": 0.4731, "step": 748 }, { "epoch": 1.803885853066181, "grad_norm": 0.349609375, "learning_rate": 4.0208079868260696e-07, "loss": 0.4812, "step": 749 }, { "epoch": 1.806314511232544, "grad_norm": 0.345703125, "learning_rate": 3.9121224426925675e-07, "loss": 0.4739, "step": 750 }, { "epoch": 1.8087431693989071, "grad_norm": 0.349609375, "learning_rate": 3.8048967131915414e-07, "loss": 0.4755, "step": 751 }, { "epoch": 1.8111718275652702, "grad_norm": 0.349609375, "learning_rate": 3.699132427349383e-07, "loss": 0.4749, "step": 752 }, { "epoch": 1.8136004857316332, "grad_norm": 0.345703125, "learning_rate": 3.594831191989523e-07, "loss": 0.4737, "step": 753 }, { "epoch": 1.8160291438979965, "grad_norm": 0.34765625, "learning_rate": 3.49199459170797e-07, "loss": 0.4689, "step": 754 }, { "epoch": 1.8160291438979965, "eval_loss": 0.506585955619812, "eval_runtime": 101.0452, "eval_samples_per_second": 29.69, "eval_steps_per_second": 3.711, "step": 754 }, { "epoch": 1.8184578020643594, "grad_norm": 0.34765625, "learning_rate": 3.3906241888493005e-07, "loss": 0.4732, "step": 755 }, { "epoch": 1.8208864602307226, "grad_norm": 0.357421875, "learning_rate": 3.2907215234829205e-07, "loss": 0.4814, "step": 756 }, { "epoch": 1.8233151183970855, "grad_norm": 0.34765625, "learning_rate": 3.1922881133795827e-07, "loss": 0.4705, "step": 757 }, { "epoch": 1.8257437765634488, "grad_norm": 0.34765625, "learning_rate": 3.095325453988385e-07, "loss": 0.4727, "step": 758 }, { "epoch": 1.8281724347298116, "grad_norm": 0.34375, "learning_rate": 2.999835018414143e-07, "loss": 0.4698, "step": 759 }, { "epoch": 1.830601092896175, "grad_norm": 0.34765625, "learning_rate": 2.905818257394799e-07, "loss": 0.478, "step": 760 }, { "epoch": 1.833029751062538, "grad_norm": 0.345703125, "learning_rate": 2.8132765992795797e-07, "loss": 0.4695, "step": 761 }, { "epoch": 1.835458409228901, "grad_norm": 0.345703125, "learning_rate": 2.722211450007206e-07, "loss": 0.4722, "step": 762 }, { "epoch": 1.837887067395264, "grad_norm": 0.345703125, "learning_rate": 2.632624193084499e-07, "loss": 0.4632, "step": 763 }, { "epoch": 1.8403157255616271, "grad_norm": 0.34765625, "learning_rate": 2.544516189565482e-07, "loss": 0.4781, "step": 764 }, { "epoch": 1.8427443837279904, "grad_norm": 0.345703125, "learning_rate": 2.4578887780305704e-07, "loss": 0.4755, "step": 765 }, { "epoch": 1.8451730418943533, "grad_norm": 0.34765625, "learning_rate": 2.3727432745663025e-07, "loss": 0.4761, "step": 766 }, { "epoch": 1.8476017000607166, "grad_norm": 0.349609375, "learning_rate": 2.2890809727453612e-07, "loss": 0.4747, "step": 767 }, { "epoch": 1.8500303582270794, "grad_norm": 0.34765625, "learning_rate": 2.2069031436068643e-07, "loss": 0.4728, "step": 768 }, { "epoch": 1.8524590163934427, "grad_norm": 0.349609375, "learning_rate": 2.1262110356371047e-07, "loss": 0.4824, "step": 769 }, { "epoch": 1.8548876745598057, "grad_norm": 0.345703125, "learning_rate": 2.0470058747505516e-07, "loss": 0.4683, "step": 770 }, { "epoch": 1.8573163327261688, "grad_norm": 0.3515625, "learning_rate": 1.969288864271246e-07, "loss": 0.4866, "step": 771 }, { "epoch": 1.8597449908925319, "grad_norm": 0.3515625, "learning_rate": 1.8930611849145131e-07, "loss": 0.4797, "step": 772 }, { "epoch": 1.862173649058895, "grad_norm": 0.345703125, "learning_rate": 1.8183239947690112e-07, "loss": 0.4676, "step": 773 }, { "epoch": 1.864602307225258, "grad_norm": 0.34765625, "learning_rate": 1.7450784292791456e-07, "loss": 0.4668, "step": 774 }, { "epoch": 1.867030965391621, "grad_norm": 0.34375, "learning_rate": 1.6733256012278486e-07, "loss": 0.4742, "step": 775 }, { "epoch": 1.8694596235579843, "grad_norm": 0.34765625, "learning_rate": 1.603066600719605e-07, "loss": 0.4728, "step": 776 }, { "epoch": 1.8718882817243472, "grad_norm": 0.34765625, "learning_rate": 1.5343024951639752e-07, "loss": 0.47, "step": 777 }, { "epoch": 1.8743169398907105, "grad_norm": 0.34765625, "learning_rate": 1.467034329259287e-07, "loss": 0.4656, "step": 778 }, { "epoch": 1.8767455980570733, "grad_norm": 0.3515625, "learning_rate": 1.4012631249768592e-07, "loss": 0.4858, "step": 779 }, { "epoch": 1.8791742562234366, "grad_norm": 0.34765625, "learning_rate": 1.336989881545403e-07, "loss": 0.4646, "step": 780 }, { "epoch": 1.8791742562234366, "eval_loss": 0.5065886974334717, "eval_runtime": 100.9737, "eval_samples_per_second": 29.711, "eval_steps_per_second": 3.714, "step": 780 }, { "epoch": 1.8816029143897997, "grad_norm": 0.349609375, "learning_rate": 1.2742155754358553e-07, "loss": 0.4823, "step": 781 }, { "epoch": 1.8840315725561627, "grad_norm": 0.357421875, "learning_rate": 1.2129411603465924e-07, "loss": 0.4806, "step": 782 }, { "epoch": 1.8864602307225258, "grad_norm": 0.3515625, "learning_rate": 1.1531675671888621e-07, "loss": 0.4909, "step": 783 }, { "epoch": 1.8888888888888888, "grad_norm": 0.345703125, "learning_rate": 1.0948957040727071e-07, "loss": 0.4798, "step": 784 }, { "epoch": 1.8913175470552521, "grad_norm": 0.34375, "learning_rate": 1.0381264562931426e-07, "loss": 0.4667, "step": 785 }, { "epoch": 1.893746205221615, "grad_norm": 0.345703125, "learning_rate": 9.828606863166779e-08, "loss": 0.4703, "step": 786 }, { "epoch": 1.8961748633879782, "grad_norm": 0.349609375, "learning_rate": 9.290992337682936e-08, "loss": 0.4799, "step": 787 }, { "epoch": 1.898603521554341, "grad_norm": 0.35546875, "learning_rate": 8.768429154185853e-08, "loss": 0.478, "step": 788 }, { "epoch": 1.9010321797207044, "grad_norm": 0.349609375, "learning_rate": 8.260925251714514e-08, "loss": 0.4779, "step": 789 }, { "epoch": 1.9034608378870674, "grad_norm": 0.3515625, "learning_rate": 7.768488340519464e-08, "loss": 0.4801, "step": 790 }, { "epoch": 1.9058894960534305, "grad_norm": 0.345703125, "learning_rate": 7.291125901946027e-08, "loss": 0.4716, "step": 791 }, { "epoch": 1.9083181542197936, "grad_norm": 0.349609375, "learning_rate": 6.828845188321054e-08, "loss": 0.4739, "step": 792 }, { "epoch": 1.9107468123861566, "grad_norm": 0.345703125, "learning_rate": 6.381653222842011e-08, "loss": 0.4673, "step": 793 }, { "epoch": 1.9131754705525197, "grad_norm": 0.353515625, "learning_rate": 5.949556799470846e-08, "loss": 0.4853, "step": 794 }, { "epoch": 1.9156041287188827, "grad_norm": 0.515625, "learning_rate": 5.532562482830406e-08, "loss": 0.5203, "step": 795 }, { "epoch": 1.918032786885246, "grad_norm": 0.345703125, "learning_rate": 5.1306766081048456e-08, "loss": 0.4728, "step": 796 }, { "epoch": 1.9204614450516089, "grad_norm": 0.64453125, "learning_rate": 4.743905280943595e-08, "loss": 0.5187, "step": 797 }, { "epoch": 1.9228901032179722, "grad_norm": 0.349609375, "learning_rate": 4.3722543773681016e-08, "loss": 0.4797, "step": 798 }, { "epoch": 1.925318761384335, "grad_norm": 0.345703125, "learning_rate": 4.0157295436830116e-08, "loss": 0.4678, "step": 799 }, { "epoch": 1.9277474195506983, "grad_norm": 0.52734375, "learning_rate": 3.674336196390238e-08, "loss": 0.5106, "step": 800 }, { "epoch": 1.9301760777170613, "grad_norm": 0.349609375, "learning_rate": 3.3480795221066955e-08, "loss": 0.4749, "step": 801 }, { "epoch": 1.9326047358834244, "grad_norm": 0.34765625, "learning_rate": 3.036964477485249e-08, "loss": 0.4735, "step": 802 }, { "epoch": 1.9350333940497875, "grad_norm": 0.353515625, "learning_rate": 2.7409957891397775e-08, "loss": 0.476, "step": 803 }, { "epoch": 1.9374620522161505, "grad_norm": 0.345703125, "learning_rate": 2.4601779535733394e-08, "loss": 0.4695, "step": 804 }, { "epoch": 1.9398907103825138, "grad_norm": 0.34765625, "learning_rate": 2.1945152371094512e-08, "loss": 0.4808, "step": 805 }, { "epoch": 1.9423193685488767, "grad_norm": 0.349609375, "learning_rate": 1.944011675827695e-08, "loss": 0.4719, "step": 806 }, { "epoch": 1.9423193685488767, "eval_loss": 0.5065895318984985, "eval_runtime": 96.8435, "eval_samples_per_second": 30.978, "eval_steps_per_second": 3.872, "step": 806 }, { "epoch": 1.94474802671524, "grad_norm": 0.349609375, "learning_rate": 1.7086710755024327e-08, "loss": 0.4724, "step": 807 }, { "epoch": 1.9471766848816028, "grad_norm": 0.34765625, "learning_rate": 1.4884970115444097e-08, "loss": 0.4684, "step": 808 }, { "epoch": 1.949605343047966, "grad_norm": 0.3515625, "learning_rate": 1.2834928289472415e-08, "loss": 0.4773, "step": 809 }, { "epoch": 1.9520340012143291, "grad_norm": 0.349609375, "learning_rate": 1.0936616422358992e-08, "loss": 0.4767, "step": 810 }, { "epoch": 1.9544626593806922, "grad_norm": 0.34765625, "learning_rate": 9.190063354198586e-09, "loss": 0.4771, "step": 811 }, { "epoch": 1.9568913175470553, "grad_norm": 0.34375, "learning_rate": 7.595295619490239e-09, "loss": 0.4729, "step": 812 }, { "epoch": 1.9593199757134183, "grad_norm": 0.3515625, "learning_rate": 6.152337446736489e-09, "loss": 0.4754, "step": 813 }, { "epoch": 1.9617486338797814, "grad_norm": 0.353515625, "learning_rate": 4.861210758071444e-09, "loss": 0.4906, "step": 814 }, { "epoch": 1.9641772920461444, "grad_norm": 0.345703125, "learning_rate": 3.7219351689310455e-09, "loss": 0.4767, "step": 815 }, { "epoch": 1.9666059502125077, "grad_norm": 0.349609375, "learning_rate": 2.734527987755531e-09, "loss": 0.4862, "step": 816 }, { "epoch": 1.9690346083788706, "grad_norm": 0.345703125, "learning_rate": 1.899004215722977e-09, "loss": 0.4682, "step": 817 }, { "epoch": 1.9714632665452339, "grad_norm": 0.3515625, "learning_rate": 1.2153765465250378e-09, "loss": 0.4798, "step": 818 }, { "epoch": 1.9738919247115967, "grad_norm": 0.34765625, "learning_rate": 6.836553661715429e-10, "loss": 0.4743, "step": 819 }, { "epoch": 1.97632058287796, "grad_norm": 0.353515625, "learning_rate": 3.038487528350675e-10, "loss": 0.4736, "step": 820 }, { "epoch": 1.978749241044323, "grad_norm": 0.341796875, "learning_rate": 7.596247672325696e-11, "loss": 0.4709, "step": 821 }, { "epoch": 1.981177899210686, "grad_norm": 0.345703125, "learning_rate": 0.0, "loss": 0.4669, "step": 822 } ], "logging_steps": 1, "max_steps": 822, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 411, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8283124614508839e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }