{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9975062344139651, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033250207813798837, "grad_norm": 1.1512730121612549, "learning_rate": 2e-05, "loss": 2.3647, "step": 1 }, { "epoch": 0.006650041562759767, "grad_norm": 1.1141036748886108, "learning_rate": 4e-05, "loss": 2.2794, "step": 2 }, { "epoch": 0.00997506234413965, "grad_norm": 0.9610893726348877, "learning_rate": 6e-05, "loss": 2.2596, "step": 3 }, { "epoch": 0.013300083125519535, "grad_norm": 1.1339858770370483, "learning_rate": 8e-05, "loss": 2.3787, "step": 4 }, { "epoch": 0.01662510390689942, "grad_norm": 0.8878076672554016, "learning_rate": 0.0001, "loss": 2.3961, "step": 5 }, { "epoch": 0.0199501246882793, "grad_norm": 0.829910397529602, "learning_rate": 0.00012, "loss": 2.1948, "step": 6 }, { "epoch": 0.023275145469659187, "grad_norm": 0.9420105814933777, "learning_rate": 0.00014, "loss": 2.4329, "step": 7 }, { "epoch": 0.02660016625103907, "grad_norm": 0.8519226908683777, "learning_rate": 0.00016, "loss": 2.3078, "step": 8 }, { "epoch": 0.029925187032418952, "grad_norm": 0.7587653994560242, "learning_rate": 0.00018, "loss": 1.9353, "step": 9 }, { "epoch": 0.03325020781379884, "grad_norm": 0.9927352666854858, "learning_rate": 0.0002, "loss": 1.9429, "step": 10 }, { "epoch": 0.03657522859517872, "grad_norm": 0.9643892049789429, "learning_rate": 0.00019999413227831132, "loss": 2.0925, "step": 11 }, { "epoch": 0.0399002493765586, "grad_norm": 0.941749632358551, "learning_rate": 0.00019997652980184843, "loss": 1.8099, "step": 12 }, { "epoch": 0.043225270157938485, "grad_norm": 0.5177962779998779, "learning_rate": 0.00019994719463633997, "loss": 1.6693, "step": 13 }, { "epoch": 0.046550290939318374, "grad_norm": 0.56168133020401, "learning_rate": 0.0001999061302243977, "loss": 1.9593, "step": 14 }, { "epoch": 0.04987531172069826, "grad_norm": 0.5392152070999146, "learning_rate": 0.00019985334138511237, "loss": 1.6836, "step": 15 }, { "epoch": 0.05320033250207814, "grad_norm": 0.5796711444854736, "learning_rate": 0.00019978883431348845, "loss": 1.7744, "step": 16 }, { "epoch": 0.05652535328345802, "grad_norm": 0.5629785060882568, "learning_rate": 0.0001997126165797167, "loss": 2.0442, "step": 17 }, { "epoch": 0.059850374064837904, "grad_norm": 0.48991289734840393, "learning_rate": 0.00019962469712828614, "loss": 1.679, "step": 18 }, { "epoch": 0.06317539484621779, "grad_norm": 0.47867172956466675, "learning_rate": 0.0001995250862769342, "loss": 1.6641, "step": 19 }, { "epoch": 0.06650041562759768, "grad_norm": 0.49752330780029297, "learning_rate": 0.00019941379571543596, "loss": 1.5331, "step": 20 }, { "epoch": 0.06982543640897755, "grad_norm": 0.49927300214767456, "learning_rate": 0.00019929083850423225, "loss": 1.5704, "step": 21 }, { "epoch": 0.07315045719035744, "grad_norm": 0.5634847283363342, "learning_rate": 0.00019915622907289694, "loss": 1.9051, "step": 22 }, { "epoch": 0.07647547797173733, "grad_norm": 0.5214512944221497, "learning_rate": 0.00019900998321844367, "loss": 1.756, "step": 23 }, { "epoch": 0.0798004987531172, "grad_norm": 0.46316221356391907, "learning_rate": 0.00019885211810347184, "loss": 1.6153, "step": 24 }, { "epoch": 0.0831255195344971, "grad_norm": 0.45869576930999756, "learning_rate": 0.00019868265225415265, "loss": 1.8899, "step": 25 }, { "epoch": 0.08645054031587697, "grad_norm": 0.4824669063091278, "learning_rate": 0.00019850160555805486, "loss": 1.8861, "step": 26 }, { "epoch": 0.08977556109725686, "grad_norm": 0.509224534034729, "learning_rate": 0.000198308999261811, "loss": 1.8507, "step": 27 }, { "epoch": 0.09310058187863675, "grad_norm": 0.4441746771335602, "learning_rate": 0.00019810485596862392, "loss": 1.7326, "step": 28 }, { "epoch": 0.09642560266001662, "grad_norm": 0.4595758318901062, "learning_rate": 0.00019788919963561422, "loss": 1.8283, "step": 29 }, { "epoch": 0.09975062344139651, "grad_norm": 0.5222824215888977, "learning_rate": 0.00019766205557100868, "loss": 1.5678, "step": 30 }, { "epoch": 0.10307564422277639, "grad_norm": 0.43890196084976196, "learning_rate": 0.00019742345043117045, "loss": 1.5899, "step": 31 }, { "epoch": 0.10640066500415628, "grad_norm": 0.4542831778526306, "learning_rate": 0.00019717341221747056, "loss": 1.6733, "step": 32 }, { "epoch": 0.10972568578553615, "grad_norm": 0.43134549260139465, "learning_rate": 0.00019691197027300205, "loss": 1.7386, "step": 33 }, { "epoch": 0.11305070656691604, "grad_norm": 0.44071701169013977, "learning_rate": 0.00019663915527913625, "loss": 1.7685, "step": 34 }, { "epoch": 0.11637572734829593, "grad_norm": 0.4880881607532501, "learning_rate": 0.0001963549992519223, "loss": 1.8461, "step": 35 }, { "epoch": 0.11970074812967581, "grad_norm": 0.40884578227996826, "learning_rate": 0.00019605953553832988, "loss": 1.5538, "step": 36 }, { "epoch": 0.1230257689110557, "grad_norm": 0.39413318037986755, "learning_rate": 0.00019575279881233577, "loss": 1.4222, "step": 37 }, { "epoch": 0.12635078969243557, "grad_norm": 0.44478997588157654, "learning_rate": 0.00019543482507085482, "loss": 1.7247, "step": 38 }, { "epoch": 0.12967581047381546, "grad_norm": 0.4295913875102997, "learning_rate": 0.00019510565162951537, "loss": 1.5788, "step": 39 }, { "epoch": 0.13300083125519535, "grad_norm": 0.47360050678253174, "learning_rate": 0.00019476531711828027, "loss": 1.7429, "step": 40 }, { "epoch": 0.13632585203657524, "grad_norm": 0.483909547328949, "learning_rate": 0.00019441386147691335, "loss": 1.6674, "step": 41 }, { "epoch": 0.1396508728179551, "grad_norm": 0.47071558237075806, "learning_rate": 0.0001940513259502924, "loss": 1.8229, "step": 42 }, { "epoch": 0.142975893599335, "grad_norm": 0.43929168581962585, "learning_rate": 0.0001936777530835689, "loss": 1.6562, "step": 43 }, { "epoch": 0.14630091438071488, "grad_norm": 0.4329998791217804, "learning_rate": 0.0001932931867171751, "loss": 1.5274, "step": 44 }, { "epoch": 0.14962593516209477, "grad_norm": 0.44375908374786377, "learning_rate": 0.00019289767198167916, "loss": 1.7084, "step": 45 }, { "epoch": 0.15295095594347466, "grad_norm": 0.48119276762008667, "learning_rate": 0.0001924912552924889, "loss": 1.7645, "step": 46 }, { "epoch": 0.15627597672485452, "grad_norm": 0.4040566384792328, "learning_rate": 0.00019207398434440478, "loss": 1.5925, "step": 47 }, { "epoch": 0.1596009975062344, "grad_norm": 0.4708506464958191, "learning_rate": 0.00019164590810602262, "loss": 1.8461, "step": 48 }, { "epoch": 0.1629260182876143, "grad_norm": 0.431772381067276, "learning_rate": 0.000191207076813987, "loss": 1.5356, "step": 49 }, { "epoch": 0.1662510390689942, "grad_norm": 0.4952054023742676, "learning_rate": 0.00019075754196709572, "loss": 1.8034, "step": 50 }, { "epoch": 0.16957605985037408, "grad_norm": 0.43522897362709045, "learning_rate": 0.00019029735632025618, "loss": 1.6717, "step": 51 }, { "epoch": 0.17290108063175394, "grad_norm": 0.46861544251441956, "learning_rate": 0.00018982657387829445, "loss": 1.766, "step": 52 }, { "epoch": 0.17622610141313383, "grad_norm": 0.44363775849342346, "learning_rate": 0.00018934524988961738, "loss": 1.5169, "step": 53 }, { "epoch": 0.17955112219451372, "grad_norm": 0.41366782784461975, "learning_rate": 0.00018885344083972914, "loss": 1.6495, "step": 54 }, { "epoch": 0.1828761429758936, "grad_norm": 0.4273390769958496, "learning_rate": 0.0001883512044446023, "loss": 1.5952, "step": 55 }, { "epoch": 0.1862011637572735, "grad_norm": 0.4389772117137909, "learning_rate": 0.00018783859964390464, "loss": 1.7003, "step": 56 }, { "epoch": 0.18952618453865336, "grad_norm": 0.480125367641449, "learning_rate": 0.0001873156865940823, "loss": 1.6503, "step": 57 }, { "epoch": 0.19285120532003325, "grad_norm": 0.48973348736763, "learning_rate": 0.00018678252666130013, "loss": 1.737, "step": 58 }, { "epoch": 0.19617622610141314, "grad_norm": 0.4558335840702057, "learning_rate": 0.0001862391824142402, "loss": 1.571, "step": 59 }, { "epoch": 0.19950124688279303, "grad_norm": 0.45777326822280884, "learning_rate": 0.00018568571761675893, "loss": 1.6462, "step": 60 }, { "epoch": 0.2028262676641729, "grad_norm": 0.4185212254524231, "learning_rate": 0.00018512219722040425, "loss": 1.5729, "step": 61 }, { "epoch": 0.20615128844555278, "grad_norm": 0.4137243330478668, "learning_rate": 0.0001845486873567932, "loss": 1.675, "step": 62 }, { "epoch": 0.20947630922693267, "grad_norm": 0.42468297481536865, "learning_rate": 0.00018396525532985108, "loss": 1.4519, "step": 63 }, { "epoch": 0.21280133000831256, "grad_norm": 0.46751776337623596, "learning_rate": 0.00018337196960791302, "loss": 1.7264, "step": 64 }, { "epoch": 0.21612635078969245, "grad_norm": 0.47722429037094116, "learning_rate": 0.00018276889981568906, "loss": 1.5392, "step": 65 }, { "epoch": 0.2194513715710723, "grad_norm": 0.4753107726573944, "learning_rate": 0.00018215611672609317, "loss": 1.5328, "step": 66 }, { "epoch": 0.2227763923524522, "grad_norm": 0.4401816129684448, "learning_rate": 0.00018153369225193782, "loss": 1.4793, "step": 67 }, { "epoch": 0.22610141313383209, "grad_norm": 0.4473712146282196, "learning_rate": 0.00018090169943749476, "loss": 1.5596, "step": 68 }, { "epoch": 0.22942643391521197, "grad_norm": 0.45505204796791077, "learning_rate": 0.00018026021244992287, "loss": 1.7437, "step": 69 }, { "epoch": 0.23275145469659186, "grad_norm": 0.44190192222595215, "learning_rate": 0.00017960930657056438, "loss": 1.7401, "step": 70 }, { "epoch": 0.23607647547797173, "grad_norm": 0.501592218875885, "learning_rate": 0.0001789490581861102, "loss": 1.7464, "step": 71 }, { "epoch": 0.23940149625935161, "grad_norm": 0.43836328387260437, "learning_rate": 0.00017827954477963557, "loss": 1.7451, "step": 72 }, { "epoch": 0.2427265170407315, "grad_norm": 0.611949622631073, "learning_rate": 0.0001776008449215073, "loss": 1.6921, "step": 73 }, { "epoch": 0.2460515378221114, "grad_norm": 0.46015432476997375, "learning_rate": 0.0001769130382601629, "loss": 1.7985, "step": 74 }, { "epoch": 0.24937655860349128, "grad_norm": 0.44316309690475464, "learning_rate": 0.00017621620551276366, "loss": 1.7806, "step": 75 }, { "epoch": 0.25270157938487114, "grad_norm": 0.4749353229999542, "learning_rate": 0.00017551042845572208, "loss": 1.7349, "step": 76 }, { "epoch": 0.25602660016625106, "grad_norm": 0.4712280333042145, "learning_rate": 0.00017479578991510506, "loss": 1.4129, "step": 77 }, { "epoch": 0.2593516209476309, "grad_norm": 0.44466859102249146, "learning_rate": 0.00017407237375691392, "loss": 1.6819, "step": 78 }, { "epoch": 0.2626766417290108, "grad_norm": 0.42531418800354004, "learning_rate": 0.00017334026487724225, "loss": 1.6154, "step": 79 }, { "epoch": 0.2660016625103907, "grad_norm": 0.4512370228767395, "learning_rate": 0.0001725995491923131, "loss": 1.6736, "step": 80 }, { "epoch": 0.26932668329177056, "grad_norm": 0.4131537079811096, "learning_rate": 0.00017185031362839626, "loss": 1.5468, "step": 81 }, { "epoch": 0.2726517040731505, "grad_norm": 0.47616103291511536, "learning_rate": 0.00017109264611160708, "loss": 1.523, "step": 82 }, { "epoch": 0.27597672485453034, "grad_norm": 0.4459686279296875, "learning_rate": 0.000170326635557588, "loss": 1.8612, "step": 83 }, { "epoch": 0.2793017456359102, "grad_norm": 0.4500899612903595, "learning_rate": 0.00016955237186107387, "loss": 1.643, "step": 84 }, { "epoch": 0.2826267664172901, "grad_norm": 0.44385287165641785, "learning_rate": 0.00016876994588534234, "loss": 1.3833, "step": 85 }, { "epoch": 0.28595178719867, "grad_norm": 0.4063577950000763, "learning_rate": 0.0001679794494515508, "loss": 1.3494, "step": 86 }, { "epoch": 0.2892768079800499, "grad_norm": 0.43013447523117065, "learning_rate": 0.00016718097532796063, "loss": 1.5205, "step": 87 }, { "epoch": 0.29260182876142976, "grad_norm": 0.46770158410072327, "learning_rate": 0.00016637461721905045, "loss": 1.6897, "step": 88 }, { "epoch": 0.2959268495428096, "grad_norm": 0.4841039478778839, "learning_rate": 0.00016556046975451963, "loss": 1.5793, "step": 89 }, { "epoch": 0.29925187032418954, "grad_norm": 0.48426705598831177, "learning_rate": 0.00016473862847818277, "loss": 1.6988, "step": 90 }, { "epoch": 0.3025768911055694, "grad_norm": 0.5768110752105713, "learning_rate": 0.0001639091898367576, "loss": 1.7846, "step": 91 }, { "epoch": 0.3059019118869493, "grad_norm": 0.446196049451828, "learning_rate": 0.00016307225116854622, "loss": 1.7882, "step": 92 }, { "epoch": 0.3092269326683292, "grad_norm": 0.4034564793109894, "learning_rate": 0.00016222791069201207, "loss": 1.6616, "step": 93 }, { "epoch": 0.31255195344970904, "grad_norm": 0.424376517534256, "learning_rate": 0.00016137626749425377, "loss": 1.5353, "step": 94 }, { "epoch": 0.31587697423108896, "grad_norm": 0.45510077476501465, "learning_rate": 0.00016051742151937655, "loss": 1.7947, "step": 95 }, { "epoch": 0.3192019950124688, "grad_norm": 0.4815070331096649, "learning_rate": 0.00015965147355676343, "loss": 1.581, "step": 96 }, { "epoch": 0.32252701579384874, "grad_norm": 0.4505084156990051, "learning_rate": 0.00015877852522924732, "loss": 1.6186, "step": 97 }, { "epoch": 0.3258520365752286, "grad_norm": 0.4437141418457031, "learning_rate": 0.0001578986789811849, "loss": 1.6509, "step": 98 }, { "epoch": 0.32917705735660846, "grad_norm": 0.4133874475955963, "learning_rate": 0.00015701203806643433, "loss": 1.7992, "step": 99 }, { "epoch": 0.3325020781379884, "grad_norm": 0.4500593841075897, "learning_rate": 0.00015611870653623825, "loss": 1.6654, "step": 100 }, { "epoch": 0.33582709891936824, "grad_norm": 0.4359726309776306, "learning_rate": 0.00015521878922701246, "loss": 1.6461, "step": 101 }, { "epoch": 0.33915211970074816, "grad_norm": 0.40108025074005127, "learning_rate": 0.00015431239174804328, "loss": 1.5237, "step": 102 }, { "epoch": 0.342477140482128, "grad_norm": 0.43869125843048096, "learning_rate": 0.00015339962046909364, "loss": 1.6909, "step": 103 }, { "epoch": 0.3458021612635079, "grad_norm": 0.42006051540374756, "learning_rate": 0.00015248058250792008, "loss": 1.5046, "step": 104 }, { "epoch": 0.3491271820448878, "grad_norm": 0.38756394386291504, "learning_rate": 0.00015155538571770218, "loss": 1.3747, "step": 105 }, { "epoch": 0.35245220282626766, "grad_norm": 0.47784286737442017, "learning_rate": 0.0001506241386743854, "loss": 1.673, "step": 106 }, { "epoch": 0.3557772236076476, "grad_norm": 0.4587322175502777, "learning_rate": 0.00014968695066393923, "loss": 1.7987, "step": 107 }, { "epoch": 0.35910224438902744, "grad_norm": 0.42091092467308044, "learning_rate": 0.00014874393166953192, "loss": 1.5309, "step": 108 }, { "epoch": 0.3624272651704073, "grad_norm": 0.47224530577659607, "learning_rate": 0.00014779519235862365, "loss": 1.7268, "step": 109 }, { "epoch": 0.3657522859517872, "grad_norm": 0.44596192240715027, "learning_rate": 0.00014684084406997903, "loss": 1.7108, "step": 110 }, { "epoch": 0.3690773067331671, "grad_norm": 0.4590005874633789, "learning_rate": 0.0001458809988006011, "loss": 1.638, "step": 111 }, { "epoch": 0.372402327514547, "grad_norm": 0.43627721071243286, "learning_rate": 0.00014491576919258792, "loss": 1.6721, "step": 112 }, { "epoch": 0.37572734829592686, "grad_norm": 0.41456034779548645, "learning_rate": 0.00014394526851991364, "loss": 1.6863, "step": 113 }, { "epoch": 0.3790523690773067, "grad_norm": 0.4247894883155823, "learning_rate": 0.0001429696106751352, "loss": 1.5659, "step": 114 }, { "epoch": 0.38237738985868663, "grad_norm": 0.4657272696495056, "learning_rate": 0.00014198891015602646, "loss": 1.4086, "step": 115 }, { "epoch": 0.3857024106400665, "grad_norm": 0.4860394597053528, "learning_rate": 0.0001410032820521416, "loss": 1.4603, "step": 116 }, { "epoch": 0.38902743142144636, "grad_norm": 0.41849544644355774, "learning_rate": 0.00014001284203130868, "loss": 1.3991, "step": 117 }, { "epoch": 0.3923524522028263, "grad_norm": 0.4544629752635956, "learning_rate": 0.00013901770632605547, "loss": 1.8028, "step": 118 }, { "epoch": 0.39567747298420614, "grad_norm": 0.5051787495613098, "learning_rate": 0.0001380179917199692, "loss": 1.8854, "step": 119 }, { "epoch": 0.39900249376558605, "grad_norm": 0.41150030493736267, "learning_rate": 0.00013701381553399145, "loss": 1.6686, "step": 120 }, { "epoch": 0.4023275145469659, "grad_norm": 0.4593510925769806, "learning_rate": 0.0001360052956126499, "loss": 1.5844, "step": 121 }, { "epoch": 0.4056525353283458, "grad_norm": 0.42087090015411377, "learning_rate": 0.00013499255031022885, "loss": 1.4865, "step": 122 }, { "epoch": 0.4089775561097257, "grad_norm": 0.4708739221096039, "learning_rate": 0.00013397569847687984, "loss": 1.7089, "step": 123 }, { "epoch": 0.41230257689110555, "grad_norm": 0.4878352880477905, "learning_rate": 0.00013295485944467405, "loss": 1.8006, "step": 124 }, { "epoch": 0.41562759767248547, "grad_norm": 0.43254002928733826, "learning_rate": 0.000131930153013598, "loss": 1.6949, "step": 125 }, { "epoch": 0.41895261845386533, "grad_norm": 0.47519850730895996, "learning_rate": 0.00013090169943749476, "loss": 1.7601, "step": 126 }, { "epoch": 0.4222776392352452, "grad_norm": 0.4135800898075104, "learning_rate": 0.00012986961940995138, "loss": 1.5955, "step": 127 }, { "epoch": 0.4256026600166251, "grad_norm": 0.46267929673194885, "learning_rate": 0.0001288340340501351, "loss": 1.8398, "step": 128 }, { "epoch": 0.428927680798005, "grad_norm": 0.43891721963882446, "learning_rate": 0.00012779506488857945, "loss": 1.4741, "step": 129 }, { "epoch": 0.4322527015793849, "grad_norm": 0.4456429183483124, "learning_rate": 0.00012675283385292212, "loss": 1.7454, "step": 130 }, { "epoch": 0.43557772236076475, "grad_norm": 0.4604743719100952, "learning_rate": 0.00012570746325359607, "loss": 1.8192, "step": 131 }, { "epoch": 0.4389027431421446, "grad_norm": 0.46728062629699707, "learning_rate": 0.00012465907576947622, "loss": 1.7551, "step": 132 }, { "epoch": 0.44222776392352453, "grad_norm": 0.436298668384552, "learning_rate": 0.000123607794433482, "loss": 1.6592, "step": 133 }, { "epoch": 0.4455527847049044, "grad_norm": 0.39828214049339294, "learning_rate": 0.00012255374261813944, "loss": 1.4603, "step": 134 }, { "epoch": 0.4488778054862843, "grad_norm": 0.4469813406467438, "learning_rate": 0.00012149704402110243, "loss": 1.6449, "step": 135 }, { "epoch": 0.45220282626766417, "grad_norm": 0.4820503294467926, "learning_rate": 0.0001204378226506365, "loss": 1.8473, "step": 136 }, { "epoch": 0.45552784704904403, "grad_norm": 0.49072131514549255, "learning_rate": 0.00011937620281106585, "loss": 1.6843, "step": 137 }, { "epoch": 0.45885286783042395, "grad_norm": 0.48773854970932007, "learning_rate": 0.00011831230908818563, "loss": 1.625, "step": 138 }, { "epoch": 0.4621778886118038, "grad_norm": 0.4438723623752594, "learning_rate": 0.00011724626633464127, "loss": 1.7558, "step": 139 }, { "epoch": 0.46550290939318373, "grad_norm": 0.4389275014400482, "learning_rate": 0.0001161781996552765, "loss": 1.4621, "step": 140 }, { "epoch": 0.4688279301745636, "grad_norm": 0.4611305296421051, "learning_rate": 0.00011510823439245169, "loss": 1.59, "step": 141 }, { "epoch": 0.47215295095594345, "grad_norm": 0.43601059913635254, "learning_rate": 0.00011403649611133444, "loss": 1.7462, "step": 142 }, { "epoch": 0.47547797173732337, "grad_norm": 0.41201236844062805, "learning_rate": 0.00011296311058516389, "loss": 1.5341, "step": 143 }, { "epoch": 0.47880299251870323, "grad_norm": 0.46523982286453247, "learning_rate": 0.00011188820378049065, "loss": 1.6327, "step": 144 }, { "epoch": 0.48212801330008315, "grad_norm": 0.42490893602371216, "learning_rate": 0.00011081190184239419, "loss": 1.6178, "step": 145 }, { "epoch": 0.485453034081463, "grad_norm": 0.42238375544548035, "learning_rate": 0.00010973433107967902, "loss": 1.534, "step": 146 }, { "epoch": 0.48877805486284287, "grad_norm": 0.48569226264953613, "learning_rate": 0.00010865561795005177, "loss": 1.5332, "step": 147 }, { "epoch": 0.4921030756442228, "grad_norm": 0.4933275878429413, "learning_rate": 0.00010757588904528106, "loss": 1.5928, "step": 148 }, { "epoch": 0.49542809642560265, "grad_norm": 0.4781058728694916, "learning_rate": 0.00010649527107634108, "loss": 1.6578, "step": 149 }, { "epoch": 0.49875311720698257, "grad_norm": 0.4651820659637451, "learning_rate": 0.00010541389085854176, "loss": 1.6884, "step": 150 }, { "epoch": 0.5020781379883624, "grad_norm": 0.4429711103439331, "learning_rate": 0.00010433187529664623, "loss": 1.6723, "step": 151 }, { "epoch": 0.5054031587697423, "grad_norm": 0.4521614611148834, "learning_rate": 0.00010324935136997806, "loss": 1.6269, "step": 152 }, { "epoch": 0.5087281795511222, "grad_norm": 0.4930736720561981, "learning_rate": 0.00010216644611751975, "loss": 1.7933, "step": 153 }, { "epoch": 0.5120532003325021, "grad_norm": 0.4855606257915497, "learning_rate": 0.000101083286623004, "loss": 1.6702, "step": 154 }, { "epoch": 0.515378221113882, "grad_norm": 0.4960128366947174, "learning_rate": 0.0001, "loss": 1.7428, "step": 155 }, { "epoch": 0.5187032418952618, "grad_norm": 0.42107459902763367, "learning_rate": 9.891671337699602e-05, "loss": 1.6235, "step": 156 }, { "epoch": 0.5220282626766417, "grad_norm": 0.4479861855506897, "learning_rate": 9.783355388248027e-05, "loss": 1.5158, "step": 157 }, { "epoch": 0.5253532834580216, "grad_norm": 0.4954458177089691, "learning_rate": 9.675064863002196e-05, "loss": 1.6743, "step": 158 }, { "epoch": 0.5286783042394015, "grad_norm": 0.5591014623641968, "learning_rate": 9.56681247033538e-05, "loss": 1.9691, "step": 159 }, { "epoch": 0.5320033250207814, "grad_norm": 0.46626871824264526, "learning_rate": 9.458610914145826e-05, "loss": 1.5621, "step": 160 }, { "epoch": 0.5353283458021613, "grad_norm": 0.4377134144306183, "learning_rate": 9.350472892365892e-05, "loss": 1.5524, "step": 161 }, { "epoch": 0.5386533665835411, "grad_norm": 0.3984418511390686, "learning_rate": 9.242411095471897e-05, "loss": 1.6454, "step": 162 }, { "epoch": 0.541978387364921, "grad_norm": 0.42802637815475464, "learning_rate": 9.134438204994824e-05, "loss": 1.4036, "step": 163 }, { "epoch": 0.545303408146301, "grad_norm": 0.4567003846168518, "learning_rate": 9.026566892032105e-05, "loss": 1.6606, "step": 164 }, { "epoch": 0.5486284289276808, "grad_norm": 0.45452797412872314, "learning_rate": 8.918809815760585e-05, "loss": 1.8219, "step": 165 }, { "epoch": 0.5519534497090607, "grad_norm": 0.4367886781692505, "learning_rate": 8.811179621950936e-05, "loss": 1.5962, "step": 166 }, { "epoch": 0.5552784704904405, "grad_norm": 0.4670146703720093, "learning_rate": 8.703688941483616e-05, "loss": 1.6382, "step": 167 }, { "epoch": 0.5586034912718204, "grad_norm": 0.5069778561592102, "learning_rate": 8.596350388866558e-05, "loss": 1.7067, "step": 168 }, { "epoch": 0.5619285120532004, "grad_norm": 0.4080033302307129, "learning_rate": 8.489176560754834e-05, "loss": 1.4192, "step": 169 }, { "epoch": 0.5652535328345802, "grad_norm": 0.491526335477829, "learning_rate": 8.382180034472353e-05, "loss": 1.8687, "step": 170 }, { "epoch": 0.5685785536159601, "grad_norm": 0.5429246425628662, "learning_rate": 8.275373366535877e-05, "loss": 1.776, "step": 171 }, { "epoch": 0.57190357439734, "grad_norm": 0.4131667912006378, "learning_rate": 8.168769091181438e-05, "loss": 1.3345, "step": 172 }, { "epoch": 0.5752285951787198, "grad_norm": 0.5055519342422485, "learning_rate": 8.062379718893417e-05, "loss": 1.7716, "step": 173 }, { "epoch": 0.5785536159600998, "grad_norm": 0.4675292670726776, "learning_rate": 7.956217734936353e-05, "loss": 1.5941, "step": 174 }, { "epoch": 0.5818786367414797, "grad_norm": 0.5096448659896851, "learning_rate": 7.85029559788976e-05, "loss": 1.9376, "step": 175 }, { "epoch": 0.5852036575228595, "grad_norm": 0.4687637686729431, "learning_rate": 7.744625738186059e-05, "loss": 1.7242, "step": 176 }, { "epoch": 0.5885286783042394, "grad_norm": 0.437148779630661, "learning_rate": 7.639220556651799e-05, "loss": 1.4993, "step": 177 }, { "epoch": 0.5918536990856192, "grad_norm": 0.44125625491142273, "learning_rate": 7.534092423052381e-05, "loss": 1.5076, "step": 178 }, { "epoch": 0.5951787198669992, "grad_norm": 0.4794883131980896, "learning_rate": 7.42925367464039e-05, "loss": 1.6401, "step": 179 }, { "epoch": 0.5985037406483791, "grad_norm": 0.42347967624664307, "learning_rate": 7.324716614707793e-05, "loss": 1.444, "step": 180 }, { "epoch": 0.6018287614297589, "grad_norm": 0.4843563437461853, "learning_rate": 7.220493511142059e-05, "loss": 1.7117, "step": 181 }, { "epoch": 0.6051537822111388, "grad_norm": 0.48885542154312134, "learning_rate": 7.116596594986494e-05, "loss": 1.6799, "step": 182 }, { "epoch": 0.6084788029925187, "grad_norm": 0.48835498094558716, "learning_rate": 7.013038059004866e-05, "loss": 1.7308, "step": 183 }, { "epoch": 0.6118038237738986, "grad_norm": 0.38506001234054565, "learning_rate": 6.909830056250527e-05, "loss": 1.5766, "step": 184 }, { "epoch": 0.6151288445552785, "grad_norm": 0.5520392656326294, "learning_rate": 6.806984698640202e-05, "loss": 1.5418, "step": 185 }, { "epoch": 0.6184538653366584, "grad_norm": 0.4401935935020447, "learning_rate": 6.704514055532597e-05, "loss": 1.7715, "step": 186 }, { "epoch": 0.6217788861180382, "grad_norm": 0.4164566695690155, "learning_rate": 6.602430152312017e-05, "loss": 1.4711, "step": 187 }, { "epoch": 0.6251039068994181, "grad_norm": 0.4750818610191345, "learning_rate": 6.500744968977116e-05, "loss": 1.374, "step": 188 }, { "epoch": 0.628428927680798, "grad_norm": 0.5478043556213379, "learning_rate": 6.399470438735014e-05, "loss": 1.7294, "step": 189 }, { "epoch": 0.6317539484621779, "grad_norm": 0.4560893476009369, "learning_rate": 6.298618446600856e-05, "loss": 1.8216, "step": 190 }, { "epoch": 0.6350789692435578, "grad_norm": 0.49942511320114136, "learning_rate": 6.19820082800308e-05, "loss": 1.6108, "step": 191 }, { "epoch": 0.6384039900249376, "grad_norm": 0.3901759088039398, "learning_rate": 6.0982293673944544e-05, "loss": 1.4635, "step": 192 }, { "epoch": 0.6417290108063175, "grad_norm": 0.45033466815948486, "learning_rate": 5.9987157968691344e-05, "loss": 1.5153, "step": 193 }, { "epoch": 0.6450540315876975, "grad_norm": 0.44514134526252747, "learning_rate": 5.899671794785839e-05, "loss": 1.6015, "step": 194 }, { "epoch": 0.6483790523690773, "grad_norm": 0.42773956060409546, "learning_rate": 5.801108984397354e-05, "loss": 1.6624, "step": 195 }, { "epoch": 0.6517040731504572, "grad_norm": 0.42323529720306396, "learning_rate": 5.703038932486484e-05, "loss": 1.642, "step": 196 }, { "epoch": 0.6550290939318371, "grad_norm": 0.4852340519428253, "learning_rate": 5.605473148008638e-05, "loss": 1.5533, "step": 197 }, { "epoch": 0.6583541147132169, "grad_norm": 0.46353092789649963, "learning_rate": 5.5084230807412126e-05, "loss": 1.5137, "step": 198 }, { "epoch": 0.6616791354945969, "grad_norm": 0.5486162304878235, "learning_rate": 5.411900119939895e-05, "loss": 1.5682, "step": 199 }, { "epoch": 0.6650041562759768, "grad_norm": 0.4136289656162262, "learning_rate": 5.3159155930021e-05, "loss": 1.5902, "step": 200 }, { "epoch": 0.6683291770573566, "grad_norm": 0.457292765378952, "learning_rate": 5.2204807641376354e-05, "loss": 1.6669, "step": 201 }, { "epoch": 0.6716541978387365, "grad_norm": 0.4368407726287842, "learning_rate": 5.12560683304681e-05, "loss": 1.7747, "step": 202 }, { "epoch": 0.6749792186201163, "grad_norm": 0.4596605598926544, "learning_rate": 5.03130493360608e-05, "loss": 1.5868, "step": 203 }, { "epoch": 0.6783042394014963, "grad_norm": 0.437491238117218, "learning_rate": 4.9375861325614606e-05, "loss": 1.7614, "step": 204 }, { "epoch": 0.6816292601828762, "grad_norm": 0.47249388694763184, "learning_rate": 4.844461428229782e-05, "loss": 1.582, "step": 205 }, { "epoch": 0.684954280964256, "grad_norm": 0.44100067019462585, "learning_rate": 4.751941749207995e-05, "loss": 1.6814, "step": 206 }, { "epoch": 0.6882793017456359, "grad_norm": 0.5000886917114258, "learning_rate": 4.660037953090639e-05, "loss": 1.6634, "step": 207 }, { "epoch": 0.6916043225270158, "grad_norm": 0.4667086899280548, "learning_rate": 4.5687608251956714e-05, "loss": 1.7767, "step": 208 }, { "epoch": 0.6949293433083957, "grad_norm": 0.4677750766277313, "learning_rate": 4.4781210772987514e-05, "loss": 1.785, "step": 209 }, { "epoch": 0.6982543640897756, "grad_norm": 0.40729814767837524, "learning_rate": 4.388129346376178e-05, "loss": 1.5742, "step": 210 }, { "epoch": 0.7015793848711555, "grad_norm": 0.4622965157032013, "learning_rate": 4.298796193356566e-05, "loss": 1.755, "step": 211 }, { "epoch": 0.7049044056525353, "grad_norm": 0.42128920555114746, "learning_rate": 4.210132101881516e-05, "loss": 1.359, "step": 212 }, { "epoch": 0.7082294264339152, "grad_norm": 0.4670293927192688, "learning_rate": 4.12214747707527e-05, "loss": 1.8743, "step": 213 }, { "epoch": 0.7115544472152951, "grad_norm": 0.474398136138916, "learning_rate": 4.034852644323661e-05, "loss": 1.6977, "step": 214 }, { "epoch": 0.714879467996675, "grad_norm": 0.5026089549064636, "learning_rate": 3.948257848062351e-05, "loss": 1.566, "step": 215 }, { "epoch": 0.7182044887780549, "grad_norm": 0.40603697299957275, "learning_rate": 3.862373250574626e-05, "loss": 1.3894, "step": 216 }, { "epoch": 0.7215295095594347, "grad_norm": 0.4771779179573059, "learning_rate": 3.7772089307987936e-05, "loss": 1.6296, "step": 217 }, { "epoch": 0.7248545303408146, "grad_norm": 0.44347891211509705, "learning_rate": 3.6927748831453836e-05, "loss": 1.6663, "step": 218 }, { "epoch": 0.7281795511221946, "grad_norm": 0.435149610042572, "learning_rate": 3.609081016324243e-05, "loss": 1.6662, "step": 219 }, { "epoch": 0.7315045719035744, "grad_norm": 0.453782856464386, "learning_rate": 3.5261371521817244e-05, "loss": 1.7286, "step": 220 }, { "epoch": 0.7348295926849543, "grad_norm": 0.42496827244758606, "learning_rate": 3.44395302454804e-05, "loss": 1.7376, "step": 221 }, { "epoch": 0.7381546134663342, "grad_norm": 0.45447835326194763, "learning_rate": 3.3625382780949574e-05, "loss": 1.5055, "step": 222 }, { "epoch": 0.741479634247714, "grad_norm": 0.5035948157310486, "learning_rate": 3.28190246720394e-05, "loss": 1.866, "step": 223 }, { "epoch": 0.744804655029094, "grad_norm": 0.47680604457855225, "learning_rate": 3.202055054844921e-05, "loss": 1.9692, "step": 224 }, { "epoch": 0.7481296758104738, "grad_norm": 0.45373663306236267, "learning_rate": 3.123005411465766e-05, "loss": 1.6879, "step": 225 }, { "epoch": 0.7514546965918537, "grad_norm": 0.49925628304481506, "learning_rate": 3.0447628138926156e-05, "loss": 1.5313, "step": 226 }, { "epoch": 0.7547797173732336, "grad_norm": 0.4820810556411743, "learning_rate": 2.9673364442412e-05, "loss": 1.6259, "step": 227 }, { "epoch": 0.7581047381546134, "grad_norm": 0.5111257433891296, "learning_rate": 2.890735388839295e-05, "loss": 1.6068, "step": 228 }, { "epoch": 0.7614297589359933, "grad_norm": 0.3893967568874359, "learning_rate": 2.8149686371603767e-05, "loss": 1.5461, "step": 229 }, { "epoch": 0.7647547797173733, "grad_norm": 0.42585450410842896, "learning_rate": 2.7400450807686938e-05, "loss": 1.4092, "step": 230 }, { "epoch": 0.7680798004987531, "grad_norm": 0.5068459510803223, "learning_rate": 2.665973512275778e-05, "loss": 1.8426, "step": 231 }, { "epoch": 0.771404821280133, "grad_norm": 0.44372087717056274, "learning_rate": 2.59276262430861e-05, "loss": 1.5669, "step": 232 }, { "epoch": 0.7747298420615129, "grad_norm": 0.4483433663845062, "learning_rate": 2.520421008489494e-05, "loss": 1.508, "step": 233 }, { "epoch": 0.7780548628428927, "grad_norm": 0.4225240647792816, "learning_rate": 2.4489571544277945e-05, "loss": 1.4963, "step": 234 }, { "epoch": 0.7813798836242727, "grad_norm": 0.4540765583515167, "learning_rate": 2.3783794487236365e-05, "loss": 1.7699, "step": 235 }, { "epoch": 0.7847049044056525, "grad_norm": 0.5303469896316528, "learning_rate": 2.308696173983711e-05, "loss": 1.7887, "step": 236 }, { "epoch": 0.7880299251870324, "grad_norm": 0.4368319809436798, "learning_rate": 2.2399155078492694e-05, "loss": 1.6762, "step": 237 }, { "epoch": 0.7913549459684123, "grad_norm": 0.41934987902641296, "learning_rate": 2.1720455220364444e-05, "loss": 1.6372, "step": 238 }, { "epoch": 0.7946799667497921, "grad_norm": 0.4291558861732483, "learning_rate": 2.1050941813889836e-05, "loss": 1.6668, "step": 239 }, { "epoch": 0.7980049875311721, "grad_norm": 0.414044052362442, "learning_rate": 2.0390693429435627e-05, "loss": 1.6885, "step": 240 }, { "epoch": 0.801330008312552, "grad_norm": 0.4342755377292633, "learning_rate": 1.9739787550077116e-05, "loss": 1.5082, "step": 241 }, { "epoch": 0.8046550290939318, "grad_norm": 0.45343807339668274, "learning_rate": 1.9098300562505266e-05, "loss": 1.5635, "step": 242 }, { "epoch": 0.8079800498753117, "grad_norm": 0.4498422145843506, "learning_rate": 1.8466307748062205e-05, "loss": 1.6047, "step": 243 }, { "epoch": 0.8113050706566916, "grad_norm": 0.4087926149368286, "learning_rate": 1.784388327390687e-05, "loss": 1.3402, "step": 244 }, { "epoch": 0.8146300914380715, "grad_norm": 0.42908143997192383, "learning_rate": 1.7231100184310956e-05, "loss": 1.5664, "step": 245 }, { "epoch": 0.8179551122194514, "grad_norm": 0.4820065200328827, "learning_rate": 1.6628030392087e-05, "loss": 1.7218, "step": 246 }, { "epoch": 0.8212801330008312, "grad_norm": 0.4803646206855774, "learning_rate": 1.6034744670148972e-05, "loss": 1.837, "step": 247 }, { "epoch": 0.8246051537822111, "grad_norm": 0.4350035786628723, "learning_rate": 1.5451312643206827e-05, "loss": 1.5924, "step": 248 }, { "epoch": 0.827930174563591, "grad_norm": 0.49933725595474243, "learning_rate": 1.4877802779595762e-05, "loss": 1.6023, "step": 249 }, { "epoch": 0.8312551953449709, "grad_norm": 0.44506213068962097, "learning_rate": 1.4314282383241096e-05, "loss": 1.4533, "step": 250 }, { "epoch": 0.8345802161263508, "grad_norm": 0.46771377325057983, "learning_rate": 1.376081758575981e-05, "loss": 1.7391, "step": 251 }, { "epoch": 0.8379052369077307, "grad_norm": 0.44328737258911133, "learning_rate": 1.3217473338699859e-05, "loss": 1.6868, "step": 252 }, { "epoch": 0.8412302576891105, "grad_norm": 0.4481683373451233, "learning_rate": 1.2684313405917703e-05, "loss": 1.4394, "step": 253 }, { "epoch": 0.8445552784704904, "grad_norm": 0.452848881483078, "learning_rate": 1.2161400356095375e-05, "loss": 1.6657, "step": 254 }, { "epoch": 0.8478802992518704, "grad_norm": 0.42388778924942017, "learning_rate": 1.1648795555397719e-05, "loss": 1.459, "step": 255 }, { "epoch": 0.8512053200332502, "grad_norm": 0.43063634634017944, "learning_rate": 1.1146559160270875e-05, "loss": 1.6652, "step": 256 }, { "epoch": 0.8545303408146301, "grad_norm": 0.40587228536605835, "learning_rate": 1.0654750110382628e-05, "loss": 1.5131, "step": 257 }, { "epoch": 0.85785536159601, "grad_norm": 0.4573078751564026, "learning_rate": 1.0173426121705576e-05, "loss": 1.6047, "step": 258 }, { "epoch": 0.8611803823773898, "grad_norm": 0.4255686104297638, "learning_rate": 9.702643679743817e-06, "loss": 1.493, "step": 259 }, { "epoch": 0.8645054031587698, "grad_norm": 0.48064589500427246, "learning_rate": 9.242458032904311e-06, "loss": 1.6691, "step": 260 }, { "epoch": 0.8678304239401496, "grad_norm": 0.4468303620815277, "learning_rate": 8.792923186013024e-06, "loss": 1.5707, "step": 261 }, { "epoch": 0.8711554447215295, "grad_norm": 0.4417254328727722, "learning_rate": 8.354091893977401e-06, "loss": 1.5591, "step": 262 }, { "epoch": 0.8744804655029094, "grad_norm": 0.42065221071243286, "learning_rate": 7.926015655595254e-06, "loss": 1.5657, "step": 263 }, { "epoch": 0.8778054862842892, "grad_norm": 0.3902848958969116, "learning_rate": 7.508744707511117e-06, "loss": 1.5445, "step": 264 }, { "epoch": 0.8811305070656692, "grad_norm": 0.41993579268455505, "learning_rate": 7.102328018320858e-06, "loss": 1.4065, "step": 265 }, { "epoch": 0.8844555278470491, "grad_norm": 0.4170606732368469, "learning_rate": 6.70681328282492e-06, "loss": 1.5117, "step": 266 }, { "epoch": 0.8877805486284289, "grad_norm": 0.513680636882782, "learning_rate": 6.322246916431107e-06, "loss": 1.9662, "step": 267 }, { "epoch": 0.8911055694098088, "grad_norm": 0.43288302421569824, "learning_rate": 5.948674049707603e-06, "loss": 1.6208, "step": 268 }, { "epoch": 0.8944305901911886, "grad_norm": 0.38253968954086304, "learning_rate": 5.58613852308667e-06, "loss": 1.5302, "step": 269 }, { "epoch": 0.8977556109725686, "grad_norm": 0.4266990125179291, "learning_rate": 5.2346828817197655e-06, "loss": 1.6815, "step": 270 }, { "epoch": 0.9010806317539485, "grad_norm": 0.561107873916626, "learning_rate": 4.8943483704846475e-06, "loss": 1.6608, "step": 271 }, { "epoch": 0.9044056525353283, "grad_norm": 0.41741943359375, "learning_rate": 4.565174929145188e-06, "loss": 1.2898, "step": 272 }, { "epoch": 0.9077306733167082, "grad_norm": 0.39722350239753723, "learning_rate": 4.247201187664218e-06, "loss": 1.585, "step": 273 }, { "epoch": 0.9110556940980881, "grad_norm": 0.41877254843711853, "learning_rate": 3.940464461670135e-06, "loss": 1.605, "step": 274 }, { "epoch": 0.914380714879468, "grad_norm": 0.5125119090080261, "learning_rate": 3.6450007480777093e-06, "loss": 1.5922, "step": 275 }, { "epoch": 0.9177057356608479, "grad_norm": 0.43189626932144165, "learning_rate": 3.360844720863765e-06, "loss": 1.559, "step": 276 }, { "epoch": 0.9210307564422278, "grad_norm": 0.44040048122406006, "learning_rate": 3.0880297269979653e-06, "loss": 1.67, "step": 277 }, { "epoch": 0.9243557772236076, "grad_norm": 0.5034830570220947, "learning_rate": 2.826587782529444e-06, "loss": 1.9225, "step": 278 }, { "epoch": 0.9276807980049875, "grad_norm": 0.4373987019062042, "learning_rate": 2.576549568829578e-06, "loss": 1.7428, "step": 279 }, { "epoch": 0.9310058187863675, "grad_norm": 0.45258763432502747, "learning_rate": 2.3379444289913342e-06, "loss": 1.5951, "step": 280 }, { "epoch": 0.9343308395677473, "grad_norm": 0.4411347210407257, "learning_rate": 2.110800364385812e-06, "loss": 1.5906, "step": 281 }, { "epoch": 0.9376558603491272, "grad_norm": 0.4530499577522278, "learning_rate": 1.8951440313760837e-06, "loss": 1.4591, "step": 282 }, { "epoch": 0.940980881130507, "grad_norm": 0.484295129776001, "learning_rate": 1.6910007381890081e-06, "loss": 1.6808, "step": 283 }, { "epoch": 0.9443059019118869, "grad_norm": 0.43871116638183594, "learning_rate": 1.4983944419451613e-06, "loss": 1.5378, "step": 284 }, { "epoch": 0.9476309226932669, "grad_norm": 0.4450673460960388, "learning_rate": 1.317347745847386e-06, "loss": 1.6353, "step": 285 }, { "epoch": 0.9509559434746467, "grad_norm": 0.4276074171066284, "learning_rate": 1.1478818965281911e-06, "loss": 1.5403, "step": 286 }, { "epoch": 0.9542809642560266, "grad_norm": 0.46902570128440857, "learning_rate": 9.900167815563465e-07, "loss": 1.5077, "step": 287 }, { "epoch": 0.9576059850374065, "grad_norm": 0.42395344376564026, "learning_rate": 8.437709271030603e-07, "loss": 1.4276, "step": 288 }, { "epoch": 0.9609310058187863, "grad_norm": 0.45644354820251465, "learning_rate": 7.091614957677517e-07, "loss": 1.6846, "step": 289 }, { "epoch": 0.9642560266001663, "grad_norm": 0.5007442831993103, "learning_rate": 5.862042845640403e-07, "loss": 1.8228, "step": 290 }, { "epoch": 0.9675810473815462, "grad_norm": 0.4374752640724182, "learning_rate": 4.7491372306580627e-07, "loss": 1.6439, "step": 291 }, { "epoch": 0.970906068162926, "grad_norm": 0.42748919129371643, "learning_rate": 3.7530287171387843e-07, "loss": 1.5968, "step": 292 }, { "epoch": 0.9742310889443059, "grad_norm": 0.3897336721420288, "learning_rate": 2.873834202833159e-07, "loss": 1.5657, "step": 293 }, { "epoch": 0.9775561097256857, "grad_norm": 0.5000090599060059, "learning_rate": 2.1116568651156076e-07, "loss": 1.6331, "step": 294 }, { "epoch": 0.9808811305070657, "grad_norm": 0.4116688668727875, "learning_rate": 1.4665861488761813e-07, "loss": 1.349, "step": 295 }, { "epoch": 0.9842061512884456, "grad_norm": 0.45309168100357056, "learning_rate": 9.386977560232879e-08, "loss": 1.5287, "step": 296 }, { "epoch": 0.9875311720698254, "grad_norm": 0.4364264905452728, "learning_rate": 5.2805363660046734e-08, "loss": 1.5738, "step": 297 }, { "epoch": 0.9908561928512053, "grad_norm": 0.4273874759674072, "learning_rate": 2.347019815158724e-08, "loss": 1.585, "step": 298 }, { "epoch": 0.9941812136325852, "grad_norm": 0.42349839210510254, "learning_rate": 5.867721688690431e-09, "loss": 1.6562, "step": 299 }, { "epoch": 0.9975062344139651, "grad_norm": 0.46075183153152466, "learning_rate": 0.0, "loss": 1.592, "step": 300 }, { "epoch": 0.9975062344139651, "eval_loss": 1.6082537174224854, "eval_runtime": 15.2561, "eval_samples_per_second": 33.233, "eval_steps_per_second": 4.195, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1990397395992576.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }