|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9975062344139651, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0033250207813798837, |
|
"grad_norm": 1.1512730121612549, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3647, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006650041562759767, |
|
"grad_norm": 1.1141036748886108, |
|
"learning_rate": 4e-05, |
|
"loss": 2.2794, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00997506234413965, |
|
"grad_norm": 0.9610893726348877, |
|
"learning_rate": 6e-05, |
|
"loss": 2.2596, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013300083125519535, |
|
"grad_norm": 1.1339858770370483, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3787, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01662510390689942, |
|
"grad_norm": 0.8878076672554016, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3961, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0199501246882793, |
|
"grad_norm": 0.829910397529602, |
|
"learning_rate": 0.00012, |
|
"loss": 2.1948, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.023275145469659187, |
|
"grad_norm": 0.9420105814933777, |
|
"learning_rate": 0.00014, |
|
"loss": 2.4329, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02660016625103907, |
|
"grad_norm": 0.8519226908683777, |
|
"learning_rate": 0.00016, |
|
"loss": 2.3078, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.029925187032418952, |
|
"grad_norm": 0.7587653994560242, |
|
"learning_rate": 0.00018, |
|
"loss": 1.9353, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03325020781379884, |
|
"grad_norm": 0.9927352666854858, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9429, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03657522859517872, |
|
"grad_norm": 0.9643892049789429, |
|
"learning_rate": 0.00019999413227831132, |
|
"loss": 2.0925, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0399002493765586, |
|
"grad_norm": 0.941749632358551, |
|
"learning_rate": 0.00019997652980184843, |
|
"loss": 1.8099, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.043225270157938485, |
|
"grad_norm": 0.5177962779998779, |
|
"learning_rate": 0.00019994719463633997, |
|
"loss": 1.6693, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.046550290939318374, |
|
"grad_norm": 0.56168133020401, |
|
"learning_rate": 0.0001999061302243977, |
|
"loss": 1.9593, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04987531172069826, |
|
"grad_norm": 0.5392152070999146, |
|
"learning_rate": 0.00019985334138511237, |
|
"loss": 1.6836, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05320033250207814, |
|
"grad_norm": 0.5796711444854736, |
|
"learning_rate": 0.00019978883431348845, |
|
"loss": 1.7744, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05652535328345802, |
|
"grad_norm": 0.5629785060882568, |
|
"learning_rate": 0.0001997126165797167, |
|
"loss": 2.0442, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.059850374064837904, |
|
"grad_norm": 0.48991289734840393, |
|
"learning_rate": 0.00019962469712828614, |
|
"loss": 1.679, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06317539484621779, |
|
"grad_norm": 0.47867172956466675, |
|
"learning_rate": 0.0001995250862769342, |
|
"loss": 1.6641, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06650041562759768, |
|
"grad_norm": 0.49752330780029297, |
|
"learning_rate": 0.00019941379571543596, |
|
"loss": 1.5331, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06982543640897755, |
|
"grad_norm": 0.49927300214767456, |
|
"learning_rate": 0.00019929083850423225, |
|
"loss": 1.5704, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07315045719035744, |
|
"grad_norm": 0.5634847283363342, |
|
"learning_rate": 0.00019915622907289694, |
|
"loss": 1.9051, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07647547797173733, |
|
"grad_norm": 0.5214512944221497, |
|
"learning_rate": 0.00019900998321844367, |
|
"loss": 1.756, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0798004987531172, |
|
"grad_norm": 0.46316221356391907, |
|
"learning_rate": 0.00019885211810347184, |
|
"loss": 1.6153, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0831255195344971, |
|
"grad_norm": 0.45869576930999756, |
|
"learning_rate": 0.00019868265225415265, |
|
"loss": 1.8899, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08645054031587697, |
|
"grad_norm": 0.4824669063091278, |
|
"learning_rate": 0.00019850160555805486, |
|
"loss": 1.8861, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08977556109725686, |
|
"grad_norm": 0.509224534034729, |
|
"learning_rate": 0.000198308999261811, |
|
"loss": 1.8507, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09310058187863675, |
|
"grad_norm": 0.4441746771335602, |
|
"learning_rate": 0.00019810485596862392, |
|
"loss": 1.7326, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09642560266001662, |
|
"grad_norm": 0.4595758318901062, |
|
"learning_rate": 0.00019788919963561422, |
|
"loss": 1.8283, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09975062344139651, |
|
"grad_norm": 0.5222824215888977, |
|
"learning_rate": 0.00019766205557100868, |
|
"loss": 1.5678, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10307564422277639, |
|
"grad_norm": 0.43890196084976196, |
|
"learning_rate": 0.00019742345043117045, |
|
"loss": 1.5899, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10640066500415628, |
|
"grad_norm": 0.4542831778526306, |
|
"learning_rate": 0.00019717341221747056, |
|
"loss": 1.6733, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.10972568578553615, |
|
"grad_norm": 0.43134549260139465, |
|
"learning_rate": 0.00019691197027300205, |
|
"loss": 1.7386, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11305070656691604, |
|
"grad_norm": 0.44071701169013977, |
|
"learning_rate": 0.00019663915527913625, |
|
"loss": 1.7685, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11637572734829593, |
|
"grad_norm": 0.4880881607532501, |
|
"learning_rate": 0.0001963549992519223, |
|
"loss": 1.8461, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11970074812967581, |
|
"grad_norm": 0.40884578227996826, |
|
"learning_rate": 0.00019605953553832988, |
|
"loss": 1.5538, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1230257689110557, |
|
"grad_norm": 0.39413318037986755, |
|
"learning_rate": 0.00019575279881233577, |
|
"loss": 1.4222, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12635078969243557, |
|
"grad_norm": 0.44478997588157654, |
|
"learning_rate": 0.00019543482507085482, |
|
"loss": 1.7247, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12967581047381546, |
|
"grad_norm": 0.4295913875102997, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.5788, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13300083125519535, |
|
"grad_norm": 0.47360050678253174, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 1.7429, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13632585203657524, |
|
"grad_norm": 0.483909547328949, |
|
"learning_rate": 0.00019441386147691335, |
|
"loss": 1.6674, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1396508728179551, |
|
"grad_norm": 0.47071558237075806, |
|
"learning_rate": 0.0001940513259502924, |
|
"loss": 1.8229, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.142975893599335, |
|
"grad_norm": 0.43929168581962585, |
|
"learning_rate": 0.0001936777530835689, |
|
"loss": 1.6562, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14630091438071488, |
|
"grad_norm": 0.4329998791217804, |
|
"learning_rate": 0.0001932931867171751, |
|
"loss": 1.5274, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.14962593516209477, |
|
"grad_norm": 0.44375908374786377, |
|
"learning_rate": 0.00019289767198167916, |
|
"loss": 1.7084, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15295095594347466, |
|
"grad_norm": 0.48119276762008667, |
|
"learning_rate": 0.0001924912552924889, |
|
"loss": 1.7645, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15627597672485452, |
|
"grad_norm": 0.4040566384792328, |
|
"learning_rate": 0.00019207398434440478, |
|
"loss": 1.5925, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1596009975062344, |
|
"grad_norm": 0.4708506464958191, |
|
"learning_rate": 0.00019164590810602262, |
|
"loss": 1.8461, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1629260182876143, |
|
"grad_norm": 0.431772381067276, |
|
"learning_rate": 0.000191207076813987, |
|
"loss": 1.5356, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1662510390689942, |
|
"grad_norm": 0.4952054023742676, |
|
"learning_rate": 0.00019075754196709572, |
|
"loss": 1.8034, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16957605985037408, |
|
"grad_norm": 0.43522897362709045, |
|
"learning_rate": 0.00019029735632025618, |
|
"loss": 1.6717, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17290108063175394, |
|
"grad_norm": 0.46861544251441956, |
|
"learning_rate": 0.00018982657387829445, |
|
"loss": 1.766, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.17622610141313383, |
|
"grad_norm": 0.44363775849342346, |
|
"learning_rate": 0.00018934524988961738, |
|
"loss": 1.5169, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.17955112219451372, |
|
"grad_norm": 0.41366782784461975, |
|
"learning_rate": 0.00018885344083972914, |
|
"loss": 1.6495, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1828761429758936, |
|
"grad_norm": 0.4273390769958496, |
|
"learning_rate": 0.0001883512044446023, |
|
"loss": 1.5952, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1862011637572735, |
|
"grad_norm": 0.4389772117137909, |
|
"learning_rate": 0.00018783859964390464, |
|
"loss": 1.7003, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.18952618453865336, |
|
"grad_norm": 0.480125367641449, |
|
"learning_rate": 0.0001873156865940823, |
|
"loss": 1.6503, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19285120532003325, |
|
"grad_norm": 0.48973348736763, |
|
"learning_rate": 0.00018678252666130013, |
|
"loss": 1.737, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.19617622610141314, |
|
"grad_norm": 0.4558335840702057, |
|
"learning_rate": 0.0001862391824142402, |
|
"loss": 1.571, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.19950124688279303, |
|
"grad_norm": 0.45777326822280884, |
|
"learning_rate": 0.00018568571761675893, |
|
"loss": 1.6462, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2028262676641729, |
|
"grad_norm": 0.4185212254524231, |
|
"learning_rate": 0.00018512219722040425, |
|
"loss": 1.5729, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.20615128844555278, |
|
"grad_norm": 0.4137243330478668, |
|
"learning_rate": 0.0001845486873567932, |
|
"loss": 1.675, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.20947630922693267, |
|
"grad_norm": 0.42468297481536865, |
|
"learning_rate": 0.00018396525532985108, |
|
"loss": 1.4519, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21280133000831256, |
|
"grad_norm": 0.46751776337623596, |
|
"learning_rate": 0.00018337196960791302, |
|
"loss": 1.7264, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.21612635078969245, |
|
"grad_norm": 0.47722429037094116, |
|
"learning_rate": 0.00018276889981568906, |
|
"loss": 1.5392, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2194513715710723, |
|
"grad_norm": 0.4753107726573944, |
|
"learning_rate": 0.00018215611672609317, |
|
"loss": 1.5328, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2227763923524522, |
|
"grad_norm": 0.4401816129684448, |
|
"learning_rate": 0.00018153369225193782, |
|
"loss": 1.4793, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.22610141313383209, |
|
"grad_norm": 0.4473712146282196, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.5596, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.22942643391521197, |
|
"grad_norm": 0.45505204796791077, |
|
"learning_rate": 0.00018026021244992287, |
|
"loss": 1.7437, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.23275145469659186, |
|
"grad_norm": 0.44190192222595215, |
|
"learning_rate": 0.00017960930657056438, |
|
"loss": 1.7401, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23607647547797173, |
|
"grad_norm": 0.501592218875885, |
|
"learning_rate": 0.0001789490581861102, |
|
"loss": 1.7464, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.23940149625935161, |
|
"grad_norm": 0.43836328387260437, |
|
"learning_rate": 0.00017827954477963557, |
|
"loss": 1.7451, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2427265170407315, |
|
"grad_norm": 0.611949622631073, |
|
"learning_rate": 0.0001776008449215073, |
|
"loss": 1.6921, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2460515378221114, |
|
"grad_norm": 0.46015432476997375, |
|
"learning_rate": 0.0001769130382601629, |
|
"loss": 1.7985, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.24937655860349128, |
|
"grad_norm": 0.44316309690475464, |
|
"learning_rate": 0.00017621620551276366, |
|
"loss": 1.7806, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25270157938487114, |
|
"grad_norm": 0.4749353229999542, |
|
"learning_rate": 0.00017551042845572208, |
|
"loss": 1.7349, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.25602660016625106, |
|
"grad_norm": 0.4712280333042145, |
|
"learning_rate": 0.00017479578991510506, |
|
"loss": 1.4129, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2593516209476309, |
|
"grad_norm": 0.44466859102249146, |
|
"learning_rate": 0.00017407237375691392, |
|
"loss": 1.6819, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2626766417290108, |
|
"grad_norm": 0.42531418800354004, |
|
"learning_rate": 0.00017334026487724225, |
|
"loss": 1.6154, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2660016625103907, |
|
"grad_norm": 0.4512370228767395, |
|
"learning_rate": 0.0001725995491923131, |
|
"loss": 1.6736, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26932668329177056, |
|
"grad_norm": 0.4131537079811096, |
|
"learning_rate": 0.00017185031362839626, |
|
"loss": 1.5468, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2726517040731505, |
|
"grad_norm": 0.47616103291511536, |
|
"learning_rate": 0.00017109264611160708, |
|
"loss": 1.523, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.27597672485453034, |
|
"grad_norm": 0.4459686279296875, |
|
"learning_rate": 0.000170326635557588, |
|
"loss": 1.8612, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2793017456359102, |
|
"grad_norm": 0.4500899612903595, |
|
"learning_rate": 0.00016955237186107387, |
|
"loss": 1.643, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2826267664172901, |
|
"grad_norm": 0.44385287165641785, |
|
"learning_rate": 0.00016876994588534234, |
|
"loss": 1.3833, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28595178719867, |
|
"grad_norm": 0.4063577950000763, |
|
"learning_rate": 0.0001679794494515508, |
|
"loss": 1.3494, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2892768079800499, |
|
"grad_norm": 0.43013447523117065, |
|
"learning_rate": 0.00016718097532796063, |
|
"loss": 1.5205, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.29260182876142976, |
|
"grad_norm": 0.46770158410072327, |
|
"learning_rate": 0.00016637461721905045, |
|
"loss": 1.6897, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2959268495428096, |
|
"grad_norm": 0.4841039478778839, |
|
"learning_rate": 0.00016556046975451963, |
|
"loss": 1.5793, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.29925187032418954, |
|
"grad_norm": 0.48426705598831177, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 1.6988, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3025768911055694, |
|
"grad_norm": 0.5768110752105713, |
|
"learning_rate": 0.0001639091898367576, |
|
"loss": 1.7846, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3059019118869493, |
|
"grad_norm": 0.446196049451828, |
|
"learning_rate": 0.00016307225116854622, |
|
"loss": 1.7882, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3092269326683292, |
|
"grad_norm": 0.4034564793109894, |
|
"learning_rate": 0.00016222791069201207, |
|
"loss": 1.6616, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.31255195344970904, |
|
"grad_norm": 0.424376517534256, |
|
"learning_rate": 0.00016137626749425377, |
|
"loss": 1.5353, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.31587697423108896, |
|
"grad_norm": 0.45510077476501465, |
|
"learning_rate": 0.00016051742151937655, |
|
"loss": 1.7947, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3192019950124688, |
|
"grad_norm": 0.4815070331096649, |
|
"learning_rate": 0.00015965147355676343, |
|
"loss": 1.581, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.32252701579384874, |
|
"grad_norm": 0.4505084156990051, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.6186, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3258520365752286, |
|
"grad_norm": 0.4437141418457031, |
|
"learning_rate": 0.0001578986789811849, |
|
"loss": 1.6509, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.32917705735660846, |
|
"grad_norm": 0.4133874475955963, |
|
"learning_rate": 0.00015701203806643433, |
|
"loss": 1.7992, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3325020781379884, |
|
"grad_norm": 0.4500593841075897, |
|
"learning_rate": 0.00015611870653623825, |
|
"loss": 1.6654, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33582709891936824, |
|
"grad_norm": 0.4359726309776306, |
|
"learning_rate": 0.00015521878922701246, |
|
"loss": 1.6461, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.33915211970074816, |
|
"grad_norm": 0.40108025074005127, |
|
"learning_rate": 0.00015431239174804328, |
|
"loss": 1.5237, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.342477140482128, |
|
"grad_norm": 0.43869125843048096, |
|
"learning_rate": 0.00015339962046909364, |
|
"loss": 1.6909, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3458021612635079, |
|
"grad_norm": 0.42006051540374756, |
|
"learning_rate": 0.00015248058250792008, |
|
"loss": 1.5046, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3491271820448878, |
|
"grad_norm": 0.38756394386291504, |
|
"learning_rate": 0.00015155538571770218, |
|
"loss": 1.3747, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.35245220282626766, |
|
"grad_norm": 0.47784286737442017, |
|
"learning_rate": 0.0001506241386743854, |
|
"loss": 1.673, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3557772236076476, |
|
"grad_norm": 0.4587322175502777, |
|
"learning_rate": 0.00014968695066393923, |
|
"loss": 1.7987, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.35910224438902744, |
|
"grad_norm": 0.42091092467308044, |
|
"learning_rate": 0.00014874393166953192, |
|
"loss": 1.5309, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3624272651704073, |
|
"grad_norm": 0.47224530577659607, |
|
"learning_rate": 0.00014779519235862365, |
|
"loss": 1.7268, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3657522859517872, |
|
"grad_norm": 0.44596192240715027, |
|
"learning_rate": 0.00014684084406997903, |
|
"loss": 1.7108, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3690773067331671, |
|
"grad_norm": 0.4590005874633789, |
|
"learning_rate": 0.0001458809988006011, |
|
"loss": 1.638, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.372402327514547, |
|
"grad_norm": 0.43627721071243286, |
|
"learning_rate": 0.00014491576919258792, |
|
"loss": 1.6721, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.37572734829592686, |
|
"grad_norm": 0.41456034779548645, |
|
"learning_rate": 0.00014394526851991364, |
|
"loss": 1.6863, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3790523690773067, |
|
"grad_norm": 0.4247894883155823, |
|
"learning_rate": 0.0001429696106751352, |
|
"loss": 1.5659, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.38237738985868663, |
|
"grad_norm": 0.4657272696495056, |
|
"learning_rate": 0.00014198891015602646, |
|
"loss": 1.4086, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3857024106400665, |
|
"grad_norm": 0.4860394597053528, |
|
"learning_rate": 0.0001410032820521416, |
|
"loss": 1.4603, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.38902743142144636, |
|
"grad_norm": 0.41849544644355774, |
|
"learning_rate": 0.00014001284203130868, |
|
"loss": 1.3991, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3923524522028263, |
|
"grad_norm": 0.4544629752635956, |
|
"learning_rate": 0.00013901770632605547, |
|
"loss": 1.8028, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.39567747298420614, |
|
"grad_norm": 0.5051787495613098, |
|
"learning_rate": 0.0001380179917199692, |
|
"loss": 1.8854, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.39900249376558605, |
|
"grad_norm": 0.41150030493736267, |
|
"learning_rate": 0.00013701381553399145, |
|
"loss": 1.6686, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4023275145469659, |
|
"grad_norm": 0.4593510925769806, |
|
"learning_rate": 0.0001360052956126499, |
|
"loss": 1.5844, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4056525353283458, |
|
"grad_norm": 0.42087090015411377, |
|
"learning_rate": 0.00013499255031022885, |
|
"loss": 1.4865, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4089775561097257, |
|
"grad_norm": 0.4708739221096039, |
|
"learning_rate": 0.00013397569847687984, |
|
"loss": 1.7089, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.41230257689110555, |
|
"grad_norm": 0.4878352880477905, |
|
"learning_rate": 0.00013295485944467405, |
|
"loss": 1.8006, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.41562759767248547, |
|
"grad_norm": 0.43254002928733826, |
|
"learning_rate": 0.000131930153013598, |
|
"loss": 1.6949, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41895261845386533, |
|
"grad_norm": 0.47519850730895996, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.7601, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4222776392352452, |
|
"grad_norm": 0.4135800898075104, |
|
"learning_rate": 0.00012986961940995138, |
|
"loss": 1.5955, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.4256026600166251, |
|
"grad_norm": 0.46267929673194885, |
|
"learning_rate": 0.0001288340340501351, |
|
"loss": 1.8398, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.428927680798005, |
|
"grad_norm": 0.43891721963882446, |
|
"learning_rate": 0.00012779506488857945, |
|
"loss": 1.4741, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4322527015793849, |
|
"grad_norm": 0.4456429183483124, |
|
"learning_rate": 0.00012675283385292212, |
|
"loss": 1.7454, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43557772236076475, |
|
"grad_norm": 0.4604743719100952, |
|
"learning_rate": 0.00012570746325359607, |
|
"loss": 1.8192, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4389027431421446, |
|
"grad_norm": 0.46728062629699707, |
|
"learning_rate": 0.00012465907576947622, |
|
"loss": 1.7551, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.44222776392352453, |
|
"grad_norm": 0.436298668384552, |
|
"learning_rate": 0.000123607794433482, |
|
"loss": 1.6592, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4455527847049044, |
|
"grad_norm": 0.39828214049339294, |
|
"learning_rate": 0.00012255374261813944, |
|
"loss": 1.4603, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4488778054862843, |
|
"grad_norm": 0.4469813406467438, |
|
"learning_rate": 0.00012149704402110243, |
|
"loss": 1.6449, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.45220282626766417, |
|
"grad_norm": 0.4820503294467926, |
|
"learning_rate": 0.0001204378226506365, |
|
"loss": 1.8473, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.45552784704904403, |
|
"grad_norm": 0.49072131514549255, |
|
"learning_rate": 0.00011937620281106585, |
|
"loss": 1.6843, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.45885286783042395, |
|
"grad_norm": 0.48773854970932007, |
|
"learning_rate": 0.00011831230908818563, |
|
"loss": 1.625, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4621778886118038, |
|
"grad_norm": 0.4438723623752594, |
|
"learning_rate": 0.00011724626633464127, |
|
"loss": 1.7558, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.46550290939318373, |
|
"grad_norm": 0.4389275014400482, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 1.4621, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4688279301745636, |
|
"grad_norm": 0.4611305296421051, |
|
"learning_rate": 0.00011510823439245169, |
|
"loss": 1.59, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.47215295095594345, |
|
"grad_norm": 0.43601059913635254, |
|
"learning_rate": 0.00011403649611133444, |
|
"loss": 1.7462, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.47547797173732337, |
|
"grad_norm": 0.41201236844062805, |
|
"learning_rate": 0.00011296311058516389, |
|
"loss": 1.5341, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.47880299251870323, |
|
"grad_norm": 0.46523982286453247, |
|
"learning_rate": 0.00011188820378049065, |
|
"loss": 1.6327, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.48212801330008315, |
|
"grad_norm": 0.42490893602371216, |
|
"learning_rate": 0.00011081190184239419, |
|
"loss": 1.6178, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.485453034081463, |
|
"grad_norm": 0.42238375544548035, |
|
"learning_rate": 0.00010973433107967902, |
|
"loss": 1.534, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.48877805486284287, |
|
"grad_norm": 0.48569226264953613, |
|
"learning_rate": 0.00010865561795005177, |
|
"loss": 1.5332, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4921030756442228, |
|
"grad_norm": 0.4933275878429413, |
|
"learning_rate": 0.00010757588904528106, |
|
"loss": 1.5928, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.49542809642560265, |
|
"grad_norm": 0.4781058728694916, |
|
"learning_rate": 0.00010649527107634108, |
|
"loss": 1.6578, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.49875311720698257, |
|
"grad_norm": 0.4651820659637451, |
|
"learning_rate": 0.00010541389085854176, |
|
"loss": 1.6884, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5020781379883624, |
|
"grad_norm": 0.4429711103439331, |
|
"learning_rate": 0.00010433187529664623, |
|
"loss": 1.6723, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5054031587697423, |
|
"grad_norm": 0.4521614611148834, |
|
"learning_rate": 0.00010324935136997806, |
|
"loss": 1.6269, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5087281795511222, |
|
"grad_norm": 0.4930736720561981, |
|
"learning_rate": 0.00010216644611751975, |
|
"loss": 1.7933, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5120532003325021, |
|
"grad_norm": 0.4855606257915497, |
|
"learning_rate": 0.000101083286623004, |
|
"loss": 1.6702, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.515378221113882, |
|
"grad_norm": 0.4960128366947174, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7428, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5187032418952618, |
|
"grad_norm": 0.42107459902763367, |
|
"learning_rate": 9.891671337699602e-05, |
|
"loss": 1.6235, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5220282626766417, |
|
"grad_norm": 0.4479861855506897, |
|
"learning_rate": 9.783355388248027e-05, |
|
"loss": 1.5158, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5253532834580216, |
|
"grad_norm": 0.4954458177089691, |
|
"learning_rate": 9.675064863002196e-05, |
|
"loss": 1.6743, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5286783042394015, |
|
"grad_norm": 0.5591014623641968, |
|
"learning_rate": 9.56681247033538e-05, |
|
"loss": 1.9691, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5320033250207814, |
|
"grad_norm": 0.46626871824264526, |
|
"learning_rate": 9.458610914145826e-05, |
|
"loss": 1.5621, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5353283458021613, |
|
"grad_norm": 0.4377134144306183, |
|
"learning_rate": 9.350472892365892e-05, |
|
"loss": 1.5524, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5386533665835411, |
|
"grad_norm": 0.3984418511390686, |
|
"learning_rate": 9.242411095471897e-05, |
|
"loss": 1.6454, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.541978387364921, |
|
"grad_norm": 0.42802637815475464, |
|
"learning_rate": 9.134438204994824e-05, |
|
"loss": 1.4036, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.545303408146301, |
|
"grad_norm": 0.4567003846168518, |
|
"learning_rate": 9.026566892032105e-05, |
|
"loss": 1.6606, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5486284289276808, |
|
"grad_norm": 0.45452797412872314, |
|
"learning_rate": 8.918809815760585e-05, |
|
"loss": 1.8219, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5519534497090607, |
|
"grad_norm": 0.4367886781692505, |
|
"learning_rate": 8.811179621950936e-05, |
|
"loss": 1.5962, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5552784704904405, |
|
"grad_norm": 0.4670146703720093, |
|
"learning_rate": 8.703688941483616e-05, |
|
"loss": 1.6382, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5586034912718204, |
|
"grad_norm": 0.5069778561592102, |
|
"learning_rate": 8.596350388866558e-05, |
|
"loss": 1.7067, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5619285120532004, |
|
"grad_norm": 0.4080033302307129, |
|
"learning_rate": 8.489176560754834e-05, |
|
"loss": 1.4192, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5652535328345802, |
|
"grad_norm": 0.491526335477829, |
|
"learning_rate": 8.382180034472353e-05, |
|
"loss": 1.8687, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5685785536159601, |
|
"grad_norm": 0.5429246425628662, |
|
"learning_rate": 8.275373366535877e-05, |
|
"loss": 1.776, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.57190357439734, |
|
"grad_norm": 0.4131667912006378, |
|
"learning_rate": 8.168769091181438e-05, |
|
"loss": 1.3345, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5752285951787198, |
|
"grad_norm": 0.5055519342422485, |
|
"learning_rate": 8.062379718893417e-05, |
|
"loss": 1.7716, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5785536159600998, |
|
"grad_norm": 0.4675292670726776, |
|
"learning_rate": 7.956217734936353e-05, |
|
"loss": 1.5941, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5818786367414797, |
|
"grad_norm": 0.5096448659896851, |
|
"learning_rate": 7.85029559788976e-05, |
|
"loss": 1.9376, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5852036575228595, |
|
"grad_norm": 0.4687637686729431, |
|
"learning_rate": 7.744625738186059e-05, |
|
"loss": 1.7242, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5885286783042394, |
|
"grad_norm": 0.437148779630661, |
|
"learning_rate": 7.639220556651799e-05, |
|
"loss": 1.4993, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5918536990856192, |
|
"grad_norm": 0.44125625491142273, |
|
"learning_rate": 7.534092423052381e-05, |
|
"loss": 1.5076, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5951787198669992, |
|
"grad_norm": 0.4794883131980896, |
|
"learning_rate": 7.42925367464039e-05, |
|
"loss": 1.6401, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5985037406483791, |
|
"grad_norm": 0.42347967624664307, |
|
"learning_rate": 7.324716614707793e-05, |
|
"loss": 1.444, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6018287614297589, |
|
"grad_norm": 0.4843563437461853, |
|
"learning_rate": 7.220493511142059e-05, |
|
"loss": 1.7117, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6051537822111388, |
|
"grad_norm": 0.48885542154312134, |
|
"learning_rate": 7.116596594986494e-05, |
|
"loss": 1.6799, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6084788029925187, |
|
"grad_norm": 0.48835498094558716, |
|
"learning_rate": 7.013038059004866e-05, |
|
"loss": 1.7308, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6118038237738986, |
|
"grad_norm": 0.38506001234054565, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 1.5766, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6151288445552785, |
|
"grad_norm": 0.5520392656326294, |
|
"learning_rate": 6.806984698640202e-05, |
|
"loss": 1.5418, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6184538653366584, |
|
"grad_norm": 0.4401935935020447, |
|
"learning_rate": 6.704514055532597e-05, |
|
"loss": 1.7715, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6217788861180382, |
|
"grad_norm": 0.4164566695690155, |
|
"learning_rate": 6.602430152312017e-05, |
|
"loss": 1.4711, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6251039068994181, |
|
"grad_norm": 0.4750818610191345, |
|
"learning_rate": 6.500744968977116e-05, |
|
"loss": 1.374, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.628428927680798, |
|
"grad_norm": 0.5478043556213379, |
|
"learning_rate": 6.399470438735014e-05, |
|
"loss": 1.7294, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6317539484621779, |
|
"grad_norm": 0.4560893476009369, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 1.8216, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6350789692435578, |
|
"grad_norm": 0.49942511320114136, |
|
"learning_rate": 6.19820082800308e-05, |
|
"loss": 1.6108, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6384039900249376, |
|
"grad_norm": 0.3901759088039398, |
|
"learning_rate": 6.0982293673944544e-05, |
|
"loss": 1.4635, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6417290108063175, |
|
"grad_norm": 0.45033466815948486, |
|
"learning_rate": 5.9987157968691344e-05, |
|
"loss": 1.5153, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6450540315876975, |
|
"grad_norm": 0.44514134526252747, |
|
"learning_rate": 5.899671794785839e-05, |
|
"loss": 1.6015, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6483790523690773, |
|
"grad_norm": 0.42773956060409546, |
|
"learning_rate": 5.801108984397354e-05, |
|
"loss": 1.6624, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6517040731504572, |
|
"grad_norm": 0.42323529720306396, |
|
"learning_rate": 5.703038932486484e-05, |
|
"loss": 1.642, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6550290939318371, |
|
"grad_norm": 0.4852340519428253, |
|
"learning_rate": 5.605473148008638e-05, |
|
"loss": 1.5533, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6583541147132169, |
|
"grad_norm": 0.46353092789649963, |
|
"learning_rate": 5.5084230807412126e-05, |
|
"loss": 1.5137, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6616791354945969, |
|
"grad_norm": 0.5486162304878235, |
|
"learning_rate": 5.411900119939895e-05, |
|
"loss": 1.5682, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6650041562759768, |
|
"grad_norm": 0.4136289656162262, |
|
"learning_rate": 5.3159155930021e-05, |
|
"loss": 1.5902, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6683291770573566, |
|
"grad_norm": 0.457292765378952, |
|
"learning_rate": 5.2204807641376354e-05, |
|
"loss": 1.6669, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6716541978387365, |
|
"grad_norm": 0.4368407726287842, |
|
"learning_rate": 5.12560683304681e-05, |
|
"loss": 1.7747, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6749792186201163, |
|
"grad_norm": 0.4596605598926544, |
|
"learning_rate": 5.03130493360608e-05, |
|
"loss": 1.5868, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6783042394014963, |
|
"grad_norm": 0.437491238117218, |
|
"learning_rate": 4.9375861325614606e-05, |
|
"loss": 1.7614, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6816292601828762, |
|
"grad_norm": 0.47249388694763184, |
|
"learning_rate": 4.844461428229782e-05, |
|
"loss": 1.582, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.684954280964256, |
|
"grad_norm": 0.44100067019462585, |
|
"learning_rate": 4.751941749207995e-05, |
|
"loss": 1.6814, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6882793017456359, |
|
"grad_norm": 0.5000886917114258, |
|
"learning_rate": 4.660037953090639e-05, |
|
"loss": 1.6634, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6916043225270158, |
|
"grad_norm": 0.4667086899280548, |
|
"learning_rate": 4.5687608251956714e-05, |
|
"loss": 1.7767, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6949293433083957, |
|
"grad_norm": 0.4677750766277313, |
|
"learning_rate": 4.4781210772987514e-05, |
|
"loss": 1.785, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6982543640897756, |
|
"grad_norm": 0.40729814767837524, |
|
"learning_rate": 4.388129346376178e-05, |
|
"loss": 1.5742, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7015793848711555, |
|
"grad_norm": 0.4622965157032013, |
|
"learning_rate": 4.298796193356566e-05, |
|
"loss": 1.755, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7049044056525353, |
|
"grad_norm": 0.42128920555114746, |
|
"learning_rate": 4.210132101881516e-05, |
|
"loss": 1.359, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7082294264339152, |
|
"grad_norm": 0.4670293927192688, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 1.8743, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7115544472152951, |
|
"grad_norm": 0.474398136138916, |
|
"learning_rate": 4.034852644323661e-05, |
|
"loss": 1.6977, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.714879467996675, |
|
"grad_norm": 0.5026089549064636, |
|
"learning_rate": 3.948257848062351e-05, |
|
"loss": 1.566, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7182044887780549, |
|
"grad_norm": 0.40603697299957275, |
|
"learning_rate": 3.862373250574626e-05, |
|
"loss": 1.3894, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7215295095594347, |
|
"grad_norm": 0.4771779179573059, |
|
"learning_rate": 3.7772089307987936e-05, |
|
"loss": 1.6296, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7248545303408146, |
|
"grad_norm": 0.44347891211509705, |
|
"learning_rate": 3.6927748831453836e-05, |
|
"loss": 1.6663, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7281795511221946, |
|
"grad_norm": 0.435149610042572, |
|
"learning_rate": 3.609081016324243e-05, |
|
"loss": 1.6662, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7315045719035744, |
|
"grad_norm": 0.453782856464386, |
|
"learning_rate": 3.5261371521817244e-05, |
|
"loss": 1.7286, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7348295926849543, |
|
"grad_norm": 0.42496827244758606, |
|
"learning_rate": 3.44395302454804e-05, |
|
"loss": 1.7376, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7381546134663342, |
|
"grad_norm": 0.45447835326194763, |
|
"learning_rate": 3.3625382780949574e-05, |
|
"loss": 1.5055, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.741479634247714, |
|
"grad_norm": 0.5035948157310486, |
|
"learning_rate": 3.28190246720394e-05, |
|
"loss": 1.866, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.744804655029094, |
|
"grad_norm": 0.47680604457855225, |
|
"learning_rate": 3.202055054844921e-05, |
|
"loss": 1.9692, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7481296758104738, |
|
"grad_norm": 0.45373663306236267, |
|
"learning_rate": 3.123005411465766e-05, |
|
"loss": 1.6879, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7514546965918537, |
|
"grad_norm": 0.49925628304481506, |
|
"learning_rate": 3.0447628138926156e-05, |
|
"loss": 1.5313, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7547797173732336, |
|
"grad_norm": 0.4820810556411743, |
|
"learning_rate": 2.9673364442412e-05, |
|
"loss": 1.6259, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7581047381546134, |
|
"grad_norm": 0.5111257433891296, |
|
"learning_rate": 2.890735388839295e-05, |
|
"loss": 1.6068, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7614297589359933, |
|
"grad_norm": 0.3893967568874359, |
|
"learning_rate": 2.8149686371603767e-05, |
|
"loss": 1.5461, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7647547797173733, |
|
"grad_norm": 0.42585450410842896, |
|
"learning_rate": 2.7400450807686938e-05, |
|
"loss": 1.4092, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7680798004987531, |
|
"grad_norm": 0.5068459510803223, |
|
"learning_rate": 2.665973512275778e-05, |
|
"loss": 1.8426, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.771404821280133, |
|
"grad_norm": 0.44372087717056274, |
|
"learning_rate": 2.59276262430861e-05, |
|
"loss": 1.5669, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7747298420615129, |
|
"grad_norm": 0.4483433663845062, |
|
"learning_rate": 2.520421008489494e-05, |
|
"loss": 1.508, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7780548628428927, |
|
"grad_norm": 0.4225240647792816, |
|
"learning_rate": 2.4489571544277945e-05, |
|
"loss": 1.4963, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7813798836242727, |
|
"grad_norm": 0.4540765583515167, |
|
"learning_rate": 2.3783794487236365e-05, |
|
"loss": 1.7699, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7847049044056525, |
|
"grad_norm": 0.5303469896316528, |
|
"learning_rate": 2.308696173983711e-05, |
|
"loss": 1.7887, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7880299251870324, |
|
"grad_norm": 0.4368319809436798, |
|
"learning_rate": 2.2399155078492694e-05, |
|
"loss": 1.6762, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7913549459684123, |
|
"grad_norm": 0.41934987902641296, |
|
"learning_rate": 2.1720455220364444e-05, |
|
"loss": 1.6372, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.7946799667497921, |
|
"grad_norm": 0.4291558861732483, |
|
"learning_rate": 2.1050941813889836e-05, |
|
"loss": 1.6668, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7980049875311721, |
|
"grad_norm": 0.414044052362442, |
|
"learning_rate": 2.0390693429435627e-05, |
|
"loss": 1.6885, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.801330008312552, |
|
"grad_norm": 0.4342755377292633, |
|
"learning_rate": 1.9739787550077116e-05, |
|
"loss": 1.5082, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8046550290939318, |
|
"grad_norm": 0.45343807339668274, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 1.5635, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8079800498753117, |
|
"grad_norm": 0.4498422145843506, |
|
"learning_rate": 1.8466307748062205e-05, |
|
"loss": 1.6047, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8113050706566916, |
|
"grad_norm": 0.4087926149368286, |
|
"learning_rate": 1.784388327390687e-05, |
|
"loss": 1.3402, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8146300914380715, |
|
"grad_norm": 0.42908143997192383, |
|
"learning_rate": 1.7231100184310956e-05, |
|
"loss": 1.5664, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8179551122194514, |
|
"grad_norm": 0.4820065200328827, |
|
"learning_rate": 1.6628030392087e-05, |
|
"loss": 1.7218, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8212801330008312, |
|
"grad_norm": 0.4803646206855774, |
|
"learning_rate": 1.6034744670148972e-05, |
|
"loss": 1.837, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8246051537822111, |
|
"grad_norm": 0.4350035786628723, |
|
"learning_rate": 1.5451312643206827e-05, |
|
"loss": 1.5924, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.827930174563591, |
|
"grad_norm": 0.49933725595474243, |
|
"learning_rate": 1.4877802779595762e-05, |
|
"loss": 1.6023, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8312551953449709, |
|
"grad_norm": 0.44506213068962097, |
|
"learning_rate": 1.4314282383241096e-05, |
|
"loss": 1.4533, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8345802161263508, |
|
"grad_norm": 0.46771377325057983, |
|
"learning_rate": 1.376081758575981e-05, |
|
"loss": 1.7391, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8379052369077307, |
|
"grad_norm": 0.44328737258911133, |
|
"learning_rate": 1.3217473338699859e-05, |
|
"loss": 1.6868, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8412302576891105, |
|
"grad_norm": 0.4481683373451233, |
|
"learning_rate": 1.2684313405917703e-05, |
|
"loss": 1.4394, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8445552784704904, |
|
"grad_norm": 0.452848881483078, |
|
"learning_rate": 1.2161400356095375e-05, |
|
"loss": 1.6657, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8478802992518704, |
|
"grad_norm": 0.42388778924942017, |
|
"learning_rate": 1.1648795555397719e-05, |
|
"loss": 1.459, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8512053200332502, |
|
"grad_norm": 0.43063634634017944, |
|
"learning_rate": 1.1146559160270875e-05, |
|
"loss": 1.6652, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8545303408146301, |
|
"grad_norm": 0.40587228536605835, |
|
"learning_rate": 1.0654750110382628e-05, |
|
"loss": 1.5131, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.85785536159601, |
|
"grad_norm": 0.4573078751564026, |
|
"learning_rate": 1.0173426121705576e-05, |
|
"loss": 1.6047, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8611803823773898, |
|
"grad_norm": 0.4255686104297638, |
|
"learning_rate": 9.702643679743817e-06, |
|
"loss": 1.493, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8645054031587698, |
|
"grad_norm": 0.48064589500427246, |
|
"learning_rate": 9.242458032904311e-06, |
|
"loss": 1.6691, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8678304239401496, |
|
"grad_norm": 0.4468303620815277, |
|
"learning_rate": 8.792923186013024e-06, |
|
"loss": 1.5707, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8711554447215295, |
|
"grad_norm": 0.4417254328727722, |
|
"learning_rate": 8.354091893977401e-06, |
|
"loss": 1.5591, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8744804655029094, |
|
"grad_norm": 0.42065221071243286, |
|
"learning_rate": 7.926015655595254e-06, |
|
"loss": 1.5657, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8778054862842892, |
|
"grad_norm": 0.3902848958969116, |
|
"learning_rate": 7.508744707511117e-06, |
|
"loss": 1.5445, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8811305070656692, |
|
"grad_norm": 0.41993579268455505, |
|
"learning_rate": 7.102328018320858e-06, |
|
"loss": 1.4065, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8844555278470491, |
|
"grad_norm": 0.4170606732368469, |
|
"learning_rate": 6.70681328282492e-06, |
|
"loss": 1.5117, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8877805486284289, |
|
"grad_norm": 0.513680636882782, |
|
"learning_rate": 6.322246916431107e-06, |
|
"loss": 1.9662, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8911055694098088, |
|
"grad_norm": 0.43288302421569824, |
|
"learning_rate": 5.948674049707603e-06, |
|
"loss": 1.6208, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.8944305901911886, |
|
"grad_norm": 0.38253968954086304, |
|
"learning_rate": 5.58613852308667e-06, |
|
"loss": 1.5302, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.8977556109725686, |
|
"grad_norm": 0.4266990125179291, |
|
"learning_rate": 5.2346828817197655e-06, |
|
"loss": 1.6815, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9010806317539485, |
|
"grad_norm": 0.561107873916626, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 1.6608, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.9044056525353283, |
|
"grad_norm": 0.41741943359375, |
|
"learning_rate": 4.565174929145188e-06, |
|
"loss": 1.2898, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9077306733167082, |
|
"grad_norm": 0.39722350239753723, |
|
"learning_rate": 4.247201187664218e-06, |
|
"loss": 1.585, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9110556940980881, |
|
"grad_norm": 0.41877254843711853, |
|
"learning_rate": 3.940464461670135e-06, |
|
"loss": 1.605, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.914380714879468, |
|
"grad_norm": 0.5125119090080261, |
|
"learning_rate": 3.6450007480777093e-06, |
|
"loss": 1.5922, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9177057356608479, |
|
"grad_norm": 0.43189626932144165, |
|
"learning_rate": 3.360844720863765e-06, |
|
"loss": 1.559, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9210307564422278, |
|
"grad_norm": 0.44040048122406006, |
|
"learning_rate": 3.0880297269979653e-06, |
|
"loss": 1.67, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9243557772236076, |
|
"grad_norm": 0.5034830570220947, |
|
"learning_rate": 2.826587782529444e-06, |
|
"loss": 1.9225, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9276807980049875, |
|
"grad_norm": 0.4373987019062042, |
|
"learning_rate": 2.576549568829578e-06, |
|
"loss": 1.7428, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9310058187863675, |
|
"grad_norm": 0.45258763432502747, |
|
"learning_rate": 2.3379444289913342e-06, |
|
"loss": 1.5951, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9343308395677473, |
|
"grad_norm": 0.4411347210407257, |
|
"learning_rate": 2.110800364385812e-06, |
|
"loss": 1.5906, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9376558603491272, |
|
"grad_norm": 0.4530499577522278, |
|
"learning_rate": 1.8951440313760837e-06, |
|
"loss": 1.4591, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.940980881130507, |
|
"grad_norm": 0.484295129776001, |
|
"learning_rate": 1.6910007381890081e-06, |
|
"loss": 1.6808, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9443059019118869, |
|
"grad_norm": 0.43871116638183594, |
|
"learning_rate": 1.4983944419451613e-06, |
|
"loss": 1.5378, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9476309226932669, |
|
"grad_norm": 0.4450673460960388, |
|
"learning_rate": 1.317347745847386e-06, |
|
"loss": 1.6353, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9509559434746467, |
|
"grad_norm": 0.4276074171066284, |
|
"learning_rate": 1.1478818965281911e-06, |
|
"loss": 1.5403, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9542809642560266, |
|
"grad_norm": 0.46902570128440857, |
|
"learning_rate": 9.900167815563465e-07, |
|
"loss": 1.5077, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9576059850374065, |
|
"grad_norm": 0.42395344376564026, |
|
"learning_rate": 8.437709271030603e-07, |
|
"loss": 1.4276, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9609310058187863, |
|
"grad_norm": 0.45644354820251465, |
|
"learning_rate": 7.091614957677517e-07, |
|
"loss": 1.6846, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9642560266001663, |
|
"grad_norm": 0.5007442831993103, |
|
"learning_rate": 5.862042845640403e-07, |
|
"loss": 1.8228, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9675810473815462, |
|
"grad_norm": 0.4374752640724182, |
|
"learning_rate": 4.7491372306580627e-07, |
|
"loss": 1.6439, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.970906068162926, |
|
"grad_norm": 0.42748919129371643, |
|
"learning_rate": 3.7530287171387843e-07, |
|
"loss": 1.5968, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9742310889443059, |
|
"grad_norm": 0.3897336721420288, |
|
"learning_rate": 2.873834202833159e-07, |
|
"loss": 1.5657, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9775561097256857, |
|
"grad_norm": 0.5000090599060059, |
|
"learning_rate": 2.1116568651156076e-07, |
|
"loss": 1.6331, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.9808811305070657, |
|
"grad_norm": 0.4116688668727875, |
|
"learning_rate": 1.4665861488761813e-07, |
|
"loss": 1.349, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9842061512884456, |
|
"grad_norm": 0.45309168100357056, |
|
"learning_rate": 9.386977560232879e-08, |
|
"loss": 1.5287, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9875311720698254, |
|
"grad_norm": 0.4364264905452728, |
|
"learning_rate": 5.2805363660046734e-08, |
|
"loss": 1.5738, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.9908561928512053, |
|
"grad_norm": 0.4273874759674072, |
|
"learning_rate": 2.347019815158724e-08, |
|
"loss": 1.585, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.9941812136325852, |
|
"grad_norm": 0.42349839210510254, |
|
"learning_rate": 5.867721688690431e-09, |
|
"loss": 1.6562, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"grad_norm": 0.46075183153152466, |
|
"learning_rate": 0.0, |
|
"loss": 1.592, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"eval_loss": 1.6082537174224854, |
|
"eval_runtime": 15.2561, |
|
"eval_samples_per_second": 33.233, |
|
"eval_steps_per_second": 4.195, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1990397395992576.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|