|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.4773519163763065, |
|
"eval_steps": 500, |
|
"global_step": 180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013937282229965157, |
|
"grad_norm": 93.75749969482422, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.2675, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027874564459930314, |
|
"grad_norm": 66.60814666748047, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.3582, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.041811846689895474, |
|
"grad_norm": 77.9828109741211, |
|
"learning_rate": 6.5999999999999995e-06, |
|
"loss": 1.2567, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05574912891986063, |
|
"grad_norm": 17.5104923248291, |
|
"learning_rate": 8.8e-06, |
|
"loss": 1.1877, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06968641114982578, |
|
"grad_norm": 10.045647621154785, |
|
"learning_rate": 1.1e-05, |
|
"loss": 1.1663, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08362369337979095, |
|
"grad_norm": 12.031865119934082, |
|
"learning_rate": 1.0999372667896238e-05, |
|
"loss": 1.1769, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 6.24857234954834, |
|
"learning_rate": 1.0997490814692433e-05, |
|
"loss": 1.1703, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11149825783972125, |
|
"grad_norm": 4.496493816375732, |
|
"learning_rate": 1.0994354869678378e-05, |
|
"loss": 1.1608, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1254355400696864, |
|
"grad_norm": 4.9181036949157715, |
|
"learning_rate": 1.0989965548228246e-05, |
|
"loss": 1.1165, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13937282229965156, |
|
"grad_norm": 3.872187614440918, |
|
"learning_rate": 1.0984323851637407e-05, |
|
"loss": 1.1308, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15331010452961671, |
|
"grad_norm": 4.341373920440674, |
|
"learning_rate": 1.0977431066894e-05, |
|
"loss": 1.1245, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1672473867595819, |
|
"grad_norm": 3.867882013320923, |
|
"learning_rate": 1.0969288766385357e-05, |
|
"loss": 1.1486, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.18118466898954705, |
|
"grad_norm": 3.2795205116271973, |
|
"learning_rate": 1.0959898807539295e-05, |
|
"loss": 1.123, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 3.429868698120117, |
|
"learning_rate": 1.0949263332400415e-05, |
|
"loss": 1.1169, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.20905923344947736, |
|
"grad_norm": 3.0912277698516846, |
|
"learning_rate": 1.0937384767141438e-05, |
|
"loss": 1.0832, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2229965156794425, |
|
"grad_norm": 4.55407190322876, |
|
"learning_rate": 1.0924265821509758e-05, |
|
"loss": 1.1018, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.23693379790940766, |
|
"grad_norm": 3.4495749473571777, |
|
"learning_rate": 1.090990948820929e-05, |
|
"loss": 1.1186, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2508710801393728, |
|
"grad_norm": 6.985757827758789, |
|
"learning_rate": 1.0894319042217767e-05, |
|
"loss": 1.1135, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.26480836236933797, |
|
"grad_norm": 3.7354323863983154, |
|
"learning_rate": 1.0877498040039657e-05, |
|
"loss": 1.1064, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2787456445993031, |
|
"grad_norm": 4.375406265258789, |
|
"learning_rate": 1.0859450318894847e-05, |
|
"loss": 1.0767, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 4.374971389770508, |
|
"learning_rate": 1.0840179995843286e-05, |
|
"loss": 1.1072, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.30662020905923343, |
|
"grad_norm": 3.129504442214966, |
|
"learning_rate": 1.0819691466845815e-05, |
|
"loss": 1.0414, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3205574912891986, |
|
"grad_norm": 4.991695404052734, |
|
"learning_rate": 1.079798940576134e-05, |
|
"loss": 1.0773, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3344947735191638, |
|
"grad_norm": 3.570375919342041, |
|
"learning_rate": 1.0775078763280638e-05, |
|
"loss": 1.0709, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.34843205574912894, |
|
"grad_norm": 7.718966007232666, |
|
"learning_rate": 1.0750964765797001e-05, |
|
"loss": 1.1019, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3623693379790941, |
|
"grad_norm": 3.459160327911377, |
|
"learning_rate": 1.072565291421398e-05, |
|
"loss": 1.0505, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.37630662020905925, |
|
"grad_norm": 5.686498641967773, |
|
"learning_rate": 1.069914898269052e-05, |
|
"loss": 1.0897, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 3.622288465499878, |
|
"learning_rate": 1.067145901732376e-05, |
|
"loss": 1.0817, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.40418118466898956, |
|
"grad_norm": 6.713829040527344, |
|
"learning_rate": 1.0642589334769783e-05, |
|
"loss": 1.0597, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4181184668989547, |
|
"grad_norm": 3.2670044898986816, |
|
"learning_rate": 1.061254652080265e-05, |
|
"loss": 1.0631, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.43205574912891986, |
|
"grad_norm": 3.0448334217071533, |
|
"learning_rate": 1.0581337428812077e-05, |
|
"loss": 1.0697, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.445993031358885, |
|
"grad_norm": 4.583492755889893, |
|
"learning_rate": 1.0548969178239997e-05, |
|
"loss": 1.0564, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.45993031358885017, |
|
"grad_norm": 3.7002651691436768, |
|
"learning_rate": 1.0515449152956496e-05, |
|
"loss": 1.0852, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4738675958188153, |
|
"grad_norm": 3.1550400257110596, |
|
"learning_rate": 1.0480784999575381e-05, |
|
"loss": 1.0576, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 3.1954734325408936, |
|
"learning_rate": 1.0444984625709842e-05, |
|
"loss": 1.0965, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5017421602787456, |
|
"grad_norm": 2.8396148681640625, |
|
"learning_rate": 1.0408056198168555e-05, |
|
"loss": 1.0401, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5156794425087108, |
|
"grad_norm": 10.67453384399414, |
|
"learning_rate": 1.0370008141092654e-05, |
|
"loss": 1.0909, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5296167247386759, |
|
"grad_norm": 3.554422378540039, |
|
"learning_rate": 1.0330849134034033e-05, |
|
"loss": 1.0534, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5435540069686411, |
|
"grad_norm": 3.5049169063568115, |
|
"learning_rate": 1.0290588109975334e-05, |
|
"loss": 1.0366, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5574912891986062, |
|
"grad_norm": 3.9955601692199707, |
|
"learning_rate": 1.024923425329217e-05, |
|
"loss": 1.0264, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 3.2370924949645996, |
|
"learning_rate": 1.0206796997657961e-05, |
|
"loss": 1.0282, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 3.0653462409973145, |
|
"learning_rate": 1.0163286023891926e-05, |
|
"loss": 1.066, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5993031358885017, |
|
"grad_norm": 2.9653961658477783, |
|
"learning_rate": 1.011871125775069e-05, |
|
"loss": 1.068, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6132404181184669, |
|
"grad_norm": 2.899329423904419, |
|
"learning_rate": 1.0073082867664e-05, |
|
"loss": 1.0075, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.627177700348432, |
|
"grad_norm": 3.231595277786255, |
|
"learning_rate": 1.002641126241511e-05, |
|
"loss": 1.0579, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6411149825783972, |
|
"grad_norm": 3.7711715698242188, |
|
"learning_rate": 9.978707088766316e-06, |
|
"loss": 1.0511, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6550522648083623, |
|
"grad_norm": 3.1697182655334473, |
|
"learning_rate": 9.929981229030202e-06, |
|
"loss": 1.0785, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6689895470383276, |
|
"grad_norm": 2.8038201332092285, |
|
"learning_rate": 9.88024479858717e-06, |
|
"loss": 1.0389, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 3.2184159755706787, |
|
"learning_rate": 9.829509143349775e-06, |
|
"loss": 1.0625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6968641114982579, |
|
"grad_norm": 2.9576430320739746, |
|
"learning_rate": 9.77778583717451e-06, |
|
"loss": 1.0217, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.710801393728223, |
|
"grad_norm": 2.918567657470703, |
|
"learning_rate": 9.725086679221542e-06, |
|
"loss": 1.0106, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7247386759581882, |
|
"grad_norm": 2.8690834045410156, |
|
"learning_rate": 9.671423691263104e-06, |
|
"loss": 1.0427, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7386759581881533, |
|
"grad_norm": 2.8049697875976562, |
|
"learning_rate": 9.616809114941055e-06, |
|
"loss": 1.0553, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7526132404181185, |
|
"grad_norm": 3.5655879974365234, |
|
"learning_rate": 9.561255408974332e-06, |
|
"loss": 1.0308, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7665505226480837, |
|
"grad_norm": 3.0086355209350586, |
|
"learning_rate": 9.504775246316836e-06, |
|
"loss": 1.0625, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 3.2106897830963135, |
|
"learning_rate": 9.447381511266482e-06, |
|
"loss": 1.0582, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.794425087108014, |
|
"grad_norm": 3.013434410095215, |
|
"learning_rate": 9.38908729652601e-06, |
|
"loss": 1.0501, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8083623693379791, |
|
"grad_norm": 3.194031000137329, |
|
"learning_rate": 9.32990590021629e-06, |
|
"loss": 1.0385, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8222996515679443, |
|
"grad_norm": 2.9916939735412598, |
|
"learning_rate": 9.269850822842717e-06, |
|
"loss": 0.9978, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8362369337979094, |
|
"grad_norm": 2.90720796585083, |
|
"learning_rate": 9.208935764215487e-06, |
|
"loss": 1.038, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8501742160278746, |
|
"grad_norm": 2.983062505722046, |
|
"learning_rate": 9.147174620324374e-06, |
|
"loss": 1.0137, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8641114982578397, |
|
"grad_norm": 3.021130084991455, |
|
"learning_rate": 9.084581480168767e-06, |
|
"loss": 1.0388, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 3.831815242767334, |
|
"learning_rate": 9.021170622543684e-06, |
|
"loss": 1.0482, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.89198606271777, |
|
"grad_norm": 3.5826456546783447, |
|
"learning_rate": 8.956956512782476e-06, |
|
"loss": 1.0494, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9059233449477352, |
|
"grad_norm": 3.445178747177124, |
|
"learning_rate": 8.891953799456987e-06, |
|
"loss": 0.9976, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9198606271777003, |
|
"grad_norm": 2.8443377017974854, |
|
"learning_rate": 8.826177311035906e-06, |
|
"loss": 1.0646, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9337979094076655, |
|
"grad_norm": 2.8210973739624023, |
|
"learning_rate": 8.759642052502092e-06, |
|
"loss": 1.0602, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9477351916376306, |
|
"grad_norm": 2.9282021522521973, |
|
"learning_rate": 8.692363201929623e-06, |
|
"loss": 1.041, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9616724738675958, |
|
"grad_norm": 2.916203260421753, |
|
"learning_rate": 8.624356107021355e-06, |
|
"loss": 1.035, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 2.7474899291992188, |
|
"learning_rate": 8.555636281607811e-06, |
|
"loss": 1.0443, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9895470383275261, |
|
"grad_norm": 3.2409560680389404, |
|
"learning_rate": 8.486219402108133e-06, |
|
"loss": 1.0502, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0034843205574913, |
|
"grad_norm": 3.2006499767303467, |
|
"learning_rate": 8.416121303953973e-06, |
|
"loss": 1.0337, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0017421602787457, |
|
"grad_norm": 3.021101713180542, |
|
"learning_rate": 8.345357977977113e-06, |
|
"loss": 1.0164, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.0156794425087108, |
|
"grad_norm": 3.0675883293151855, |
|
"learning_rate": 8.273945566761604e-06, |
|
"loss": 0.9739, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.029616724738676, |
|
"grad_norm": 2.925316095352173, |
|
"learning_rate": 8.201900360961325e-06, |
|
"loss": 0.9607, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.043554006968641, |
|
"grad_norm": 3.2823657989501953, |
|
"learning_rate": 8.12923879558374e-06, |
|
"loss": 0.9313, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0574912891986064, |
|
"grad_norm": 2.809762477874756, |
|
"learning_rate": 8.055977446240727e-06, |
|
"loss": 0.9682, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 3.0586483478546143, |
|
"learning_rate": 7.982133025367346e-06, |
|
"loss": 0.9674, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0853658536585367, |
|
"grad_norm": 2.9541120529174805, |
|
"learning_rate": 7.907722378409371e-06, |
|
"loss": 0.9691, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.0993031358885017, |
|
"grad_norm": 2.965197801589966, |
|
"learning_rate": 7.83276247998052e-06, |
|
"loss": 0.9537, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.113240418118467, |
|
"grad_norm": 3.096423625946045, |
|
"learning_rate": 7.757270429990162e-06, |
|
"loss": 0.9631, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.127177700348432, |
|
"grad_norm": 3.0303800106048584, |
|
"learning_rate": 7.681263449742493e-06, |
|
"loss": 0.9774, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.1411149825783973, |
|
"grad_norm": 3.2755138874053955, |
|
"learning_rate": 7.604758878007994e-06, |
|
"loss": 0.9589, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1550522648083623, |
|
"grad_norm": 3.0583481788635254, |
|
"learning_rate": 7.527774167068094e-06, |
|
"loss": 0.9313, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1689895470383276, |
|
"grad_norm": 2.8140718936920166, |
|
"learning_rate": 7.4503268787339455e-06, |
|
"loss": 0.9137, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1829268292682926, |
|
"grad_norm": 3.0150392055511475, |
|
"learning_rate": 7.372434680340213e-06, |
|
"loss": 0.9324, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.1968641114982579, |
|
"grad_norm": 3.1816203594207764, |
|
"learning_rate": 7.294115340714782e-06, |
|
"loss": 0.9835, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.210801393728223, |
|
"grad_norm": 3.011570930480957, |
|
"learning_rate": 7.215386726125319e-06, |
|
"loss": 0.9395, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.2247386759581882, |
|
"grad_norm": 4.090625286102295, |
|
"learning_rate": 7.1362667962036166e-06, |
|
"loss": 0.9682, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2386759581881532, |
|
"grad_norm": 3.290282964706421, |
|
"learning_rate": 7.056773599848612e-06, |
|
"loss": 0.9487, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2526132404181185, |
|
"grad_norm": 3.567591667175293, |
|
"learning_rate": 6.976925271109072e-06, |
|
"loss": 0.9459, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.2665505226480835, |
|
"grad_norm": 2.817159414291382, |
|
"learning_rate": 6.8967400250468335e-06, |
|
"loss": 0.968, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2804878048780488, |
|
"grad_norm": 3.8883535861968994, |
|
"learning_rate": 6.816236153581568e-06, |
|
"loss": 0.938, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.294425087108014, |
|
"grad_norm": 2.920037269592285, |
|
"learning_rate": 6.735432021318023e-06, |
|
"loss": 0.9401, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.3083623693379791, |
|
"grad_norm": 2.851327896118164, |
|
"learning_rate": 6.654346061356661e-06, |
|
"loss": 0.9636, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3222996515679442, |
|
"grad_norm": 3.1404778957366943, |
|
"learning_rate": 6.572996771088706e-06, |
|
"loss": 0.9665, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3362369337979094, |
|
"grad_norm": 4.205505847930908, |
|
"learning_rate": 6.491402707976482e-06, |
|
"loss": 0.945, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3501742160278747, |
|
"grad_norm": 2.9891207218170166, |
|
"learning_rate": 6.409582485320087e-06, |
|
"loss": 0.9554, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3641114982578397, |
|
"grad_norm": 2.9016053676605225, |
|
"learning_rate": 6.327554768011307e-06, |
|
"loss": 0.9613, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3780487804878048, |
|
"grad_norm": 2.954751968383789, |
|
"learning_rate": 6.245338268275765e-06, |
|
"loss": 0.9358, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.39198606271777, |
|
"grad_norm": 3.2333548069000244, |
|
"learning_rate": 6.162951741404276e-06, |
|
"loss": 0.9573, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.4059233449477353, |
|
"grad_norm": 3.4226996898651123, |
|
"learning_rate": 6.080413981474379e-06, |
|
"loss": 0.9294, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.4198606271777003, |
|
"grad_norm": 3.169379711151123, |
|
"learning_rate": 5.9977438170630085e-06, |
|
"loss": 0.9195, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.4337979094076654, |
|
"grad_norm": 3.147132396697998, |
|
"learning_rate": 5.914960106951313e-06, |
|
"loss": 0.9524, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.4477351916376306, |
|
"grad_norm": 2.7678744792938232, |
|
"learning_rate": 5.832081735822573e-06, |
|
"loss": 0.9234, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.461672473867596, |
|
"grad_norm": 2.8116793632507324, |
|
"learning_rate": 5.749127609954215e-06, |
|
"loss": 0.9619, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.475609756097561, |
|
"grad_norm": 3.6426944732666016, |
|
"learning_rate": 5.666116652904889e-06, |
|
"loss": 0.9435, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.489547038327526, |
|
"grad_norm": 4.532324314117432, |
|
"learning_rate": 5.5830678011976225e-06, |
|
"loss": 0.9538, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5034843205574913, |
|
"grad_norm": 3.7293591499328613, |
|
"learning_rate": 5.5e-06, |
|
"loss": 0.9595, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.5174216027874565, |
|
"grad_norm": 2.8792057037353516, |
|
"learning_rate": 5.416932198802378e-06, |
|
"loss": 0.9498, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5313588850174216, |
|
"grad_norm": 2.9074580669403076, |
|
"learning_rate": 5.333883347095112e-06, |
|
"loss": 0.9572, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5452961672473866, |
|
"grad_norm": 4.807355880737305, |
|
"learning_rate": 5.250872390045787e-06, |
|
"loss": 0.9527, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.5592334494773519, |
|
"grad_norm": 4.229637622833252, |
|
"learning_rate": 5.167918264177426e-06, |
|
"loss": 0.9275, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.5731707317073171, |
|
"grad_norm": 3.07944393157959, |
|
"learning_rate": 5.085039893048687e-06, |
|
"loss": 0.9565, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.5871080139372822, |
|
"grad_norm": 2.8650782108306885, |
|
"learning_rate": 5.002256182936992e-06, |
|
"loss": 0.9493, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.6010452961672472, |
|
"grad_norm": 2.928194284439087, |
|
"learning_rate": 4.919586018525621e-06, |
|
"loss": 0.9513, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.6149825783972127, |
|
"grad_norm": 3.433330535888672, |
|
"learning_rate": 4.837048258595723e-06, |
|
"loss": 0.978, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.6289198606271778, |
|
"grad_norm": 2.8581085205078125, |
|
"learning_rate": 4.754661731724237e-06, |
|
"loss": 0.9366, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 2.897559404373169, |
|
"learning_rate": 4.672445231988693e-06, |
|
"loss": 0.9641, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.656794425087108, |
|
"grad_norm": 2.8979365825653076, |
|
"learning_rate": 4.590417514679912e-06, |
|
"loss": 0.9316, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6707317073170733, |
|
"grad_norm": 3.090183734893799, |
|
"learning_rate": 4.508597292023518e-06, |
|
"loss": 0.9545, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.6846689895470384, |
|
"grad_norm": 2.9215645790100098, |
|
"learning_rate": 4.427003228911295e-06, |
|
"loss": 0.9169, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.6986062717770034, |
|
"grad_norm": 4.066997528076172, |
|
"learning_rate": 4.345653938643339e-06, |
|
"loss": 0.9357, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.7125435540069687, |
|
"grad_norm": 3.2360036373138428, |
|
"learning_rate": 4.2645679786819796e-06, |
|
"loss": 0.9205, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.726480836236934, |
|
"grad_norm": 2.890538215637207, |
|
"learning_rate": 4.1837638464184334e-06, |
|
"loss": 0.9355, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.740418118466899, |
|
"grad_norm": 3.4630892276763916, |
|
"learning_rate": 4.103259974953166e-06, |
|
"loss": 0.9263, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.754355400696864, |
|
"grad_norm": 3.326503276824951, |
|
"learning_rate": 4.023074728890927e-06, |
|
"loss": 0.9154, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.7682926829268293, |
|
"grad_norm": 3.6348044872283936, |
|
"learning_rate": 3.943226400151388e-06, |
|
"loss": 0.944, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.7822299651567945, |
|
"grad_norm": 3.0831995010375977, |
|
"learning_rate": 3.863733203796385e-06, |
|
"loss": 0.9457, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.7961672473867596, |
|
"grad_norm": 3.1765310764312744, |
|
"learning_rate": 3.784613273874681e-06, |
|
"loss": 0.9329, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.8101045296167246, |
|
"grad_norm": 2.767540693283081, |
|
"learning_rate": 3.70588465928522e-06, |
|
"loss": 0.9285, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.82404181184669, |
|
"grad_norm": 2.718041181564331, |
|
"learning_rate": 3.6275653196597856e-06, |
|
"loss": 0.9767, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.8379790940766552, |
|
"grad_norm": 4.337693691253662, |
|
"learning_rate": 3.5496731212660538e-06, |
|
"loss": 0.9351, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.8519163763066202, |
|
"grad_norm": 3.106288194656372, |
|
"learning_rate": 3.472225832931907e-06, |
|
"loss": 0.9554, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.8658536585365852, |
|
"grad_norm": 3.2049343585968018, |
|
"learning_rate": 3.3952411219920066e-06, |
|
"loss": 0.9601, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8797909407665505, |
|
"grad_norm": 4.705850601196289, |
|
"learning_rate": 3.318736550257507e-06, |
|
"loss": 0.9595, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.8937282229965158, |
|
"grad_norm": 3.335909128189087, |
|
"learning_rate": 3.2427295700098385e-06, |
|
"loss": 0.9889, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.9076655052264808, |
|
"grad_norm": 2.938302993774414, |
|
"learning_rate": 3.1672375200194797e-06, |
|
"loss": 0.9306, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.9216027874564459, |
|
"grad_norm": 3.15813946723938, |
|
"learning_rate": 3.092277621590627e-06, |
|
"loss": 0.9332, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.9355400696864111, |
|
"grad_norm": 2.869309902191162, |
|
"learning_rate": 3.0178669746326567e-06, |
|
"loss": 0.9496, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.9494773519163764, |
|
"grad_norm": 2.8482539653778076, |
|
"learning_rate": 2.9440225537592728e-06, |
|
"loss": 0.9406, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.9634146341463414, |
|
"grad_norm": 3.2523257732391357, |
|
"learning_rate": 2.8707612044162595e-06, |
|
"loss": 0.9377, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.9773519163763065, |
|
"grad_norm": 3.42722487449646, |
|
"learning_rate": 2.7980996390386755e-06, |
|
"loss": 0.9622, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.9912891986062717, |
|
"grad_norm": 3.1534297466278076, |
|
"learning_rate": 2.7260544332383964e-06, |
|
"loss": 0.9219, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.005226480836237, |
|
"grad_norm": 2.9020955562591553, |
|
"learning_rate": 2.654642022022887e-06, |
|
"loss": 0.9233, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.0034843205574915, |
|
"grad_norm": 6.1046247482299805, |
|
"learning_rate": 2.5838786960460267e-06, |
|
"loss": 0.9226, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.0174216027874565, |
|
"grad_norm": 2.791771411895752, |
|
"learning_rate": 2.513780597891867e-06, |
|
"loss": 0.8854, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.0313588850174216, |
|
"grad_norm": 4.039772033691406, |
|
"learning_rate": 2.444363718392189e-06, |
|
"loss": 0.9067, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.0452961672473866, |
|
"grad_norm": 3.059945821762085, |
|
"learning_rate": 2.3756438929786434e-06, |
|
"loss": 0.872, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.059233449477352, |
|
"grad_norm": 3.355379819869995, |
|
"learning_rate": 2.3076367980703774e-06, |
|
"loss": 0.8897, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.073170731707317, |
|
"grad_norm": 3.4992244243621826, |
|
"learning_rate": 2.240357947497908e-06, |
|
"loss": 0.9425, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.087108013937282, |
|
"grad_norm": 3.829111099243164, |
|
"learning_rate": 2.173822688964094e-06, |
|
"loss": 0.9141, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.1010452961672472, |
|
"grad_norm": 2.9267561435699463, |
|
"learning_rate": 2.108046200543013e-06, |
|
"loss": 0.9238, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.1149825783972127, |
|
"grad_norm": 3.307797908782959, |
|
"learning_rate": 2.0430434872175245e-06, |
|
"loss": 0.9014, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.1289198606271778, |
|
"grad_norm": 3.0536694526672363, |
|
"learning_rate": 1.9788293774563163e-06, |
|
"loss": 0.8967, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3.2300519943237305, |
|
"learning_rate": 1.9154185198312327e-06, |
|
"loss": 0.8817, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.156794425087108, |
|
"grad_norm": 3.410097599029541, |
|
"learning_rate": 1.8528253796756277e-06, |
|
"loss": 0.9151, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.1707317073170733, |
|
"grad_norm": 3.1198341846466064, |
|
"learning_rate": 1.7910642357845122e-06, |
|
"loss": 0.9359, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.1846689895470384, |
|
"grad_norm": 3.0908079147338867, |
|
"learning_rate": 1.7301491771572808e-06, |
|
"loss": 0.8697, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.1986062717770034, |
|
"grad_norm": 45.06842803955078, |
|
"learning_rate": 1.67009409978371e-06, |
|
"loss": 0.8753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.2125435540069684, |
|
"grad_norm": 3.0929031372070312, |
|
"learning_rate": 1.610912703473989e-06, |
|
"loss": 0.9032, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.226480836236934, |
|
"grad_norm": 3.174290180206299, |
|
"learning_rate": 1.5526184887335188e-06, |
|
"loss": 0.9066, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.240418118466899, |
|
"grad_norm": 3.2078001499176025, |
|
"learning_rate": 1.4952247536831645e-06, |
|
"loss": 0.8793, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.254355400696864, |
|
"grad_norm": 3.8318631649017334, |
|
"learning_rate": 1.438744591025668e-06, |
|
"loss": 0.8881, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.2682926829268295, |
|
"grad_norm": 3.460146188735962, |
|
"learning_rate": 1.3831908850589433e-06, |
|
"loss": 0.9087, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.2822299651567945, |
|
"grad_norm": 3.1915969848632812, |
|
"learning_rate": 1.3285763087368974e-06, |
|
"loss": 0.9007, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.2961672473867596, |
|
"grad_norm": 3.0315561294555664, |
|
"learning_rate": 1.2749133207784575e-06, |
|
"loss": 0.879, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.3101045296167246, |
|
"grad_norm": 4.681447505950928, |
|
"learning_rate": 1.2222141628254902e-06, |
|
"loss": 0.9268, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.3240418118466897, |
|
"grad_norm": 2.946652889251709, |
|
"learning_rate": 1.1704908566502246e-06, |
|
"loss": 0.8952, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.337979094076655, |
|
"grad_norm": 2.944952964782715, |
|
"learning_rate": 1.1197552014128314e-06, |
|
"loss": 0.8807, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.35191637630662, |
|
"grad_norm": 3.2808735370635986, |
|
"learning_rate": 1.0700187709697969e-06, |
|
"loss": 0.8722, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.3658536585365852, |
|
"grad_norm": 3.887355327606201, |
|
"learning_rate": 1.0212929112336848e-06, |
|
"loss": 0.8863, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.3797909407665507, |
|
"grad_norm": 3.0687787532806396, |
|
"learning_rate": 9.7358873758489e-07, |
|
"loss": 0.9093, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.3937282229965158, |
|
"grad_norm": 2.890662431716919, |
|
"learning_rate": 9.269171323360006e-07, |
|
"loss": 0.8987, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.407665505226481, |
|
"grad_norm": 3.5224292278289795, |
|
"learning_rate": 8.812887422493117e-07, |
|
"loss": 0.9008, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.421602787456446, |
|
"grad_norm": 7.938294410705566, |
|
"learning_rate": 8.367139761080734e-07, |
|
"loss": 0.8774, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.435540069686411, |
|
"grad_norm": 3.240544557571411, |
|
"learning_rate": 7.932030023420393e-07, |
|
"loss": 0.9178, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.4494773519163764, |
|
"grad_norm": 3.17809796333313, |
|
"learning_rate": 7.507657467078292e-07, |
|
"loss": 0.8881, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.4634146341463414, |
|
"grad_norm": 3.075671911239624, |
|
"learning_rate": 7.094118900246642e-07, |
|
"loss": 0.94, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.4773519163763065, |
|
"grad_norm": 3.6002347469329834, |
|
"learning_rate": 6.691508659659682e-07, |
|
"loss": 0.8895, |
|
"step": 180 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 213, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 36, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.164757741828047e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|