|
{ |
|
"best_metric": NaN, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 0.02817298211015636, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001408649105507818, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0100000000000002e-05, |
|
"loss": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0001408649105507818, |
|
"eval_loss": NaN, |
|
"eval_runtime": 995.0245, |
|
"eval_samples_per_second": 12.017, |
|
"eval_steps_per_second": 3.005, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002817298211015636, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0200000000000003e-05, |
|
"loss": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0004225947316523454, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0299999999999998e-05, |
|
"loss": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0005634596422031273, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.000704324552753909, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0008451894633046908, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.0599999999999996e-05, |
|
"loss": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0009860543738554725, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.07e-05, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0011269192844062545, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0012677841949570363, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.09e-05, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.001408649105507818, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000101, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0015495140160585999, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010099309690211968, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0016903789266093816, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010097238949571676, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0018312438371601634, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010093788344198939, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.001972108747710945, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010088958817454812, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.002112973658261727, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010082751689683683, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.002253838568812509, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010075168657852308, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0023947034793632906, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010066211795085874, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0025355683899140726, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010055883550101226, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.002676433300464854, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010044186746537416, |
|
"loss": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.002817298211015636, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010031124582183748, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0029581631215664177, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010016700628105531, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0030990280321171997, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010000918827667787, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0032398929426679813, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.983783495457178e-05, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0033807578532187633, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.96529931610243e-05, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.003521622763769545, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.945471342993618e-05, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003662487674320327, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.92430499690061e-05, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0038033525848711084, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.901806064491084e-05, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00394421749542189, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.877980696748506e-05, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0040850824059726724, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.852835407290526e-05, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.004225947316523454, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.826377070588204e-05, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004366812227074236, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.798612920086614e-05, |
|
"loss": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.004507677137625018, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.769550546227278e-05, |
|
"loss": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0046485420481758, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.739197894373021e-05, |
|
"loss": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.004789406958726581, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.707563262635793e-05, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004930271869277363, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.67465529960804e-05, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.005071136779828145, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.640483001998271e-05, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.005212001690378927, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.605055712171443e-05, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.005352866600929708, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.568383115594856e-05, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.00549373151148049, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.53047523819024e-05, |
|
"loss": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.005634596422031272, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.491342443592769e-05, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005775461332582054, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.45099543031775e-05, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0059163262431328354, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.40944522883575e-05, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.006057191153683617, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.366703198556972e-05, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0061980560642343994, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.322781024725723e-05, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.006338920974785181, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.27769071522577e-05, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.006479785885335963, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.231444597297502e-05, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.006620650795886745, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.184055314167797e-05, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.006761515706437527, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.135535821593484e-05, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.006902380616988308, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.08589938431937e-05, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.00704324552753909, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.035159572451788e-05, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00704324552753909, |
|
"eval_loss": NaN, |
|
"eval_runtime": 839.8948, |
|
"eval_samples_per_second": 14.236, |
|
"eval_steps_per_second": 3.56, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007184110438089872, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.983330257748669e-05, |
|
"loss": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.007324975348640654, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.930425609827138e-05, |
|
"loss": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.007465840259191435, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.876460092289691e-05, |
|
"loss": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.007606705169742217, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.821448458769978e-05, |
|
"loss": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.007747570080292999, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.765405748899315e-05, |
|
"loss": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00788843499084378, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.708347284195e-05, |
|
"loss": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.008029299901394563, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.650288663871555e-05, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.008170164811945345, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.591245760576067e-05, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.008311029722496126, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.531234716048757e-05, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.008451894633046908, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.470271936709994e-05, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00859275954359769, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.408374089174933e-05, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.008733624454148471, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.345558095697051e-05, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.008874489364699253, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.281841129541749e-05, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.009015354275250036, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.217240610291362e-05, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.009156219185800818, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.151774199082823e-05, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0092970840963516, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.085459793779277e-05, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00943794900690238, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.018315524076989e-05, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.009578813917453162, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.950359746548865e-05, |
|
"loss": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.009719678828003944, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.881611039625947e-05, |
|
"loss": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.009860543738554725, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.812088198518258e-05, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.010001408649105507, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.741810230076368e-05, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.01014227355965629, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.670796347595137e-05, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.010283138470207072, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.599065965560962e-05, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.010424003380757853, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.526638694344066e-05, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.010564868291308635, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.453534334837223e-05, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.010705733201859417, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.379772873042374e-05, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.010846598112410198, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.305374474606674e-05, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.01098746302296098, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.230359479309389e-05, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.011128327933511763, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.154748395501217e-05, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.011269192844062545, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.078561894497497e-05, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.011410057754613326, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.001820804926883e-05, |
|
"loss": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.011550922665164108, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.924546107037015e-05, |
|
"loss": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.01169178757571489, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.846758926958709e-05, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.011832652486265671, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.768480530930298e-05, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.011973517396816452, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.689732319483653e-05, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.012114382307367234, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.610535821593485e-05, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.012255247217918017, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.530912688791548e-05, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.012396112128468799, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.450884689247316e-05, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.01253697703901958, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.37047370181679e-05, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.012677841949570362, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.289701710061036e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.012818706860121144, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.208590796236096e-05, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.012959571770671925, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.127163135255923e-05, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.013100436681222707, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.045440988629975e-05, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.01324130159177349, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9634466983771556e-05, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.013382166502324272, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.881202680917707e-05, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.013523031412875053, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.7987314209448023e-05, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.013663896323425835, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.716055465277449e-05, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.013804761233976616, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.633197416696411e-05, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.013945626144527398, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5501799277648376e-05, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.01408649105507818, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.467025694635279e-05, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01408649105507818, |
|
"eval_loss": NaN, |
|
"eval_runtime": 673.3728, |
|
"eval_samples_per_second": 17.757, |
|
"eval_steps_per_second": 4.44, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.014227355965628961, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.383757450844782e-05, |
|
"loss": 0.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.014368220876179744, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.300397961099773e-05, |
|
"loss": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.014509085786730526, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.216970015052406e-05, |
|
"loss": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.014649950697281307, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.133496421070111e-05, |
|
"loss": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.014790815607832089, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01493168051838287, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.96650357892989e-05, |
|
"loss": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.015072545428933652, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.8830299849475936e-05, |
|
"loss": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.015213410339484434, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.799602038900227e-05, |
|
"loss": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.015354275250035217, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.716242549155218e-05, |
|
"loss": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.015495140160585999, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.632974305364722e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01563600507113678, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.549820072235163e-05, |
|
"loss": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.01577686998168756, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4668025833035906e-05, |
|
"loss": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.015917734892238345, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.383944534722552e-05, |
|
"loss": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.016058599802789127, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.301268579055198e-05, |
|
"loss": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.016199464713339908, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.218797319082293e-05, |
|
"loss": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.01634032962389069, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.1365533016228466e-05, |
|
"loss": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.01648119453444147, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.0545590113700254e-05, |
|
"loss": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.016622059444992253, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.972836864744079e-05, |
|
"loss": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.016762924355543034, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.891409203763905e-05, |
|
"loss": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.016903789266093816, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.810298289938965e-05, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.017044654176644598, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.72952629818321e-05, |
|
"loss": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.01718551908719538, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.649115310752686e-05, |
|
"loss": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.01732638399774616, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5690873112084536e-05, |
|
"loss": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.017467248908296942, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.489464178406516e-05, |
|
"loss": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.017608113818847724, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.410267680516349e-05, |
|
"loss": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.017748978729398505, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3315194690697024e-05, |
|
"loss": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.017889843639949287, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.253241073041291e-05, |
|
"loss": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.018030708550500072, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.175453892962985e-05, |
|
"loss": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.018171573461050854, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.098179195073118e-05, |
|
"loss": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.018312438371601635, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0214381055025054e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.018453303282152417, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9452516044987844e-05, |
|
"loss": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.0185941681927032, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8696405206906116e-05, |
|
"loss": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01873503310325398, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7946255253933275e-05, |
|
"loss": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.01887589801380476, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7202271269576275e-05, |
|
"loss": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.019016762924355543, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6464656651627787e-05, |
|
"loss": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.019157627834906325, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5733613056559357e-05, |
|
"loss": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.019298492745457106, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5009340344390407e-05, |
|
"loss": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.019439357656007888, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4292036524048648e-05, |
|
"loss": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.01958022256655867, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3581897699236327e-05, |
|
"loss": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.01972108747710945, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.287911801481745e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.019861952387660232, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2183889603740534e-05, |
|
"loss": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.020002817298211014, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.149640253451135e-05, |
|
"loss": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.0201436822087618, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0816844759230112e-05, |
|
"loss": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.02028454711931258, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0145402062207232e-05, |
|
"loss": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.020425412029863362, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9482258009171774e-05, |
|
"loss": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.020566276940414144, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.882759389708638e-05, |
|
"loss": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.020707141850964925, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.818158870458251e-05, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.020848006761515707, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.754441904302948e-05, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.02098887167206649, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.691625910825066e-05, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.02112973658261727, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6297280632900087e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02112973658261727, |
|
"eval_loss": NaN, |
|
"eval_runtime": 1071.658, |
|
"eval_samples_per_second": 11.157, |
|
"eval_steps_per_second": 2.79, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02127060149316805, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5687652839512427e-05, |
|
"loss": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.021411466403718833, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5087542394239326e-05, |
|
"loss": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.021552331314269615, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.449711336128445e-05, |
|
"loss": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.021693196224820396, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3916527158050007e-05, |
|
"loss": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.021834061135371178, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3345942511006854e-05, |
|
"loss": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.02197492604592196, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2785515412300245e-05, |
|
"loss": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.02211579095647274, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2235399077103106e-05, |
|
"loss": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.022256655867023526, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1695743901728631e-05, |
|
"loss": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.022397520777574308, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1166697422513329e-05, |
|
"loss": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.02253838568812509, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.064840427548213e-05, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02267925059867587, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0141006156806303e-05, |
|
"loss": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.022820115509226652, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.64464178406516e-06, |
|
"loss": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.022960980419777434, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.159446858322036e-06, |
|
"loss": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.023101845330328215, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.685554027024989e-06, |
|
"loss": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.023242710240878997, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.22309284774231e-06, |
|
"loss": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02338357515142978, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.772189752742756e-06, |
|
"loss": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.02352444006198056, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.332968014430274e-06, |
|
"loss": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.023665304972531342, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.905547711642518e-06, |
|
"loss": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.023806169883082123, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.490045696822492e-06, |
|
"loss": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.023947034793632905, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.086575564072307e-06, |
|
"loss": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.024087899704183686, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6952476180976035e-06, |
|
"loss": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.024228764614734468, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.316168844051445e-06, |
|
"loss": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.024369629525285253, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.949442878285576e-06, |
|
"loss": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.024510494435836035, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.5951699800172935e-06, |
|
"loss": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.024651359346386816, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.253447003919596e-06, |
|
"loss": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.024792224256937598, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.924367373642071e-06, |
|
"loss": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.02493308916748838, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6080210562697984e-06, |
|
"loss": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.02507395407803916, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3044945377272327e-06, |
|
"loss": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.025214818988589943, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.013870799133861e-06, |
|
"loss": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.025355683899140724, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.736229294117951e-06, |
|
"loss": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.025496548809691506, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4716459270947466e-06, |
|
"loss": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.025637413720242287, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.220193032514939e-06, |
|
"loss": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.02577827863079307, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9819393550891686e-06, |
|
"loss": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.02591914354134385, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7569500309938975e-06, |
|
"loss": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.026060008451894632, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5452865700638161e-06, |
|
"loss": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.026200873362445413, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.34700683897571e-06, |
|
"loss": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.026341738272996195, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.162165045428237e-06, |
|
"loss": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.02648260318354698, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.908117233221274e-07, |
|
"loss": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.02662346809409776, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.329937189446904e-07, |
|
"loss": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.026764333004648543, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.887541781625227e-07, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.026905197915199325, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.581325346258412e-07, |
|
"loss": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.027046062825750106, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.411644989877527e-07, |
|
"loss": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.027186927736300888, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.378820491412738e-07, |
|
"loss": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.02732779264685167, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.483134214769235e-07, |
|
"loss": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.02746865755740245, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7248310316317272e-07, |
|
"loss": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.027609522467953233, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.10411825451886e-07, |
|
"loss": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.027750387378504014, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.211655801061078e-08, |
|
"loss": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.027891252289054796, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.761050428323453e-08, |
|
"loss": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.028032117199605577, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.90309788031529e-09, |
|
"loss": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.02817298211015636, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02817298211015636, |
|
"eval_loss": NaN, |
|
"eval_runtime": 1347.7679, |
|
"eval_samples_per_second": 8.872, |
|
"eval_steps_per_second": 2.218, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.287627183469363e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|