|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9974985568597268, |
|
"eval_steps": 500, |
|
"global_step": 324, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.3905354112155957, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5601, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2069311238141744, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5425, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.329876685543184, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5518, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3279617694098214, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4658, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3546782512787179, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4736, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6365619709572907, |
|
"learning_rate": 6e-05, |
|
"loss": 1.4756, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2081088328094727, |
|
"learning_rate": 7e-05, |
|
"loss": 1.3752, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0574546767027917, |
|
"learning_rate": 8e-05, |
|
"loss": 1.439, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0012858470947548, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4502, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0136463608511321, |
|
"learning_rate": 0.0001, |
|
"loss": 1.396, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9854986957079499, |
|
"learning_rate": 9.999749748415981e-05, |
|
"loss": 1.4634, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9239702875144448, |
|
"learning_rate": 9.998999018714263e-05, |
|
"loss": 1.4185, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9233186885946617, |
|
"learning_rate": 9.997747886043367e-05, |
|
"loss": 1.3691, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8728080646228755, |
|
"learning_rate": 9.995996475642466e-05, |
|
"loss": 1.3569, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8307436189120798, |
|
"learning_rate": 9.99374496282885e-05, |
|
"loss": 1.3667, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7939681765489442, |
|
"learning_rate": 9.990993572980378e-05, |
|
"loss": 1.3589, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7980526587202514, |
|
"learning_rate": 9.987742581512918e-05, |
|
"loss": 1.395, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7207026961586376, |
|
"learning_rate": 9.983992313852774e-05, |
|
"loss": 1.3887, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7871310923443939, |
|
"learning_rate": 9.979743145404119e-05, |
|
"loss": 1.3062, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7246807835643384, |
|
"learning_rate": 9.974995501511404e-05, |
|
"loss": 1.4028, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7206361737972257, |
|
"learning_rate": 9.969749857416789e-05, |
|
"loss": 1.3398, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6960105391155793, |
|
"learning_rate": 9.964006738212575e-05, |
|
"loss": 1.3469, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7204455050781057, |
|
"learning_rate": 9.957766718788633e-05, |
|
"loss": 1.3765, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5812596677574942, |
|
"learning_rate": 9.951030423774859e-05, |
|
"loss": 1.3379, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.675904894861765, |
|
"learning_rate": 9.943798527478651e-05, |
|
"loss": 1.3789, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6664199931252878, |
|
"learning_rate": 9.936071753817415e-05, |
|
"loss": 1.3535, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7030363325377104, |
|
"learning_rate": 9.927850876246088e-05, |
|
"loss": 1.218, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6185910243439269, |
|
"learning_rate": 9.919136717679722e-05, |
|
"loss": 1.3501, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6270496500828271, |
|
"learning_rate": 9.909930150411113e-05, |
|
"loss": 1.3628, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6021577621428597, |
|
"learning_rate": 9.900232096023477e-05, |
|
"loss": 1.3428, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6760900210464318, |
|
"learning_rate": 9.890043525298203e-05, |
|
"loss": 1.3079, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5940959503401227, |
|
"learning_rate": 9.879365458117678e-05, |
|
"loss": 1.291, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5427094594745479, |
|
"learning_rate": 9.868198963363189e-05, |
|
"loss": 1.2893, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5958301777472343, |
|
"learning_rate": 9.856545158807938e-05, |
|
"loss": 1.2363, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5850214722817281, |
|
"learning_rate": 9.844405211005146e-05, |
|
"loss": 1.3154, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6172288827652985, |
|
"learning_rate": 9.831780335171279e-05, |
|
"loss": 1.3101, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5728085013088785, |
|
"learning_rate": 9.818671795064404e-05, |
|
"loss": 1.2817, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5979780148525169, |
|
"learning_rate": 9.805080902857699e-05, |
|
"loss": 1.2493, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.537951216091522, |
|
"learning_rate": 9.791009019008078e-05, |
|
"loss": 1.2454, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5879388223555754, |
|
"learning_rate": 9.776457552120033e-05, |
|
"loss": 1.2744, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5828144715679795, |
|
"learning_rate": 9.761427958804621e-05, |
|
"loss": 1.2354, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5640545987289305, |
|
"learning_rate": 9.745921743533653e-05, |
|
"loss": 1.3105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6077402891626962, |
|
"learning_rate": 9.729940458489104e-05, |
|
"loss": 1.3188, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5649165442230879, |
|
"learning_rate": 9.713485703407731e-05, |
|
"loss": 1.2756, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5751090024359722, |
|
"learning_rate": 9.696559125420948e-05, |
|
"loss": 1.2336, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5415492855376868, |
|
"learning_rate": 9.679162418889931e-05, |
|
"loss": 1.2424, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5742691889803935, |
|
"learning_rate": 9.66129732523603e-05, |
|
"loss": 1.2363, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5763001671560306, |
|
"learning_rate": 9.642965632766436e-05, |
|
"loss": 1.2661, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5940077836695673, |
|
"learning_rate": 9.624169176495184e-05, |
|
"loss": 1.2734, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6005566973905004, |
|
"learning_rate": 9.604909837959455e-05, |
|
"loss": 1.2078, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5429670344508561, |
|
"learning_rate": 9.585189545031238e-05, |
|
"loss": 1.2246, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5871060174846529, |
|
"learning_rate": 9.565010271724352e-05, |
|
"loss": 1.3501, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5907025473048154, |
|
"learning_rate": 9.54437403799684e-05, |
|
"loss": 1.2729, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6516780485358238, |
|
"learning_rate": 9.523282909548773e-05, |
|
"loss": 1.2881, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5670548168352986, |
|
"learning_rate": 9.50173899761547e-05, |
|
"loss": 1.2056, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5390303783246289, |
|
"learning_rate": 9.47974445875617e-05, |
|
"loss": 1.2144, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5646076772019145, |
|
"learning_rate": 9.457301494638147e-05, |
|
"loss": 1.2212, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5612167041328082, |
|
"learning_rate": 9.434412351816328e-05, |
|
"loss": 1.2717, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5263144027788276, |
|
"learning_rate": 9.411079321508414e-05, |
|
"loss": 1.2112, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5356079649939604, |
|
"learning_rate": 9.387304739365523e-05, |
|
"loss": 1.2234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5615087563747105, |
|
"learning_rate": 9.36309098523839e-05, |
|
"loss": 1.2996, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5314023973665841, |
|
"learning_rate": 9.338440482939146e-05, |
|
"loss": 1.1797, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5539257915730327, |
|
"learning_rate": 9.31335569999869e-05, |
|
"loss": 1.2395, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5648098416860655, |
|
"learning_rate": 9.287839147419686e-05, |
|
"loss": 1.2156, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5865707594533118, |
|
"learning_rate": 9.261893379425218e-05, |
|
"loss": 1.2683, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5440049015077518, |
|
"learning_rate": 9.2355209932031e-05, |
|
"loss": 1.1853, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5440912007358205, |
|
"learning_rate": 9.208724628645902e-05, |
|
"loss": 1.207, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5587040510710671, |
|
"learning_rate": 9.181506968086697e-05, |
|
"loss": 1.1768, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5192024047252568, |
|
"learning_rate": 9.153870736030548e-05, |
|
"loss": 1.2427, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.528230045301502, |
|
"learning_rate": 9.125818698881798e-05, |
|
"loss": 1.2168, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5562754356167662, |
|
"learning_rate": 9.097353664667138e-05, |
|
"loss": 1.2793, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5415713371356312, |
|
"learning_rate": 9.068478482754532e-05, |
|
"loss": 1.2334, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5683526535348254, |
|
"learning_rate": 9.03919604356798e-05, |
|
"loss": 1.2534, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5396091696393609, |
|
"learning_rate": 9.0095092782982e-05, |
|
"loss": 1.2722, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.513589724983156, |
|
"learning_rate": 8.979421158609206e-05, |
|
"loss": 1.2083, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5376666654796496, |
|
"learning_rate": 8.948934696340843e-05, |
|
"loss": 1.1931, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5368664737517322, |
|
"learning_rate": 8.918052943207298e-05, |
|
"loss": 1.2202, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5263413266320822, |
|
"learning_rate": 8.886778990491631e-05, |
|
"loss": 1.1553, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5604541523029579, |
|
"learning_rate": 8.85511596873632e-05, |
|
"loss": 1.2964, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5427504321680312, |
|
"learning_rate": 8.823067047429907e-05, |
|
"loss": 1.2896, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5474275345159585, |
|
"learning_rate": 8.790635434689721e-05, |
|
"loss": 1.2612, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5269654905955425, |
|
"learning_rate": 8.757824376940746e-05, |
|
"loss": 1.1821, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.518380839571753, |
|
"learning_rate": 8.724637158590652e-05, |
|
"loss": 1.2393, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5158889921158187, |
|
"learning_rate": 8.691077101701024e-05, |
|
"loss": 1.2754, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4935757204910804, |
|
"learning_rate": 8.65714756565482e-05, |
|
"loss": 1.2173, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5653934280740491, |
|
"learning_rate": 8.622851946820095e-05, |
|
"loss": 1.2253, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5250362872573285, |
|
"learning_rate": 8.588193678210026e-05, |
|
"loss": 1.1707, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5446362423647451, |
|
"learning_rate": 8.553176229139261e-05, |
|
"loss": 1.1589, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5142920460404157, |
|
"learning_rate": 8.517803104876639e-05, |
|
"loss": 1.1948, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5317755291367837, |
|
"learning_rate": 8.482077846294308e-05, |
|
"loss": 1.1812, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5147982642702206, |
|
"learning_rate": 8.446004029513294e-05, |
|
"loss": 1.2266, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5096116915413985, |
|
"learning_rate": 8.409585265545509e-05, |
|
"loss": 1.179, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.49910238964612424, |
|
"learning_rate": 8.372825199932304e-05, |
|
"loss": 1.2485, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5507929652198581, |
|
"learning_rate": 8.335727512379534e-05, |
|
"loss": 1.2388, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5063227228339493, |
|
"learning_rate": 8.298295916389234e-05, |
|
"loss": 1.2415, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.48774944558422, |
|
"learning_rate": 8.260534158887876e-05, |
|
"loss": 1.1301, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5131731261481731, |
|
"learning_rate": 8.222446019851314e-05, |
|
"loss": 1.2158, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.49068550228239954, |
|
"learning_rate": 8.184035311926396e-05, |
|
"loss": 1.1968, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5288695654598018, |
|
"learning_rate": 8.145305880049328e-05, |
|
"loss": 1.2637, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5479388159051014, |
|
"learning_rate": 8.106261601060772e-05, |
|
"loss": 1.3218, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5621161336370604, |
|
"learning_rate": 8.066906383317801e-05, |
|
"loss": 1.1729, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4924846556483893, |
|
"learning_rate": 8.027244166302642e-05, |
|
"loss": 1.1875, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5037791405351364, |
|
"learning_rate": 7.987278920228349e-05, |
|
"loss": 1.2539, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.522728960991421, |
|
"learning_rate": 7.947014645641379e-05, |
|
"loss": 1.2217, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5182027559256257, |
|
"learning_rate": 7.906455373021129e-05, |
|
"loss": 1.2188, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5539472403444413, |
|
"learning_rate": 7.865605162376486e-05, |
|
"loss": 1.1509, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5439005321857965, |
|
"learning_rate": 7.824468102839419e-05, |
|
"loss": 1.251, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5093593984665626, |
|
"learning_rate": 7.783048312255653e-05, |
|
"loss": 1.229, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5145253165856254, |
|
"learning_rate": 7.741349936772469e-05, |
|
"loss": 1.1824, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5702243863746732, |
|
"learning_rate": 7.699377150423672e-05, |
|
"loss": 1.1582, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5451183103162889, |
|
"learning_rate": 7.65713415471177e-05, |
|
"loss": 1.2122, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5252426501201418, |
|
"learning_rate": 7.614625178187402e-05, |
|
"loss": 1.1833, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.48749296422821287, |
|
"learning_rate": 7.571854476026048e-05, |
|
"loss": 1.2705, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.502002191255964, |
|
"learning_rate": 7.528826329602099e-05, |
|
"loss": 1.2188, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.48918503246260797, |
|
"learning_rate": 7.485545046060271e-05, |
|
"loss": 1.1997, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5000600132443745, |
|
"learning_rate": 7.442014957884472e-05, |
|
"loss": 1.2051, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5463372631998543, |
|
"learning_rate": 7.398240422464109e-05, |
|
"loss": 1.2214, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4961344121724833, |
|
"learning_rate": 7.354225821657914e-05, |
|
"loss": 1.208, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5899278757904003, |
|
"learning_rate": 7.309975561355312e-05, |
|
"loss": 1.146, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5023010444635025, |
|
"learning_rate": 7.265494071035401e-05, |
|
"loss": 1.1509, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5098342526132932, |
|
"learning_rate": 7.220785803323544e-05, |
|
"loss": 1.1743, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5181510224338065, |
|
"learning_rate": 7.175855233545668e-05, |
|
"loss": 1.1807, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5126009147492929, |
|
"learning_rate": 7.130706859280274e-05, |
|
"loss": 1.1875, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5694155609516423, |
|
"learning_rate": 7.085345199908235e-05, |
|
"loss": 1.1428, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5320176914945207, |
|
"learning_rate": 7.03977479616039e-05, |
|
"loss": 1.219, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5263190337396886, |
|
"learning_rate": 6.994000209663036e-05, |
|
"loss": 1.1709, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5043242059465939, |
|
"learning_rate": 6.948026022481279e-05, |
|
"loss": 1.1968, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5580562389957314, |
|
"learning_rate": 6.901856836660386e-05, |
|
"loss": 1.1494, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.495782343949626, |
|
"learning_rate": 6.855497273765112e-05, |
|
"loss": 1.2119, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.49510379613102157, |
|
"learning_rate": 6.808951974417078e-05, |
|
"loss": 1.2063, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5536322418977505, |
|
"learning_rate": 6.762225597830237e-05, |
|
"loss": 1.2617, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5668665640341709, |
|
"learning_rate": 6.715322821344494e-05, |
|
"loss": 1.1443, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5083973254687169, |
|
"learning_rate": 6.668248339957491e-05, |
|
"loss": 1.2588, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5188609075674584, |
|
"learning_rate": 6.621006865854644e-05, |
|
"loss": 1.1726, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5083154005353754, |
|
"learning_rate": 6.573603127937442e-05, |
|
"loss": 1.1953, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.544802137981313, |
|
"learning_rate": 6.526041871350086e-05, |
|
"loss": 1.1853, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4692918589408204, |
|
"learning_rate": 6.478327857004495e-05, |
|
"loss": 1.1267, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.504019443454899, |
|
"learning_rate": 6.43046586110374e-05, |
|
"loss": 1.2085, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5071221863228176, |
|
"learning_rate": 6.382460674663932e-05, |
|
"loss": 1.2026, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5146798822767533, |
|
"learning_rate": 6.334317103034652e-05, |
|
"loss": 1.1177, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.4996775842843673, |
|
"learning_rate": 6.286039965417925e-05, |
|
"loss": 1.1711, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5288823443570614, |
|
"learning_rate": 6.237634094385813e-05, |
|
"loss": 1.1199, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5214584824734313, |
|
"learning_rate": 6.18910433539668e-05, |
|
"loss": 1.1199, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5287174552858058, |
|
"learning_rate": 6.140455546310148e-05, |
|
"loss": 1.2134, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5254574455344191, |
|
"learning_rate": 6.0916925969008275e-05, |
|
"loss": 1.2578, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5189177716169284, |
|
"learning_rate": 6.042820368370854e-05, |
|
"loss": 1.2236, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5680190447979175, |
|
"learning_rate": 5.993843752861266e-05, |
|
"loss": 1.1443, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.471841513545942, |
|
"learning_rate": 5.944767652962309e-05, |
|
"loss": 1.1733, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5131643851734395, |
|
"learning_rate": 5.895596981222678e-05, |
|
"loss": 1.1592, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5069414300839418, |
|
"learning_rate": 5.8463366596577706e-05, |
|
"loss": 1.2537, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5239372199223642, |
|
"learning_rate": 5.796991619256985e-05, |
|
"loss": 1.1743, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5387326952860074, |
|
"learning_rate": 5.747566799490132e-05, |
|
"loss": 1.2075, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5224076525892895, |
|
"learning_rate": 5.6980671478129853e-05, |
|
"loss": 1.2515, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5609147216869792, |
|
"learning_rate": 5.648497619172042e-05, |
|
"loss": 1.1836, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5465488763290158, |
|
"learning_rate": 5.5988631755085264e-05, |
|
"loss": 1.1433, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5079660248232113, |
|
"learning_rate": 5.549168785261698e-05, |
|
"loss": 1.1812, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5717154092705009, |
|
"learning_rate": 5.499419422871506e-05, |
|
"loss": 1.2068, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5301493442832448, |
|
"learning_rate": 5.4496200682806495e-05, |
|
"loss": 1.2273, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5098890110687092, |
|
"learning_rate": 5.399775706436076e-05, |
|
"loss": 1.2134, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5365690403371399, |
|
"learning_rate": 5.3498913267899864e-05, |
|
"loss": 1.2051, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5002287628666306, |
|
"learning_rate": 5.299971922800391e-05, |
|
"loss": 1.1255, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5276270904967929, |
|
"learning_rate": 5.250022491431259e-05, |
|
"loss": 1.2124, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5224515486361312, |
|
"learning_rate": 5.200048032652318e-05, |
|
"loss": 1.2559, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.49316666866194575, |
|
"learning_rate": 5.150053548938557e-05, |
|
"loss": 1.1421, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5388196993705506, |
|
"learning_rate": 5.100044044769472e-05, |
|
"loss": 1.2017, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.4944161877205336, |
|
"learning_rate": 5.0500245261281175e-05, |
|
"loss": 1.1838, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5075949586376647, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2698, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5288349964384436, |
|
"learning_rate": 4.949975473871884e-05, |
|
"loss": 1.2051, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5298564213314307, |
|
"learning_rate": 4.899955955230529e-05, |
|
"loss": 1.1921, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5006832164442894, |
|
"learning_rate": 4.849946451061443e-05, |
|
"loss": 1.0999, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5180365116427443, |
|
"learning_rate": 4.799951967347683e-05, |
|
"loss": 1.1248, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5085456414296031, |
|
"learning_rate": 4.749977508568742e-05, |
|
"loss": 1.0989, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5062569741436713, |
|
"learning_rate": 4.7000280771996104e-05, |
|
"loss": 1.1851, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.49915526983252295, |
|
"learning_rate": 4.650108673210015e-05, |
|
"loss": 1.1565, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.530737295373366, |
|
"learning_rate": 4.6002242935639254e-05, |
|
"loss": 1.1367, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5354173274123866, |
|
"learning_rate": 4.550379931719351e-05, |
|
"loss": 1.2231, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4910072525351625, |
|
"learning_rate": 4.500580577128495e-05, |
|
"loss": 1.2354, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5227332856552718, |
|
"learning_rate": 4.4508312147383036e-05, |
|
"loss": 1.2178, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5288368214749951, |
|
"learning_rate": 4.4011368244914755e-05, |
|
"loss": 1.1731, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5044952842565356, |
|
"learning_rate": 4.3515023808279586e-05, |
|
"loss": 1.1287, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5325342803705855, |
|
"learning_rate": 4.301932852187016e-05, |
|
"loss": 1.2129, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5415040662560654, |
|
"learning_rate": 4.252433200509869e-05, |
|
"loss": 1.3086, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.51763854212227, |
|
"learning_rate": 4.203008380743016e-05, |
|
"loss": 1.1216, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5422430582487087, |
|
"learning_rate": 4.1536633403422306e-05, |
|
"loss": 1.1111, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5022269460427503, |
|
"learning_rate": 4.104403018777323e-05, |
|
"loss": 1.1963, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5631689643987351, |
|
"learning_rate": 4.0552323470376916e-05, |
|
"loss": 1.261, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5083595167881586, |
|
"learning_rate": 4.006156247138736e-05, |
|
"loss": 1.2529, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4982995201328196, |
|
"learning_rate": 3.9571796316291476e-05, |
|
"loss": 1.1938, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5142407248745989, |
|
"learning_rate": 3.908307403099174e-05, |
|
"loss": 1.231, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.49845909814685674, |
|
"learning_rate": 3.859544453689853e-05, |
|
"loss": 1.1174, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5044393346608876, |
|
"learning_rate": 3.810895664603321e-05, |
|
"loss": 1.1626, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5028906716759755, |
|
"learning_rate": 3.762365905614187e-05, |
|
"loss": 1.1248, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5052094751619233, |
|
"learning_rate": 3.713960034582077e-05, |
|
"loss": 1.1782, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4927933528160678, |
|
"learning_rate": 3.665682896965349e-05, |
|
"loss": 1.1411, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5194837061389329, |
|
"learning_rate": 3.61753932533607e-05, |
|
"loss": 1.1194, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5003507430931593, |
|
"learning_rate": 3.5695341388962614e-05, |
|
"loss": 1.1438, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.4952909851844929, |
|
"learning_rate": 3.521672142995506e-05, |
|
"loss": 1.1274, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.515058338420701, |
|
"learning_rate": 3.473958128649915e-05, |
|
"loss": 1.2173, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.4649441022469789, |
|
"learning_rate": 3.4263968720625594e-05, |
|
"loss": 1.0901, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5257495392411118, |
|
"learning_rate": 3.378993134145356e-05, |
|
"loss": 1.2124, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.4932641665110047, |
|
"learning_rate": 3.33175166004251e-05, |
|
"loss": 1.2024, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.47986279797474973, |
|
"learning_rate": 3.284677178655507e-05, |
|
"loss": 1.199, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5192526802216362, |
|
"learning_rate": 3.2377744021697644e-05, |
|
"loss": 1.1348, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.49075397683891514, |
|
"learning_rate": 3.1910480255829237e-05, |
|
"loss": 1.209, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5308583566356849, |
|
"learning_rate": 3.1445027262348894e-05, |
|
"loss": 1.1843, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.4754800140640908, |
|
"learning_rate": 3.098143163339615e-05, |
|
"loss": 1.0713, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.48744339587851926, |
|
"learning_rate": 3.051973977518723e-05, |
|
"loss": 1.1589, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.538169296667757, |
|
"learning_rate": 3.0059997903369656e-05, |
|
"loss": 1.1636, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5174008006678488, |
|
"learning_rate": 2.9602252038396093e-05, |
|
"loss": 1.0088, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5423637270068683, |
|
"learning_rate": 2.914654800091768e-05, |
|
"loss": 1.2637, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.49532340168296196, |
|
"learning_rate": 2.8692931407197275e-05, |
|
"loss": 1.1211, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5223772948936137, |
|
"learning_rate": 2.824144766454333e-05, |
|
"loss": 1.1392, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5301272448221841, |
|
"learning_rate": 2.7792141966764568e-05, |
|
"loss": 1.1663, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5241453393589025, |
|
"learning_rate": 2.7345059289646008e-05, |
|
"loss": 1.1375, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4946190439071369, |
|
"learning_rate": 2.6900244386446904e-05, |
|
"loss": 1.1843, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4894503682723616, |
|
"learning_rate": 2.6457741783420886e-05, |
|
"loss": 1.1492, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4852212167430057, |
|
"learning_rate": 2.6017595775358926e-05, |
|
"loss": 1.1592, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5560815990000127, |
|
"learning_rate": 2.5579850421155293e-05, |
|
"loss": 1.1484, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5035735515547093, |
|
"learning_rate": 2.514454953939731e-05, |
|
"loss": 1.1381, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5057189594248047, |
|
"learning_rate": 2.4711736703979018e-05, |
|
"loss": 1.2036, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5116012961964514, |
|
"learning_rate": 2.428145523973952e-05, |
|
"loss": 1.2212, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.48593835956939924, |
|
"learning_rate": 2.3853748218126e-05, |
|
"loss": 1.1505, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5014137656412536, |
|
"learning_rate": 2.342865845288232e-05, |
|
"loss": 1.2246, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5223070185696177, |
|
"learning_rate": 2.3006228495763295e-05, |
|
"loss": 1.158, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5624122056478932, |
|
"learning_rate": 2.258650063227533e-05, |
|
"loss": 1.0566, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5471967527021497, |
|
"learning_rate": 2.2169516877443485e-05, |
|
"loss": 1.1277, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.48337168913421796, |
|
"learning_rate": 2.1755318971605826e-05, |
|
"loss": 1.1956, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5284626857313452, |
|
"learning_rate": 2.1343948376235144e-05, |
|
"loss": 1.196, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5837213719579207, |
|
"learning_rate": 2.0935446269788717e-05, |
|
"loss": 1.0967, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5165785735037434, |
|
"learning_rate": 2.052985354358622e-05, |
|
"loss": 1.2036, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5536196580249753, |
|
"learning_rate": 2.0127210797716524e-05, |
|
"loss": 1.1572, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4991948055619459, |
|
"learning_rate": 1.9727558336973595e-05, |
|
"loss": 1.2112, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5271303549814744, |
|
"learning_rate": 1.933093616682201e-05, |
|
"loss": 1.0928, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.4755907436318383, |
|
"learning_rate": 1.8937383989392294e-05, |
|
"loss": 1.198, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5071149662683123, |
|
"learning_rate": 1.854694119950675e-05, |
|
"loss": 1.1602, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.541369836697391, |
|
"learning_rate": 1.8159646880736036e-05, |
|
"loss": 1.1282, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5083789491431415, |
|
"learning_rate": 1.7775539801486867e-05, |
|
"loss": 1.0872, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5324743232408313, |
|
"learning_rate": 1.739465841112125e-05, |
|
"loss": 1.2222, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.48387142611649014, |
|
"learning_rate": 1.701704083610768e-05, |
|
"loss": 1.158, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5141063576503123, |
|
"learning_rate": 1.664272487620466e-05, |
|
"loss": 1.1162, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5408323267729829, |
|
"learning_rate": 1.6271748000676983e-05, |
|
"loss": 1.02, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.49627165191053163, |
|
"learning_rate": 1.590414734454493e-05, |
|
"loss": 1.0848, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5376377474797022, |
|
"learning_rate": 1.5539959704867085e-05, |
|
"loss": 1.2019, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.49371506434647033, |
|
"learning_rate": 1.517922153705692e-05, |
|
"loss": 1.2151, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.523592301439241, |
|
"learning_rate": 1.4821968951233638e-05, |
|
"loss": 1.231, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5074105674850246, |
|
"learning_rate": 1.4468237708607397e-05, |
|
"loss": 1.1519, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5066768606133761, |
|
"learning_rate": 1.4118063217899746e-05, |
|
"loss": 1.1492, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5116682098711993, |
|
"learning_rate": 1.3771480531799052e-05, |
|
"loss": 1.1477, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5352752339451914, |
|
"learning_rate": 1.342852434345181e-05, |
|
"loss": 1.0964, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5418770700535751, |
|
"learning_rate": 1.308922898298977e-05, |
|
"loss": 1.0803, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5157514248366463, |
|
"learning_rate": 1.2753628414093489e-05, |
|
"loss": 1.1614, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5351720244073139, |
|
"learning_rate": 1.2421756230592534e-05, |
|
"loss": 1.2224, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5214450914678166, |
|
"learning_rate": 1.2093645653102786e-05, |
|
"loss": 1.2302, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5409585268429792, |
|
"learning_rate": 1.1769329525700935e-05, |
|
"loss": 1.2104, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5220300809084372, |
|
"learning_rate": 1.144884031263681e-05, |
|
"loss": 1.1047, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5376568328092315, |
|
"learning_rate": 1.1132210095083694e-05, |
|
"loss": 1.2002, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5414025185087213, |
|
"learning_rate": 1.081947056792702e-05, |
|
"loss": 1.1833, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.49383703911170834, |
|
"learning_rate": 1.0510653036591583e-05, |
|
"loss": 1.199, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4916712820340113, |
|
"learning_rate": 1.0205788413907952e-05, |
|
"loss": 1.1292, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5168351602558782, |
|
"learning_rate": 9.904907217018e-06, |
|
"loss": 1.2712, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4856958512300068, |
|
"learning_rate": 9.608039564320209e-06, |
|
"loss": 1.1277, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5106017400451819, |
|
"learning_rate": 9.31521517245469e-06, |
|
"loss": 1.1108, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5036900746833531, |
|
"learning_rate": 9.026463353328613e-06, |
|
"loss": 1.1748, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5033379794982934, |
|
"learning_rate": 8.741813011182014e-06, |
|
"loss": 1.1521, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5332335505700774, |
|
"learning_rate": 8.461292639694518e-06, |
|
"loss": 1.0955, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5125213937729919, |
|
"learning_rate": 8.18493031913305e-06, |
|
"loss": 1.1089, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5148264028992737, |
|
"learning_rate": 7.912753713540988e-06, |
|
"loss": 1.177, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5229543225675894, |
|
"learning_rate": 7.644790067969005e-06, |
|
"loss": 1.1646, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.541964343800654, |
|
"learning_rate": 7.381066205747822e-06, |
|
"loss": 1.1187, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5266917236869202, |
|
"learning_rate": 7.1216085258031414e-06, |
|
"loss": 1.0901, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.49725116144459375, |
|
"learning_rate": 6.866443000013117e-06, |
|
"loss": 1.1526, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5012370969649222, |
|
"learning_rate": 6.6155951706085405e-06, |
|
"loss": 1.1689, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5129519814796674, |
|
"learning_rate": 6.369090147616103e-06, |
|
"loss": 1.1348, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.4926628244202811, |
|
"learning_rate": 6.1269526063447765e-06, |
|
"loss": 1.1538, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5420926972187566, |
|
"learning_rate": 5.889206784915862e-06, |
|
"loss": 1.1484, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.4950562048422779, |
|
"learning_rate": 5.6558764818367195e-06, |
|
"loss": 1.2126, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.536783110803186, |
|
"learning_rate": 5.426985053618544e-06, |
|
"loss": 1.1301, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5016583572718184, |
|
"learning_rate": 5.2025554124383095e-06, |
|
"loss": 1.1582, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5263293678305013, |
|
"learning_rate": 4.9826100238453135e-06, |
|
"loss": 1.1577, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5272007649322749, |
|
"learning_rate": 4.767170904512292e-06, |
|
"loss": 1.167, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5063050835599103, |
|
"learning_rate": 4.556259620031617e-06, |
|
"loss": 1.1846, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4910298010180289, |
|
"learning_rate": 4.349897282756487e-06, |
|
"loss": 1.1357, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5379014923521229, |
|
"learning_rate": 4.148104549687626e-06, |
|
"loss": 1.0798, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4865878079832316, |
|
"learning_rate": 3.95090162040545e-06, |
|
"loss": 1.1123, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5282174531059731, |
|
"learning_rate": 3.758308235048158e-06, |
|
"loss": 1.1538, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5010967709298223, |
|
"learning_rate": 3.570343672335641e-06, |
|
"loss": 1.1392, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5189711142846094, |
|
"learning_rate": 3.38702674763971e-06, |
|
"loss": 1.1851, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.49664252429083205, |
|
"learning_rate": 3.2083758111006945e-06, |
|
"loss": 1.11, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.4859267407564821, |
|
"learning_rate": 3.0344087457905346e-06, |
|
"loss": 1.1719, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5151543902141822, |
|
"learning_rate": 2.86514296592269e-06, |
|
"loss": 1.2102, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5256996348656279, |
|
"learning_rate": 2.7005954151089695e-06, |
|
"loss": 1.1709, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5102631445476464, |
|
"learning_rate": 2.54078256466348e-06, |
|
"loss": 1.2034, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5093343267813872, |
|
"learning_rate": 2.3857204119538014e-06, |
|
"loss": 1.0969, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5024898891939928, |
|
"learning_rate": 2.2354244787996748e-06, |
|
"loss": 1.1289, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5279824056783474, |
|
"learning_rate": 2.0899098099192273e-06, |
|
"loss": 1.1672, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5379712149279697, |
|
"learning_rate": 1.9491909714230204e-06, |
|
"loss": 1.1858, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5436100119596193, |
|
"learning_rate": 1.8132820493559521e-06, |
|
"loss": 1.1201, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5015550141769547, |
|
"learning_rate": 1.6821966482872264e-06, |
|
"loss": 1.1797, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5119620808666944, |
|
"learning_rate": 1.5559478899485447e-06, |
|
"loss": 1.1493, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4977128949907143, |
|
"learning_rate": 1.434548411920622e-06, |
|
"loss": 1.1821, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.509981327731476, |
|
"learning_rate": 1.3180103663681165e-06, |
|
"loss": 1.1377, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5029662223988268, |
|
"learning_rate": 1.206345418823235e-06, |
|
"loss": 1.1526, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5519080967379805, |
|
"learning_rate": 1.099564747017967e-06, |
|
"loss": 1.1282, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5223711089586693, |
|
"learning_rate": 9.976790397652315e-07, |
|
"loss": 1.0811, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.47043707607640073, |
|
"learning_rate": 9.006984958888742e-07, |
|
"loss": 1.1685, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5254608837175258, |
|
"learning_rate": 8.086328232027873e-07, |
|
"loss": 1.0779, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5081931596861774, |
|
"learning_rate": 7.214912375391291e-07, |
|
"loss": 1.1371, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.4922428385525549, |
|
"learning_rate": 6.392824618258519e-07, |
|
"loss": 1.2107, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5274751815037492, |
|
"learning_rate": 5.620147252134889e-07, |
|
"loss": 1.1548, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.4984402486182386, |
|
"learning_rate": 4.896957622514298e-07, |
|
"loss": 1.1189, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.503914938251447, |
|
"learning_rate": 4.2233281211368493e-07, |
|
"loss": 1.1362, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5313082041540377, |
|
"learning_rate": 3.599326178742535e-07, |
|
"loss": 1.2146, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5301442913189116, |
|
"learning_rate": 3.025014258321135e-07, |
|
"loss": 1.1699, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5176192540816547, |
|
"learning_rate": 2.500449848859776e-07, |
|
"loss": 1.2048, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5034214543341652, |
|
"learning_rate": 2.0256854595881447e-07, |
|
"loss": 1.1338, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4795708437379455, |
|
"learning_rate": 1.6007686147225254e-07, |
|
"loss": 1.1533, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5053834258977199, |
|
"learning_rate": 1.2257418487082727e-07, |
|
"loss": 1.1648, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5276919175530018, |
|
"learning_rate": 9.006427019622176e-08, |
|
"loss": 1.2034, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5495649945557277, |
|
"learning_rate": 6.255037171150612e-08, |
|
"loss": 1.1985, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.4946180370647359, |
|
"learning_rate": 4.0035243575342605e-08, |
|
"loss": 1.1738, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.47883695876558374, |
|
"learning_rate": 2.2521139566328285e-08, |
|
"loss": 1.1804, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5584783758937163, |
|
"learning_rate": 1.0009812857370016e-08, |
|
"loss": 1.1968, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.4815115437532202, |
|
"learning_rate": 2.502515840197006e-09, |
|
"loss": 1.1697, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5515369695100554, |
|
"learning_rate": 0.0, |
|
"loss": 1.1912, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 324, |
|
"total_flos": 2.9498071531598643e+18, |
|
"train_loss": 1.2066333912037037, |
|
"train_runtime": 17698.3629, |
|
"train_samples_per_second": 4.697, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 324, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 2.9498071531598643e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|