|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.031110004977600796, |
|
"eval_steps": 125, |
|
"global_step": 125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00024888003982080636, |
|
"grad_norm": 5.4410481452941895, |
|
"learning_rate": 2e-05, |
|
"loss": 2.9282, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00024888003982080636, |
|
"eval_loss": 3.5097618103027344, |
|
"eval_runtime": 47.433, |
|
"eval_samples_per_second": 13.83, |
|
"eval_steps_per_second": 6.915, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0004977600796416127, |
|
"grad_norm": 5.435488224029541, |
|
"learning_rate": 4e-05, |
|
"loss": 3.0917, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0007466401194624191, |
|
"grad_norm": 5.29222297668457, |
|
"learning_rate": 6e-05, |
|
"loss": 2.9705, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0009955201592832255, |
|
"grad_norm": 3.5290043354034424, |
|
"learning_rate": 8e-05, |
|
"loss": 2.8213, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0012444001991040318, |
|
"grad_norm": 1.9220443964004517, |
|
"learning_rate": 0.0001, |
|
"loss": 3.7124, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0014932802389248383, |
|
"grad_norm": 1.0480749607086182, |
|
"learning_rate": 0.00012, |
|
"loss": 3.2383, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0017421602787456446, |
|
"grad_norm": 2.3310468196868896, |
|
"learning_rate": 0.00014, |
|
"loss": 3.2751, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.001991040318566451, |
|
"grad_norm": 0.622734546661377, |
|
"learning_rate": 0.00016, |
|
"loss": 2.9144, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.002239920358387257, |
|
"grad_norm": 1.5959340333938599, |
|
"learning_rate": 0.00018, |
|
"loss": 3.4992, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0024888003982080635, |
|
"grad_norm": 2.5492312908172607, |
|
"learning_rate": 0.0002, |
|
"loss": 3.7241, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0027376804380288703, |
|
"grad_norm": 2.9819629192352295, |
|
"learning_rate": 0.0001999979446958366, |
|
"loss": 2.5479, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0029865604778496766, |
|
"grad_norm": 2.515784978866577, |
|
"learning_rate": 0.00019999177886783194, |
|
"loss": 2.9924, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.003235440517670483, |
|
"grad_norm": 0.9300045371055603, |
|
"learning_rate": 0.00019998150276943902, |
|
"loss": 2.7006, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.003484320557491289, |
|
"grad_norm": 3.067373275756836, |
|
"learning_rate": 0.000199967116823068, |
|
"loss": 2.735, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0037332005973120955, |
|
"grad_norm": 3.160754919052124, |
|
"learning_rate": 0.0001999486216200688, |
|
"loss": 2.8188, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.003982080637132902, |
|
"grad_norm": 3.5078177452087402, |
|
"learning_rate": 0.00019992601792070679, |
|
"loss": 2.4545, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004230960676953708, |
|
"grad_norm": 1.3515621423721313, |
|
"learning_rate": 0.00019989930665413147, |
|
"loss": 2.4931, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.004479840716774514, |
|
"grad_norm": 1.023354172706604, |
|
"learning_rate": 0.00019986848891833845, |
|
"loss": 2.6162, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.004728720756595321, |
|
"grad_norm": 0.6757822036743164, |
|
"learning_rate": 0.0001998335659801241, |
|
"loss": 2.3384, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.004977600796416127, |
|
"grad_norm": 2.4865520000457764, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 2.0874, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005226480836236934, |
|
"grad_norm": 0.50606369972229, |
|
"learning_rate": 0.00019975141040730207, |
|
"loss": 2.2065, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0054753608760577405, |
|
"grad_norm": 0.5787423253059387, |
|
"learning_rate": 0.0001997041811497882, |
|
"loss": 2.1371, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.005724240915878547, |
|
"grad_norm": 2.7240867614746094, |
|
"learning_rate": 0.00019965285344390184, |
|
"loss": 2.3011, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.005973120955699353, |
|
"grad_norm": 0.5894359350204468, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 2.2565, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0062220009955201595, |
|
"grad_norm": 0.47996145486831665, |
|
"learning_rate": 0.00019953791129491983, |
|
"loss": 2.1724, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.006470881035340966, |
|
"grad_norm": 0.43773430585861206, |
|
"learning_rate": 0.00019947430157664576, |
|
"loss": 2.21, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.006719761075161772, |
|
"grad_norm": 0.4364480972290039, |
|
"learning_rate": 0.00019940660285944803, |
|
"loss": 1.9883, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.006968641114982578, |
|
"grad_norm": 0.42618849873542786, |
|
"learning_rate": 0.00019933481792615583, |
|
"loss": 1.8854, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.007217521154803385, |
|
"grad_norm": 0.9404358267784119, |
|
"learning_rate": 0.0001992589497275665, |
|
"loss": 1.9978, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.007466401194624191, |
|
"grad_norm": 0.4599241614341736, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 2.1981, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007715281234444997, |
|
"grad_norm": 0.5835628509521484, |
|
"learning_rate": 0.00019909497617679348, |
|
"loss": 2.0446, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.007964161274265804, |
|
"grad_norm": 1.6969475746154785, |
|
"learning_rate": 0.0001990068775649202, |
|
"loss": 1.7206, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00821304131408661, |
|
"grad_norm": 1.1236441135406494, |
|
"learning_rate": 0.00019891470916809362, |
|
"loss": 2.3924, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.008461921353907416, |
|
"grad_norm": 0.8025091886520386, |
|
"learning_rate": 0.00019881847477499557, |
|
"loss": 2.0218, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.008710801393728223, |
|
"grad_norm": 1.1834402084350586, |
|
"learning_rate": 0.00019871817834144504, |
|
"loss": 2.3439, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.008959681433549029, |
|
"grad_norm": 1.910337209701538, |
|
"learning_rate": 0.0001986138239902355, |
|
"loss": 2.174, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.009208561473369835, |
|
"grad_norm": 2.081226348876953, |
|
"learning_rate": 0.0001985054160109657, |
|
"loss": 1.9232, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.009457441513190641, |
|
"grad_norm": 0.8203554749488831, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 2.3514, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.009706321553011448, |
|
"grad_norm": 0.6460116505622864, |
|
"learning_rate": 0.0001982764571596004, |
|
"loss": 2.0705, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.009955201592832254, |
|
"grad_norm": 0.7928256988525391, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 2.2336, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01020408163265306, |
|
"grad_norm": 0.5046447515487671, |
|
"learning_rate": 0.00019803133943336874, |
|
"loss": 2.3007, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.010452961672473868, |
|
"grad_norm": 0.7800955772399902, |
|
"learning_rate": 0.0001979027334832293, |
|
"loss": 1.7504, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.010701841712294675, |
|
"grad_norm": 0.4248393476009369, |
|
"learning_rate": 0.00019777010313517518, |
|
"loss": 2.1283, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.010950721752115481, |
|
"grad_norm": 0.4310763478279114, |
|
"learning_rate": 0.00019763345384112043, |
|
"loss": 2.1454, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.011199601791936287, |
|
"grad_norm": 0.3514373302459717, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 2.0302, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.011448481831757094, |
|
"grad_norm": 0.4506613612174988, |
|
"learning_rate": 0.00019734812104845047, |
|
"loss": 2.3444, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0116973618715779, |
|
"grad_norm": 0.823046088218689, |
|
"learning_rate": 0.00019719944927874881, |
|
"loss": 2.1397, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.011946241911398706, |
|
"grad_norm": 0.6662668585777283, |
|
"learning_rate": 0.0001970467820203915, |
|
"loss": 2.2086, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.012195121951219513, |
|
"grad_norm": 1.1788537502288818, |
|
"learning_rate": 0.00019689012554893154, |
|
"loss": 1.9532, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.012444001991040319, |
|
"grad_norm": 0.41714179515838623, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 1.994, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012692882030861125, |
|
"grad_norm": 0.40183207392692566, |
|
"learning_rate": 0.00019656487088855592, |
|
"loss": 1.986, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.012941762070681932, |
|
"grad_norm": 0.6568073034286499, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 1.9911, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.013190642110502738, |
|
"grad_norm": 0.40880608558654785, |
|
"learning_rate": 0.0001962237387768529, |
|
"loss": 2.2154, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.013439522150323544, |
|
"grad_norm": 0.3884672522544861, |
|
"learning_rate": 0.00019604723610310194, |
|
"loss": 1.9765, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01368840219014435, |
|
"grad_norm": 0.3780982196331024, |
|
"learning_rate": 0.00019586678530366606, |
|
"loss": 2.0202, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.013937282229965157, |
|
"grad_norm": 0.4995989203453064, |
|
"learning_rate": 0.00019568239379617088, |
|
"loss": 1.8901, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.014186162269785963, |
|
"grad_norm": 0.3196570575237274, |
|
"learning_rate": 0.00019549406916022905, |
|
"loss": 2.3905, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.01443504230960677, |
|
"grad_norm": 0.3686681389808655, |
|
"learning_rate": 0.00019530181913712872, |
|
"loss": 2.184, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.014683922349427576, |
|
"grad_norm": 0.3581784963607788, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.8453, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.014932802389248382, |
|
"grad_norm": 0.3524768054485321, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 2.0638, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.015181682429069188, |
|
"grad_norm": 0.344062864780426, |
|
"learning_rate": 0.00019470159657616215, |
|
"loss": 2.0319, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.015430562468889995, |
|
"grad_norm": 0.4052053391933441, |
|
"learning_rate": 0.00019449372563954293, |
|
"loss": 1.8948, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0156794425087108, |
|
"grad_norm": 0.40913650393486023, |
|
"learning_rate": 0.0001942819704359693, |
|
"loss": 2.2626, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.015928322548531607, |
|
"grad_norm": 0.3126090168952942, |
|
"learning_rate": 0.00019406633966986828, |
|
"loss": 2.1733, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.016177202588352414, |
|
"grad_norm": 0.35215505957603455, |
|
"learning_rate": 0.00019384684220497605, |
|
"loss": 2.2469, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01642608262817322, |
|
"grad_norm": 0.36992332339286804, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 2.32, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.016674962667994026, |
|
"grad_norm": 0.3458121418952942, |
|
"learning_rate": 0.00019339628342811632, |
|
"loss": 2.0615, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.016923842707814832, |
|
"grad_norm": 0.33729124069213867, |
|
"learning_rate": 0.0001931652406368554, |
|
"loss": 2.1723, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01717272274763564, |
|
"grad_norm": 0.46333789825439453, |
|
"learning_rate": 0.0001929303681874552, |
|
"loss": 2.2158, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.017421602787456445, |
|
"grad_norm": 0.3130255341529846, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 1.991, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01767048282727725, |
|
"grad_norm": 0.3570137917995453, |
|
"learning_rate": 0.00019244917309000817, |
|
"loss": 1.922, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.017919362867098058, |
|
"grad_norm": 0.27171969413757324, |
|
"learning_rate": 0.00019220287022200707, |
|
"loss": 2.2815, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.018168242906918864, |
|
"grad_norm": 0.30303627252578735, |
|
"learning_rate": 0.0001919527772551451, |
|
"loss": 2.09, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.01841712294673967, |
|
"grad_norm": 0.45809829235076904, |
|
"learning_rate": 0.00019169890446976454, |
|
"loss": 2.0446, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.018666002986560477, |
|
"grad_norm": 0.29333075881004333, |
|
"learning_rate": 0.00019144126230158127, |
|
"loss": 1.9728, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.018914883026381283, |
|
"grad_norm": 0.2959883511066437, |
|
"learning_rate": 0.0001911798613412557, |
|
"loss": 2.0945, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.01916376306620209, |
|
"grad_norm": 0.31434130668640137, |
|
"learning_rate": 0.0001909147123339575, |
|
"loss": 2.1703, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.019412643106022896, |
|
"grad_norm": 0.3976629972457886, |
|
"learning_rate": 0.0001906458261789238, |
|
"loss": 2.1109, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.019661523145843702, |
|
"grad_norm": 0.678793728351593, |
|
"learning_rate": 0.00019037321392901136, |
|
"loss": 1.9291, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.019910403185664508, |
|
"grad_norm": 0.46012192964553833, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 2.265, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.020159283225485315, |
|
"grad_norm": 0.3698313534259796, |
|
"learning_rate": 0.0001898168561213419, |
|
"loss": 2.1672, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 0.3445495069026947, |
|
"learning_rate": 0.0001895331334332753, |
|
"loss": 2.0265, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.020657043305126927, |
|
"grad_norm": 0.38758572936058044, |
|
"learning_rate": 0.0001892457303887706, |
|
"loss": 2.1402, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.020905923344947737, |
|
"grad_norm": 0.40104663372039795, |
|
"learning_rate": 0.0001889546588018412, |
|
"loss": 2.2048, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.021154803384768543, |
|
"grad_norm": 0.42981839179992676, |
|
"learning_rate": 0.00018865993063730004, |
|
"loss": 2.2485, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.02140368342458935, |
|
"grad_norm": 0.3159562647342682, |
|
"learning_rate": 0.00018836155801026753, |
|
"loss": 1.8041, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.021652563464410156, |
|
"grad_norm": 0.415228009223938, |
|
"learning_rate": 0.0001880595531856738, |
|
"loss": 2.08, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.021901443504230962, |
|
"grad_norm": 0.31004348397254944, |
|
"learning_rate": 0.00018775392857775432, |
|
"loss": 2.0714, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.02215032354405177, |
|
"grad_norm": 0.3166860342025757, |
|
"learning_rate": 0.00018744469674953956, |
|
"loss": 2.1457, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.022399203583872575, |
|
"grad_norm": 0.3657294511795044, |
|
"learning_rate": 0.00018713187041233896, |
|
"loss": 2.1152, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02264808362369338, |
|
"grad_norm": 0.3469568192958832, |
|
"learning_rate": 0.00018681546242521786, |
|
"loss": 1.9196, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.022896963663514187, |
|
"grad_norm": 0.3438771665096283, |
|
"learning_rate": 0.00018649548579446936, |
|
"loss": 2.2341, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.023145843703334994, |
|
"grad_norm": 0.30898717045783997, |
|
"learning_rate": 0.0001861719536730795, |
|
"loss": 2.0978, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0233947237431558, |
|
"grad_norm": 0.27930477261543274, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 2.0048, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.023643603782976606, |
|
"grad_norm": 0.32163354754447937, |
|
"learning_rate": 0.00018551427630053463, |
|
"loss": 1.9489, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.023892483822797413, |
|
"grad_norm": 0.3902042806148529, |
|
"learning_rate": 0.00018518015808392045, |
|
"loss": 1.9324, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.02414136386261822, |
|
"grad_norm": 0.35879242420196533, |
|
"learning_rate": 0.00018484253844463526, |
|
"loss": 2.0225, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.024390243902439025, |
|
"grad_norm": 0.33365774154663086, |
|
"learning_rate": 0.00018450143126090015, |
|
"loss": 2.0397, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.02463912394225983, |
|
"grad_norm": 0.35122770071029663, |
|
"learning_rate": 0.00018415685055429533, |
|
"loss": 2.03, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.024888003982080638, |
|
"grad_norm": 0.29391220211982727, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 2.119, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025136884021901444, |
|
"grad_norm": 0.4452991187572479, |
|
"learning_rate": 0.00018345732537213027, |
|
"loss": 1.9258, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.02538576406172225, |
|
"grad_norm": 0.38186201453208923, |
|
"learning_rate": 0.00018310240965131041, |
|
"loss": 1.6995, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.025634644101543057, |
|
"grad_norm": 0.3281259834766388, |
|
"learning_rate": 0.00018274407791591966, |
|
"loss": 1.7411, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.025883524141363863, |
|
"grad_norm": 0.29078221321105957, |
|
"learning_rate": 0.00018238234489557215, |
|
"loss": 1.8857, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.02613240418118467, |
|
"grad_norm": 0.2946663200855255, |
|
"learning_rate": 0.0001820172254596956, |
|
"loss": 1.8642, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.026381284221005476, |
|
"grad_norm": 0.38488832116127014, |
|
"learning_rate": 0.00018164873461691986, |
|
"loss": 2.0094, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.026630164260826282, |
|
"grad_norm": 0.33220160007476807, |
|
"learning_rate": 0.00018127688751446027, |
|
"loss": 2.1743, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.02687904430064709, |
|
"grad_norm": 0.3243928849697113, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.8538, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.027127924340467895, |
|
"grad_norm": 0.31402042508125305, |
|
"learning_rate": 0.0001805231858085356, |
|
"loss": 2.0906, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0273768043802887, |
|
"grad_norm": 0.46022218465805054, |
|
"learning_rate": 0.00018014136218679567, |
|
"loss": 2.1055, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.027625684420109507, |
|
"grad_norm": 0.3270494341850281, |
|
"learning_rate": 0.00017975624426754848, |
|
"loss": 1.6856, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.027874564459930314, |
|
"grad_norm": 0.31129342317581177, |
|
"learning_rate": 0.00017936784788148328, |
|
"loss": 2.2389, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.02812344449975112, |
|
"grad_norm": 0.3763848543167114, |
|
"learning_rate": 0.00017897618899405423, |
|
"loss": 2.2251, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.028372324539571926, |
|
"grad_norm": 0.3961619436740875, |
|
"learning_rate": 0.00017858128370482426, |
|
"loss": 2.1523, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.028621204579392732, |
|
"grad_norm": 0.32675299048423767, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 1.8199, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.02887008461921354, |
|
"grad_norm": 0.39736902713775635, |
|
"learning_rate": 0.00017778179898577973, |
|
"loss": 2.0062, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.029118964659034345, |
|
"grad_norm": 0.3664799928665161, |
|
"learning_rate": 0.00017737725241965069, |
|
"loss": 2.2775, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.02936784469885515, |
|
"grad_norm": 0.3666561543941498, |
|
"learning_rate": 0.00017696952517774062, |
|
"loss": 1.9005, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.029616724738675958, |
|
"grad_norm": 0.2838401794433594, |
|
"learning_rate": 0.00017655863402011947, |
|
"loss": 2.0504, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.029865604778496764, |
|
"grad_norm": 0.3173494338989258, |
|
"learning_rate": 0.00017614459583691346, |
|
"loss": 1.9387, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03011448481831757, |
|
"grad_norm": 0.31382298469543457, |
|
"learning_rate": 0.00017572742764761055, |
|
"loss": 2.0618, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.030363364858138377, |
|
"grad_norm": 0.3865104019641876, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 2.0401, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.030612244897959183, |
|
"grad_norm": 0.39169177412986755, |
|
"learning_rate": 0.00017488376997127283, |
|
"loss": 1.7495, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.03086112493777999, |
|
"grad_norm": 0.3588005602359772, |
|
"learning_rate": 0.0001744573151637007, |
|
"loss": 2.028, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.031110004977600796, |
|
"grad_norm": 0.46878084540367126, |
|
"learning_rate": 0.00017402779970753155, |
|
"loss": 1.6867, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.031110004977600796, |
|
"eval_loss": 2.049610137939453, |
|
"eval_runtime": 47.161, |
|
"eval_samples_per_second": 13.91, |
|
"eval_steps_per_second": 6.955, |
|
"step": 125 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 125, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6511065563136000.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|