{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.031110004977600796, "eval_steps": 125, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024888003982080636, "grad_norm": 5.4410481452941895, "learning_rate": 2e-05, "loss": 2.9282, "step": 1 }, { "epoch": 0.00024888003982080636, "eval_loss": 3.5097618103027344, "eval_runtime": 47.433, "eval_samples_per_second": 13.83, "eval_steps_per_second": 6.915, "step": 1 }, { "epoch": 0.0004977600796416127, "grad_norm": 5.435488224029541, "learning_rate": 4e-05, "loss": 3.0917, "step": 2 }, { "epoch": 0.0007466401194624191, "grad_norm": 5.29222297668457, "learning_rate": 6e-05, "loss": 2.9705, "step": 3 }, { "epoch": 0.0009955201592832255, "grad_norm": 3.5290043354034424, "learning_rate": 8e-05, "loss": 2.8213, "step": 4 }, { "epoch": 0.0012444001991040318, "grad_norm": 1.9220443964004517, "learning_rate": 0.0001, "loss": 3.7124, "step": 5 }, { "epoch": 0.0014932802389248383, "grad_norm": 1.0480749607086182, "learning_rate": 0.00012, "loss": 3.2383, "step": 6 }, { "epoch": 0.0017421602787456446, "grad_norm": 2.3310468196868896, "learning_rate": 0.00014, "loss": 3.2751, "step": 7 }, { "epoch": 0.001991040318566451, "grad_norm": 0.622734546661377, "learning_rate": 0.00016, "loss": 2.9144, "step": 8 }, { "epoch": 0.002239920358387257, "grad_norm": 1.5959340333938599, "learning_rate": 0.00018, "loss": 3.4992, "step": 9 }, { "epoch": 0.0024888003982080635, "grad_norm": 2.5492312908172607, "learning_rate": 0.0002, "loss": 3.7241, "step": 10 }, { "epoch": 0.0027376804380288703, "grad_norm": 2.9819629192352295, "learning_rate": 0.0001999979446958366, "loss": 2.5479, "step": 11 }, { "epoch": 0.0029865604778496766, "grad_norm": 2.515784978866577, "learning_rate": 0.00019999177886783194, "loss": 2.9924, "step": 12 }, { "epoch": 0.003235440517670483, "grad_norm": 0.9300045371055603, "learning_rate": 0.00019998150276943902, "loss": 2.7006, "step": 13 }, { "epoch": 0.003484320557491289, "grad_norm": 3.067373275756836, "learning_rate": 0.000199967116823068, "loss": 2.735, "step": 14 }, { "epoch": 0.0037332005973120955, "grad_norm": 3.160754919052124, "learning_rate": 0.0001999486216200688, "loss": 2.8188, "step": 15 }, { "epoch": 0.003982080637132902, "grad_norm": 3.5078177452087402, "learning_rate": 0.00019992601792070679, "loss": 2.4545, "step": 16 }, { "epoch": 0.004230960676953708, "grad_norm": 1.3515621423721313, "learning_rate": 0.00019989930665413147, "loss": 2.4931, "step": 17 }, { "epoch": 0.004479840716774514, "grad_norm": 1.023354172706604, "learning_rate": 0.00019986848891833845, "loss": 2.6162, "step": 18 }, { "epoch": 0.004728720756595321, "grad_norm": 0.6757822036743164, "learning_rate": 0.0001998335659801241, "loss": 2.3384, "step": 19 }, { "epoch": 0.004977600796416127, "grad_norm": 2.4865520000457764, "learning_rate": 0.00019979453927503364, "loss": 2.0874, "step": 20 }, { "epoch": 0.005226480836236934, "grad_norm": 0.50606369972229, "learning_rate": 0.00019975141040730207, "loss": 2.2065, "step": 21 }, { "epoch": 0.0054753608760577405, "grad_norm": 0.5787423253059387, "learning_rate": 0.0001997041811497882, "loss": 2.1371, "step": 22 }, { "epoch": 0.005724240915878547, "grad_norm": 2.7240867614746094, "learning_rate": 0.00019965285344390184, "loss": 2.3011, "step": 23 }, { "epoch": 0.005973120955699353, "grad_norm": 0.5894359350204468, "learning_rate": 0.00019959742939952392, "loss": 2.2565, "step": 24 }, { "epoch": 0.0062220009955201595, "grad_norm": 0.47996145486831665, "learning_rate": 0.00019953791129491983, "loss": 2.1724, "step": 25 }, { "epoch": 0.006470881035340966, "grad_norm": 0.43773430585861206, "learning_rate": 0.00019947430157664576, "loss": 2.21, "step": 26 }, { "epoch": 0.006719761075161772, "grad_norm": 0.4364480972290039, "learning_rate": 0.00019940660285944803, "loss": 1.9883, "step": 27 }, { "epoch": 0.006968641114982578, "grad_norm": 0.42618849873542786, "learning_rate": 0.00019933481792615583, "loss": 1.8854, "step": 28 }, { "epoch": 0.007217521154803385, "grad_norm": 0.9404358267784119, "learning_rate": 0.0001992589497275665, "loss": 1.9978, "step": 29 }, { "epoch": 0.007466401194624191, "grad_norm": 0.4599241614341736, "learning_rate": 0.0001991790013823246, "loss": 2.1981, "step": 30 }, { "epoch": 0.007715281234444997, "grad_norm": 0.5835628509521484, "learning_rate": 0.00019909497617679348, "loss": 2.0446, "step": 31 }, { "epoch": 0.007964161274265804, "grad_norm": 1.6969475746154785, "learning_rate": 0.0001990068775649202, "loss": 1.7206, "step": 32 }, { "epoch": 0.00821304131408661, "grad_norm": 1.1236441135406494, "learning_rate": 0.00019891470916809362, "loss": 2.3924, "step": 33 }, { "epoch": 0.008461921353907416, "grad_norm": 0.8025091886520386, "learning_rate": 0.00019881847477499557, "loss": 2.0218, "step": 34 }, { "epoch": 0.008710801393728223, "grad_norm": 1.1834402084350586, "learning_rate": 0.00019871817834144504, "loss": 2.3439, "step": 35 }, { "epoch": 0.008959681433549029, "grad_norm": 1.910337209701538, "learning_rate": 0.0001986138239902355, "loss": 2.174, "step": 36 }, { "epoch": 0.009208561473369835, "grad_norm": 2.081226348876953, "learning_rate": 0.0001985054160109657, "loss": 1.9232, "step": 37 }, { "epoch": 0.009457441513190641, "grad_norm": 0.8203554749488831, "learning_rate": 0.00019839295885986296, "loss": 2.3514, "step": 38 }, { "epoch": 0.009706321553011448, "grad_norm": 0.6460116505622864, "learning_rate": 0.0001982764571596004, "loss": 2.0705, "step": 39 }, { "epoch": 0.009955201592832254, "grad_norm": 0.7928256988525391, "learning_rate": 0.00019815591569910654, "loss": 2.2336, "step": 40 }, { "epoch": 0.01020408163265306, "grad_norm": 0.5046447515487671, "learning_rate": 0.00019803133943336874, "loss": 2.3007, "step": 41 }, { "epoch": 0.010452961672473868, "grad_norm": 0.7800955772399902, "learning_rate": 0.0001979027334832293, "loss": 1.7504, "step": 42 }, { "epoch": 0.010701841712294675, "grad_norm": 0.4248393476009369, "learning_rate": 0.00019777010313517518, "loss": 2.1283, "step": 43 }, { "epoch": 0.010950721752115481, "grad_norm": 0.4310763478279114, "learning_rate": 0.00019763345384112043, "loss": 2.1454, "step": 44 }, { "epoch": 0.011199601791936287, "grad_norm": 0.3514373302459717, "learning_rate": 0.00019749279121818235, "loss": 2.0302, "step": 45 }, { "epoch": 0.011448481831757094, "grad_norm": 0.4506613612174988, "learning_rate": 0.00019734812104845047, "loss": 2.3444, "step": 46 }, { "epoch": 0.0116973618715779, "grad_norm": 0.823046088218689, "learning_rate": 0.00019719944927874881, "loss": 2.1397, "step": 47 }, { "epoch": 0.011946241911398706, "grad_norm": 0.6662668585777283, "learning_rate": 0.0001970467820203915, "loss": 2.2086, "step": 48 }, { "epoch": 0.012195121951219513, "grad_norm": 1.1788537502288818, "learning_rate": 0.00019689012554893154, "loss": 1.9532, "step": 49 }, { "epoch": 0.012444001991040319, "grad_norm": 0.41714179515838623, "learning_rate": 0.00019672948630390294, "loss": 1.994, "step": 50 }, { "epoch": 0.012692882030861125, "grad_norm": 0.40183207392692566, "learning_rate": 0.00019656487088855592, "loss": 1.986, "step": 51 }, { "epoch": 0.012941762070681932, "grad_norm": 0.6568073034286499, "learning_rate": 0.00019639628606958533, "loss": 1.9911, "step": 52 }, { "epoch": 0.013190642110502738, "grad_norm": 0.40880608558654785, "learning_rate": 0.0001962237387768529, "loss": 2.2154, "step": 53 }, { "epoch": 0.013439522150323544, "grad_norm": 0.3884672522544861, "learning_rate": 0.00019604723610310194, "loss": 1.9765, "step": 54 }, { "epoch": 0.01368840219014435, "grad_norm": 0.3780982196331024, "learning_rate": 0.00019586678530366606, "loss": 2.0202, "step": 55 }, { "epoch": 0.013937282229965157, "grad_norm": 0.4995989203453064, "learning_rate": 0.00019568239379617088, "loss": 1.8901, "step": 56 }, { "epoch": 0.014186162269785963, "grad_norm": 0.3196570575237274, "learning_rate": 0.00019549406916022905, "loss": 2.3905, "step": 57 }, { "epoch": 0.01443504230960677, "grad_norm": 0.3686681389808655, "learning_rate": 0.00019530181913712872, "loss": 2.184, "step": 58 }, { "epoch": 0.014683922349427576, "grad_norm": 0.3581784963607788, "learning_rate": 0.00019510565162951537, "loss": 1.8453, "step": 59 }, { "epoch": 0.014932802389248382, "grad_norm": 0.3524768054485321, "learning_rate": 0.00019490557470106686, "loss": 2.0638, "step": 60 }, { "epoch": 0.015181682429069188, "grad_norm": 0.344062864780426, "learning_rate": 0.00019470159657616215, "loss": 2.0319, "step": 61 }, { "epoch": 0.015430562468889995, "grad_norm": 0.4052053391933441, "learning_rate": 0.00019449372563954293, "loss": 1.8948, "step": 62 }, { "epoch": 0.0156794425087108, "grad_norm": 0.40913650393486023, "learning_rate": 0.0001942819704359693, "loss": 2.2626, "step": 63 }, { "epoch": 0.015928322548531607, "grad_norm": 0.3126090168952942, "learning_rate": 0.00019406633966986828, "loss": 2.1733, "step": 64 }, { "epoch": 0.016177202588352414, "grad_norm": 0.35215505957603455, "learning_rate": 0.00019384684220497605, "loss": 2.2469, "step": 65 }, { "epoch": 0.01642608262817322, "grad_norm": 0.36992332339286804, "learning_rate": 0.00019362348706397373, "loss": 2.32, "step": 66 }, { "epoch": 0.016674962667994026, "grad_norm": 0.3458121418952942, "learning_rate": 0.00019339628342811632, "loss": 2.0615, "step": 67 }, { "epoch": 0.016923842707814832, "grad_norm": 0.33729124069213867, "learning_rate": 0.0001931652406368554, "loss": 2.1723, "step": 68 }, { "epoch": 0.01717272274763564, "grad_norm": 0.46333789825439453, "learning_rate": 0.0001929303681874552, "loss": 2.2158, "step": 69 }, { "epoch": 0.017421602787456445, "grad_norm": 0.3130255341529846, "learning_rate": 0.0001926916757346022, "loss": 1.991, "step": 70 }, { "epoch": 0.01767048282727725, "grad_norm": 0.3570137917995453, "learning_rate": 0.00019244917309000817, "loss": 1.922, "step": 71 }, { "epoch": 0.017919362867098058, "grad_norm": 0.27171969413757324, "learning_rate": 0.00019220287022200707, "loss": 2.2815, "step": 72 }, { "epoch": 0.018168242906918864, "grad_norm": 0.30303627252578735, "learning_rate": 0.0001919527772551451, "loss": 2.09, "step": 73 }, { "epoch": 0.01841712294673967, "grad_norm": 0.45809829235076904, "learning_rate": 0.00019169890446976454, "loss": 2.0446, "step": 74 }, { "epoch": 0.018666002986560477, "grad_norm": 0.29333075881004333, "learning_rate": 0.00019144126230158127, "loss": 1.9728, "step": 75 }, { "epoch": 0.018914883026381283, "grad_norm": 0.2959883511066437, "learning_rate": 0.0001911798613412557, "loss": 2.0945, "step": 76 }, { "epoch": 0.01916376306620209, "grad_norm": 0.31434130668640137, "learning_rate": 0.0001909147123339575, "loss": 2.1703, "step": 77 }, { "epoch": 0.019412643106022896, "grad_norm": 0.3976629972457886, "learning_rate": 0.0001906458261789238, "loss": 2.1109, "step": 78 }, { "epoch": 0.019661523145843702, "grad_norm": 0.678793728351593, "learning_rate": 0.00019037321392901136, "loss": 1.9291, "step": 79 }, { "epoch": 0.019910403185664508, "grad_norm": 0.46012192964553833, "learning_rate": 0.0001900968867902419, "loss": 2.265, "step": 80 }, { "epoch": 0.020159283225485315, "grad_norm": 0.3698313534259796, "learning_rate": 0.0001898168561213419, "loss": 2.1672, "step": 81 }, { "epoch": 0.02040816326530612, "grad_norm": 0.3445495069026947, "learning_rate": 0.0001895331334332753, "loss": 2.0265, "step": 82 }, { "epoch": 0.020657043305126927, "grad_norm": 0.38758572936058044, "learning_rate": 0.0001892457303887706, "loss": 2.1402, "step": 83 }, { "epoch": 0.020905923344947737, "grad_norm": 0.40104663372039795, "learning_rate": 0.0001889546588018412, "loss": 2.2048, "step": 84 }, { "epoch": 0.021154803384768543, "grad_norm": 0.42981839179992676, "learning_rate": 0.00018865993063730004, "loss": 2.2485, "step": 85 }, { "epoch": 0.02140368342458935, "grad_norm": 0.3159562647342682, "learning_rate": 0.00018836155801026753, "loss": 1.8041, "step": 86 }, { "epoch": 0.021652563464410156, "grad_norm": 0.415228009223938, "learning_rate": 0.0001880595531856738, "loss": 2.08, "step": 87 }, { "epoch": 0.021901443504230962, "grad_norm": 0.31004348397254944, "learning_rate": 0.00018775392857775432, "loss": 2.0714, "step": 88 }, { "epoch": 0.02215032354405177, "grad_norm": 0.3166860342025757, "learning_rate": 0.00018744469674953956, "loss": 2.1457, "step": 89 }, { "epoch": 0.022399203583872575, "grad_norm": 0.3657294511795044, "learning_rate": 0.00018713187041233896, "loss": 2.1152, "step": 90 }, { "epoch": 0.02264808362369338, "grad_norm": 0.3469568192958832, "learning_rate": 0.00018681546242521786, "loss": 1.9196, "step": 91 }, { "epoch": 0.022896963663514187, "grad_norm": 0.3438771665096283, "learning_rate": 0.00018649548579446936, "loss": 2.2341, "step": 92 }, { "epoch": 0.023145843703334994, "grad_norm": 0.30898717045783997, "learning_rate": 0.0001861719536730795, "loss": 2.0978, "step": 93 }, { "epoch": 0.0233947237431558, "grad_norm": 0.27930477261543274, "learning_rate": 0.00018584487936018661, "loss": 2.0048, "step": 94 }, { "epoch": 0.023643603782976606, "grad_norm": 0.32163354754447937, "learning_rate": 0.00018551427630053463, "loss": 1.9489, "step": 95 }, { "epoch": 0.023892483822797413, "grad_norm": 0.3902042806148529, "learning_rate": 0.00018518015808392045, "loss": 1.9324, "step": 96 }, { "epoch": 0.02414136386261822, "grad_norm": 0.35879242420196533, "learning_rate": 0.00018484253844463526, "loss": 2.0225, "step": 97 }, { "epoch": 0.024390243902439025, "grad_norm": 0.33365774154663086, "learning_rate": 0.00018450143126090015, "loss": 2.0397, "step": 98 }, { "epoch": 0.02463912394225983, "grad_norm": 0.35122770071029663, "learning_rate": 0.00018415685055429533, "loss": 2.03, "step": 99 }, { "epoch": 0.024888003982080638, "grad_norm": 0.29391220211982727, "learning_rate": 0.00018380881048918405, "loss": 2.119, "step": 100 }, { "epoch": 0.025136884021901444, "grad_norm": 0.4452991187572479, "learning_rate": 0.00018345732537213027, "loss": 1.9258, "step": 101 }, { "epoch": 0.02538576406172225, "grad_norm": 0.38186201453208923, "learning_rate": 0.00018310240965131041, "loss": 1.6995, "step": 102 }, { "epoch": 0.025634644101543057, "grad_norm": 0.3281259834766388, "learning_rate": 0.00018274407791591966, "loss": 1.7411, "step": 103 }, { "epoch": 0.025883524141363863, "grad_norm": 0.29078221321105957, "learning_rate": 0.00018238234489557215, "loss": 1.8857, "step": 104 }, { "epoch": 0.02613240418118467, "grad_norm": 0.2946663200855255, "learning_rate": 0.0001820172254596956, "loss": 1.8642, "step": 105 }, { "epoch": 0.026381284221005476, "grad_norm": 0.38488832116127014, "learning_rate": 0.00018164873461691986, "loss": 2.0094, "step": 106 }, { "epoch": 0.026630164260826282, "grad_norm": 0.33220160007476807, "learning_rate": 0.00018127688751446027, "loss": 2.1743, "step": 107 }, { "epoch": 0.02687904430064709, "grad_norm": 0.3243928849697113, "learning_rate": 0.00018090169943749476, "loss": 1.8538, "step": 108 }, { "epoch": 0.027127924340467895, "grad_norm": 0.31402042508125305, "learning_rate": 0.0001805231858085356, "loss": 2.0906, "step": 109 }, { "epoch": 0.0273768043802887, "grad_norm": 0.46022218465805054, "learning_rate": 0.00018014136218679567, "loss": 2.1055, "step": 110 }, { "epoch": 0.027625684420109507, "grad_norm": 0.3270494341850281, "learning_rate": 0.00017975624426754848, "loss": 1.6856, "step": 111 }, { "epoch": 0.027874564459930314, "grad_norm": 0.31129342317581177, "learning_rate": 0.00017936784788148328, "loss": 2.2389, "step": 112 }, { "epoch": 0.02812344449975112, "grad_norm": 0.3763848543167114, "learning_rate": 0.00017897618899405423, "loss": 2.2251, "step": 113 }, { "epoch": 0.028372324539571926, "grad_norm": 0.3961619436740875, "learning_rate": 0.00017858128370482426, "loss": 2.1523, "step": 114 }, { "epoch": 0.028621204579392732, "grad_norm": 0.32675299048423767, "learning_rate": 0.000178183148246803, "loss": 1.8199, "step": 115 }, { "epoch": 0.02887008461921354, "grad_norm": 0.39736902713775635, "learning_rate": 0.00017778179898577973, "loss": 2.0062, "step": 116 }, { "epoch": 0.029118964659034345, "grad_norm": 0.3664799928665161, "learning_rate": 0.00017737725241965069, "loss": 2.2775, "step": 117 }, { "epoch": 0.02936784469885515, "grad_norm": 0.3666561543941498, "learning_rate": 0.00017696952517774062, "loss": 1.9005, "step": 118 }, { "epoch": 0.029616724738675958, "grad_norm": 0.2838401794433594, "learning_rate": 0.00017655863402011947, "loss": 2.0504, "step": 119 }, { "epoch": 0.029865604778496764, "grad_norm": 0.3173494338989258, "learning_rate": 0.00017614459583691346, "loss": 1.9387, "step": 120 }, { "epoch": 0.03011448481831757, "grad_norm": 0.31382298469543457, "learning_rate": 0.00017572742764761055, "loss": 2.0618, "step": 121 }, { "epoch": 0.030363364858138377, "grad_norm": 0.3865104019641876, "learning_rate": 0.00017530714660036112, "loss": 2.0401, "step": 122 }, { "epoch": 0.030612244897959183, "grad_norm": 0.39169177412986755, "learning_rate": 0.00017488376997127283, "loss": 1.7495, "step": 123 }, { "epoch": 0.03086112493777999, "grad_norm": 0.3588005602359772, "learning_rate": 0.0001744573151637007, "loss": 2.028, "step": 124 }, { "epoch": 0.031110004977600796, "grad_norm": 0.46878084540367126, "learning_rate": 0.00017402779970753155, "loss": 1.6867, "step": 125 }, { "epoch": 0.031110004977600796, "eval_loss": 2.049610137939453, "eval_runtime": 47.161, "eval_samples_per_second": 13.91, "eval_steps_per_second": 6.955, "step": 125 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6511065563136000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }