error577's picture
Training in progress, step 125, checkpoint
cc6e982 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.031110004977600796,
"eval_steps": 125,
"global_step": 125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00024888003982080636,
"grad_norm": 5.4410481452941895,
"learning_rate": 2e-05,
"loss": 2.9282,
"step": 1
},
{
"epoch": 0.00024888003982080636,
"eval_loss": 3.5097618103027344,
"eval_runtime": 47.433,
"eval_samples_per_second": 13.83,
"eval_steps_per_second": 6.915,
"step": 1
},
{
"epoch": 0.0004977600796416127,
"grad_norm": 5.435488224029541,
"learning_rate": 4e-05,
"loss": 3.0917,
"step": 2
},
{
"epoch": 0.0007466401194624191,
"grad_norm": 5.29222297668457,
"learning_rate": 6e-05,
"loss": 2.9705,
"step": 3
},
{
"epoch": 0.0009955201592832255,
"grad_norm": 3.5290043354034424,
"learning_rate": 8e-05,
"loss": 2.8213,
"step": 4
},
{
"epoch": 0.0012444001991040318,
"grad_norm": 1.9220443964004517,
"learning_rate": 0.0001,
"loss": 3.7124,
"step": 5
},
{
"epoch": 0.0014932802389248383,
"grad_norm": 1.0480749607086182,
"learning_rate": 0.00012,
"loss": 3.2383,
"step": 6
},
{
"epoch": 0.0017421602787456446,
"grad_norm": 2.3310468196868896,
"learning_rate": 0.00014,
"loss": 3.2751,
"step": 7
},
{
"epoch": 0.001991040318566451,
"grad_norm": 0.622734546661377,
"learning_rate": 0.00016,
"loss": 2.9144,
"step": 8
},
{
"epoch": 0.002239920358387257,
"grad_norm": 1.5959340333938599,
"learning_rate": 0.00018,
"loss": 3.4992,
"step": 9
},
{
"epoch": 0.0024888003982080635,
"grad_norm": 2.5492312908172607,
"learning_rate": 0.0002,
"loss": 3.7241,
"step": 10
},
{
"epoch": 0.0027376804380288703,
"grad_norm": 2.9819629192352295,
"learning_rate": 0.0001999979446958366,
"loss": 2.5479,
"step": 11
},
{
"epoch": 0.0029865604778496766,
"grad_norm": 2.515784978866577,
"learning_rate": 0.00019999177886783194,
"loss": 2.9924,
"step": 12
},
{
"epoch": 0.003235440517670483,
"grad_norm": 0.9300045371055603,
"learning_rate": 0.00019998150276943902,
"loss": 2.7006,
"step": 13
},
{
"epoch": 0.003484320557491289,
"grad_norm": 3.067373275756836,
"learning_rate": 0.000199967116823068,
"loss": 2.735,
"step": 14
},
{
"epoch": 0.0037332005973120955,
"grad_norm": 3.160754919052124,
"learning_rate": 0.0001999486216200688,
"loss": 2.8188,
"step": 15
},
{
"epoch": 0.003982080637132902,
"grad_norm": 3.5078177452087402,
"learning_rate": 0.00019992601792070679,
"loss": 2.4545,
"step": 16
},
{
"epoch": 0.004230960676953708,
"grad_norm": 1.3515621423721313,
"learning_rate": 0.00019989930665413147,
"loss": 2.4931,
"step": 17
},
{
"epoch": 0.004479840716774514,
"grad_norm": 1.023354172706604,
"learning_rate": 0.00019986848891833845,
"loss": 2.6162,
"step": 18
},
{
"epoch": 0.004728720756595321,
"grad_norm": 0.6757822036743164,
"learning_rate": 0.0001998335659801241,
"loss": 2.3384,
"step": 19
},
{
"epoch": 0.004977600796416127,
"grad_norm": 2.4865520000457764,
"learning_rate": 0.00019979453927503364,
"loss": 2.0874,
"step": 20
},
{
"epoch": 0.005226480836236934,
"grad_norm": 0.50606369972229,
"learning_rate": 0.00019975141040730207,
"loss": 2.2065,
"step": 21
},
{
"epoch": 0.0054753608760577405,
"grad_norm": 0.5787423253059387,
"learning_rate": 0.0001997041811497882,
"loss": 2.1371,
"step": 22
},
{
"epoch": 0.005724240915878547,
"grad_norm": 2.7240867614746094,
"learning_rate": 0.00019965285344390184,
"loss": 2.3011,
"step": 23
},
{
"epoch": 0.005973120955699353,
"grad_norm": 0.5894359350204468,
"learning_rate": 0.00019959742939952392,
"loss": 2.2565,
"step": 24
},
{
"epoch": 0.0062220009955201595,
"grad_norm": 0.47996145486831665,
"learning_rate": 0.00019953791129491983,
"loss": 2.1724,
"step": 25
},
{
"epoch": 0.006470881035340966,
"grad_norm": 0.43773430585861206,
"learning_rate": 0.00019947430157664576,
"loss": 2.21,
"step": 26
},
{
"epoch": 0.006719761075161772,
"grad_norm": 0.4364480972290039,
"learning_rate": 0.00019940660285944803,
"loss": 1.9883,
"step": 27
},
{
"epoch": 0.006968641114982578,
"grad_norm": 0.42618849873542786,
"learning_rate": 0.00019933481792615583,
"loss": 1.8854,
"step": 28
},
{
"epoch": 0.007217521154803385,
"grad_norm": 0.9404358267784119,
"learning_rate": 0.0001992589497275665,
"loss": 1.9978,
"step": 29
},
{
"epoch": 0.007466401194624191,
"grad_norm": 0.4599241614341736,
"learning_rate": 0.0001991790013823246,
"loss": 2.1981,
"step": 30
},
{
"epoch": 0.007715281234444997,
"grad_norm": 0.5835628509521484,
"learning_rate": 0.00019909497617679348,
"loss": 2.0446,
"step": 31
},
{
"epoch": 0.007964161274265804,
"grad_norm": 1.6969475746154785,
"learning_rate": 0.0001990068775649202,
"loss": 1.7206,
"step": 32
},
{
"epoch": 0.00821304131408661,
"grad_norm": 1.1236441135406494,
"learning_rate": 0.00019891470916809362,
"loss": 2.3924,
"step": 33
},
{
"epoch": 0.008461921353907416,
"grad_norm": 0.8025091886520386,
"learning_rate": 0.00019881847477499557,
"loss": 2.0218,
"step": 34
},
{
"epoch": 0.008710801393728223,
"grad_norm": 1.1834402084350586,
"learning_rate": 0.00019871817834144504,
"loss": 2.3439,
"step": 35
},
{
"epoch": 0.008959681433549029,
"grad_norm": 1.910337209701538,
"learning_rate": 0.0001986138239902355,
"loss": 2.174,
"step": 36
},
{
"epoch": 0.009208561473369835,
"grad_norm": 2.081226348876953,
"learning_rate": 0.0001985054160109657,
"loss": 1.9232,
"step": 37
},
{
"epoch": 0.009457441513190641,
"grad_norm": 0.8203554749488831,
"learning_rate": 0.00019839295885986296,
"loss": 2.3514,
"step": 38
},
{
"epoch": 0.009706321553011448,
"grad_norm": 0.6460116505622864,
"learning_rate": 0.0001982764571596004,
"loss": 2.0705,
"step": 39
},
{
"epoch": 0.009955201592832254,
"grad_norm": 0.7928256988525391,
"learning_rate": 0.00019815591569910654,
"loss": 2.2336,
"step": 40
},
{
"epoch": 0.01020408163265306,
"grad_norm": 0.5046447515487671,
"learning_rate": 0.00019803133943336874,
"loss": 2.3007,
"step": 41
},
{
"epoch": 0.010452961672473868,
"grad_norm": 0.7800955772399902,
"learning_rate": 0.0001979027334832293,
"loss": 1.7504,
"step": 42
},
{
"epoch": 0.010701841712294675,
"grad_norm": 0.4248393476009369,
"learning_rate": 0.00019777010313517518,
"loss": 2.1283,
"step": 43
},
{
"epoch": 0.010950721752115481,
"grad_norm": 0.4310763478279114,
"learning_rate": 0.00019763345384112043,
"loss": 2.1454,
"step": 44
},
{
"epoch": 0.011199601791936287,
"grad_norm": 0.3514373302459717,
"learning_rate": 0.00019749279121818235,
"loss": 2.0302,
"step": 45
},
{
"epoch": 0.011448481831757094,
"grad_norm": 0.4506613612174988,
"learning_rate": 0.00019734812104845047,
"loss": 2.3444,
"step": 46
},
{
"epoch": 0.0116973618715779,
"grad_norm": 0.823046088218689,
"learning_rate": 0.00019719944927874881,
"loss": 2.1397,
"step": 47
},
{
"epoch": 0.011946241911398706,
"grad_norm": 0.6662668585777283,
"learning_rate": 0.0001970467820203915,
"loss": 2.2086,
"step": 48
},
{
"epoch": 0.012195121951219513,
"grad_norm": 1.1788537502288818,
"learning_rate": 0.00019689012554893154,
"loss": 1.9532,
"step": 49
},
{
"epoch": 0.012444001991040319,
"grad_norm": 0.41714179515838623,
"learning_rate": 0.00019672948630390294,
"loss": 1.994,
"step": 50
},
{
"epoch": 0.012692882030861125,
"grad_norm": 0.40183207392692566,
"learning_rate": 0.00019656487088855592,
"loss": 1.986,
"step": 51
},
{
"epoch": 0.012941762070681932,
"grad_norm": 0.6568073034286499,
"learning_rate": 0.00019639628606958533,
"loss": 1.9911,
"step": 52
},
{
"epoch": 0.013190642110502738,
"grad_norm": 0.40880608558654785,
"learning_rate": 0.0001962237387768529,
"loss": 2.2154,
"step": 53
},
{
"epoch": 0.013439522150323544,
"grad_norm": 0.3884672522544861,
"learning_rate": 0.00019604723610310194,
"loss": 1.9765,
"step": 54
},
{
"epoch": 0.01368840219014435,
"grad_norm": 0.3780982196331024,
"learning_rate": 0.00019586678530366606,
"loss": 2.0202,
"step": 55
},
{
"epoch": 0.013937282229965157,
"grad_norm": 0.4995989203453064,
"learning_rate": 0.00019568239379617088,
"loss": 1.8901,
"step": 56
},
{
"epoch": 0.014186162269785963,
"grad_norm": 0.3196570575237274,
"learning_rate": 0.00019549406916022905,
"loss": 2.3905,
"step": 57
},
{
"epoch": 0.01443504230960677,
"grad_norm": 0.3686681389808655,
"learning_rate": 0.00019530181913712872,
"loss": 2.184,
"step": 58
},
{
"epoch": 0.014683922349427576,
"grad_norm": 0.3581784963607788,
"learning_rate": 0.00019510565162951537,
"loss": 1.8453,
"step": 59
},
{
"epoch": 0.014932802389248382,
"grad_norm": 0.3524768054485321,
"learning_rate": 0.00019490557470106686,
"loss": 2.0638,
"step": 60
},
{
"epoch": 0.015181682429069188,
"grad_norm": 0.344062864780426,
"learning_rate": 0.00019470159657616215,
"loss": 2.0319,
"step": 61
},
{
"epoch": 0.015430562468889995,
"grad_norm": 0.4052053391933441,
"learning_rate": 0.00019449372563954293,
"loss": 1.8948,
"step": 62
},
{
"epoch": 0.0156794425087108,
"grad_norm": 0.40913650393486023,
"learning_rate": 0.0001942819704359693,
"loss": 2.2626,
"step": 63
},
{
"epoch": 0.015928322548531607,
"grad_norm": 0.3126090168952942,
"learning_rate": 0.00019406633966986828,
"loss": 2.1733,
"step": 64
},
{
"epoch": 0.016177202588352414,
"grad_norm": 0.35215505957603455,
"learning_rate": 0.00019384684220497605,
"loss": 2.2469,
"step": 65
},
{
"epoch": 0.01642608262817322,
"grad_norm": 0.36992332339286804,
"learning_rate": 0.00019362348706397373,
"loss": 2.32,
"step": 66
},
{
"epoch": 0.016674962667994026,
"grad_norm": 0.3458121418952942,
"learning_rate": 0.00019339628342811632,
"loss": 2.0615,
"step": 67
},
{
"epoch": 0.016923842707814832,
"grad_norm": 0.33729124069213867,
"learning_rate": 0.0001931652406368554,
"loss": 2.1723,
"step": 68
},
{
"epoch": 0.01717272274763564,
"grad_norm": 0.46333789825439453,
"learning_rate": 0.0001929303681874552,
"loss": 2.2158,
"step": 69
},
{
"epoch": 0.017421602787456445,
"grad_norm": 0.3130255341529846,
"learning_rate": 0.0001926916757346022,
"loss": 1.991,
"step": 70
},
{
"epoch": 0.01767048282727725,
"grad_norm": 0.3570137917995453,
"learning_rate": 0.00019244917309000817,
"loss": 1.922,
"step": 71
},
{
"epoch": 0.017919362867098058,
"grad_norm": 0.27171969413757324,
"learning_rate": 0.00019220287022200707,
"loss": 2.2815,
"step": 72
},
{
"epoch": 0.018168242906918864,
"grad_norm": 0.30303627252578735,
"learning_rate": 0.0001919527772551451,
"loss": 2.09,
"step": 73
},
{
"epoch": 0.01841712294673967,
"grad_norm": 0.45809829235076904,
"learning_rate": 0.00019169890446976454,
"loss": 2.0446,
"step": 74
},
{
"epoch": 0.018666002986560477,
"grad_norm": 0.29333075881004333,
"learning_rate": 0.00019144126230158127,
"loss": 1.9728,
"step": 75
},
{
"epoch": 0.018914883026381283,
"grad_norm": 0.2959883511066437,
"learning_rate": 0.0001911798613412557,
"loss": 2.0945,
"step": 76
},
{
"epoch": 0.01916376306620209,
"grad_norm": 0.31434130668640137,
"learning_rate": 0.0001909147123339575,
"loss": 2.1703,
"step": 77
},
{
"epoch": 0.019412643106022896,
"grad_norm": 0.3976629972457886,
"learning_rate": 0.0001906458261789238,
"loss": 2.1109,
"step": 78
},
{
"epoch": 0.019661523145843702,
"grad_norm": 0.678793728351593,
"learning_rate": 0.00019037321392901136,
"loss": 1.9291,
"step": 79
},
{
"epoch": 0.019910403185664508,
"grad_norm": 0.46012192964553833,
"learning_rate": 0.0001900968867902419,
"loss": 2.265,
"step": 80
},
{
"epoch": 0.020159283225485315,
"grad_norm": 0.3698313534259796,
"learning_rate": 0.0001898168561213419,
"loss": 2.1672,
"step": 81
},
{
"epoch": 0.02040816326530612,
"grad_norm": 0.3445495069026947,
"learning_rate": 0.0001895331334332753,
"loss": 2.0265,
"step": 82
},
{
"epoch": 0.020657043305126927,
"grad_norm": 0.38758572936058044,
"learning_rate": 0.0001892457303887706,
"loss": 2.1402,
"step": 83
},
{
"epoch": 0.020905923344947737,
"grad_norm": 0.40104663372039795,
"learning_rate": 0.0001889546588018412,
"loss": 2.2048,
"step": 84
},
{
"epoch": 0.021154803384768543,
"grad_norm": 0.42981839179992676,
"learning_rate": 0.00018865993063730004,
"loss": 2.2485,
"step": 85
},
{
"epoch": 0.02140368342458935,
"grad_norm": 0.3159562647342682,
"learning_rate": 0.00018836155801026753,
"loss": 1.8041,
"step": 86
},
{
"epoch": 0.021652563464410156,
"grad_norm": 0.415228009223938,
"learning_rate": 0.0001880595531856738,
"loss": 2.08,
"step": 87
},
{
"epoch": 0.021901443504230962,
"grad_norm": 0.31004348397254944,
"learning_rate": 0.00018775392857775432,
"loss": 2.0714,
"step": 88
},
{
"epoch": 0.02215032354405177,
"grad_norm": 0.3166860342025757,
"learning_rate": 0.00018744469674953956,
"loss": 2.1457,
"step": 89
},
{
"epoch": 0.022399203583872575,
"grad_norm": 0.3657294511795044,
"learning_rate": 0.00018713187041233896,
"loss": 2.1152,
"step": 90
},
{
"epoch": 0.02264808362369338,
"grad_norm": 0.3469568192958832,
"learning_rate": 0.00018681546242521786,
"loss": 1.9196,
"step": 91
},
{
"epoch": 0.022896963663514187,
"grad_norm": 0.3438771665096283,
"learning_rate": 0.00018649548579446936,
"loss": 2.2341,
"step": 92
},
{
"epoch": 0.023145843703334994,
"grad_norm": 0.30898717045783997,
"learning_rate": 0.0001861719536730795,
"loss": 2.0978,
"step": 93
},
{
"epoch": 0.0233947237431558,
"grad_norm": 0.27930477261543274,
"learning_rate": 0.00018584487936018661,
"loss": 2.0048,
"step": 94
},
{
"epoch": 0.023643603782976606,
"grad_norm": 0.32163354754447937,
"learning_rate": 0.00018551427630053463,
"loss": 1.9489,
"step": 95
},
{
"epoch": 0.023892483822797413,
"grad_norm": 0.3902042806148529,
"learning_rate": 0.00018518015808392045,
"loss": 1.9324,
"step": 96
},
{
"epoch": 0.02414136386261822,
"grad_norm": 0.35879242420196533,
"learning_rate": 0.00018484253844463526,
"loss": 2.0225,
"step": 97
},
{
"epoch": 0.024390243902439025,
"grad_norm": 0.33365774154663086,
"learning_rate": 0.00018450143126090015,
"loss": 2.0397,
"step": 98
},
{
"epoch": 0.02463912394225983,
"grad_norm": 0.35122770071029663,
"learning_rate": 0.00018415685055429533,
"loss": 2.03,
"step": 99
},
{
"epoch": 0.024888003982080638,
"grad_norm": 0.29391220211982727,
"learning_rate": 0.00018380881048918405,
"loss": 2.119,
"step": 100
},
{
"epoch": 0.025136884021901444,
"grad_norm": 0.4452991187572479,
"learning_rate": 0.00018345732537213027,
"loss": 1.9258,
"step": 101
},
{
"epoch": 0.02538576406172225,
"grad_norm": 0.38186201453208923,
"learning_rate": 0.00018310240965131041,
"loss": 1.6995,
"step": 102
},
{
"epoch": 0.025634644101543057,
"grad_norm": 0.3281259834766388,
"learning_rate": 0.00018274407791591966,
"loss": 1.7411,
"step": 103
},
{
"epoch": 0.025883524141363863,
"grad_norm": 0.29078221321105957,
"learning_rate": 0.00018238234489557215,
"loss": 1.8857,
"step": 104
},
{
"epoch": 0.02613240418118467,
"grad_norm": 0.2946663200855255,
"learning_rate": 0.0001820172254596956,
"loss": 1.8642,
"step": 105
},
{
"epoch": 0.026381284221005476,
"grad_norm": 0.38488832116127014,
"learning_rate": 0.00018164873461691986,
"loss": 2.0094,
"step": 106
},
{
"epoch": 0.026630164260826282,
"grad_norm": 0.33220160007476807,
"learning_rate": 0.00018127688751446027,
"loss": 2.1743,
"step": 107
},
{
"epoch": 0.02687904430064709,
"grad_norm": 0.3243928849697113,
"learning_rate": 0.00018090169943749476,
"loss": 1.8538,
"step": 108
},
{
"epoch": 0.027127924340467895,
"grad_norm": 0.31402042508125305,
"learning_rate": 0.0001805231858085356,
"loss": 2.0906,
"step": 109
},
{
"epoch": 0.0273768043802887,
"grad_norm": 0.46022218465805054,
"learning_rate": 0.00018014136218679567,
"loss": 2.1055,
"step": 110
},
{
"epoch": 0.027625684420109507,
"grad_norm": 0.3270494341850281,
"learning_rate": 0.00017975624426754848,
"loss": 1.6856,
"step": 111
},
{
"epoch": 0.027874564459930314,
"grad_norm": 0.31129342317581177,
"learning_rate": 0.00017936784788148328,
"loss": 2.2389,
"step": 112
},
{
"epoch": 0.02812344449975112,
"grad_norm": 0.3763848543167114,
"learning_rate": 0.00017897618899405423,
"loss": 2.2251,
"step": 113
},
{
"epoch": 0.028372324539571926,
"grad_norm": 0.3961619436740875,
"learning_rate": 0.00017858128370482426,
"loss": 2.1523,
"step": 114
},
{
"epoch": 0.028621204579392732,
"grad_norm": 0.32675299048423767,
"learning_rate": 0.000178183148246803,
"loss": 1.8199,
"step": 115
},
{
"epoch": 0.02887008461921354,
"grad_norm": 0.39736902713775635,
"learning_rate": 0.00017778179898577973,
"loss": 2.0062,
"step": 116
},
{
"epoch": 0.029118964659034345,
"grad_norm": 0.3664799928665161,
"learning_rate": 0.00017737725241965069,
"loss": 2.2775,
"step": 117
},
{
"epoch": 0.02936784469885515,
"grad_norm": 0.3666561543941498,
"learning_rate": 0.00017696952517774062,
"loss": 1.9005,
"step": 118
},
{
"epoch": 0.029616724738675958,
"grad_norm": 0.2838401794433594,
"learning_rate": 0.00017655863402011947,
"loss": 2.0504,
"step": 119
},
{
"epoch": 0.029865604778496764,
"grad_norm": 0.3173494338989258,
"learning_rate": 0.00017614459583691346,
"loss": 1.9387,
"step": 120
},
{
"epoch": 0.03011448481831757,
"grad_norm": 0.31382298469543457,
"learning_rate": 0.00017572742764761055,
"loss": 2.0618,
"step": 121
},
{
"epoch": 0.030363364858138377,
"grad_norm": 0.3865104019641876,
"learning_rate": 0.00017530714660036112,
"loss": 2.0401,
"step": 122
},
{
"epoch": 0.030612244897959183,
"grad_norm": 0.39169177412986755,
"learning_rate": 0.00017488376997127283,
"loss": 1.7495,
"step": 123
},
{
"epoch": 0.03086112493777999,
"grad_norm": 0.3588005602359772,
"learning_rate": 0.0001744573151637007,
"loss": 2.028,
"step": 124
},
{
"epoch": 0.031110004977600796,
"grad_norm": 0.46878084540367126,
"learning_rate": 0.00017402779970753155,
"loss": 1.6867,
"step": 125
},
{
"epoch": 0.031110004977600796,
"eval_loss": 2.049610137939453,
"eval_runtime": 47.161,
"eval_samples_per_second": 13.91,
"eval_steps_per_second": 6.955,
"step": 125
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 125,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6511065563136000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}