bbytxt's picture
Training in progress, step 187, checkpoint
1237ef8 verified
raw
history blame
33.8 kB
{
"best_metric": 1.1385369300842285,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 3.004016064257028,
"eval_steps": 50,
"global_step": 187,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01606425702811245,
"grad_norm": 8.358396530151367,
"learning_rate": 1e-05,
"loss": 10.6499,
"step": 1
},
{
"epoch": 0.01606425702811245,
"eval_loss": 2.549973487854004,
"eval_runtime": 5.012,
"eval_samples_per_second": 20.95,
"eval_steps_per_second": 5.387,
"step": 1
},
{
"epoch": 0.0321285140562249,
"grad_norm": 8.561280250549316,
"learning_rate": 2e-05,
"loss": 10.1041,
"step": 2
},
{
"epoch": 0.04819277108433735,
"grad_norm": 7.631792068481445,
"learning_rate": 3e-05,
"loss": 9.1649,
"step": 3
},
{
"epoch": 0.0642570281124498,
"grad_norm": 8.794489860534668,
"learning_rate": 4e-05,
"loss": 10.3223,
"step": 4
},
{
"epoch": 0.08032128514056225,
"grad_norm": 8.187786102294922,
"learning_rate": 5e-05,
"loss": 8.9209,
"step": 5
},
{
"epoch": 0.0963855421686747,
"grad_norm": 9.42801284790039,
"learning_rate": 6e-05,
"loss": 8.9984,
"step": 6
},
{
"epoch": 0.11244979919678715,
"grad_norm": 8.824488639831543,
"learning_rate": 7e-05,
"loss": 9.4921,
"step": 7
},
{
"epoch": 0.1285140562248996,
"grad_norm": 7.4875640869140625,
"learning_rate": 8e-05,
"loss": 7.2293,
"step": 8
},
{
"epoch": 0.14457831325301204,
"grad_norm": 5.650693416595459,
"learning_rate": 9e-05,
"loss": 6.9636,
"step": 9
},
{
"epoch": 0.1606425702811245,
"grad_norm": 4.513960838317871,
"learning_rate": 0.0001,
"loss": 6.1581,
"step": 10
},
{
"epoch": 0.17670682730923695,
"grad_norm": 4.6525983810424805,
"learning_rate": 9.99921244331919e-05,
"loss": 5.6187,
"step": 11
},
{
"epoch": 0.1927710843373494,
"grad_norm": 4.981886863708496,
"learning_rate": 9.996850021374968e-05,
"loss": 5.8963,
"step": 12
},
{
"epoch": 0.20883534136546184,
"grad_norm": 4.619255065917969,
"learning_rate": 9.99291347838381e-05,
"loss": 6.3916,
"step": 13
},
{
"epoch": 0.2248995983935743,
"grad_norm": 42.78565979003906,
"learning_rate": 9.987404054446008e-05,
"loss": 5.2134,
"step": 14
},
{
"epoch": 0.24096385542168675,
"grad_norm": 4.6155290603637695,
"learning_rate": 9.980323485155013e-05,
"loss": 3.8579,
"step": 15
},
{
"epoch": 0.2570281124497992,
"grad_norm": 4.810256004333496,
"learning_rate": 9.971674001050686e-05,
"loss": 6.0338,
"step": 16
},
{
"epoch": 0.27309236947791166,
"grad_norm": 4.097535610198975,
"learning_rate": 9.961458326916624e-05,
"loss": 5.6122,
"step": 17
},
{
"epoch": 0.2891566265060241,
"grad_norm": 4.028859615325928,
"learning_rate": 9.94967968092179e-05,
"loss": 5.251,
"step": 18
},
{
"epoch": 0.30522088353413657,
"grad_norm": 3.8736844062805176,
"learning_rate": 9.936341773606723e-05,
"loss": 5.9792,
"step": 19
},
{
"epoch": 0.321285140562249,
"grad_norm": 3.783942937850952,
"learning_rate": 9.921448806714631e-05,
"loss": 5.6756,
"step": 20
},
{
"epoch": 0.3373493975903614,
"grad_norm": 3.129595994949341,
"learning_rate": 9.905005471867739e-05,
"loss": 4.8383,
"step": 21
},
{
"epoch": 0.3534136546184739,
"grad_norm": 3.2784130573272705,
"learning_rate": 9.887016949089333e-05,
"loss": 4.6347,
"step": 22
},
{
"epoch": 0.36947791164658633,
"grad_norm": 3.4912328720092773,
"learning_rate": 9.867488905171933e-05,
"loss": 4.8285,
"step": 23
},
{
"epoch": 0.3855421686746988,
"grad_norm": 4.122783184051514,
"learning_rate": 9.846427491892118e-05,
"loss": 4.9993,
"step": 24
},
{
"epoch": 0.40160642570281124,
"grad_norm": 3.454385757446289,
"learning_rate": 9.82383934407258e-05,
"loss": 4.7978,
"step": 25
},
{
"epoch": 0.41767068273092367,
"grad_norm": 3.439769744873047,
"learning_rate": 9.79973157749201e-05,
"loss": 4.8947,
"step": 26
},
{
"epoch": 0.43373493975903615,
"grad_norm": 3.4278244972229004,
"learning_rate": 9.77411178664346e-05,
"loss": 4.5172,
"step": 27
},
{
"epoch": 0.4497991967871486,
"grad_norm": 3.5005717277526855,
"learning_rate": 9.746988042341906e-05,
"loss": 4.7726,
"step": 28
},
{
"epoch": 0.46586345381526106,
"grad_norm": 3.3018925189971924,
"learning_rate": 9.718368889181764e-05,
"loss": 4.3067,
"step": 29
},
{
"epoch": 0.4819277108433735,
"grad_norm": 3.5539093017578125,
"learning_rate": 9.68826334284514e-05,
"loss": 3.7747,
"step": 30
},
{
"epoch": 0.4979919678714859,
"grad_norm": 3.4804933071136475,
"learning_rate": 9.656680887261693e-05,
"loss": 5.1528,
"step": 31
},
{
"epoch": 0.5140562248995983,
"grad_norm": 3.2824409008026123,
"learning_rate": 9.62363147162098e-05,
"loss": 4.9029,
"step": 32
},
{
"epoch": 0.5301204819277109,
"grad_norm": 3.1446473598480225,
"learning_rate": 9.589125507238233e-05,
"loss": 4.8181,
"step": 33
},
{
"epoch": 0.5461847389558233,
"grad_norm": 3.476330041885376,
"learning_rate": 9.553173864274567e-05,
"loss": 5.1613,
"step": 34
},
{
"epoch": 0.5622489959839357,
"grad_norm": 3.2551815509796143,
"learning_rate": 9.515787868312619e-05,
"loss": 4.7212,
"step": 35
},
{
"epoch": 0.5783132530120482,
"grad_norm": 3.1978161334991455,
"learning_rate": 9.476979296788747e-05,
"loss": 4.7767,
"step": 36
},
{
"epoch": 0.5943775100401606,
"grad_norm": 2.728731632232666,
"learning_rate": 9.436760375282859e-05,
"loss": 4.4009,
"step": 37
},
{
"epoch": 0.6104417670682731,
"grad_norm": 2.96553373336792,
"learning_rate": 9.395143773667088e-05,
"loss": 5.6028,
"step": 38
},
{
"epoch": 0.6265060240963856,
"grad_norm": 3.6579477787017822,
"learning_rate": 9.352142602114486e-05,
"loss": 4.9442,
"step": 39
},
{
"epoch": 0.642570281124498,
"grad_norm": 2.903290271759033,
"learning_rate": 9.30777040696903e-05,
"loss": 4.5713,
"step": 40
},
{
"epoch": 0.6586345381526104,
"grad_norm": 3.4129068851470947,
"learning_rate": 9.262041166478214e-05,
"loss": 4.6581,
"step": 41
},
{
"epoch": 0.6746987951807228,
"grad_norm": 2.9026341438293457,
"learning_rate": 9.214969286389576e-05,
"loss": 3.8248,
"step": 42
},
{
"epoch": 0.6907630522088354,
"grad_norm": 2.8055315017700195,
"learning_rate": 9.166569595412575e-05,
"loss": 3.7736,
"step": 43
},
{
"epoch": 0.7068273092369478,
"grad_norm": 3.1325647830963135,
"learning_rate": 9.116857340547202e-05,
"loss": 4.526,
"step": 44
},
{
"epoch": 0.7228915662650602,
"grad_norm": 3.3222157955169678,
"learning_rate": 9.065848182280833e-05,
"loss": 3.4443,
"step": 45
},
{
"epoch": 0.7389558232931727,
"grad_norm": 9.711248397827148,
"learning_rate": 9.013558189654819e-05,
"loss": 4.5834,
"step": 46
},
{
"epoch": 0.7550200803212851,
"grad_norm": 3.783061981201172,
"learning_rate": 8.96000383520237e-05,
"loss": 4.9376,
"step": 47
},
{
"epoch": 0.7710843373493976,
"grad_norm": 3.666478157043457,
"learning_rate": 8.905201989759341e-05,
"loss": 4.9693,
"step": 48
},
{
"epoch": 0.7871485943775101,
"grad_norm": 2.7563154697418213,
"learning_rate": 8.849169917149531e-05,
"loss": 4.0881,
"step": 49
},
{
"epoch": 0.8032128514056225,
"grad_norm": 3.036672353744507,
"learning_rate": 8.791925268746193e-05,
"loss": 4.4524,
"step": 50
},
{
"epoch": 0.8032128514056225,
"eval_loss": 1.1385369300842285,
"eval_runtime": 5.1628,
"eval_samples_per_second": 20.338,
"eval_steps_per_second": 5.23,
"step": 50
},
{
"epoch": 0.8192771084337349,
"grad_norm": 2.9426279067993164,
"learning_rate": 8.73348607791144e-05,
"loss": 4.6989,
"step": 51
},
{
"epoch": 0.8353413654618473,
"grad_norm": 3.0048654079437256,
"learning_rate": 8.673870754315336e-05,
"loss": 4.2927,
"step": 52
},
{
"epoch": 0.8514056224899599,
"grad_norm": 3.0875372886657715,
"learning_rate": 8.613098078136437e-05,
"loss": 4.8515,
"step": 53
},
{
"epoch": 0.8674698795180723,
"grad_norm": 3.7591512203216553,
"learning_rate": 8.551187194145592e-05,
"loss": 4.032,
"step": 54
},
{
"epoch": 0.8835341365461847,
"grad_norm": 2.896423816680908,
"learning_rate": 8.488157605674925e-05,
"loss": 4.0854,
"step": 55
},
{
"epoch": 0.8995983935742972,
"grad_norm": 2.782850503921509,
"learning_rate": 8.424029168473829e-05,
"loss": 4.6205,
"step": 56
},
{
"epoch": 0.9156626506024096,
"grad_norm": 2.9316649436950684,
"learning_rate": 8.358822084453965e-05,
"loss": 3.9744,
"step": 57
},
{
"epoch": 0.9317269076305221,
"grad_norm": 2.7768986225128174,
"learning_rate": 8.292556895325194e-05,
"loss": 4.2567,
"step": 58
},
{
"epoch": 0.9477911646586346,
"grad_norm": 3.6404924392700195,
"learning_rate": 8.225254476124478e-05,
"loss": 4.2149,
"step": 59
},
{
"epoch": 0.963855421686747,
"grad_norm": 3.508265972137451,
"learning_rate": 8.156936028639767e-05,
"loss": 3.9642,
"step": 60
},
{
"epoch": 0.9799196787148594,
"grad_norm": 3.133049726486206,
"learning_rate": 8.08762307473096e-05,
"loss": 4.7128,
"step": 61
},
{
"epoch": 0.9959839357429718,
"grad_norm": 3.1079959869384766,
"learning_rate": 8.01733744955002e-05,
"loss": 4.2072,
"step": 62
},
{
"epoch": 1.0120481927710843,
"grad_norm": 5.682511806488037,
"learning_rate": 7.946101294662418e-05,
"loss": 5.271,
"step": 63
},
{
"epoch": 1.0281124497991967,
"grad_norm": 2.7712900638580322,
"learning_rate": 7.873937051072035e-05,
"loss": 4.2853,
"step": 64
},
{
"epoch": 1.0441767068273093,
"grad_norm": 2.630441188812256,
"learning_rate": 7.80086745215173e-05,
"loss": 3.7277,
"step": 65
},
{
"epoch": 1.0602409638554218,
"grad_norm": 2.798489570617676,
"learning_rate": 7.726915516481824e-05,
"loss": 4.2442,
"step": 66
},
{
"epoch": 1.0763052208835342,
"grad_norm": 2.5500569343566895,
"learning_rate": 7.652104540598712e-05,
"loss": 3.378,
"step": 67
},
{
"epoch": 1.0923694779116466,
"grad_norm": 2.5407848358154297,
"learning_rate": 7.57645809165594e-05,
"loss": 4.2423,
"step": 68
},
{
"epoch": 1.108433734939759,
"grad_norm": 2.9551448822021484,
"learning_rate": 7.500000000000001e-05,
"loss": 4.8987,
"step": 69
},
{
"epoch": 1.1244979919678715,
"grad_norm": 2.51187801361084,
"learning_rate": 7.422754351663252e-05,
"loss": 3.8333,
"step": 70
},
{
"epoch": 1.140562248995984,
"grad_norm": 2.5731523036956787,
"learning_rate": 7.344745480776257e-05,
"loss": 3.8188,
"step": 71
},
{
"epoch": 1.1566265060240963,
"grad_norm": 2.586987018585205,
"learning_rate": 7.265997961901987e-05,
"loss": 3.5334,
"step": 72
},
{
"epoch": 1.1726907630522088,
"grad_norm": 2.6580755710601807,
"learning_rate": 7.186536602294278e-05,
"loss": 3.6089,
"step": 73
},
{
"epoch": 1.1887550200803212,
"grad_norm": 2.7322309017181396,
"learning_rate": 7.106386434082979e-05,
"loss": 3.5912,
"step": 74
},
{
"epoch": 1.2048192771084336,
"grad_norm": 2.6883673667907715,
"learning_rate": 7.025572706388268e-05,
"loss": 3.4219,
"step": 75
},
{
"epoch": 1.2208835341365463,
"grad_norm": 3.3504085540771484,
"learning_rate": 6.944120877366604e-05,
"loss": 3.5701,
"step": 76
},
{
"epoch": 1.2369477911646587,
"grad_norm": 2.829219102859497,
"learning_rate": 6.86205660619083e-05,
"loss": 2.6253,
"step": 77
},
{
"epoch": 1.2530120481927711,
"grad_norm": 3.903244733810425,
"learning_rate": 6.779405744966954e-05,
"loss": 3.8675,
"step": 78
},
{
"epoch": 1.2690763052208835,
"grad_norm": 3.95448637008667,
"learning_rate": 6.696194330590151e-05,
"loss": 4.0529,
"step": 79
},
{
"epoch": 1.285140562248996,
"grad_norm": 2.8472986221313477,
"learning_rate": 6.612448576542545e-05,
"loss": 3.1193,
"step": 80
},
{
"epoch": 1.3012048192771084,
"grad_norm": 3.7642226219177246,
"learning_rate": 6.528194864635369e-05,
"loss": 3.6889,
"step": 81
},
{
"epoch": 1.3172690763052208,
"grad_norm": 3.163255214691162,
"learning_rate": 6.443459736698105e-05,
"loss": 3.8981,
"step": 82
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.7267022132873535,
"learning_rate": 6.358269886217194e-05,
"loss": 3.7673,
"step": 83
},
{
"epoch": 1.3493975903614457,
"grad_norm": 3.128511428833008,
"learning_rate": 6.272652149926988e-05,
"loss": 3.1023,
"step": 84
},
{
"epoch": 1.3654618473895583,
"grad_norm": 3.0362024307250977,
"learning_rate": 6.186633499355576e-05,
"loss": 3.2074,
"step": 85
},
{
"epoch": 1.3815261044176708,
"grad_norm": 3.2987911701202393,
"learning_rate": 6.100241032328124e-05,
"loss": 3.4199,
"step": 86
},
{
"epoch": 1.3975903614457832,
"grad_norm": 3.5199403762817383,
"learning_rate": 6.013501964430468e-05,
"loss": 3.5126,
"step": 87
},
{
"epoch": 1.4136546184738956,
"grad_norm": 3.5599286556243896,
"learning_rate": 5.9264436204355724e-05,
"loss": 3.6378,
"step": 88
},
{
"epoch": 1.429718875502008,
"grad_norm": 3.366896152496338,
"learning_rate": 5.839093425695609e-05,
"loss": 3.1565,
"step": 89
},
{
"epoch": 1.4457831325301205,
"grad_norm": 3.9521143436431885,
"learning_rate": 5.751478897502352e-05,
"loss": 3.5207,
"step": 90
},
{
"epoch": 1.461847389558233,
"grad_norm": 3.8751003742218018,
"learning_rate": 5.6636276364186105e-05,
"loss": 3.0394,
"step": 91
},
{
"epoch": 1.4779116465863453,
"grad_norm": 4.066254615783691,
"learning_rate": 5.5755673175834145e-05,
"loss": 3.2113,
"step": 92
},
{
"epoch": 1.4939759036144578,
"grad_norm": 4.048666954040527,
"learning_rate": 5.487325681993733e-05,
"loss": 3.5889,
"step": 93
},
{
"epoch": 1.5100401606425704,
"grad_norm": 4.316978931427002,
"learning_rate": 5.3989305277654156e-05,
"loss": 3.9679,
"step": 94
},
{
"epoch": 1.5261044176706826,
"grad_norm": 3.883434534072876,
"learning_rate": 5.31040970137617e-05,
"loss": 3.3537,
"step": 95
},
{
"epoch": 1.5421686746987953,
"grad_norm": 4.3469672203063965,
"learning_rate": 5.221791088893282e-05,
"loss": 4.7528,
"step": 96
},
{
"epoch": 1.5582329317269075,
"grad_norm": 3.930917739868164,
"learning_rate": 5.133102607188874e-05,
"loss": 3.8965,
"step": 97
},
{
"epoch": 1.5742971887550201,
"grad_norm": 3.873133420944214,
"learning_rate": 5.044372195145455e-05,
"loss": 3.3711,
"step": 98
},
{
"epoch": 1.5903614457831325,
"grad_norm": 3.6565146446228027,
"learning_rate": 4.955627804854545e-05,
"loss": 3.6828,
"step": 99
},
{
"epoch": 1.606425702811245,
"grad_norm": 3.9164047241210938,
"learning_rate": 4.866897392811126e-05,
"loss": 4.1462,
"step": 100
},
{
"epoch": 1.606425702811245,
"eval_loss": 1.1504853963851929,
"eval_runtime": 5.1386,
"eval_samples_per_second": 20.434,
"eval_steps_per_second": 5.254,
"step": 100
},
{
"epoch": 1.6224899598393574,
"grad_norm": 4.047707557678223,
"learning_rate": 4.7782089111067176e-05,
"loss": 3.7656,
"step": 101
},
{
"epoch": 1.6385542168674698,
"grad_norm": 3.6224374771118164,
"learning_rate": 4.6895902986238304e-05,
"loss": 3.7808,
"step": 102
},
{
"epoch": 1.6546184738955825,
"grad_norm": 3.4380745887756348,
"learning_rate": 4.601069472234584e-05,
"loss": 2.9976,
"step": 103
},
{
"epoch": 1.6706827309236947,
"grad_norm": 3.690455436706543,
"learning_rate": 4.512674318006268e-05,
"loss": 2.969,
"step": 104
},
{
"epoch": 1.6867469879518073,
"grad_norm": 3.4021284580230713,
"learning_rate": 4.424432682416585e-05,
"loss": 3.0345,
"step": 105
},
{
"epoch": 1.7028112449799195,
"grad_norm": 3.821765422821045,
"learning_rate": 4.336372363581391e-05,
"loss": 3.0546,
"step": 106
},
{
"epoch": 1.7188755020080322,
"grad_norm": 3.7544493675231934,
"learning_rate": 4.2485211024976496e-05,
"loss": 2.6392,
"step": 107
},
{
"epoch": 1.7349397590361446,
"grad_norm": 3.898599863052368,
"learning_rate": 4.160906574304392e-05,
"loss": 3.675,
"step": 108
},
{
"epoch": 1.751004016064257,
"grad_norm": 3.801405429840088,
"learning_rate": 4.0735563795644294e-05,
"loss": 3.8661,
"step": 109
},
{
"epoch": 1.7670682730923695,
"grad_norm": 4.313454627990723,
"learning_rate": 3.986498035569532e-05,
"loss": 4.2648,
"step": 110
},
{
"epoch": 1.783132530120482,
"grad_norm": 3.6857495307922363,
"learning_rate": 3.899758967671878e-05,
"loss": 3.2822,
"step": 111
},
{
"epoch": 1.7991967871485943,
"grad_norm": 3.5725905895233154,
"learning_rate": 3.8133665006444255e-05,
"loss": 3.4181,
"step": 112
},
{
"epoch": 1.8152610441767068,
"grad_norm": 3.708075761795044,
"learning_rate": 3.727347850073012e-05,
"loss": 3.0178,
"step": 113
},
{
"epoch": 1.8313253012048194,
"grad_norm": 4.1876220703125,
"learning_rate": 3.641730113782807e-05,
"loss": 3.5986,
"step": 114
},
{
"epoch": 1.8473895582329316,
"grad_norm": 3.863704204559326,
"learning_rate": 3.556540263301896e-05,
"loss": 3.5571,
"step": 115
},
{
"epoch": 1.8634538152610443,
"grad_norm": 3.584493398666382,
"learning_rate": 3.47180513536463e-05,
"loss": 3.3766,
"step": 116
},
{
"epoch": 1.8795180722891565,
"grad_norm": 3.944830894470215,
"learning_rate": 3.3875514234574556e-05,
"loss": 3.5299,
"step": 117
},
{
"epoch": 1.895582329317269,
"grad_norm": 3.59722900390625,
"learning_rate": 3.303805669409848e-05,
"loss": 3.1769,
"step": 118
},
{
"epoch": 1.9116465863453815,
"grad_norm": 3.9105989933013916,
"learning_rate": 3.2205942550330456e-05,
"loss": 3.3873,
"step": 119
},
{
"epoch": 1.927710843373494,
"grad_norm": 3.794104814529419,
"learning_rate": 3.1379433938091696e-05,
"loss": 3.3131,
"step": 120
},
{
"epoch": 1.9437751004016064,
"grad_norm": 3.642608165740967,
"learning_rate": 3.055879122633397e-05,
"loss": 2.8884,
"step": 121
},
{
"epoch": 1.9598393574297188,
"grad_norm": 4.019187927246094,
"learning_rate": 2.9744272936117323e-05,
"loss": 2.927,
"step": 122
},
{
"epoch": 1.9759036144578315,
"grad_norm": 4.073461532592773,
"learning_rate": 2.8936135659170216e-05,
"loss": 3.2049,
"step": 123
},
{
"epoch": 1.9919678714859437,
"grad_norm": 3.866515874862671,
"learning_rate": 2.8134633977057235e-05,
"loss": 3.0155,
"step": 124
},
{
"epoch": 2.0080321285140563,
"grad_norm": 6.784224033355713,
"learning_rate": 2.7340020380980146e-05,
"loss": 2.7591,
"step": 125
},
{
"epoch": 2.0240963855421685,
"grad_norm": 3.676565408706665,
"learning_rate": 2.655254519223746e-05,
"loss": 3.2366,
"step": 126
},
{
"epoch": 2.040160642570281,
"grad_norm": 3.3340706825256348,
"learning_rate": 2.5772456483367497e-05,
"loss": 2.8017,
"step": 127
},
{
"epoch": 2.0562248995983934,
"grad_norm": 3.5175349712371826,
"learning_rate": 2.500000000000001e-05,
"loss": 2.9206,
"step": 128
},
{
"epoch": 2.072289156626506,
"grad_norm": 3.3527610301971436,
"learning_rate": 2.4235419083440613e-05,
"loss": 2.5938,
"step": 129
},
{
"epoch": 2.0883534136546187,
"grad_norm": 3.2494187355041504,
"learning_rate": 2.347895459401288e-05,
"loss": 2.2693,
"step": 130
},
{
"epoch": 2.104417670682731,
"grad_norm": 4.090683460235596,
"learning_rate": 2.2730844835181757e-05,
"loss": 2.9655,
"step": 131
},
{
"epoch": 2.1204819277108435,
"grad_norm": 3.746830701828003,
"learning_rate": 2.1991325478482694e-05,
"loss": 2.6929,
"step": 132
},
{
"epoch": 2.1365461847389557,
"grad_norm": 3.2926580905914307,
"learning_rate": 2.126062948927966e-05,
"loss": 2.395,
"step": 133
},
{
"epoch": 2.1526104417670684,
"grad_norm": 4.051050662994385,
"learning_rate": 2.053898705337583e-05,
"loss": 2.8629,
"step": 134
},
{
"epoch": 2.1686746987951806,
"grad_norm": 3.782912254333496,
"learning_rate": 1.9826625504499806e-05,
"loss": 2.7832,
"step": 135
},
{
"epoch": 2.1847389558232932,
"grad_norm": 3.7877721786499023,
"learning_rate": 1.912376925269041e-05,
"loss": 1.9438,
"step": 136
},
{
"epoch": 2.2008032128514055,
"grad_norm": 3.5854597091674805,
"learning_rate": 1.8430639713602316e-05,
"loss": 1.9395,
"step": 137
},
{
"epoch": 2.216867469879518,
"grad_norm": 4.118504524230957,
"learning_rate": 1.7747455238755223e-05,
"loss": 2.7666,
"step": 138
},
{
"epoch": 2.2329317269076308,
"grad_norm": 4.0326924324035645,
"learning_rate": 1.7074431046748075e-05,
"loss": 1.8387,
"step": 139
},
{
"epoch": 2.248995983935743,
"grad_norm": 4.268743991851807,
"learning_rate": 1.641177915546036e-05,
"loss": 2.5107,
"step": 140
},
{
"epoch": 2.2650602409638556,
"grad_norm": 4.364184856414795,
"learning_rate": 1.5759708315261722e-05,
"loss": 2.4838,
"step": 141
},
{
"epoch": 2.281124497991968,
"grad_norm": 4.482365131378174,
"learning_rate": 1.5118423943250771e-05,
"loss": 2.5102,
"step": 142
},
{
"epoch": 2.2971887550200805,
"grad_norm": 4.34283447265625,
"learning_rate": 1.4488128058544098e-05,
"loss": 2.5356,
"step": 143
},
{
"epoch": 2.3132530120481927,
"grad_norm": 4.787532329559326,
"learning_rate": 1.3869019218635642e-05,
"loss": 3.0075,
"step": 144
},
{
"epoch": 2.3293172690763053,
"grad_norm": 4.763615608215332,
"learning_rate": 1.3261292456846647e-05,
"loss": 2.6659,
"step": 145
},
{
"epoch": 2.3453815261044175,
"grad_norm": 4.660789489746094,
"learning_rate": 1.2665139220885613e-05,
"loss": 2.6328,
"step": 146
},
{
"epoch": 2.36144578313253,
"grad_norm": 4.864875793457031,
"learning_rate": 1.2080747312538083e-05,
"loss": 2.6357,
"step": 147
},
{
"epoch": 2.3775100401606424,
"grad_norm": 5.13457727432251,
"learning_rate": 1.150830082850468e-05,
"loss": 2.601,
"step": 148
},
{
"epoch": 2.393574297188755,
"grad_norm": 4.843163967132568,
"learning_rate": 1.0947980102406596e-05,
"loss": 2.1365,
"step": 149
},
{
"epoch": 2.4096385542168672,
"grad_norm": 4.978252410888672,
"learning_rate": 1.0399961647976314e-05,
"loss": 2.4167,
"step": 150
},
{
"epoch": 2.4096385542168672,
"eval_loss": 1.3296469449996948,
"eval_runtime": 5.1411,
"eval_samples_per_second": 20.423,
"eval_steps_per_second": 5.252,
"step": 150
},
{
"epoch": 2.42570281124498,
"grad_norm": 5.148977279663086,
"learning_rate": 9.864418103451828e-06,
"loss": 2.2449,
"step": 151
},
{
"epoch": 2.4417670682730925,
"grad_norm": 4.7471466064453125,
"learning_rate": 9.34151817719166e-06,
"loss": 1.7333,
"step": 152
},
{
"epoch": 2.4578313253012047,
"grad_norm": 5.056032180786133,
"learning_rate": 8.831426594527975e-06,
"loss": 1.7489,
"step": 153
},
{
"epoch": 2.4738955823293174,
"grad_norm": 5.261003017425537,
"learning_rate": 8.334304045874247e-06,
"loss": 1.6044,
"step": 154
},
{
"epoch": 2.4899598393574296,
"grad_norm": 5.535795211791992,
"learning_rate": 7.850307136104247e-06,
"loss": 2.2367,
"step": 155
},
{
"epoch": 2.5060240963855422,
"grad_norm": 5.856411933898926,
"learning_rate": 7.379588335217874e-06,
"loss": 3.0989,
"step": 156
},
{
"epoch": 2.522088353413655,
"grad_norm": 5.084694862365723,
"learning_rate": 6.922295930309691e-06,
"loss": 2.7171,
"step": 157
},
{
"epoch": 2.538152610441767,
"grad_norm": 6.212282657623291,
"learning_rate": 6.478573978855146e-06,
"loss": 2.8528,
"step": 158
},
{
"epoch": 2.5542168674698793,
"grad_norm": 4.591296195983887,
"learning_rate": 6.048562263329138e-06,
"loss": 2.0066,
"step": 159
},
{
"epoch": 2.570281124497992,
"grad_norm": 5.471847057342529,
"learning_rate": 5.6323962471714286e-06,
"loss": 2.6712,
"step": 160
},
{
"epoch": 2.5863453815261046,
"grad_norm": 6.344399929046631,
"learning_rate": 5.23020703211255e-06,
"loss": 2.3738,
"step": 161
},
{
"epoch": 2.602409638554217,
"grad_norm": 4.858112335205078,
"learning_rate": 4.842121316873821e-06,
"loss": 2.2417,
"step": 162
},
{
"epoch": 2.6184738955823295,
"grad_norm": 5.603279113769531,
"learning_rate": 4.468261357254339e-06,
"loss": 2.6957,
"step": 163
},
{
"epoch": 2.6345381526104417,
"grad_norm": 5.545329570770264,
"learning_rate": 4.108744927617669e-06,
"loss": 2.2776,
"step": 164
},
{
"epoch": 2.6506024096385543,
"grad_norm": 4.7836222648620605,
"learning_rate": 3.763685283790208e-06,
"loss": 2.0821,
"step": 165
},
{
"epoch": 2.6666666666666665,
"grad_norm": 4.492027282714844,
"learning_rate": 3.4331911273830784e-06,
"loss": 2.031,
"step": 166
},
{
"epoch": 2.682730923694779,
"grad_norm": 6.321730136871338,
"learning_rate": 3.117366571548608e-06,
"loss": 2.5016,
"step": 167
},
{
"epoch": 2.6987951807228914,
"grad_norm": 8.664602279663086,
"learning_rate": 2.816311108182368e-06,
"loss": 2.1106,
"step": 168
},
{
"epoch": 2.714859437751004,
"grad_norm": 5.606003284454346,
"learning_rate": 2.530119576580936e-06,
"loss": 2.185,
"step": 169
},
{
"epoch": 2.7309236947791167,
"grad_norm": 5.336299896240234,
"learning_rate": 2.258882133565404e-06,
"loss": 2.0902,
"step": 170
},
{
"epoch": 2.746987951807229,
"grad_norm": 5.943612575531006,
"learning_rate": 2.0026842250799038e-06,
"loss": 3.1001,
"step": 171
},
{
"epoch": 2.7630522088353415,
"grad_norm": 5.235433101654053,
"learning_rate": 1.7616065592742038e-06,
"loss": 2.7069,
"step": 172
},
{
"epoch": 2.7791164658634537,
"grad_norm": 4.886165142059326,
"learning_rate": 1.5357250810788314e-06,
"loss": 2.3469,
"step": 173
},
{
"epoch": 2.7951807228915664,
"grad_norm": 5.185632228851318,
"learning_rate": 1.3251109482806666e-06,
"loss": 2.7937,
"step": 174
},
{
"epoch": 2.8112449799196786,
"grad_norm": 5.091357231140137,
"learning_rate": 1.1298305091066664e-06,
"loss": 2.5354,
"step": 175
},
{
"epoch": 2.8273092369477912,
"grad_norm": 4.6926774978637695,
"learning_rate": 9.499452813226284e-07,
"loss": 2.2616,
"step": 176
},
{
"epoch": 2.8433734939759034,
"grad_norm": 4.782034397125244,
"learning_rate": 7.855119328537109e-07,
"loss": 2.0711,
"step": 177
},
{
"epoch": 2.859437751004016,
"grad_norm": 5.071019172668457,
"learning_rate": 6.365822639327723e-07,
"loss": 2.1635,
"step": 178
},
{
"epoch": 2.8755020080321287,
"grad_norm": 4.6255974769592285,
"learning_rate": 5.032031907821089e-07,
"loss": 2.1709,
"step": 179
},
{
"epoch": 2.891566265060241,
"grad_norm": 5.228384494781494,
"learning_rate": 3.854167308337708e-07,
"loss": 2.1617,
"step": 180
},
{
"epoch": 2.907630522088353,
"grad_norm": 5.127053260803223,
"learning_rate": 2.8325998949314536e-07,
"loss": 2.1208,
"step": 181
},
{
"epoch": 2.923694779116466,
"grad_norm": 4.4402055740356445,
"learning_rate": 1.9676514844987337e-07,
"loss": 1.9813,
"step": 182
},
{
"epoch": 2.9397590361445785,
"grad_norm": 6.084360122680664,
"learning_rate": 1.2595945553992573e-07,
"loss": 2.1314,
"step": 183
},
{
"epoch": 2.9558232931726907,
"grad_norm": 4.875538349151611,
"learning_rate": 7.086521616190279e-08,
"loss": 1.9403,
"step": 184
},
{
"epoch": 2.9718875502008033,
"grad_norm": 4.680423736572266,
"learning_rate": 3.149978625032191e-08,
"loss": 1.7237,
"step": 185
},
{
"epoch": 2.9879518072289155,
"grad_norm": 5.582780838012695,
"learning_rate": 7.875566808107637e-09,
"loss": 2.8136,
"step": 186
},
{
"epoch": 3.004016064257028,
"grad_norm": 5.710657596588135,
"learning_rate": 0.0,
"loss": 1.9331,
"step": 187
}
],
"logging_steps": 1,
"max_steps": 187,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4076236595304858e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}