llava_ov_mid2llava_cot / trainer_state.json
luodian's picture
Upload folder using huggingface_hub
13a0a4c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 770,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 10.01884195009205,
"learning_rate": 4.1666666666666667e-07,
"loss": 1.6675,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 10.180633447078613,
"learning_rate": 8.333333333333333e-07,
"loss": 1.7678,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 10.303211452992821,
"learning_rate": 1.25e-06,
"loss": 1.7284,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 9.910461388084395,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.7352,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 9.326250707724988,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.7242,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 7.913664539477789,
"learning_rate": 2.5e-06,
"loss": 1.6469,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 7.078423538808147,
"learning_rate": 2.916666666666667e-06,
"loss": 1.6132,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 5.384402059977251,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4678,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 4.3577482532491025,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3556,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 4.174369624886316,
"learning_rate": 4.166666666666667e-06,
"loss": 1.4044,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 3.8639134882240826,
"learning_rate": 4.583333333333333e-06,
"loss": 1.3128,
"step": 11
},
{
"epoch": 0.02,
"grad_norm": 3.9821742334186547,
"learning_rate": 5e-06,
"loss": 1.3084,
"step": 12
},
{
"epoch": 0.02,
"grad_norm": 3.8428940928189363,
"learning_rate": 5.416666666666667e-06,
"loss": 1.287,
"step": 13
},
{
"epoch": 0.02,
"grad_norm": 2.770103811111949,
"learning_rate": 5.833333333333334e-06,
"loss": 1.1749,
"step": 14
},
{
"epoch": 0.02,
"grad_norm": 3.2520397890717034,
"learning_rate": 6.25e-06,
"loss": 1.1662,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 2.3752009512247585,
"learning_rate": 6.666666666666667e-06,
"loss": 1.1161,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 2.2116579911789667,
"learning_rate": 7.083333333333335e-06,
"loss": 1.1,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 2.0963580548505436,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0793,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 1.651378799679108,
"learning_rate": 7.916666666666667e-06,
"loss": 1.0386,
"step": 19
},
{
"epoch": 0.03,
"grad_norm": 1.5740210080816908,
"learning_rate": 8.333333333333334e-06,
"loss": 1.0234,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 1.777625330317658,
"learning_rate": 8.750000000000001e-06,
"loss": 1.0418,
"step": 21
},
{
"epoch": 0.03,
"grad_norm": 1.5402956873406757,
"learning_rate": 9.166666666666666e-06,
"loss": 0.9971,
"step": 22
},
{
"epoch": 0.03,
"grad_norm": 1.4503947750874269,
"learning_rate": 9.583333333333335e-06,
"loss": 1.0418,
"step": 23
},
{
"epoch": 0.03,
"grad_norm": 1.4809567732326383,
"learning_rate": 1e-05,
"loss": 0.9896,
"step": 24
},
{
"epoch": 0.03,
"grad_norm": 1.347063159372048,
"learning_rate": 9.999955663494783e-06,
"loss": 0.9821,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 1.337982719973813,
"learning_rate": 9.999822654765424e-06,
"loss": 0.984,
"step": 26
},
{
"epoch": 0.04,
"grad_norm": 1.2598040340641647,
"learning_rate": 9.999600976170775e-06,
"loss": 0.9564,
"step": 27
},
{
"epoch": 0.04,
"grad_norm": 1.5283239907844695,
"learning_rate": 9.999290631642222e-06,
"loss": 0.9315,
"step": 28
},
{
"epoch": 0.04,
"grad_norm": 1.3107099251584715,
"learning_rate": 9.9988916266836e-06,
"loss": 0.9524,
"step": 29
},
{
"epoch": 0.04,
"grad_norm": 1.2841120681980969,
"learning_rate": 9.998403968371104e-06,
"loss": 0.9801,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 1.2833311749613852,
"learning_rate": 9.997827665353159e-06,
"loss": 0.9564,
"step": 31
},
{
"epoch": 0.04,
"grad_norm": 1.326424018708689,
"learning_rate": 9.997162727850271e-06,
"loss": 0.9359,
"step": 32
},
{
"epoch": 0.04,
"grad_norm": 1.4100755530546323,
"learning_rate": 9.996409167654843e-06,
"loss": 0.9462,
"step": 33
},
{
"epoch": 0.04,
"grad_norm": 1.3058423401625958,
"learning_rate": 9.995566998130962e-06,
"loss": 0.9495,
"step": 34
},
{
"epoch": 0.05,
"grad_norm": 1.3957855779834178,
"learning_rate": 9.99463623421417e-06,
"loss": 0.9394,
"step": 35
},
{
"epoch": 0.05,
"grad_norm": 1.2590639321281085,
"learning_rate": 9.993616892411198e-06,
"loss": 0.9165,
"step": 36
},
{
"epoch": 0.05,
"grad_norm": 1.2489518258284393,
"learning_rate": 9.992508990799665e-06,
"loss": 0.9682,
"step": 37
},
{
"epoch": 0.05,
"grad_norm": 1.3114899180647628,
"learning_rate": 9.991312549027762e-06,
"loss": 0.9939,
"step": 38
},
{
"epoch": 0.05,
"grad_norm": 1.2902990615583814,
"learning_rate": 9.990027588313916e-06,
"loss": 0.935,
"step": 39
},
{
"epoch": 0.05,
"grad_norm": 1.334306124290386,
"learning_rate": 9.988654131446385e-06,
"loss": 0.9489,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 1.214618974357902,
"learning_rate": 9.987192202782886e-06,
"loss": 0.9122,
"step": 41
},
{
"epoch": 0.05,
"grad_norm": 1.2918632529579752,
"learning_rate": 9.98564182825014e-06,
"loss": 0.9633,
"step": 42
},
{
"epoch": 0.06,
"grad_norm": 1.3534377529573218,
"learning_rate": 9.984003035343422e-06,
"loss": 0.9306,
"step": 43
},
{
"epoch": 0.06,
"grad_norm": 1.3761114385394022,
"learning_rate": 9.982275853126073e-06,
"loss": 0.9354,
"step": 44
},
{
"epoch": 0.06,
"grad_norm": 1.7160524235309491,
"learning_rate": 9.980460312228981e-06,
"loss": 0.9524,
"step": 45
},
{
"epoch": 0.06,
"grad_norm": 1.4535654188609335,
"learning_rate": 9.978556444850043e-06,
"loss": 0.9126,
"step": 46
},
{
"epoch": 0.06,
"grad_norm": 1.329692670990971,
"learning_rate": 9.97656428475359e-06,
"loss": 0.8982,
"step": 47
},
{
"epoch": 0.06,
"grad_norm": 1.2204169092183164,
"learning_rate": 9.974483867269787e-06,
"loss": 0.8878,
"step": 48
},
{
"epoch": 0.06,
"grad_norm": 1.3914134893180399,
"learning_rate": 9.97231522929401e-06,
"loss": 0.8933,
"step": 49
},
{
"epoch": 0.06,
"grad_norm": 1.314449142464296,
"learning_rate": 9.97005840928619e-06,
"loss": 0.9163,
"step": 50
},
{
"epoch": 0.07,
"grad_norm": 1.2684644507071798,
"learning_rate": 9.967713447270134e-06,
"loss": 0.9036,
"step": 51
},
{
"epoch": 0.07,
"grad_norm": 1.1289718531785145,
"learning_rate": 9.965280384832809e-06,
"loss": 0.8844,
"step": 52
},
{
"epoch": 0.07,
"grad_norm": 1.2737395334908646,
"learning_rate": 9.962759265123611e-06,
"loss": 0.8624,
"step": 53
},
{
"epoch": 0.07,
"grad_norm": 1.3627730655756511,
"learning_rate": 9.960150132853592e-06,
"loss": 0.8977,
"step": 54
},
{
"epoch": 0.07,
"grad_norm": 1.2442014917910598,
"learning_rate": 9.957453034294677e-06,
"loss": 0.9067,
"step": 55
},
{
"epoch": 0.07,
"grad_norm": 1.4810477924919117,
"learning_rate": 9.954668017278834e-06,
"loss": 0.9119,
"step": 56
},
{
"epoch": 0.07,
"grad_norm": 1.3799563005304054,
"learning_rate": 9.951795131197233e-06,
"loss": 0.9261,
"step": 57
},
{
"epoch": 0.08,
"grad_norm": 1.414006096549083,
"learning_rate": 9.948834426999363e-06,
"loss": 0.9121,
"step": 58
},
{
"epoch": 0.08,
"grad_norm": 1.6428491735539237,
"learning_rate": 9.945785957192138e-06,
"loss": 0.9428,
"step": 59
},
{
"epoch": 0.08,
"grad_norm": 1.2653571709570268,
"learning_rate": 9.942649775838955e-06,
"loss": 0.8767,
"step": 60
},
{
"epoch": 0.08,
"grad_norm": 1.15765431948816,
"learning_rate": 9.939425938558744e-06,
"loss": 0.9034,
"step": 61
},
{
"epoch": 0.08,
"grad_norm": 1.1798179797077757,
"learning_rate": 9.936114502524974e-06,
"loss": 0.9168,
"step": 62
},
{
"epoch": 0.08,
"grad_norm": 1.17972963935367,
"learning_rate": 9.932715526464646e-06,
"loss": 0.8591,
"step": 63
},
{
"epoch": 0.08,
"grad_norm": 1.1938866521236933,
"learning_rate": 9.929229070657251e-06,
"loss": 0.9049,
"step": 64
},
{
"epoch": 0.08,
"grad_norm": 1.2396855840030339,
"learning_rate": 9.925655196933692e-06,
"loss": 0.9578,
"step": 65
},
{
"epoch": 0.09,
"grad_norm": 1.266928905031081,
"learning_rate": 9.921993968675198e-06,
"loss": 0.9097,
"step": 66
},
{
"epoch": 0.09,
"grad_norm": 1.164862535741009,
"learning_rate": 9.918245450812196e-06,
"loss": 0.9182,
"step": 67
},
{
"epoch": 0.09,
"grad_norm": 1.3760807748353976,
"learning_rate": 9.914409709823158e-06,
"loss": 0.9183,
"step": 68
},
{
"epoch": 0.09,
"grad_norm": 1.204551307828834,
"learning_rate": 9.910486813733427e-06,
"loss": 0.909,
"step": 69
},
{
"epoch": 0.09,
"grad_norm": 1.3492227169803832,
"learning_rate": 9.906476832114e-06,
"loss": 0.8767,
"step": 70
},
{
"epoch": 0.09,
"grad_norm": 1.2599025510474737,
"learning_rate": 9.902379836080308e-06,
"loss": 0.9017,
"step": 71
},
{
"epoch": 0.09,
"grad_norm": 1.2291538586506283,
"learning_rate": 9.898195898290944e-06,
"loss": 0.879,
"step": 72
},
{
"epoch": 0.09,
"grad_norm": 1.3486283669610692,
"learning_rate": 9.893925092946379e-06,
"loss": 0.904,
"step": 73
},
{
"epoch": 0.1,
"grad_norm": 1.2322670763852388,
"learning_rate": 9.889567495787651e-06,
"loss": 0.9129,
"step": 74
},
{
"epoch": 0.1,
"grad_norm": 1.6502741500539375,
"learning_rate": 9.885123184095007e-06,
"loss": 0.893,
"step": 75
},
{
"epoch": 0.1,
"grad_norm": 1.2779919969142226,
"learning_rate": 9.880592236686548e-06,
"loss": 0.9129,
"step": 76
},
{
"epoch": 0.1,
"grad_norm": 1.221028204978688,
"learning_rate": 9.875974733916822e-06,
"loss": 0.8834,
"step": 77
},
{
"epoch": 0.1,
"grad_norm": 1.185083112243534,
"learning_rate": 9.871270757675406e-06,
"loss": 0.9237,
"step": 78
},
{
"epoch": 0.1,
"grad_norm": 1.2932265161475407,
"learning_rate": 9.866480391385446e-06,
"loss": 0.8421,
"step": 79
},
{
"epoch": 0.1,
"grad_norm": 1.4143826220607103,
"learning_rate": 9.861603720002182e-06,
"loss": 0.8825,
"step": 80
},
{
"epoch": 0.11,
"grad_norm": 1.3149520714145249,
"learning_rate": 9.856640830011437e-06,
"loss": 0.8686,
"step": 81
},
{
"epoch": 0.11,
"grad_norm": 1.2757323469959327,
"learning_rate": 9.851591809428096e-06,
"loss": 0.9248,
"step": 82
},
{
"epoch": 0.11,
"grad_norm": 1.2619145491525985,
"learning_rate": 9.846456747794526e-06,
"loss": 0.9045,
"step": 83
},
{
"epoch": 0.11,
"grad_norm": 1.2192461206205478,
"learning_rate": 9.841235736179002e-06,
"loss": 0.9009,
"step": 84
},
{
"epoch": 0.11,
"grad_norm": 1.2543181200558176,
"learning_rate": 9.83592886717409e-06,
"loss": 0.8777,
"step": 85
},
{
"epoch": 0.11,
"grad_norm": 1.2385868215899136,
"learning_rate": 9.830536234894996e-06,
"loss": 0.9023,
"step": 86
},
{
"epoch": 0.11,
"grad_norm": 1.2899363487484155,
"learning_rate": 9.825057934977912e-06,
"loss": 0.9033,
"step": 87
},
{
"epoch": 0.11,
"grad_norm": 1.4391428437617175,
"learning_rate": 9.819494064578305e-06,
"loss": 0.8457,
"step": 88
},
{
"epoch": 0.12,
"grad_norm": 1.3299900370660194,
"learning_rate": 9.813844722369204e-06,
"loss": 0.8632,
"step": 89
},
{
"epoch": 0.12,
"grad_norm": 1.174244071731833,
"learning_rate": 9.808110008539441e-06,
"loss": 0.8913,
"step": 90
},
{
"epoch": 0.12,
"grad_norm": 1.266275549208671,
"learning_rate": 9.80229002479189e-06,
"loss": 0.8955,
"step": 91
},
{
"epoch": 0.12,
"grad_norm": 1.296325653451658,
"learning_rate": 9.796384874341643e-06,
"loss": 0.8731,
"step": 92
},
{
"epoch": 0.12,
"grad_norm": 1.264882203381404,
"learning_rate": 9.790394661914194e-06,
"loss": 0.8788,
"step": 93
},
{
"epoch": 0.12,
"grad_norm": 1.0978252424854067,
"learning_rate": 9.784319493743576e-06,
"loss": 0.8415,
"step": 94
},
{
"epoch": 0.12,
"grad_norm": 1.2829083314019198,
"learning_rate": 9.778159477570483e-06,
"loss": 0.9018,
"step": 95
},
{
"epoch": 0.12,
"grad_norm": 1.2586291926807105,
"learning_rate": 9.771914722640345e-06,
"loss": 0.9072,
"step": 96
},
{
"epoch": 0.13,
"grad_norm": 1.1796963405145942,
"learning_rate": 9.76558533970141e-06,
"loss": 0.8726,
"step": 97
},
{
"epoch": 0.13,
"grad_norm": 1.306469482808402,
"learning_rate": 9.759171441002766e-06,
"loss": 0.9025,
"step": 98
},
{
"epoch": 0.13,
"grad_norm": 1.2313395789612747,
"learning_rate": 9.75267314029235e-06,
"loss": 0.8555,
"step": 99
},
{
"epoch": 0.13,
"grad_norm": 1.3784125721886025,
"learning_rate": 9.746090552814944e-06,
"loss": 0.8959,
"step": 100
},
{
"epoch": 0.13,
"grad_norm": 1.3345787723484401,
"learning_rate": 9.739423795310115e-06,
"loss": 0.8818,
"step": 101
},
{
"epoch": 0.13,
"grad_norm": 1.2453323910999627,
"learning_rate": 9.732672986010157e-06,
"loss": 0.9028,
"step": 102
},
{
"epoch": 0.13,
"grad_norm": 1.2850929631258812,
"learning_rate": 9.725838244637982e-06,
"loss": 0.8962,
"step": 103
},
{
"epoch": 0.14,
"grad_norm": 1.257323570554658,
"learning_rate": 9.718919692405014e-06,
"loss": 0.8679,
"step": 104
},
{
"epoch": 0.14,
"grad_norm": 1.1908601963221823,
"learning_rate": 9.711917452009021e-06,
"loss": 0.9098,
"step": 105
},
{
"epoch": 0.14,
"grad_norm": 1.4435199482611327,
"learning_rate": 9.704831647631951e-06,
"loss": 0.8695,
"step": 106
},
{
"epoch": 0.14,
"grad_norm": 1.2562296479791273,
"learning_rate": 9.697662404937724e-06,
"loss": 0.9202,
"step": 107
},
{
"epoch": 0.14,
"grad_norm": 1.2710317483643663,
"learning_rate": 9.690409851070009e-06,
"loss": 0.9095,
"step": 108
},
{
"epoch": 0.14,
"grad_norm": 1.0364322803097805,
"learning_rate": 9.68307411464996e-06,
"loss": 0.8897,
"step": 109
},
{
"epoch": 0.14,
"grad_norm": 1.7483739299613759,
"learning_rate": 9.675655325773943e-06,
"loss": 0.872,
"step": 110
},
{
"epoch": 0.14,
"grad_norm": 1.2668959871346073,
"learning_rate": 9.66815361601123e-06,
"loss": 0.905,
"step": 111
},
{
"epoch": 0.15,
"grad_norm": 1.1308633342373222,
"learning_rate": 9.660569118401656e-06,
"loss": 0.9043,
"step": 112
},
{
"epoch": 0.15,
"grad_norm": 1.272143699055615,
"learning_rate": 9.65290196745327e-06,
"loss": 0.8669,
"step": 113
},
{
"epoch": 0.15,
"grad_norm": 1.5486854453771295,
"learning_rate": 9.64515229913994e-06,
"loss": 0.8908,
"step": 114
},
{
"epoch": 0.15,
"grad_norm": 1.3773680680248115,
"learning_rate": 9.637320250898953e-06,
"loss": 0.8752,
"step": 115
},
{
"epoch": 0.15,
"grad_norm": 1.4237537841409098,
"learning_rate": 9.629405961628568e-06,
"loss": 0.9257,
"step": 116
},
{
"epoch": 0.15,
"grad_norm": 1.162885378975861,
"learning_rate": 9.621409571685555e-06,
"loss": 0.8581,
"step": 117
},
{
"epoch": 0.15,
"grad_norm": 1.4262660718320415,
"learning_rate": 9.61333122288271e-06,
"loss": 0.8929,
"step": 118
},
{
"epoch": 0.15,
"grad_norm": 1.3321707519752775,
"learning_rate": 9.605171058486329e-06,
"loss": 0.8715,
"step": 119
},
{
"epoch": 0.16,
"grad_norm": 1.220589676110642,
"learning_rate": 9.596929223213685e-06,
"loss": 0.9275,
"step": 120
},
{
"epoch": 0.16,
"grad_norm": 1.8226230069534708,
"learning_rate": 9.588605863230447e-06,
"loss": 0.8913,
"step": 121
},
{
"epoch": 0.16,
"grad_norm": 1.1686577144464307,
"learning_rate": 9.58020112614809e-06,
"loss": 0.8661,
"step": 122
},
{
"epoch": 0.16,
"grad_norm": 1.4726816440981407,
"learning_rate": 9.571715161021285e-06,
"loss": 0.8741,
"step": 123
},
{
"epoch": 0.16,
"grad_norm": 1.1893010742558143,
"learning_rate": 9.563148118345242e-06,
"loss": 0.8963,
"step": 124
},
{
"epoch": 0.16,
"grad_norm": 1.1341800604331242,
"learning_rate": 9.55450015005306e-06,
"loss": 0.8872,
"step": 125
},
{
"epoch": 0.16,
"grad_norm": 1.0555002605396062,
"learning_rate": 9.545771409513012e-06,
"loss": 0.8417,
"step": 126
},
{
"epoch": 0.16,
"grad_norm": 1.2311976830107538,
"learning_rate": 9.536962051525837e-06,
"loss": 0.8598,
"step": 127
},
{
"epoch": 0.17,
"grad_norm": 1.226898171357091,
"learning_rate": 9.528072232321996e-06,
"loss": 0.8893,
"step": 128
},
{
"epoch": 0.17,
"grad_norm": 1.2603976581878822,
"learning_rate": 9.519102109558893e-06,
"loss": 0.8824,
"step": 129
},
{
"epoch": 0.17,
"grad_norm": 1.188060037369501,
"learning_rate": 9.510051842318089e-06,
"loss": 0.8809,
"step": 130
},
{
"epoch": 0.17,
"grad_norm": 1.2452630639374425,
"learning_rate": 9.50092159110247e-06,
"loss": 0.8778,
"step": 131
},
{
"epoch": 0.17,
"grad_norm": 1.344515264849714,
"learning_rate": 9.49171151783341e-06,
"loss": 0.8657,
"step": 132
},
{
"epoch": 0.17,
"grad_norm": 1.2874467819022641,
"learning_rate": 9.48242178584789e-06,
"loss": 0.8662,
"step": 133
},
{
"epoch": 0.17,
"grad_norm": 1.1720764172369453,
"learning_rate": 9.473052559895615e-06,
"loss": 0.8398,
"step": 134
},
{
"epoch": 0.18,
"grad_norm": 1.2724997102438147,
"learning_rate": 9.463604006136076e-06,
"loss": 0.8691,
"step": 135
},
{
"epoch": 0.18,
"grad_norm": 1.2157718055799869,
"learning_rate": 9.454076292135615e-06,
"loss": 0.8966,
"step": 136
},
{
"epoch": 0.18,
"grad_norm": 1.2041736734636745,
"learning_rate": 9.44446958686445e-06,
"loss": 0.8315,
"step": 137
},
{
"epoch": 0.18,
"grad_norm": 1.2873000337357716,
"learning_rate": 9.434784060693671e-06,
"loss": 0.8387,
"step": 138
},
{
"epoch": 0.18,
"grad_norm": 1.1238231039137265,
"learning_rate": 9.425019885392238e-06,
"loss": 0.9066,
"step": 139
},
{
"epoch": 0.18,
"grad_norm": 1.3605721992080366,
"learning_rate": 9.41517723412391e-06,
"loss": 0.9199,
"step": 140
},
{
"epoch": 0.18,
"grad_norm": 1.2105376013486355,
"learning_rate": 9.405256281444192e-06,
"loss": 0.8621,
"step": 141
},
{
"epoch": 0.18,
"grad_norm": 1.1508108761585434,
"learning_rate": 9.395257203297232e-06,
"loss": 0.8725,
"step": 142
},
{
"epoch": 0.19,
"grad_norm": 1.3578283923073486,
"learning_rate": 9.385180177012703e-06,
"loss": 0.9158,
"step": 143
},
{
"epoch": 0.19,
"grad_norm": 1.1901249895249055,
"learning_rate": 9.375025381302656e-06,
"loss": 0.8794,
"step": 144
},
{
"epoch": 0.19,
"grad_norm": 1.2932679790426476,
"learning_rate": 9.36479299625835e-06,
"loss": 0.8719,
"step": 145
},
{
"epoch": 0.19,
"grad_norm": 1.1658450769719235,
"learning_rate": 9.354483203347066e-06,
"loss": 0.9041,
"step": 146
},
{
"epoch": 0.19,
"grad_norm": 1.1074333464226818,
"learning_rate": 9.344096185408875e-06,
"loss": 0.9061,
"step": 147
},
{
"epoch": 0.19,
"grad_norm": 1.3904132378009597,
"learning_rate": 9.333632126653412e-06,
"loss": 0.8168,
"step": 148
},
{
"epoch": 0.19,
"grad_norm": 1.3016771798542626,
"learning_rate": 9.323091212656589e-06,
"loss": 0.9129,
"step": 149
},
{
"epoch": 0.19,
"grad_norm": 1.3787628224004695,
"learning_rate": 9.312473630357326e-06,
"loss": 0.8934,
"step": 150
},
{
"epoch": 0.2,
"grad_norm": 1.225566612536322,
"learning_rate": 9.301779568054219e-06,
"loss": 0.8483,
"step": 151
},
{
"epoch": 0.2,
"grad_norm": 1.2125198119803513,
"learning_rate": 9.291009215402204e-06,
"loss": 0.8858,
"step": 152
},
{
"epoch": 0.2,
"grad_norm": 1.1519000629696743,
"learning_rate": 9.280162763409207e-06,
"loss": 0.8435,
"step": 153
},
{
"epoch": 0.2,
"grad_norm": 1.1552307028646323,
"learning_rate": 9.269240404432732e-06,
"loss": 0.852,
"step": 154
},
{
"epoch": 0.2,
"grad_norm": 1.2118496637645362,
"learning_rate": 9.258242332176473e-06,
"loss": 0.8951,
"step": 155
},
{
"epoch": 0.2,
"grad_norm": 1.3963015983530243,
"learning_rate": 9.247168741686863e-06,
"loss": 0.8546,
"step": 156
},
{
"epoch": 0.2,
"grad_norm": 1.309312245402273,
"learning_rate": 9.236019829349623e-06,
"loss": 0.8902,
"step": 157
},
{
"epoch": 0.21,
"grad_norm": 1.3032791306057538,
"learning_rate": 9.224795792886276e-06,
"loss": 0.8645,
"step": 158
},
{
"epoch": 0.21,
"grad_norm": 1.2500412045624514,
"learning_rate": 9.213496831350647e-06,
"loss": 0.8514,
"step": 159
},
{
"epoch": 0.21,
"grad_norm": 1.502113914941289,
"learning_rate": 9.202123145125318e-06,
"loss": 0.8812,
"step": 160
},
{
"epoch": 0.21,
"grad_norm": 1.179178003711897,
"learning_rate": 9.190674935918092e-06,
"loss": 0.8585,
"step": 161
},
{
"epoch": 0.21,
"grad_norm": 1.273869159733753,
"learning_rate": 9.1791524067584e-06,
"loss": 0.8649,
"step": 162
},
{
"epoch": 0.21,
"grad_norm": 1.2314877134912634,
"learning_rate": 9.167555761993716e-06,
"loss": 0.8649,
"step": 163
},
{
"epoch": 0.21,
"grad_norm": 1.3622037711073158,
"learning_rate": 9.155885207285919e-06,
"loss": 0.8668,
"step": 164
},
{
"epoch": 0.21,
"grad_norm": 1.2546049124533816,
"learning_rate": 9.14414094960765e-06,
"loss": 0.8182,
"step": 165
},
{
"epoch": 0.22,
"grad_norm": 2.113434892192804,
"learning_rate": 9.132323197238649e-06,
"loss": 0.859,
"step": 166
},
{
"epoch": 0.22,
"grad_norm": 1.7423410683870517,
"learning_rate": 9.120432159762051e-06,
"loss": 0.9227,
"step": 167
},
{
"epoch": 0.22,
"grad_norm": 1.1368796798921579,
"learning_rate": 9.108468048060675e-06,
"loss": 0.8546,
"step": 168
},
{
"epoch": 0.22,
"grad_norm": 1.224122669035051,
"learning_rate": 9.096431074313278e-06,
"loss": 0.8319,
"step": 169
},
{
"epoch": 0.22,
"grad_norm": 1.3637642569977657,
"learning_rate": 9.084321451990804e-06,
"loss": 0.884,
"step": 170
},
{
"epoch": 0.22,
"grad_norm": 1.3239374587315518,
"learning_rate": 9.072139395852582e-06,
"loss": 0.8418,
"step": 171
},
{
"epoch": 0.22,
"grad_norm": 1.18034938438751,
"learning_rate": 9.059885121942533e-06,
"loss": 0.8471,
"step": 172
},
{
"epoch": 0.22,
"grad_norm": 1.2432620846129294,
"learning_rate": 9.04755884758533e-06,
"loss": 0.895,
"step": 173
},
{
"epoch": 0.23,
"grad_norm": 1.2124450178376394,
"learning_rate": 9.03516079138254e-06,
"loss": 0.8576,
"step": 174
},
{
"epoch": 0.23,
"grad_norm": 1.2905752914519677,
"learning_rate": 9.022691173208759e-06,
"loss": 0.836,
"step": 175
},
{
"epoch": 0.23,
"grad_norm": 1.1768633931424846,
"learning_rate": 9.010150214207704e-06,
"loss": 0.8324,
"step": 176
},
{
"epoch": 0.23,
"grad_norm": 1.3781750954365992,
"learning_rate": 8.997538136788291e-06,
"loss": 0.8426,
"step": 177
},
{
"epoch": 0.23,
"grad_norm": 1.230640779663414,
"learning_rate": 8.984855164620694e-06,
"loss": 0.8679,
"step": 178
},
{
"epoch": 0.23,
"grad_norm": 1.2255727503119238,
"learning_rate": 8.97210152263238e-06,
"loss": 0.85,
"step": 179
},
{
"epoch": 0.23,
"grad_norm": 1.3100217977587998,
"learning_rate": 8.959277437004114e-06,
"loss": 0.89,
"step": 180
},
{
"epoch": 0.24,
"grad_norm": 1.3085937284787819,
"learning_rate": 8.94638313516595e-06,
"loss": 0.8748,
"step": 181
},
{
"epoch": 0.24,
"grad_norm": 1.10287922354063,
"learning_rate": 8.933418845793202e-06,
"loss": 0.8553,
"step": 182
},
{
"epoch": 0.24,
"grad_norm": 1.27133597219518,
"learning_rate": 8.920384798802384e-06,
"loss": 0.8757,
"step": 183
},
{
"epoch": 0.24,
"grad_norm": 1.6520106528813114,
"learning_rate": 8.907281225347134e-06,
"loss": 0.8242,
"step": 184
},
{
"epoch": 0.24,
"grad_norm": 1.2525684796940382,
"learning_rate": 8.894108357814107e-06,
"loss": 0.8834,
"step": 185
},
{
"epoch": 0.24,
"grad_norm": 1.2578480714177394,
"learning_rate": 8.880866429818873e-06,
"loss": 0.8633,
"step": 186
},
{
"epoch": 0.24,
"grad_norm": 1.723729927706631,
"learning_rate": 8.867555676201753e-06,
"loss": 0.8565,
"step": 187
},
{
"epoch": 0.24,
"grad_norm": 1.5654752498240772,
"learning_rate": 8.85417633302367e-06,
"loss": 0.875,
"step": 188
},
{
"epoch": 0.25,
"grad_norm": 1.1660030377875041,
"learning_rate": 8.840728637561947e-06,
"loss": 0.8172,
"step": 189
},
{
"epoch": 0.25,
"grad_norm": 1.304332990745047,
"learning_rate": 8.827212828306111e-06,
"loss": 0.8593,
"step": 190
},
{
"epoch": 0.25,
"grad_norm": 1.5873586326273417,
"learning_rate": 8.813629144953666e-06,
"loss": 0.8656,
"step": 191
},
{
"epoch": 0.25,
"grad_norm": 1.173378431318532,
"learning_rate": 8.799977828405826e-06,
"loss": 0.8444,
"step": 192
},
{
"epoch": 0.25,
"grad_norm": 1.3871104053810464,
"learning_rate": 8.786259120763263e-06,
"loss": 0.8551,
"step": 193
},
{
"epoch": 0.25,
"grad_norm": 1.2616067480138016,
"learning_rate": 8.772473265321794e-06,
"loss": 0.8798,
"step": 194
},
{
"epoch": 0.25,
"grad_norm": 1.180462258223769,
"learning_rate": 8.758620506568084e-06,
"loss": 0.8514,
"step": 195
},
{
"epoch": 0.25,
"grad_norm": 1.1530272269450472,
"learning_rate": 8.74470109017529e-06,
"loss": 0.8726,
"step": 196
},
{
"epoch": 0.26,
"grad_norm": 1.1563346325065118,
"learning_rate": 8.730715262998733e-06,
"loss": 0.8617,
"step": 197
},
{
"epoch": 0.26,
"grad_norm": 1.2336725498438685,
"learning_rate": 8.716663273071484e-06,
"loss": 0.814,
"step": 198
},
{
"epoch": 0.26,
"grad_norm": 1.2605889115541364,
"learning_rate": 8.702545369599997e-06,
"loss": 0.8588,
"step": 199
},
{
"epoch": 0.26,
"grad_norm": 1.19906305613824,
"learning_rate": 8.688361802959673e-06,
"loss": 0.8849,
"step": 200
},
{
"epoch": 0.26,
"grad_norm": 1.1538706074366336,
"learning_rate": 8.674112824690419e-06,
"loss": 0.8267,
"step": 201
},
{
"epoch": 0.26,
"grad_norm": 1.169788765403587,
"learning_rate": 8.659798687492199e-06,
"loss": 0.8593,
"step": 202
},
{
"epoch": 0.26,
"grad_norm": 1.3244594230863784,
"learning_rate": 8.645419645220538e-06,
"loss": 0.8348,
"step": 203
},
{
"epoch": 0.26,
"grad_norm": 1.1732992626263374,
"learning_rate": 8.630975952882027e-06,
"loss": 0.8246,
"step": 204
},
{
"epoch": 0.27,
"grad_norm": 1.297363781740773,
"learning_rate": 8.616467866629808e-06,
"loss": 0.835,
"step": 205
},
{
"epoch": 0.27,
"grad_norm": 1.7609709518413195,
"learning_rate": 8.601895643759014e-06,
"loss": 0.8755,
"step": 206
},
{
"epoch": 0.27,
"grad_norm": 1.5862887040904983,
"learning_rate": 8.58725954270222e-06,
"loss": 0.8726,
"step": 207
},
{
"epoch": 0.27,
"grad_norm": 1.1935398277338376,
"learning_rate": 8.572559823024853e-06,
"loss": 0.866,
"step": 208
},
{
"epoch": 0.27,
"grad_norm": 1.1834450572696433,
"learning_rate": 8.557796745420592e-06,
"loss": 0.8614,
"step": 209
},
{
"epoch": 0.27,
"grad_norm": 1.1878168847959716,
"learning_rate": 8.542970571706748e-06,
"loss": 0.8799,
"step": 210
},
{
"epoch": 0.27,
"grad_norm": 1.354522490073717,
"learning_rate": 8.528081564819608e-06,
"loss": 0.8531,
"step": 211
},
{
"epoch": 0.28,
"grad_norm": 1.317765104330031,
"learning_rate": 8.513129988809787e-06,
"loss": 0.8459,
"step": 212
},
{
"epoch": 0.28,
"grad_norm": 1.3118174417979898,
"learning_rate": 8.498116108837533e-06,
"loss": 0.8922,
"step": 213
},
{
"epoch": 0.28,
"grad_norm": 1.1131070658330877,
"learning_rate": 8.483040191168037e-06,
"loss": 0.8812,
"step": 214
},
{
"epoch": 0.28,
"grad_norm": 1.2336386228496021,
"learning_rate": 8.467902503166698e-06,
"loss": 0.8282,
"step": 215
},
{
"epoch": 0.28,
"grad_norm": 1.4586312090220346,
"learning_rate": 8.45270331329439e-06,
"loss": 0.8635,
"step": 216
},
{
"epoch": 0.28,
"grad_norm": 1.3656966584287829,
"learning_rate": 8.437442891102696e-06,
"loss": 0.8877,
"step": 217
},
{
"epoch": 0.28,
"grad_norm": 1.3937380322780935,
"learning_rate": 8.42212150722913e-06,
"loss": 0.8282,
"step": 218
},
{
"epoch": 0.28,
"grad_norm": 1.22224963815494,
"learning_rate": 8.406739433392343e-06,
"loss": 0.8424,
"step": 219
},
{
"epoch": 0.29,
"grad_norm": 1.6065974227695905,
"learning_rate": 8.391296942387293e-06,
"loss": 0.8572,
"step": 220
},
{
"epoch": 0.29,
"grad_norm": 1.3023080745688278,
"learning_rate": 8.37579430808041e-06,
"loss": 0.8362,
"step": 221
},
{
"epoch": 0.29,
"grad_norm": 1.2324977420758008,
"learning_rate": 8.360231805404745e-06,
"loss": 0.8589,
"step": 222
},
{
"epoch": 0.29,
"grad_norm": 1.4254937288107534,
"learning_rate": 8.344609710355092e-06,
"loss": 0.8644,
"step": 223
},
{
"epoch": 0.29,
"grad_norm": 1.2762189341727412,
"learning_rate": 8.32892829998309e-06,
"loss": 0.8759,
"step": 224
},
{
"epoch": 0.29,
"grad_norm": 1.212162649007418,
"learning_rate": 8.313187852392314e-06,
"loss": 0.8318,
"step": 225
},
{
"epoch": 0.29,
"grad_norm": 1.2879599902194216,
"learning_rate": 8.297388646733335e-06,
"loss": 0.8668,
"step": 226
},
{
"epoch": 0.29,
"grad_norm": 1.2284347722023181,
"learning_rate": 8.281530963198782e-06,
"loss": 0.8455,
"step": 227
},
{
"epoch": 0.3,
"grad_norm": 1.3556686197816876,
"learning_rate": 8.26561508301836e-06,
"loss": 0.8212,
"step": 228
},
{
"epoch": 0.3,
"grad_norm": 1.391929292166319,
"learning_rate": 8.249641288453872e-06,
"loss": 0.8788,
"step": 229
},
{
"epoch": 0.3,
"grad_norm": 3.4586892421492013,
"learning_rate": 8.23360986279421e-06,
"loss": 0.8261,
"step": 230
},
{
"epoch": 0.3,
"grad_norm": 1.2170746219562474,
"learning_rate": 8.217521090350326e-06,
"loss": 0.8421,
"step": 231
},
{
"epoch": 0.3,
"grad_norm": 1.218271614680763,
"learning_rate": 8.201375256450198e-06,
"loss": 0.883,
"step": 232
},
{
"epoch": 0.3,
"grad_norm": 1.4799501574669076,
"learning_rate": 8.185172647433766e-06,
"loss": 0.87,
"step": 233
},
{
"epoch": 0.3,
"grad_norm": 1.400079191714797,
"learning_rate": 8.168913550647855e-06,
"loss": 0.8373,
"step": 234
},
{
"epoch": 0.31,
"grad_norm": 1.1669396201944626,
"learning_rate": 8.152598254441076e-06,
"loss": 0.847,
"step": 235
},
{
"epoch": 0.31,
"grad_norm": 1.195621110624864,
"learning_rate": 8.136227048158716e-06,
"loss": 0.8601,
"step": 236
},
{
"epoch": 0.31,
"grad_norm": 1.2953519903155755,
"learning_rate": 8.1198002221376e-06,
"loss": 0.8441,
"step": 237
},
{
"epoch": 0.31,
"grad_norm": 1.6326150827119306,
"learning_rate": 8.103318067700957e-06,
"loss": 0.8448,
"step": 238
},
{
"epoch": 0.31,
"grad_norm": 1.0961196527359565,
"learning_rate": 8.086780877153233e-06,
"loss": 0.8268,
"step": 239
},
{
"epoch": 0.31,
"grad_norm": 1.4247422383868384,
"learning_rate": 8.070188943774921e-06,
"loss": 0.8115,
"step": 240
},
{
"epoch": 0.31,
"grad_norm": 1.2240799976807206,
"learning_rate": 8.053542561817364e-06,
"loss": 0.8047,
"step": 241
},
{
"epoch": 0.31,
"grad_norm": 1.1148459295674251,
"learning_rate": 8.036842026497515e-06,
"loss": 0.7947,
"step": 242
},
{
"epoch": 0.32,
"grad_norm": 1.3046439821028708,
"learning_rate": 8.020087633992729e-06,
"loss": 0.8596,
"step": 243
},
{
"epoch": 0.32,
"grad_norm": 1.3923522847308203,
"learning_rate": 8.003279681435483e-06,
"loss": 0.8815,
"step": 244
},
{
"epoch": 0.32,
"grad_norm": 1.279395243287966,
"learning_rate": 7.986418466908133e-06,
"loss": 0.8218,
"step": 245
},
{
"epoch": 0.32,
"grad_norm": 1.305938895131756,
"learning_rate": 7.969504289437607e-06,
"loss": 0.8653,
"step": 246
},
{
"epoch": 0.32,
"grad_norm": 1.2194222921731876,
"learning_rate": 7.952537448990114e-06,
"loss": 0.8413,
"step": 247
},
{
"epoch": 0.32,
"grad_norm": 1.3454506997775046,
"learning_rate": 7.935518246465815e-06,
"loss": 0.8556,
"step": 248
},
{
"epoch": 0.32,
"grad_norm": 1.2952291235408084,
"learning_rate": 7.918446983693498e-06,
"loss": 0.869,
"step": 249
},
{
"epoch": 0.32,
"grad_norm": 1.2459907150930951,
"learning_rate": 7.901323963425213e-06,
"loss": 0.8427,
"step": 250
},
{
"epoch": 0.33,
"grad_norm": 1.2147661517452935,
"learning_rate": 7.884149489330912e-06,
"loss": 0.832,
"step": 251
},
{
"epoch": 0.33,
"grad_norm": 1.1668831211471047,
"learning_rate": 7.866923865993057e-06,
"loss": 0.8734,
"step": 252
},
{
"epoch": 0.33,
"grad_norm": 1.4995150707097251,
"learning_rate": 7.849647398901227e-06,
"loss": 0.8809,
"step": 253
},
{
"epoch": 0.33,
"grad_norm": 1.1424611915270306,
"learning_rate": 7.832320394446688e-06,
"loss": 0.8384,
"step": 254
},
{
"epoch": 0.33,
"grad_norm": 1.2621218740072504,
"learning_rate": 7.814943159916974e-06,
"loss": 0.8465,
"step": 255
},
{
"epoch": 0.33,
"grad_norm": 1.273110955180023,
"learning_rate": 7.797516003490421e-06,
"loss": 0.8253,
"step": 256
},
{
"epoch": 0.33,
"grad_norm": 1.3313009954548312,
"learning_rate": 7.780039234230714e-06,
"loss": 0.8794,
"step": 257
},
{
"epoch": 0.34,
"grad_norm": 1.5759780161169947,
"learning_rate": 7.762513162081402e-06,
"loss": 0.8649,
"step": 258
},
{
"epoch": 0.34,
"grad_norm": 1.812796559030521,
"learning_rate": 7.7449380978604e-06,
"loss": 0.8065,
"step": 259
},
{
"epoch": 0.34,
"grad_norm": 1.3596453509876505,
"learning_rate": 7.727314353254482e-06,
"loss": 0.8655,
"step": 260
},
{
"epoch": 0.34,
"grad_norm": 1.1898419559244204,
"learning_rate": 7.709642240813742e-06,
"loss": 0.8415,
"step": 261
},
{
"epoch": 0.34,
"grad_norm": 1.3500754898160217,
"learning_rate": 7.691922073946063e-06,
"loss": 0.853,
"step": 262
},
{
"epoch": 0.34,
"grad_norm": 1.1504817003231094,
"learning_rate": 7.674154166911553e-06,
"loss": 0.8793,
"step": 263
},
{
"epoch": 0.34,
"grad_norm": 1.2590688573293491,
"learning_rate": 7.656338834816976e-06,
"loss": 0.8715,
"step": 264
},
{
"epoch": 0.34,
"grad_norm": 1.2651292489923993,
"learning_rate": 7.638476393610155e-06,
"loss": 0.8388,
"step": 265
},
{
"epoch": 0.35,
"grad_norm": 1.3571956680408448,
"learning_rate": 7.620567160074377e-06,
"loss": 0.8849,
"step": 266
},
{
"epoch": 0.35,
"grad_norm": 1.552153053502718,
"learning_rate": 7.602611451822775e-06,
"loss": 0.8586,
"step": 267
},
{
"epoch": 0.35,
"grad_norm": 1.5020758017980491,
"learning_rate": 7.584609587292686e-06,
"loss": 0.8817,
"step": 268
},
{
"epoch": 0.35,
"grad_norm": 1.329746891781287,
"learning_rate": 7.566561885740019e-06,
"loss": 0.8723,
"step": 269
},
{
"epoch": 0.35,
"grad_norm": 1.1578631093841143,
"learning_rate": 7.548468667233576e-06,
"loss": 0.8455,
"step": 270
},
{
"epoch": 0.35,
"grad_norm": 1.1032924408612441,
"learning_rate": 7.5303302526493894e-06,
"loss": 0.8342,
"step": 271
},
{
"epoch": 0.35,
"grad_norm": 1.564083216357106,
"learning_rate": 7.512146963665023e-06,
"loss": 0.8263,
"step": 272
},
{
"epoch": 0.35,
"grad_norm": 1.2052297883957035,
"learning_rate": 7.493919122753873e-06,
"loss": 0.8385,
"step": 273
},
{
"epoch": 0.36,
"grad_norm": 1.1808734641861955,
"learning_rate": 7.475647053179444e-06,
"loss": 0.8514,
"step": 274
},
{
"epoch": 0.36,
"grad_norm": 1.401160272277566,
"learning_rate": 7.457331078989619e-06,
"loss": 0.8467,
"step": 275
},
{
"epoch": 0.36,
"grad_norm": 1.1227772209522688,
"learning_rate": 7.438971525010914e-06,
"loss": 0.8692,
"step": 276
},
{
"epoch": 0.36,
"grad_norm": 1.370076448447391,
"learning_rate": 7.420568716842711e-06,
"loss": 0.8432,
"step": 277
},
{
"epoch": 0.36,
"grad_norm": 1.244630228546123,
"learning_rate": 7.402122980851491e-06,
"loss": 0.8583,
"step": 278
},
{
"epoch": 0.36,
"grad_norm": 1.27669216892998,
"learning_rate": 7.383634644165041e-06,
"loss": 0.8712,
"step": 279
},
{
"epoch": 0.36,
"grad_norm": 1.553980990671672,
"learning_rate": 7.365104034666657e-06,
"loss": 0.8197,
"step": 280
},
{
"epoch": 0.36,
"grad_norm": 1.187131952435461,
"learning_rate": 7.346531480989325e-06,
"loss": 0.8434,
"step": 281
},
{
"epoch": 0.37,
"grad_norm": 1.8707182730243352,
"learning_rate": 7.327917312509893e-06,
"loss": 0.847,
"step": 282
},
{
"epoch": 0.37,
"grad_norm": 1.4802799795263466,
"learning_rate": 7.309261859343233e-06,
"loss": 0.8184,
"step": 283
},
{
"epoch": 0.37,
"grad_norm": 1.2194638672966402,
"learning_rate": 7.290565452336382e-06,
"loss": 0.8264,
"step": 284
},
{
"epoch": 0.37,
"grad_norm": 1.228307425661477,
"learning_rate": 7.27182842306268e-06,
"loss": 0.8445,
"step": 285
},
{
"epoch": 0.37,
"grad_norm": 1.330130601732375,
"learning_rate": 7.253051103815887e-06,
"loss": 0.8487,
"step": 286
},
{
"epoch": 0.37,
"grad_norm": 1.2351793793938697,
"learning_rate": 7.234233827604285e-06,
"loss": 0.8315,
"step": 287
},
{
"epoch": 0.37,
"grad_norm": 1.4996469832250112,
"learning_rate": 7.215376928144783e-06,
"loss": 0.8522,
"step": 288
},
{
"epoch": 0.38,
"grad_norm": 1.2552008111918165,
"learning_rate": 7.196480739856988e-06,
"loss": 0.8163,
"step": 289
},
{
"epoch": 0.38,
"grad_norm": 1.502543788757623,
"learning_rate": 7.177545597857279e-06,
"loss": 0.8441,
"step": 290
},
{
"epoch": 0.38,
"grad_norm": 1.494129590625939,
"learning_rate": 7.158571837952867e-06,
"loss": 0.8256,
"step": 291
},
{
"epoch": 0.38,
"grad_norm": 1.1732463644816806,
"learning_rate": 7.139559796635833e-06,
"loss": 0.8545,
"step": 292
},
{
"epoch": 0.38,
"grad_norm": 1.2940044086310112,
"learning_rate": 7.120509811077164e-06,
"loss": 0.8436,
"step": 293
},
{
"epoch": 0.38,
"grad_norm": 2.587490092215707,
"learning_rate": 7.101422219120774e-06,
"loss": 0.8492,
"step": 294
},
{
"epoch": 0.38,
"grad_norm": 1.2525502879285224,
"learning_rate": 7.082297359277513e-06,
"loss": 0.8355,
"step": 295
},
{
"epoch": 0.38,
"grad_norm": 1.2154301868677129,
"learning_rate": 7.0631355707191575e-06,
"loss": 0.864,
"step": 296
},
{
"epoch": 0.39,
"grad_norm": 1.509297691003082,
"learning_rate": 7.043937193272405e-06,
"loss": 0.8535,
"step": 297
},
{
"epoch": 0.39,
"grad_norm": 1.560282341886913,
"learning_rate": 7.024702567412839e-06,
"loss": 0.8415,
"step": 298
},
{
"epoch": 0.39,
"grad_norm": 1.215819914432597,
"learning_rate": 7.0054320342588954e-06,
"loss": 0.8307,
"step": 299
},
{
"epoch": 0.39,
"grad_norm": 1.4363490411881552,
"learning_rate": 6.986125935565813e-06,
"loss": 0.8635,
"step": 300
},
{
"epoch": 0.39,
"grad_norm": 1.252680452931007,
"learning_rate": 6.966784613719568e-06,
"loss": 0.8187,
"step": 301
},
{
"epoch": 0.39,
"grad_norm": 1.147759930914122,
"learning_rate": 6.94740841173081e-06,
"loss": 0.855,
"step": 302
},
{
"epoch": 0.39,
"grad_norm": 1.2481611069144203,
"learning_rate": 6.927997673228766e-06,
"loss": 0.88,
"step": 303
},
{
"epoch": 0.39,
"grad_norm": 1.1605598358791287,
"learning_rate": 6.908552742455167e-06,
"loss": 0.8238,
"step": 304
},
{
"epoch": 0.4,
"grad_norm": 1.150740940595073,
"learning_rate": 6.889073964258116e-06,
"loss": 0.8416,
"step": 305
},
{
"epoch": 0.4,
"grad_norm": 1.1875419366249447,
"learning_rate": 6.869561684085998e-06,
"loss": 0.861,
"step": 306
},
{
"epoch": 0.4,
"grad_norm": 1.117161313240673,
"learning_rate": 6.850016247981335e-06,
"loss": 0.8187,
"step": 307
},
{
"epoch": 0.4,
"grad_norm": 1.178563333637316,
"learning_rate": 6.83043800257466e-06,
"loss": 0.8637,
"step": 308
},
{
"epoch": 0.4,
"grad_norm": 1.4846187498958823,
"learning_rate": 6.810827295078365e-06,
"loss": 0.8084,
"step": 309
},
{
"epoch": 0.4,
"grad_norm": 1.2242229357089285,
"learning_rate": 6.791184473280542e-06,
"loss": 0.8452,
"step": 310
},
{
"epoch": 0.4,
"grad_norm": 1.3028754268878384,
"learning_rate": 6.771509885538823e-06,
"loss": 0.8158,
"step": 311
},
{
"epoch": 0.41,
"grad_norm": 1.1123018120100558,
"learning_rate": 6.7518038807741915e-06,
"loss": 0.8729,
"step": 312
},
{
"epoch": 0.41,
"grad_norm": 1.1932793058105855,
"learning_rate": 6.7320668084648e-06,
"loss": 0.8522,
"step": 313
},
{
"epoch": 0.41,
"grad_norm": 1.1640197426308538,
"learning_rate": 6.712299018639772e-06,
"loss": 0.8811,
"step": 314
},
{
"epoch": 0.41,
"grad_norm": 1.2756920346871423,
"learning_rate": 6.692500861872996e-06,
"loss": 0.8499,
"step": 315
},
{
"epoch": 0.41,
"grad_norm": 1.2063462879314655,
"learning_rate": 6.672672689276902e-06,
"loss": 0.8401,
"step": 316
},
{
"epoch": 0.41,
"grad_norm": 1.4598555490832712,
"learning_rate": 6.652814852496242e-06,
"loss": 0.8271,
"step": 317
},
{
"epoch": 0.41,
"grad_norm": 1.0938343281591207,
"learning_rate": 6.6329277037018505e-06,
"loss": 0.8206,
"step": 318
},
{
"epoch": 0.41,
"grad_norm": 1.225705583990496,
"learning_rate": 6.6130115955843975e-06,
"loss": 0.862,
"step": 319
},
{
"epoch": 0.42,
"grad_norm": 1.2180868955960955,
"learning_rate": 6.593066881348133e-06,
"loss": 0.8253,
"step": 320
},
{
"epoch": 0.42,
"grad_norm": 1.213674446057375,
"learning_rate": 6.573093914704633e-06,
"loss": 0.833,
"step": 321
},
{
"epoch": 0.42,
"grad_norm": 1.2175598412319608,
"learning_rate": 6.553093049866509e-06,
"loss": 0.863,
"step": 322
},
{
"epoch": 0.42,
"grad_norm": 1.8426677722391969,
"learning_rate": 6.533064641541142e-06,
"loss": 0.8585,
"step": 323
},
{
"epoch": 0.42,
"grad_norm": 1.1805886991608463,
"learning_rate": 6.513009044924384e-06,
"loss": 0.8604,
"step": 324
},
{
"epoch": 0.42,
"grad_norm": 1.1638818327339862,
"learning_rate": 6.492926615694262e-06,
"loss": 0.8624,
"step": 325
},
{
"epoch": 0.42,
"grad_norm": 1.1190056074940464,
"learning_rate": 6.472817710004664e-06,
"loss": 0.8318,
"step": 326
},
{
"epoch": 0.42,
"grad_norm": 1.4703765166977123,
"learning_rate": 6.452682684479032e-06,
"loss": 0.8659,
"step": 327
},
{
"epoch": 0.43,
"grad_norm": 1.1488809794920523,
"learning_rate": 6.432521896204035e-06,
"loss": 0.8133,
"step": 328
},
{
"epoch": 0.43,
"grad_norm": 1.2077971564958,
"learning_rate": 6.412335702723224e-06,
"loss": 0.8488,
"step": 329
},
{
"epoch": 0.43,
"grad_norm": 1.335953923852408,
"learning_rate": 6.392124462030715e-06,
"loss": 0.8209,
"step": 330
},
{
"epoch": 0.43,
"grad_norm": 1.239560657787868,
"learning_rate": 6.371888532564817e-06,
"loss": 0.8582,
"step": 331
},
{
"epoch": 0.43,
"grad_norm": 1.123443909247595,
"learning_rate": 6.351628273201687e-06,
"loss": 0.8522,
"step": 332
},
{
"epoch": 0.43,
"grad_norm": 1.1930390364093206,
"learning_rate": 6.331344043248961e-06,
"loss": 0.8612,
"step": 333
},
{
"epoch": 0.43,
"grad_norm": 1.1651674600359125,
"learning_rate": 6.311036202439388e-06,
"loss": 0.8141,
"step": 334
},
{
"epoch": 0.44,
"grad_norm": 1.827712401238591,
"learning_rate": 6.290705110924442e-06,
"loss": 0.8257,
"step": 335
},
{
"epoch": 0.44,
"grad_norm": 1.1730736711077356,
"learning_rate": 6.270351129267944e-06,
"loss": 0.809,
"step": 336
},
{
"epoch": 0.44,
"grad_norm": 1.2321894607586943,
"learning_rate": 6.249974618439657e-06,
"loss": 0.865,
"step": 337
},
{
"epoch": 0.44,
"grad_norm": 1.2508419001037108,
"learning_rate": 6.229575939808893e-06,
"loss": 0.858,
"step": 338
},
{
"epoch": 0.44,
"grad_norm": 1.1664795356123143,
"learning_rate": 6.209155455138102e-06,
"loss": 0.8473,
"step": 339
},
{
"epoch": 0.44,
"grad_norm": 1.1058463932196927,
"learning_rate": 6.188713526576452e-06,
"loss": 0.827,
"step": 340
},
{
"epoch": 0.44,
"grad_norm": 1.5948496072571947,
"learning_rate": 6.1682505166534134e-06,
"loss": 0.8441,
"step": 341
},
{
"epoch": 0.44,
"grad_norm": 1.123050976281229,
"learning_rate": 6.1477667882723245e-06,
"loss": 0.824,
"step": 342
},
{
"epoch": 0.45,
"grad_norm": 1.1179511468396548,
"learning_rate": 6.127262704703956e-06,
"loss": 0.8116,
"step": 343
},
{
"epoch": 0.45,
"grad_norm": 1.448611313915091,
"learning_rate": 6.106738629580073e-06,
"loss": 0.8133,
"step": 344
},
{
"epoch": 0.45,
"grad_norm": 1.4292078314595598,
"learning_rate": 6.0861949268869814e-06,
"loss": 0.8445,
"step": 345
},
{
"epoch": 0.45,
"grad_norm": 1.1799038394669346,
"learning_rate": 6.065631960959072e-06,
"loss": 0.8298,
"step": 346
},
{
"epoch": 0.45,
"grad_norm": 1.2343005419584467,
"learning_rate": 6.045050096472363e-06,
"loss": 0.874,
"step": 347
},
{
"epoch": 0.45,
"grad_norm": 1.133793942853395,
"learning_rate": 6.024449698438033e-06,
"loss": 0.8373,
"step": 348
},
{
"epoch": 0.45,
"grad_norm": 1.1482697304859235,
"learning_rate": 6.003831132195943e-06,
"loss": 0.8291,
"step": 349
},
{
"epoch": 0.45,
"grad_norm": 1.0714069634839316,
"learning_rate": 5.983194763408161e-06,
"loss": 0.8038,
"step": 350
},
{
"epoch": 0.46,
"grad_norm": 1.189866180029149,
"learning_rate": 5.962540958052478e-06,
"loss": 0.8369,
"step": 351
},
{
"epoch": 0.46,
"grad_norm": 1.151937951000298,
"learning_rate": 5.94187008241591e-06,
"loss": 0.8724,
"step": 352
},
{
"epoch": 0.46,
"grad_norm": 1.1885630504841458,
"learning_rate": 5.921182503088212e-06,
"loss": 0.8363,
"step": 353
},
{
"epoch": 0.46,
"grad_norm": 1.2563198905659214,
"learning_rate": 5.900478586955374e-06,
"loss": 0.8414,
"step": 354
},
{
"epoch": 0.46,
"grad_norm": 1.0903183738957514,
"learning_rate": 5.879758701193108e-06,
"loss": 0.8104,
"step": 355
},
{
"epoch": 0.46,
"grad_norm": 1.3514303801827983,
"learning_rate": 5.8590232132603444e-06,
"loss": 0.8723,
"step": 356
},
{
"epoch": 0.46,
"grad_norm": 1.385355867796163,
"learning_rate": 5.838272490892708e-06,
"loss": 0.8155,
"step": 357
},
{
"epoch": 0.46,
"grad_norm": 1.4230336181646532,
"learning_rate": 5.817506902096007e-06,
"loss": 0.8227,
"step": 358
},
{
"epoch": 0.47,
"grad_norm": 1.292768981531148,
"learning_rate": 5.796726815139695e-06,
"loss": 0.8571,
"step": 359
},
{
"epoch": 0.47,
"grad_norm": 1.2735642058681054,
"learning_rate": 5.7759325985503435e-06,
"loss": 0.8342,
"step": 360
},
{
"epoch": 0.47,
"grad_norm": 1.2086676089354491,
"learning_rate": 5.755124621105111e-06,
"loss": 0.8496,
"step": 361
},
{
"epoch": 0.47,
"grad_norm": 1.244245262090597,
"learning_rate": 5.734303251825198e-06,
"loss": 0.8257,
"step": 362
},
{
"epoch": 0.47,
"grad_norm": 1.0803631521753734,
"learning_rate": 5.713468859969301e-06,
"loss": 0.813,
"step": 363
},
{
"epoch": 0.47,
"grad_norm": 1.1478802532788033,
"learning_rate": 5.6926218150270716e-06,
"loss": 0.8022,
"step": 364
},
{
"epoch": 0.47,
"grad_norm": 0.9961671906693075,
"learning_rate": 5.671762486712557e-06,
"loss": 0.8405,
"step": 365
},
{
"epoch": 0.48,
"grad_norm": 1.1541301819630243,
"learning_rate": 5.650891244957644e-06,
"loss": 0.8289,
"step": 366
},
{
"epoch": 0.48,
"grad_norm": 1.1824673976498992,
"learning_rate": 5.630008459905498e-06,
"loss": 0.8413,
"step": 367
},
{
"epoch": 0.48,
"grad_norm": 1.2250269994788847,
"learning_rate": 5.609114501904006e-06,
"loss": 0.8447,
"step": 368
},
{
"epoch": 0.48,
"grad_norm": 1.16055884464047,
"learning_rate": 5.588209741499196e-06,
"loss": 0.8173,
"step": 369
},
{
"epoch": 0.48,
"grad_norm": 1.1285506194740014,
"learning_rate": 5.567294549428678e-06,
"loss": 0.8435,
"step": 370
},
{
"epoch": 0.48,
"grad_norm": 1.3319483590214511,
"learning_rate": 5.54636929661506e-06,
"loss": 0.8393,
"step": 371
},
{
"epoch": 0.48,
"grad_norm": 1.1399581144803144,
"learning_rate": 5.525434354159374e-06,
"loss": 0.8383,
"step": 372
},
{
"epoch": 0.48,
"grad_norm": 1.2097290183876572,
"learning_rate": 5.504490093334493e-06,
"loss": 0.8489,
"step": 373
},
{
"epoch": 0.49,
"grad_norm": 4.352780644899712,
"learning_rate": 5.48353688557855e-06,
"loss": 0.8643,
"step": 374
},
{
"epoch": 0.49,
"grad_norm": 1.2582400293178824,
"learning_rate": 5.462575102488348e-06,
"loss": 0.805,
"step": 375
},
{
"epoch": 0.49,
"grad_norm": 1.574728499559222,
"learning_rate": 5.441605115812767e-06,
"loss": 0.8594,
"step": 376
},
{
"epoch": 0.49,
"grad_norm": 1.829195542286078,
"learning_rate": 5.420627297446179e-06,
"loss": 0.8765,
"step": 377
},
{
"epoch": 0.49,
"grad_norm": 1.0805397201337004,
"learning_rate": 5.399642019421844e-06,
"loss": 0.8453,
"step": 378
},
{
"epoch": 0.49,
"grad_norm": 1.2382172071093036,
"learning_rate": 5.378649653905316e-06,
"loss": 0.8332,
"step": 379
},
{
"epoch": 0.49,
"grad_norm": 1.1809812345716155,
"learning_rate": 5.357650573187847e-06,
"loss": 0.8254,
"step": 380
},
{
"epoch": 0.49,
"grad_norm": 1.1932829048262574,
"learning_rate": 5.336645149679775e-06,
"loss": 0.8231,
"step": 381
},
{
"epoch": 0.5,
"grad_norm": 1.3006622982723932,
"learning_rate": 5.315633755903931e-06,
"loss": 0.8341,
"step": 382
},
{
"epoch": 0.5,
"grad_norm": 1.1773771127758201,
"learning_rate": 5.294616764489018e-06,
"loss": 0.82,
"step": 383
},
{
"epoch": 0.5,
"grad_norm": 1.120358554988103,
"learning_rate": 5.27359454816302e-06,
"loss": 0.8183,
"step": 384
},
{
"epoch": 0.5,
"grad_norm": 1.1479966317416317,
"learning_rate": 5.252567479746577e-06,
"loss": 0.8504,
"step": 385
},
{
"epoch": 0.5,
"grad_norm": 1.7373144722386622,
"learning_rate": 5.231535932146382e-06,
"loss": 0.8293,
"step": 386
},
{
"epoch": 0.5,
"grad_norm": 1.2159912654625296,
"learning_rate": 5.210500278348561e-06,
"loss": 0.828,
"step": 387
},
{
"epoch": 0.5,
"grad_norm": 1.2134302086400865,
"learning_rate": 5.1894608914120635e-06,
"loss": 0.8645,
"step": 388
},
{
"epoch": 0.51,
"grad_norm": 1.0591258858274246,
"learning_rate": 5.168418144462046e-06,
"loss": 0.8164,
"step": 389
},
{
"epoch": 0.51,
"grad_norm": 1.2186717818024067,
"learning_rate": 5.147372410683252e-06,
"loss": 0.8476,
"step": 390
},
{
"epoch": 0.51,
"grad_norm": 1.1213501657531966,
"learning_rate": 5.126324063313397e-06,
"loss": 0.8663,
"step": 391
},
{
"epoch": 0.51,
"grad_norm": 1.4491273350649847,
"learning_rate": 5.105273475636545e-06,
"loss": 0.8525,
"step": 392
},
{
"epoch": 0.51,
"grad_norm": 1.159514917414318,
"learning_rate": 5.084221020976491e-06,
"loss": 0.8317,
"step": 393
},
{
"epoch": 0.51,
"grad_norm": 1.1877065524083912,
"learning_rate": 5.063167072690144e-06,
"loss": 0.8363,
"step": 394
},
{
"epoch": 0.51,
"grad_norm": 1.0824757433851597,
"learning_rate": 5.042112004160898e-06,
"loss": 0.8384,
"step": 395
},
{
"epoch": 0.51,
"grad_norm": 1.1452248714301483,
"learning_rate": 5.021056188792014e-06,
"loss": 0.8789,
"step": 396
},
{
"epoch": 0.52,
"grad_norm": 1.1364607148991899,
"learning_rate": 5e-06,
"loss": 0.8524,
"step": 397
},
{
"epoch": 0.52,
"grad_norm": 1.1839720849840152,
"learning_rate": 4.978943811207988e-06,
"loss": 0.8741,
"step": 398
},
{
"epoch": 0.52,
"grad_norm": 1.5936600484839722,
"learning_rate": 4.957887995839104e-06,
"loss": 0.8254,
"step": 399
},
{
"epoch": 0.52,
"grad_norm": 1.0926397681862798,
"learning_rate": 4.936832927309858e-06,
"loss": 0.8252,
"step": 400
},
{
"epoch": 0.52,
"grad_norm": 1.0770992635214238,
"learning_rate": 4.915778979023511e-06,
"loss": 0.8048,
"step": 401
},
{
"epoch": 0.52,
"grad_norm": 1.3714502182024384,
"learning_rate": 4.894726524363456e-06,
"loss": 0.8148,
"step": 402
},
{
"epoch": 0.52,
"grad_norm": 1.1328097681910083,
"learning_rate": 4.873675936686604e-06,
"loss": 0.8155,
"step": 403
},
{
"epoch": 0.52,
"grad_norm": 1.1731809825959303,
"learning_rate": 4.852627589316749e-06,
"loss": 0.8593,
"step": 404
},
{
"epoch": 0.53,
"grad_norm": 1.049090459083091,
"learning_rate": 4.831581855537955e-06,
"loss": 0.8239,
"step": 405
},
{
"epoch": 0.53,
"grad_norm": 1.0993193737486686,
"learning_rate": 4.810539108587938e-06,
"loss": 0.8425,
"step": 406
},
{
"epoch": 0.53,
"grad_norm": 1.1941858463970723,
"learning_rate": 4.789499721651441e-06,
"loss": 0.8411,
"step": 407
},
{
"epoch": 0.53,
"grad_norm": 1.1456952108338223,
"learning_rate": 4.76846406785362e-06,
"loss": 0.8074,
"step": 408
},
{
"epoch": 0.53,
"grad_norm": 1.1787915666433677,
"learning_rate": 4.747432520253424e-06,
"loss": 0.8203,
"step": 409
},
{
"epoch": 0.53,
"grad_norm": 1.3051379948424053,
"learning_rate": 4.726405451836982e-06,
"loss": 0.8447,
"step": 410
},
{
"epoch": 0.53,
"grad_norm": 1.1233484298047998,
"learning_rate": 4.705383235510984e-06,
"loss": 0.8301,
"step": 411
},
{
"epoch": 0.54,
"grad_norm": 1.0834579202868906,
"learning_rate": 4.684366244096072e-06,
"loss": 0.8429,
"step": 412
},
{
"epoch": 0.54,
"grad_norm": 1.1507289567364096,
"learning_rate": 4.663354850320226e-06,
"loss": 0.8594,
"step": 413
},
{
"epoch": 0.54,
"grad_norm": 1.1908348320197186,
"learning_rate": 4.642349426812155e-06,
"loss": 0.8214,
"step": 414
},
{
"epoch": 0.54,
"grad_norm": 1.2504470609063638,
"learning_rate": 4.621350346094685e-06,
"loss": 0.8131,
"step": 415
},
{
"epoch": 0.54,
"grad_norm": 2.0624917538169445,
"learning_rate": 4.600357980578158e-06,
"loss": 0.8468,
"step": 416
},
{
"epoch": 0.54,
"grad_norm": 1.3297179381863848,
"learning_rate": 4.579372702553822e-06,
"loss": 0.7982,
"step": 417
},
{
"epoch": 0.54,
"grad_norm": 1.246901494601956,
"learning_rate": 4.558394884187234e-06,
"loss": 0.8227,
"step": 418
},
{
"epoch": 0.54,
"grad_norm": 1.151150781962948,
"learning_rate": 4.537424897511654e-06,
"loss": 0.8338,
"step": 419
},
{
"epoch": 0.55,
"grad_norm": 1.1660907114296764,
"learning_rate": 4.516463114421452e-06,
"loss": 0.8159,
"step": 420
},
{
"epoch": 0.55,
"grad_norm": 1.7766157190258682,
"learning_rate": 4.495509906665508e-06,
"loss": 0.8345,
"step": 421
},
{
"epoch": 0.55,
"grad_norm": 1.1857385105788216,
"learning_rate": 4.474565645840629e-06,
"loss": 0.8233,
"step": 422
},
{
"epoch": 0.55,
"grad_norm": 1.2264446822967827,
"learning_rate": 4.453630703384942e-06,
"loss": 0.8468,
"step": 423
},
{
"epoch": 0.55,
"grad_norm": 1.264976558078766,
"learning_rate": 4.432705450571323e-06,
"loss": 0.8165,
"step": 424
},
{
"epoch": 0.55,
"grad_norm": 1.1222621762765579,
"learning_rate": 4.411790258500805e-06,
"loss": 0.8184,
"step": 425
},
{
"epoch": 0.55,
"grad_norm": 1.2233198012545898,
"learning_rate": 4.390885498095996e-06,
"loss": 0.8601,
"step": 426
},
{
"epoch": 0.55,
"grad_norm": 1.1030451313547371,
"learning_rate": 4.369991540094503e-06,
"loss": 0.8259,
"step": 427
},
{
"epoch": 0.56,
"grad_norm": 1.2243881638199383,
"learning_rate": 4.3491087550423585e-06,
"loss": 0.8308,
"step": 428
},
{
"epoch": 0.56,
"grad_norm": 1.2802454455900687,
"learning_rate": 4.328237513287444e-06,
"loss": 0.8273,
"step": 429
},
{
"epoch": 0.56,
"grad_norm": 1.5883389737605764,
"learning_rate": 4.3073781849729276e-06,
"loss": 0.793,
"step": 430
},
{
"epoch": 0.56,
"grad_norm": 1.151105984490431,
"learning_rate": 4.286531140030699e-06,
"loss": 0.7827,
"step": 431
},
{
"epoch": 0.56,
"grad_norm": 1.2218234503282421,
"learning_rate": 4.265696748174803e-06,
"loss": 0.819,
"step": 432
},
{
"epoch": 0.56,
"grad_norm": 1.140797795358718,
"learning_rate": 4.2448753788948895e-06,
"loss": 0.8087,
"step": 433
},
{
"epoch": 0.56,
"grad_norm": 1.0760664395492803,
"learning_rate": 4.2240674014496565e-06,
"loss": 0.8267,
"step": 434
},
{
"epoch": 0.56,
"grad_norm": 1.1139625369896868,
"learning_rate": 4.203273184860306e-06,
"loss": 0.8008,
"step": 435
},
{
"epoch": 0.57,
"grad_norm": 1.4018290508347282,
"learning_rate": 4.1824930979039926e-06,
"loss": 0.8546,
"step": 436
},
{
"epoch": 0.57,
"grad_norm": 1.4091864309994824,
"learning_rate": 4.161727509107292e-06,
"loss": 0.7943,
"step": 437
},
{
"epoch": 0.57,
"grad_norm": 1.1324871046006824,
"learning_rate": 4.140976786739658e-06,
"loss": 0.7966,
"step": 438
},
{
"epoch": 0.57,
"grad_norm": 1.6224874962550682,
"learning_rate": 4.120241298806893e-06,
"loss": 0.8261,
"step": 439
},
{
"epoch": 0.57,
"grad_norm": 1.272631877145078,
"learning_rate": 4.099521413044627e-06,
"loss": 0.7966,
"step": 440
},
{
"epoch": 0.57,
"grad_norm": 1.1425226366031473,
"learning_rate": 4.078817496911788e-06,
"loss": 0.8261,
"step": 441
},
{
"epoch": 0.57,
"grad_norm": 1.4359985462900144,
"learning_rate": 4.058129917584091e-06,
"loss": 0.8568,
"step": 442
},
{
"epoch": 0.58,
"grad_norm": 1.0753954087608588,
"learning_rate": 4.037459041947523e-06,
"loss": 0.8217,
"step": 443
},
{
"epoch": 0.58,
"grad_norm": 1.2692450418319305,
"learning_rate": 4.016805236591839e-06,
"loss": 0.8673,
"step": 444
},
{
"epoch": 0.58,
"grad_norm": 1.1195139212914398,
"learning_rate": 3.996168867804058e-06,
"loss": 0.7953,
"step": 445
},
{
"epoch": 0.58,
"grad_norm": 1.3678518854634432,
"learning_rate": 3.975550301561968e-06,
"loss": 0.8095,
"step": 446
},
{
"epoch": 0.58,
"grad_norm": 1.1569918654905087,
"learning_rate": 3.9549499035276375e-06,
"loss": 0.8733,
"step": 447
},
{
"epoch": 0.58,
"grad_norm": 1.1854799970605574,
"learning_rate": 3.934368039040929e-06,
"loss": 0.8126,
"step": 448
},
{
"epoch": 0.58,
"grad_norm": 1.3730103333668784,
"learning_rate": 3.9138050731130185e-06,
"loss": 0.8309,
"step": 449
},
{
"epoch": 0.58,
"grad_norm": 1.1140616423192409,
"learning_rate": 3.893261370419927e-06,
"loss": 0.8065,
"step": 450
},
{
"epoch": 0.59,
"grad_norm": 1.4052106203909946,
"learning_rate": 3.872737295296044e-06,
"loss": 0.8248,
"step": 451
},
{
"epoch": 0.59,
"grad_norm": 1.1758315380501903,
"learning_rate": 3.852233211727676e-06,
"loss": 0.8342,
"step": 452
},
{
"epoch": 0.59,
"grad_norm": 1.4482783731512796,
"learning_rate": 3.8317494833465865e-06,
"loss": 0.8264,
"step": 453
},
{
"epoch": 0.59,
"grad_norm": 1.0844929617557844,
"learning_rate": 3.811286473423549e-06,
"loss": 0.8268,
"step": 454
},
{
"epoch": 0.59,
"grad_norm": 1.152076315782049,
"learning_rate": 3.7908445448618992e-06,
"loss": 0.8079,
"step": 455
},
{
"epoch": 0.59,
"grad_norm": 1.1935228824138842,
"learning_rate": 3.7704240601911075e-06,
"loss": 0.8202,
"step": 456
},
{
"epoch": 0.59,
"grad_norm": 1.1813858261394568,
"learning_rate": 3.7500253815603442e-06,
"loss": 0.8646,
"step": 457
},
{
"epoch": 0.59,
"grad_norm": 1.2716301549560993,
"learning_rate": 3.729648870732058e-06,
"loss": 0.8167,
"step": 458
},
{
"epoch": 0.6,
"grad_norm": 1.177945879650482,
"learning_rate": 3.7092948890755577e-06,
"loss": 0.8678,
"step": 459
},
{
"epoch": 0.6,
"grad_norm": 1.214019403562676,
"learning_rate": 3.688963797560615e-06,
"loss": 0.8327,
"step": 460
},
{
"epoch": 0.6,
"grad_norm": 1.636773329857946,
"learning_rate": 3.6686559567510417e-06,
"loss": 0.824,
"step": 461
},
{
"epoch": 0.6,
"grad_norm": 1.0666034783382468,
"learning_rate": 3.648371726798316e-06,
"loss": 0.7909,
"step": 462
},
{
"epoch": 0.6,
"grad_norm": 1.361459612074104,
"learning_rate": 3.6281114674351846e-06,
"loss": 0.8477,
"step": 463
},
{
"epoch": 0.6,
"grad_norm": 1.6122680059960277,
"learning_rate": 3.6078755379692855e-06,
"loss": 0.8425,
"step": 464
},
{
"epoch": 0.6,
"grad_norm": 1.1605817366410531,
"learning_rate": 3.587664297276776e-06,
"loss": 0.8335,
"step": 465
},
{
"epoch": 0.61,
"grad_norm": 1.5046134018346586,
"learning_rate": 3.5674781037959683e-06,
"loss": 0.7833,
"step": 466
},
{
"epoch": 0.61,
"grad_norm": 1.0563278373051415,
"learning_rate": 3.5473173155209694e-06,
"loss": 0.799,
"step": 467
},
{
"epoch": 0.61,
"grad_norm": 1.0755240081794408,
"learning_rate": 3.527182289995339e-06,
"loss": 0.8536,
"step": 468
},
{
"epoch": 0.61,
"grad_norm": 1.1146568468192999,
"learning_rate": 3.5070733843057415e-06,
"loss": 0.8271,
"step": 469
},
{
"epoch": 0.61,
"grad_norm": 1.2145240314146524,
"learning_rate": 3.4869909550756177e-06,
"loss": 0.8215,
"step": 470
},
{
"epoch": 0.61,
"grad_norm": 1.1149256639601721,
"learning_rate": 3.4669353584588606e-06,
"loss": 0.8287,
"step": 471
},
{
"epoch": 0.61,
"grad_norm": 1.2796860456730539,
"learning_rate": 3.4469069501334932e-06,
"loss": 0.8484,
"step": 472
},
{
"epoch": 0.61,
"grad_norm": 1.073005938552458,
"learning_rate": 3.426906085295369e-06,
"loss": 0.8355,
"step": 473
},
{
"epoch": 0.62,
"grad_norm": 1.1930321678421913,
"learning_rate": 3.4069331186518677e-06,
"loss": 0.8197,
"step": 474
},
{
"epoch": 0.62,
"grad_norm": 1.1883434410680984,
"learning_rate": 3.3869884044156054e-06,
"loss": 0.7895,
"step": 475
},
{
"epoch": 0.62,
"grad_norm": 1.3604734593340317,
"learning_rate": 3.3670722962981516e-06,
"loss": 0.8288,
"step": 476
},
{
"epoch": 0.62,
"grad_norm": 1.0748441692901816,
"learning_rate": 3.3471851475037596e-06,
"loss": 0.8449,
"step": 477
},
{
"epoch": 0.62,
"grad_norm": 1.0860864001092179,
"learning_rate": 3.3273273107231007e-06,
"loss": 0.8468,
"step": 478
},
{
"epoch": 0.62,
"grad_norm": 1.1203049509506295,
"learning_rate": 3.3074991381270072e-06,
"loss": 0.7999,
"step": 479
},
{
"epoch": 0.62,
"grad_norm": 1.0833871352844642,
"learning_rate": 3.28770098136023e-06,
"loss": 0.7806,
"step": 480
},
{
"epoch": 0.62,
"grad_norm": 1.143657532263609,
"learning_rate": 3.2679331915352023e-06,
"loss": 0.8364,
"step": 481
},
{
"epoch": 0.63,
"grad_norm": 1.0808130722425977,
"learning_rate": 3.248196119225811e-06,
"loss": 0.8162,
"step": 482
},
{
"epoch": 0.63,
"grad_norm": 1.5790710971517254,
"learning_rate": 3.228490114461178e-06,
"loss": 0.7935,
"step": 483
},
{
"epoch": 0.63,
"grad_norm": 1.2311619644001286,
"learning_rate": 3.2088155267194586e-06,
"loss": 0.7944,
"step": 484
},
{
"epoch": 0.63,
"grad_norm": 1.282202384930966,
"learning_rate": 3.1891727049216375e-06,
"loss": 0.8352,
"step": 485
},
{
"epoch": 0.63,
"grad_norm": 1.4793811130434844,
"learning_rate": 3.169561997425342e-06,
"loss": 0.822,
"step": 486
},
{
"epoch": 0.63,
"grad_norm": 1.1796102209432577,
"learning_rate": 3.1499837520186676e-06,
"loss": 0.8111,
"step": 487
},
{
"epoch": 0.63,
"grad_norm": 1.1580009886459264,
"learning_rate": 3.130438315914005e-06,
"loss": 0.8148,
"step": 488
},
{
"epoch": 0.64,
"grad_norm": 1.0446124399556485,
"learning_rate": 3.110926035741886e-06,
"loss": 0.8328,
"step": 489
},
{
"epoch": 0.64,
"grad_norm": 1.651469788442752,
"learning_rate": 3.091447257544836e-06,
"loss": 0.8243,
"step": 490
},
{
"epoch": 0.64,
"grad_norm": 1.5532921877403698,
"learning_rate": 3.072002326771235e-06,
"loss": 0.8522,
"step": 491
},
{
"epoch": 0.64,
"grad_norm": 1.1116055858154035,
"learning_rate": 3.0525915882691923e-06,
"loss": 0.8214,
"step": 492
},
{
"epoch": 0.64,
"grad_norm": 1.1956196368057803,
"learning_rate": 3.0332153862804324e-06,
"loss": 0.8314,
"step": 493
},
{
"epoch": 0.64,
"grad_norm": 1.1689114541431895,
"learning_rate": 3.0138740644341887e-06,
"loss": 0.8838,
"step": 494
},
{
"epoch": 0.64,
"grad_norm": 1.248229372898906,
"learning_rate": 2.9945679657411054e-06,
"loss": 0.8347,
"step": 495
},
{
"epoch": 0.64,
"grad_norm": 1.1078504742591242,
"learning_rate": 2.9752974325871625e-06,
"loss": 0.8227,
"step": 496
},
{
"epoch": 0.65,
"grad_norm": 1.1900434139705938,
"learning_rate": 2.9560628067275966e-06,
"loss": 0.8188,
"step": 497
},
{
"epoch": 0.65,
"grad_norm": 1.3818403864096889,
"learning_rate": 2.9368644292808433e-06,
"loss": 0.8107,
"step": 498
},
{
"epoch": 0.65,
"grad_norm": 1.149364405276468,
"learning_rate": 2.917702640722488e-06,
"loss": 0.8319,
"step": 499
},
{
"epoch": 0.65,
"grad_norm": 1.1033046148197456,
"learning_rate": 2.898577780879227e-06,
"loss": 0.8056,
"step": 500
},
{
"epoch": 0.65,
"grad_norm": 1.4667019536685615,
"learning_rate": 2.879490188922837e-06,
"loss": 0.8301,
"step": 501
},
{
"epoch": 0.65,
"grad_norm": 1.0975707519773683,
"learning_rate": 2.86044020336417e-06,
"loss": 0.8436,
"step": 502
},
{
"epoch": 0.65,
"grad_norm": 1.124374118696095,
"learning_rate": 2.8414281620471347e-06,
"loss": 0.8468,
"step": 503
},
{
"epoch": 0.65,
"grad_norm": 1.858826654639766,
"learning_rate": 2.8224544021427234e-06,
"loss": 0.8187,
"step": 504
},
{
"epoch": 0.66,
"grad_norm": 1.0707690364920266,
"learning_rate": 2.803519260143014e-06,
"loss": 0.7986,
"step": 505
},
{
"epoch": 0.66,
"grad_norm": 1.4117752904872918,
"learning_rate": 2.784623071855217e-06,
"loss": 0.8525,
"step": 506
},
{
"epoch": 0.66,
"grad_norm": 1.127786410455673,
"learning_rate": 2.765766172395716e-06,
"loss": 0.8042,
"step": 507
},
{
"epoch": 0.66,
"grad_norm": 1.7330464613002825,
"learning_rate": 2.746948896184114e-06,
"loss": 0.8447,
"step": 508
},
{
"epoch": 0.66,
"grad_norm": 1.2899602920949957,
"learning_rate": 2.7281715769373205e-06,
"loss": 0.854,
"step": 509
},
{
"epoch": 0.66,
"grad_norm": 1.1424757403756332,
"learning_rate": 2.7094345476636185e-06,
"loss": 0.8148,
"step": 510
},
{
"epoch": 0.66,
"grad_norm": 1.2199975615104413,
"learning_rate": 2.6907381406567696e-06,
"loss": 0.8014,
"step": 511
},
{
"epoch": 0.66,
"grad_norm": 1.1194583712399984,
"learning_rate": 2.6720826874901083e-06,
"loss": 0.8419,
"step": 512
},
{
"epoch": 0.67,
"grad_norm": 1.16983949626066,
"learning_rate": 2.653468519010677e-06,
"loss": 0.8181,
"step": 513
},
{
"epoch": 0.67,
"grad_norm": 1.1476527120151712,
"learning_rate": 2.634895965333344e-06,
"loss": 0.8038,
"step": 514
},
{
"epoch": 0.67,
"grad_norm": 1.2383972572556945,
"learning_rate": 2.6163653558349613e-06,
"loss": 0.7947,
"step": 515
},
{
"epoch": 0.67,
"grad_norm": 1.294782640008379,
"learning_rate": 2.5978770191485115e-06,
"loss": 0.8118,
"step": 516
},
{
"epoch": 0.67,
"grad_norm": 1.1307407949263424,
"learning_rate": 2.5794312831572897e-06,
"loss": 0.8161,
"step": 517
},
{
"epoch": 0.67,
"grad_norm": 1.1156038483537878,
"learning_rate": 2.561028474989088e-06,
"loss": 0.8175,
"step": 518
},
{
"epoch": 0.67,
"grad_norm": 1.1441747497674815,
"learning_rate": 2.5426689210103813e-06,
"loss": 0.8345,
"step": 519
},
{
"epoch": 0.68,
"grad_norm": 1.1286048632129229,
"learning_rate": 2.5243529468205574e-06,
"loss": 0.8512,
"step": 520
},
{
"epoch": 0.68,
"grad_norm": 1.2143329409471455,
"learning_rate": 2.5060808772461275e-06,
"loss": 0.84,
"step": 521
},
{
"epoch": 0.68,
"grad_norm": 1.2053778551775718,
"learning_rate": 2.487853036334979e-06,
"loss": 0.8246,
"step": 522
},
{
"epoch": 0.68,
"grad_norm": 1.1960048327957544,
"learning_rate": 2.4696697473506122e-06,
"loss": 0.8231,
"step": 523
},
{
"epoch": 0.68,
"grad_norm": 1.295745581171811,
"learning_rate": 2.451531332766426e-06,
"loss": 0.8853,
"step": 524
},
{
"epoch": 0.68,
"grad_norm": 1.3067594332973278,
"learning_rate": 2.433438114259982e-06,
"loss": 0.8309,
"step": 525
},
{
"epoch": 0.68,
"grad_norm": 1.1373281583361006,
"learning_rate": 2.4153904127073137e-06,
"loss": 0.8146,
"step": 526
},
{
"epoch": 0.68,
"grad_norm": 1.1417580445878792,
"learning_rate": 2.397388548177227e-06,
"loss": 0.839,
"step": 527
},
{
"epoch": 0.69,
"grad_norm": 1.5599739904042915,
"learning_rate": 2.3794328399256235e-06,
"loss": 0.8294,
"step": 528
},
{
"epoch": 0.69,
"grad_norm": 1.625491080719815,
"learning_rate": 2.3615236063898474e-06,
"loss": 0.8558,
"step": 529
},
{
"epoch": 0.69,
"grad_norm": 1.1287172439081854,
"learning_rate": 2.343661165183025e-06,
"loss": 0.8196,
"step": 530
},
{
"epoch": 0.69,
"grad_norm": 1.2174944956603801,
"learning_rate": 2.325845833088448e-06,
"loss": 0.8036,
"step": 531
},
{
"epoch": 0.69,
"grad_norm": 1.251400066331298,
"learning_rate": 2.308077926053939e-06,
"loss": 0.8371,
"step": 532
},
{
"epoch": 0.69,
"grad_norm": 1.2121696312359778,
"learning_rate": 2.290357759186261e-06,
"loss": 0.8426,
"step": 533
},
{
"epoch": 0.69,
"grad_norm": 1.0604225747034348,
"learning_rate": 2.27268564674552e-06,
"loss": 0.8188,
"step": 534
},
{
"epoch": 0.69,
"grad_norm": 1.1011428657548785,
"learning_rate": 2.2550619021396e-06,
"loss": 0.8079,
"step": 535
},
{
"epoch": 0.7,
"grad_norm": 1.1723339573000198,
"learning_rate": 2.2374868379185998e-06,
"loss": 0.8178,
"step": 536
},
{
"epoch": 0.7,
"grad_norm": 1.135210308251682,
"learning_rate": 2.2199607657692874e-06,
"loss": 0.8045,
"step": 537
},
{
"epoch": 0.7,
"grad_norm": 1.3722545706665699,
"learning_rate": 2.2024839965095814e-06,
"loss": 0.8314,
"step": 538
},
{
"epoch": 0.7,
"grad_norm": 1.1631275771309266,
"learning_rate": 2.1850568400830268e-06,
"loss": 0.8411,
"step": 539
},
{
"epoch": 0.7,
"grad_norm": 1.0760153562190804,
"learning_rate": 2.1676796055533125e-06,
"loss": 0.8176,
"step": 540
},
{
"epoch": 0.7,
"grad_norm": 1.1177832971628443,
"learning_rate": 2.150352601098774e-06,
"loss": 0.8719,
"step": 541
},
{
"epoch": 0.7,
"grad_norm": 1.3419502743335265,
"learning_rate": 2.133076134006945e-06,
"loss": 0.8166,
"step": 542
},
{
"epoch": 0.71,
"grad_norm": 1.0758424378799882,
"learning_rate": 2.11585051066909e-06,
"loss": 0.7853,
"step": 543
},
{
"epoch": 0.71,
"grad_norm": 1.291711507267418,
"learning_rate": 2.0986760365747883e-06,
"loss": 0.829,
"step": 544
},
{
"epoch": 0.71,
"grad_norm": 1.0798176397290844,
"learning_rate": 2.081553016306504e-06,
"loss": 0.8003,
"step": 545
},
{
"epoch": 0.71,
"grad_norm": 1.1801650428025168,
"learning_rate": 2.0644817535341856e-06,
"loss": 0.8362,
"step": 546
},
{
"epoch": 0.71,
"grad_norm": 1.268664958156847,
"learning_rate": 2.0474625510098883e-06,
"loss": 0.837,
"step": 547
},
{
"epoch": 0.71,
"grad_norm": 1.1501634035936659,
"learning_rate": 2.0304957105623936e-06,
"loss": 0.8105,
"step": 548
},
{
"epoch": 0.71,
"grad_norm": 1.0585458538794812,
"learning_rate": 2.013581533091869e-06,
"loss": 0.8033,
"step": 549
},
{
"epoch": 0.71,
"grad_norm": 1.3468267171455577,
"learning_rate": 1.996720318564518e-06,
"loss": 0.8565,
"step": 550
},
{
"epoch": 0.72,
"grad_norm": 1.1428836719091247,
"learning_rate": 1.9799123660072744e-06,
"loss": 0.8195,
"step": 551
},
{
"epoch": 0.72,
"grad_norm": 1.206897896948396,
"learning_rate": 1.9631579735024854e-06,
"loss": 0.84,
"step": 552
},
{
"epoch": 0.72,
"grad_norm": 1.1891641075077786,
"learning_rate": 1.9464574381826367e-06,
"loss": 0.8356,
"step": 553
},
{
"epoch": 0.72,
"grad_norm": 1.222933588941957,
"learning_rate": 1.9298110562250787e-06,
"loss": 0.8156,
"step": 554
},
{
"epoch": 0.72,
"grad_norm": 1.155842038000571,
"learning_rate": 1.9132191228467685e-06,
"loss": 0.8097,
"step": 555
},
{
"epoch": 0.72,
"grad_norm": 1.140681245819448,
"learning_rate": 1.8966819322990455e-06,
"loss": 0.8128,
"step": 556
},
{
"epoch": 0.72,
"grad_norm": 1.1221328511746198,
"learning_rate": 1.8801997778623998e-06,
"loss": 0.8572,
"step": 557
},
{
"epoch": 0.72,
"grad_norm": 1.2526117843090938,
"learning_rate": 1.8637729518412861e-06,
"loss": 0.7972,
"step": 558
},
{
"epoch": 0.73,
"grad_norm": 1.1590154010447482,
"learning_rate": 1.8474017455589238e-06,
"loss": 0.8268,
"step": 559
},
{
"epoch": 0.73,
"grad_norm": 1.1525228183168728,
"learning_rate": 1.8310864493521453e-06,
"loss": 0.823,
"step": 560
},
{
"epoch": 0.73,
"grad_norm": 1.5610374437152565,
"learning_rate": 1.8148273525662336e-06,
"loss": 0.8313,
"step": 561
},
{
"epoch": 0.73,
"grad_norm": 1.518763002771371,
"learning_rate": 1.7986247435498033e-06,
"loss": 0.8418,
"step": 562
},
{
"epoch": 0.73,
"grad_norm": 1.2059483655034768,
"learning_rate": 1.7824789096496752e-06,
"loss": 0.8304,
"step": 563
},
{
"epoch": 0.73,
"grad_norm": 1.2028996101251008,
"learning_rate": 1.7663901372057907e-06,
"loss": 0.805,
"step": 564
},
{
"epoch": 0.73,
"grad_norm": 1.1237925328355798,
"learning_rate": 1.7503587115461286e-06,
"loss": 0.8279,
"step": 565
},
{
"epoch": 0.74,
"grad_norm": 1.1888955513675719,
"learning_rate": 1.7343849169816396e-06,
"loss": 0.8456,
"step": 566
},
{
"epoch": 0.74,
"grad_norm": 1.196964494587607,
"learning_rate": 1.7184690368012191e-06,
"loss": 0.8181,
"step": 567
},
{
"epoch": 0.74,
"grad_norm": 1.4923991366523424,
"learning_rate": 1.702611353266665e-06,
"loss": 0.8275,
"step": 568
},
{
"epoch": 0.74,
"grad_norm": 1.0896582192802815,
"learning_rate": 1.6868121476076877e-06,
"loss": 0.7931,
"step": 569
},
{
"epoch": 0.74,
"grad_norm": 1.1459479171554634,
"learning_rate": 1.6710717000169098e-06,
"loss": 0.8249,
"step": 570
},
{
"epoch": 0.74,
"grad_norm": 1.2161166441890499,
"learning_rate": 1.6553902896449092e-06,
"loss": 0.8541,
"step": 571
},
{
"epoch": 0.74,
"grad_norm": 1.181061357249287,
"learning_rate": 1.639768194595256e-06,
"loss": 0.7886,
"step": 572
},
{
"epoch": 0.74,
"grad_norm": 1.0881658213398064,
"learning_rate": 1.624205691919591e-06,
"loss": 0.8381,
"step": 573
},
{
"epoch": 0.75,
"grad_norm": 1.1232986701514402,
"learning_rate": 1.6087030576127082e-06,
"loss": 0.8017,
"step": 574
},
{
"epoch": 0.75,
"grad_norm": 1.1128820987957522,
"learning_rate": 1.5932605666076557e-06,
"loss": 0.8363,
"step": 575
},
{
"epoch": 0.75,
"grad_norm": 1.3744563473037221,
"learning_rate": 1.5778784927708695e-06,
"loss": 0.8154,
"step": 576
},
{
"epoch": 0.75,
"grad_norm": 1.120775385580711,
"learning_rate": 1.5625571088973051e-06,
"loss": 0.8199,
"step": 577
},
{
"epoch": 0.75,
"grad_norm": 1.1360877101243405,
"learning_rate": 1.5472966867056122e-06,
"loss": 0.8327,
"step": 578
},
{
"epoch": 0.75,
"grad_norm": 1.1280760221777546,
"learning_rate": 1.5320974968333025e-06,
"loss": 0.832,
"step": 579
},
{
"epoch": 0.75,
"grad_norm": 1.3216185504239597,
"learning_rate": 1.5169598088319642e-06,
"loss": 0.8328,
"step": 580
},
{
"epoch": 0.75,
"grad_norm": 1.1555998427076246,
"learning_rate": 1.5018838911624671e-06,
"loss": 0.7986,
"step": 581
},
{
"epoch": 0.76,
"grad_norm": 1.097188726494774,
"learning_rate": 1.486870011190214e-06,
"loss": 0.8139,
"step": 582
},
{
"epoch": 0.76,
"grad_norm": 1.313359424122802,
"learning_rate": 1.4719184351803927e-06,
"loss": 0.8247,
"step": 583
},
{
"epoch": 0.76,
"grad_norm": 1.2841336725150148,
"learning_rate": 1.457029428293254e-06,
"loss": 0.8214,
"step": 584
},
{
"epoch": 0.76,
"grad_norm": 1.4019331627637832,
"learning_rate": 1.4422032545794096e-06,
"loss": 0.8476,
"step": 585
},
{
"epoch": 0.76,
"grad_norm": 1.2479270352689151,
"learning_rate": 1.4274401769751496e-06,
"loss": 0.8596,
"step": 586
},
{
"epoch": 0.76,
"grad_norm": 1.190584202906868,
"learning_rate": 1.412740457297782e-06,
"loss": 0.7976,
"step": 587
},
{
"epoch": 0.76,
"grad_norm": 1.3358512600414083,
"learning_rate": 1.398104356240988e-06,
"loss": 0.8467,
"step": 588
},
{
"epoch": 0.76,
"grad_norm": 1.1134129745594221,
"learning_rate": 1.383532133370193e-06,
"loss": 0.7994,
"step": 589
},
{
"epoch": 0.77,
"grad_norm": 1.7236513155468485,
"learning_rate": 1.369024047117974e-06,
"loss": 0.7879,
"step": 590
},
{
"epoch": 0.77,
"grad_norm": 1.2979089557824244,
"learning_rate": 1.3545803547794639e-06,
"loss": 0.8403,
"step": 591
},
{
"epoch": 0.77,
"grad_norm": 1.235539117645581,
"learning_rate": 1.3402013125078039e-06,
"loss": 0.8364,
"step": 592
},
{
"epoch": 0.77,
"grad_norm": 1.5366346531355402,
"learning_rate": 1.325887175309582e-06,
"loss": 0.8379,
"step": 593
},
{
"epoch": 0.77,
"grad_norm": 1.121574181414578,
"learning_rate": 1.3116381970403302e-06,
"loss": 0.836,
"step": 594
},
{
"epoch": 0.77,
"grad_norm": 1.340696954512561,
"learning_rate": 1.2974546304000046e-06,
"loss": 0.8027,
"step": 595
},
{
"epoch": 0.77,
"grad_norm": 1.2132502416588156,
"learning_rate": 1.2833367269285168e-06,
"loss": 0.8077,
"step": 596
},
{
"epoch": 0.78,
"grad_norm": 1.1192375791395337,
"learning_rate": 1.2692847370012696e-06,
"loss": 0.8522,
"step": 597
},
{
"epoch": 0.78,
"grad_norm": 1.741500020742988,
"learning_rate": 1.2552989098247092e-06,
"loss": 0.8067,
"step": 598
},
{
"epoch": 0.78,
"grad_norm": 1.1150261698728017,
"learning_rate": 1.241379493431919e-06,
"loss": 0.8529,
"step": 599
},
{
"epoch": 0.78,
"grad_norm": 1.3385533216939478,
"learning_rate": 1.2275267346782067e-06,
"loss": 0.843,
"step": 600
},
{
"epoch": 0.78,
"grad_norm": 1.1806042588679646,
"learning_rate": 1.2137408792367388e-06,
"loss": 0.7897,
"step": 601
},
{
"epoch": 0.78,
"grad_norm": 1.2608515351865308,
"learning_rate": 1.2000221715941746e-06,
"loss": 0.8248,
"step": 602
},
{
"epoch": 0.78,
"grad_norm": 1.1452255358786536,
"learning_rate": 1.1863708550463372e-06,
"loss": 0.8283,
"step": 603
},
{
"epoch": 0.78,
"grad_norm": 1.584053048603632,
"learning_rate": 1.1727871716938904e-06,
"loss": 0.8472,
"step": 604
},
{
"epoch": 0.79,
"grad_norm": 1.16292088995077,
"learning_rate": 1.1592713624380553e-06,
"loss": 0.814,
"step": 605
},
{
"epoch": 0.79,
"grad_norm": 1.0574349634190905,
"learning_rate": 1.1458236669763323e-06,
"loss": 0.8029,
"step": 606
},
{
"epoch": 0.79,
"grad_norm": 1.1504507119536145,
"learning_rate": 1.132444323798247e-06,
"loss": 0.8376,
"step": 607
},
{
"epoch": 0.79,
"grad_norm": 1.1683086518860988,
"learning_rate": 1.1191335701811285e-06,
"loss": 0.8231,
"step": 608
},
{
"epoch": 0.79,
"grad_norm": 1.0085098659895237,
"learning_rate": 1.105891642185894e-06,
"loss": 0.8007,
"step": 609
},
{
"epoch": 0.79,
"grad_norm": 1.2738890651414907,
"learning_rate": 1.0927187746528695e-06,
"loss": 0.7735,
"step": 610
},
{
"epoch": 0.79,
"grad_norm": 1.0889145540805063,
"learning_rate": 1.0796152011976164e-06,
"loss": 0.8529,
"step": 611
},
{
"epoch": 0.79,
"grad_norm": 1.0941937359259284,
"learning_rate": 1.0665811542067988e-06,
"loss": 0.8374,
"step": 612
},
{
"epoch": 0.8,
"grad_norm": 1.1244112767282193,
"learning_rate": 1.0536168648340506e-06,
"loss": 0.8098,
"step": 613
},
{
"epoch": 0.8,
"grad_norm": 1.1652423680488342,
"learning_rate": 1.0407225629958883e-06,
"loss": 0.8586,
"step": 614
},
{
"epoch": 0.8,
"grad_norm": 1.371358727142576,
"learning_rate": 1.0278984773676214e-06,
"loss": 0.8302,
"step": 615
},
{
"epoch": 0.8,
"grad_norm": 1.282130434347346,
"learning_rate": 1.0151448353793064e-06,
"loss": 0.7846,
"step": 616
},
{
"epoch": 0.8,
"grad_norm": 1.0308140323506172,
"learning_rate": 1.0024618632117112e-06,
"loss": 0.8424,
"step": 617
},
{
"epoch": 0.8,
"grad_norm": 1.0307135501463645,
"learning_rate": 9.898497857922978e-07,
"loss": 0.804,
"step": 618
},
{
"epoch": 0.8,
"grad_norm": 1.1411850630578646,
"learning_rate": 9.773088267912423e-07,
"loss": 0.8061,
"step": 619
},
{
"epoch": 0.81,
"grad_norm": 1.0646486743244608,
"learning_rate": 9.648392086174612e-07,
"loss": 0.834,
"step": 620
},
{
"epoch": 0.81,
"grad_norm": 1.3421257146608427,
"learning_rate": 9.524411524146726e-07,
"loss": 0.849,
"step": 621
},
{
"epoch": 0.81,
"grad_norm": 1.094638647544279,
"learning_rate": 9.401148780574682e-07,
"loss": 0.8179,
"step": 622
},
{
"epoch": 0.81,
"grad_norm": 1.1619475819740164,
"learning_rate": 9.278606041474203e-07,
"loss": 0.8457,
"step": 623
},
{
"epoch": 0.81,
"grad_norm": 1.2816836446796047,
"learning_rate": 9.15678548009199e-07,
"loss": 0.8202,
"step": 624
},
{
"epoch": 0.81,
"grad_norm": 1.1426328170657212,
"learning_rate": 9.03568925686723e-07,
"loss": 0.8166,
"step": 625
},
{
"epoch": 0.81,
"grad_norm": 1.1247140360252137,
"learning_rate": 8.915319519393278e-07,
"loss": 0.8326,
"step": 626
},
{
"epoch": 0.81,
"grad_norm": 1.105494579178503,
"learning_rate": 8.795678402379498e-07,
"loss": 0.8282,
"step": 627
},
{
"epoch": 0.82,
"grad_norm": 1.0572570612604646,
"learning_rate": 8.676768027613525e-07,
"loss": 0.812,
"step": 628
},
{
"epoch": 0.82,
"grad_norm": 1.1209163806272,
"learning_rate": 8.558590503923509e-07,
"loss": 0.8326,
"step": 629
},
{
"epoch": 0.82,
"grad_norm": 1.415314790531641,
"learning_rate": 8.441147927140836e-07,
"loss": 0.8396,
"step": 630
},
{
"epoch": 0.82,
"grad_norm": 1.1376087262273429,
"learning_rate": 8.324442380062847e-07,
"loss": 0.8003,
"step": 631
},
{
"epoch": 0.82,
"grad_norm": 1.1053995721453131,
"learning_rate": 8.208475932416005e-07,
"loss": 0.8151,
"step": 632
},
{
"epoch": 0.82,
"grad_norm": 1.24029540386004,
"learning_rate": 8.093250640819095e-07,
"loss": 0.8624,
"step": 633
},
{
"epoch": 0.82,
"grad_norm": 1.2129126237029006,
"learning_rate": 7.978768548746818e-07,
"loss": 0.8647,
"step": 634
},
{
"epoch": 0.82,
"grad_norm": 1.2236973968631748,
"learning_rate": 7.865031686493546e-07,
"loss": 0.8326,
"step": 635
},
{
"epoch": 0.83,
"grad_norm": 1.4420200195634538,
"learning_rate": 7.752042071137239e-07,
"loss": 0.8318,
"step": 636
},
{
"epoch": 0.83,
"grad_norm": 1.152704237069485,
"learning_rate": 7.639801706503791e-07,
"loss": 0.7957,
"step": 637
},
{
"epoch": 0.83,
"grad_norm": 1.1410843484070319,
"learning_rate": 7.528312583131387e-07,
"loss": 0.8563,
"step": 638
},
{
"epoch": 0.83,
"grad_norm": 1.4441417221706796,
"learning_rate": 7.417576678235288e-07,
"loss": 0.8485,
"step": 639
},
{
"epoch": 0.83,
"grad_norm": 1.2782496928533214,
"learning_rate": 7.307595955672686e-07,
"loss": 0.8131,
"step": 640
},
{
"epoch": 0.83,
"grad_norm": 1.3239848225764461,
"learning_rate": 7.198372365907946e-07,
"loss": 0.8278,
"step": 641
},
{
"epoch": 0.83,
"grad_norm": 1.0124273493787945,
"learning_rate": 7.089907845977962e-07,
"loss": 0.8285,
"step": 642
},
{
"epoch": 0.84,
"grad_norm": 1.1984388728115332,
"learning_rate": 6.982204319457831e-07,
"loss": 0.8147,
"step": 643
},
{
"epoch": 0.84,
"grad_norm": 1.7431070656458465,
"learning_rate": 6.875263696426759e-07,
"loss": 0.8392,
"step": 644
},
{
"epoch": 0.84,
"grad_norm": 1.0841381618287178,
"learning_rate": 6.769087873434122e-07,
"loss": 0.7805,
"step": 645
},
{
"epoch": 0.84,
"grad_norm": 1.1363894535495727,
"learning_rate": 6.663678733465905e-07,
"loss": 0.8522,
"step": 646
},
{
"epoch": 0.84,
"grad_norm": 1.149142197303568,
"learning_rate": 6.55903814591125e-07,
"loss": 0.8143,
"step": 647
},
{
"epoch": 0.84,
"grad_norm": 1.2176090443322134,
"learning_rate": 6.455167966529357e-07,
"loss": 0.8031,
"step": 648
},
{
"epoch": 0.84,
"grad_norm": 1.210210988431294,
"learning_rate": 6.352070037416503e-07,
"loss": 0.8527,
"step": 649
},
{
"epoch": 0.84,
"grad_norm": 1.1647607912725104,
"learning_rate": 6.24974618697346e-07,
"loss": 0.8086,
"step": 650
},
{
"epoch": 0.85,
"grad_norm": 1.210283794503598,
"learning_rate": 6.148198229872981e-07,
"loss": 0.7931,
"step": 651
},
{
"epoch": 0.85,
"grad_norm": 1.1449560108431547,
"learning_rate": 6.04742796702768e-07,
"loss": 0.7754,
"step": 652
},
{
"epoch": 0.85,
"grad_norm": 1.229482257223063,
"learning_rate": 5.947437185558091e-07,
"loss": 0.7979,
"step": 653
},
{
"epoch": 0.85,
"grad_norm": 1.1064351563786532,
"learning_rate": 5.848227658760914e-07,
"loss": 0.8302,
"step": 654
},
{
"epoch": 0.85,
"grad_norm": 1.1639624432408997,
"learning_rate": 5.749801146077638e-07,
"loss": 0.783,
"step": 655
},
{
"epoch": 0.85,
"grad_norm": 1.3066438219306855,
"learning_rate": 5.652159393063295e-07,
"loss": 0.8009,
"step": 656
},
{
"epoch": 0.85,
"grad_norm": 1.443570241927174,
"learning_rate": 5.555304131355532e-07,
"loss": 0.8553,
"step": 657
},
{
"epoch": 0.85,
"grad_norm": 1.1154090455850014,
"learning_rate": 5.459237078643864e-07,
"loss": 0.8639,
"step": 658
},
{
"epoch": 0.86,
"grad_norm": 1.3023330347714015,
"learning_rate": 5.363959938639257e-07,
"loss": 0.7804,
"step": 659
},
{
"epoch": 0.86,
"grad_norm": 1.249767283287092,
"learning_rate": 5.269474401043861e-07,
"loss": 0.7951,
"step": 660
},
{
"epoch": 0.86,
"grad_norm": 1.1063372151581325,
"learning_rate": 5.175782141521107e-07,
"loss": 0.8224,
"step": 661
},
{
"epoch": 0.86,
"grad_norm": 1.1100286563923407,
"learning_rate": 5.082884821665918e-07,
"loss": 0.7727,
"step": 662
},
{
"epoch": 0.86,
"grad_norm": 2.3724062097558187,
"learning_rate": 4.990784088975298e-07,
"loss": 0.8162,
"step": 663
},
{
"epoch": 0.86,
"grad_norm": 1.0999505714218432,
"learning_rate": 4.899481576819116e-07,
"loss": 0.7921,
"step": 664
},
{
"epoch": 0.86,
"grad_norm": 1.1869204585037552,
"learning_rate": 4.808978904411066e-07,
"loss": 0.7788,
"step": 665
},
{
"epoch": 0.86,
"grad_norm": 1.203463681829628,
"learning_rate": 4.719277676780054e-07,
"loss": 0.8159,
"step": 666
},
{
"epoch": 0.87,
"grad_norm": 1.0816234617283351,
"learning_rate": 4.630379484741643e-07,
"loss": 0.8381,
"step": 667
},
{
"epoch": 0.87,
"grad_norm": 1.0235382933487907,
"learning_rate": 4.542285904869903e-07,
"loss": 0.82,
"step": 668
},
{
"epoch": 0.87,
"grad_norm": 1.1180943607355467,
"learning_rate": 4.4549984994694095e-07,
"loss": 0.8201,
"step": 669
},
{
"epoch": 0.87,
"grad_norm": 1.0643402324837885,
"learning_rate": 4.3685188165475847e-07,
"loss": 0.8194,
"step": 670
},
{
"epoch": 0.87,
"grad_norm": 1.2761426870344974,
"learning_rate": 4.2828483897871644e-07,
"loss": 0.8491,
"step": 671
},
{
"epoch": 0.87,
"grad_norm": 1.131085874662742,
"learning_rate": 4.197988738519099e-07,
"loss": 0.8208,
"step": 672
},
{
"epoch": 0.87,
"grad_norm": 1.4239166104583856,
"learning_rate": 4.11394136769554e-07,
"loss": 0.8546,
"step": 673
},
{
"epoch": 0.88,
"grad_norm": 1.1422745721220944,
"learning_rate": 4.030707767863151e-07,
"loss": 0.8394,
"step": 674
},
{
"epoch": 0.88,
"grad_norm": 1.163144758918877,
"learning_rate": 3.9482894151367193e-07,
"loss": 0.7996,
"step": 675
},
{
"epoch": 0.88,
"grad_norm": 1.1466246998351644,
"learning_rate": 3.866687771172917e-07,
"loss": 0.842,
"step": 676
},
{
"epoch": 0.88,
"grad_norm": 3.5669370889766294,
"learning_rate": 3.785904283144454e-07,
"loss": 0.8256,
"step": 677
},
{
"epoch": 0.88,
"grad_norm": 1.118821104205034,
"learning_rate": 3.705940383714318e-07,
"loss": 0.8273,
"step": 678
},
{
"epoch": 0.88,
"grad_norm": 1.323490909694076,
"learning_rate": 3.6267974910104696e-07,
"loss": 0.7964,
"step": 679
},
{
"epoch": 0.88,
"grad_norm": 1.115932337714536,
"learning_rate": 3.5484770086006037e-07,
"loss": 0.8155,
"step": 680
},
{
"epoch": 0.88,
"grad_norm": 1.0438668202049772,
"learning_rate": 3.470980325467316e-07,
"loss": 0.7981,
"step": 681
},
{
"epoch": 0.89,
"grad_norm": 1.1725553396037112,
"learning_rate": 3.394308815983455e-07,
"loss": 0.8166,
"step": 682
},
{
"epoch": 0.89,
"grad_norm": 1.0432079376075305,
"learning_rate": 3.318463839887714e-07,
"loss": 0.8048,
"step": 683
},
{
"epoch": 0.89,
"grad_norm": 1.1190181566238873,
"learning_rate": 3.243446742260581e-07,
"loss": 0.8143,
"step": 684
},
{
"epoch": 0.89,
"grad_norm": 1.2027311340026403,
"learning_rate": 3.169258853500423e-07,
"loss": 0.8072,
"step": 685
},
{
"epoch": 0.89,
"grad_norm": 1.0448826480431468,
"learning_rate": 3.095901489299935e-07,
"loss": 0.8156,
"step": 686
},
{
"epoch": 0.89,
"grad_norm": 1.192618042559272,
"learning_rate": 3.0233759506227646e-07,
"loss": 0.8091,
"step": 687
},
{
"epoch": 0.89,
"grad_norm": 1.5187121355543114,
"learning_rate": 2.951683523680504e-07,
"loss": 0.8679,
"step": 688
},
{
"epoch": 0.89,
"grad_norm": 1.0879754257424312,
"learning_rate": 2.8808254799097936e-07,
"loss": 0.7972,
"step": 689
},
{
"epoch": 0.9,
"grad_norm": 1.1613265458170998,
"learning_rate": 2.8108030759498583e-07,
"loss": 0.829,
"step": 690
},
{
"epoch": 0.9,
"grad_norm": 1.1230654665039808,
"learning_rate": 2.7416175536201794e-07,
"loss": 0.8032,
"step": 691
},
{
"epoch": 0.9,
"grad_norm": 1.1769239213597456,
"learning_rate": 2.673270139898443e-07,
"loss": 0.8239,
"step": 692
},
{
"epoch": 0.9,
"grad_norm": 1.0197306575365235,
"learning_rate": 2.605762046898852e-07,
"loss": 0.8078,
"step": 693
},
{
"epoch": 0.9,
"grad_norm": 1.1409475466437597,
"learning_rate": 2.539094471850562e-07,
"loss": 0.8414,
"step": 694
},
{
"epoch": 0.9,
"grad_norm": 1.4106234568201388,
"learning_rate": 2.4732685970765004e-07,
"loss": 0.8255,
"step": 695
},
{
"epoch": 0.9,
"grad_norm": 1.392418029087564,
"learning_rate": 2.408285589972353e-07,
"loss": 0.8616,
"step": 696
},
{
"epoch": 0.91,
"grad_norm": 1.1287766159634611,
"learning_rate": 2.3441466029859027e-07,
"loss": 0.8257,
"step": 697
},
{
"epoch": 0.91,
"grad_norm": 1.176991079235627,
"learning_rate": 2.280852773596548e-07,
"loss": 0.8592,
"step": 698
},
{
"epoch": 0.91,
"grad_norm": 1.159662292058494,
"learning_rate": 2.218405224295178e-07,
"loss": 0.8608,
"step": 699
},
{
"epoch": 0.91,
"grad_norm": 1.1122413124432546,
"learning_rate": 2.1568050625642323e-07,
"loss": 0.8338,
"step": 700
},
{
"epoch": 0.91,
"grad_norm": 1.3908225007282236,
"learning_rate": 2.0960533808580596e-07,
"loss": 0.824,
"step": 701
},
{
"epoch": 0.91,
"grad_norm": 1.0773093703955594,
"learning_rate": 2.0361512565835738e-07,
"loss": 0.8022,
"step": 702
},
{
"epoch": 0.91,
"grad_norm": 1.0665415052562757,
"learning_rate": 1.9770997520810965e-07,
"loss": 0.833,
"step": 703
},
{
"epoch": 0.91,
"grad_norm": 1.1527526893626212,
"learning_rate": 1.918899914605582e-07,
"loss": 0.8078,
"step": 704
},
{
"epoch": 0.92,
"grad_norm": 1.1609049518698369,
"learning_rate": 1.8615527763079678e-07,
"loss": 0.8044,
"step": 705
},
{
"epoch": 0.92,
"grad_norm": 1.4022083725557934,
"learning_rate": 1.8050593542169537e-07,
"loss": 0.8395,
"step": 706
},
{
"epoch": 0.92,
"grad_norm": 1.152545683730929,
"learning_rate": 1.7494206502208787e-07,
"loss": 0.8257,
"step": 707
},
{
"epoch": 0.92,
"grad_norm": 1.1491485626025015,
"learning_rate": 1.6946376510500406e-07,
"loss": 0.8384,
"step": 708
},
{
"epoch": 0.92,
"grad_norm": 1.052754057159298,
"learning_rate": 1.6407113282591204e-07,
"loss": 0.8274,
"step": 709
},
{
"epoch": 0.92,
"grad_norm": 1.0398923442519588,
"learning_rate": 1.5876426382099908e-07,
"loss": 0.8142,
"step": 710
},
{
"epoch": 0.92,
"grad_norm": 1.4521184232963287,
"learning_rate": 1.5354325220547638e-07,
"loss": 0.8346,
"step": 711
},
{
"epoch": 0.92,
"grad_norm": 1.1173628865842729,
"learning_rate": 1.4840819057190591e-07,
"loss": 0.8292,
"step": 712
},
{
"epoch": 0.93,
"grad_norm": 1.0382298520155955,
"learning_rate": 1.433591699885639e-07,
"loss": 0.7851,
"step": 713
},
{
"epoch": 0.93,
"grad_norm": 1.0096877302771539,
"learning_rate": 1.3839627999782056e-07,
"loss": 0.7929,
"step": 714
},
{
"epoch": 0.93,
"grad_norm": 1.0884547484191,
"learning_rate": 1.3351960861455515e-07,
"loss": 0.8375,
"step": 715
},
{
"epoch": 0.93,
"grad_norm": 1.1252011150006394,
"learning_rate": 1.287292423245945e-07,
"loss": 0.7717,
"step": 716
},
{
"epoch": 0.93,
"grad_norm": 1.040686560253859,
"learning_rate": 1.2402526608317812e-07,
"loss": 0.7949,
"step": 717
},
{
"epoch": 0.93,
"grad_norm": 3.0364147961167727,
"learning_rate": 1.1940776331345383e-07,
"loss": 0.7683,
"step": 718
},
{
"epoch": 0.93,
"grad_norm": 1.1201818935745307,
"learning_rate": 1.1487681590499456e-07,
"loss": 0.8266,
"step": 719
},
{
"epoch": 0.94,
"grad_norm": 1.3409144389292686,
"learning_rate": 1.1043250421235107e-07,
"loss": 0.874,
"step": 720
},
{
"epoch": 0.94,
"grad_norm": 1.182209135051885,
"learning_rate": 1.06074907053621e-07,
"loss": 0.8179,
"step": 721
},
{
"epoch": 0.94,
"grad_norm": 1.1131457112516843,
"learning_rate": 1.0180410170905819e-07,
"loss": 0.8328,
"step": 722
},
{
"epoch": 0.94,
"grad_norm": 1.190739874954435,
"learning_rate": 9.762016391969386e-08,
"loss": 0.7798,
"step": 723
},
{
"epoch": 0.94,
"grad_norm": 1.1368913561260203,
"learning_rate": 9.352316788600102e-08,
"loss": 0.8303,
"step": 724
},
{
"epoch": 0.94,
"grad_norm": 1.1850727243568353,
"learning_rate": 8.95131862665749e-08,
"loss": 0.7791,
"step": 725
},
{
"epoch": 0.94,
"grad_norm": 1.1643319143252835,
"learning_rate": 8.559029017684184e-08,
"loss": 0.8276,
"step": 726
},
{
"epoch": 0.94,
"grad_norm": 1.1274946360164755,
"learning_rate": 8.175454918780467e-08,
"loss": 0.8095,
"step": 727
},
{
"epoch": 0.95,
"grad_norm": 1.1648118926590638,
"learning_rate": 7.800603132480322e-08,
"loss": 0.8263,
"step": 728
},
{
"epoch": 0.95,
"grad_norm": 1.1391257495952203,
"learning_rate": 7.434480306630965e-08,
"loss": 0.7894,
"step": 729
},
{
"epoch": 0.95,
"grad_norm": 1.5657383282724364,
"learning_rate": 7.077092934275054e-08,
"loss": 0.8102,
"step": 730
},
{
"epoch": 0.95,
"grad_norm": 1.1438725716115983,
"learning_rate": 6.72844735353545e-08,
"loss": 0.8541,
"step": 731
},
{
"epoch": 0.95,
"grad_norm": 1.3162108137240458,
"learning_rate": 6.388549747502748e-08,
"loss": 0.814,
"step": 732
},
{
"epoch": 0.95,
"grad_norm": 1.2615633941509532,
"learning_rate": 6.057406144125755e-08,
"loss": 0.7947,
"step": 733
},
{
"epoch": 0.95,
"grad_norm": 1.0545752870068084,
"learning_rate": 5.7350224161046294e-08,
"loss": 0.8528,
"step": 734
},
{
"epoch": 0.95,
"grad_norm": 1.1412889461353484,
"learning_rate": 5.421404280786302e-08,
"loss": 0.812,
"step": 735
},
{
"epoch": 0.96,
"grad_norm": 1.2921519350201203,
"learning_rate": 5.116557300063774e-08,
"loss": 0.8166,
"step": 736
},
{
"epoch": 0.96,
"grad_norm": 1.1139933495136198,
"learning_rate": 4.8204868802768645e-08,
"loss": 0.8318,
"step": 737
},
{
"epoch": 0.96,
"grad_norm": 1.0513282814995402,
"learning_rate": 4.5331982721167345e-08,
"loss": 0.836,
"step": 738
},
{
"epoch": 0.96,
"grad_norm": 1.232938382319419,
"learning_rate": 4.254696570532402e-08,
"loss": 0.8203,
"step": 739
},
{
"epoch": 0.96,
"grad_norm": 1.2688797329670713,
"learning_rate": 3.98498671464087e-08,
"loss": 0.838,
"step": 740
},
{
"epoch": 0.96,
"grad_norm": 1.2712704012974276,
"learning_rate": 3.7240734876389796e-08,
"loss": 0.8357,
"step": 741
},
{
"epoch": 0.96,
"grad_norm": 1.2446289864712674,
"learning_rate": 3.47196151671908e-08,
"loss": 0.8294,
"step": 742
},
{
"epoch": 0.96,
"grad_norm": 1.1249447697847985,
"learning_rate": 3.2286552729866585e-08,
"loss": 0.8005,
"step": 743
},
{
"epoch": 0.97,
"grad_norm": 1.1970341383463035,
"learning_rate": 2.9941590713810645e-08,
"loss": 0.8006,
"step": 744
},
{
"epoch": 0.97,
"grad_norm": 1.052001308737665,
"learning_rate": 2.768477070599185e-08,
"loss": 0.8469,
"step": 745
},
{
"epoch": 0.97,
"grad_norm": 1.4637979916072497,
"learning_rate": 2.5516132730215028e-08,
"loss": 0.833,
"step": 746
},
{
"epoch": 0.97,
"grad_norm": 1.1844659936098128,
"learning_rate": 2.3435715246411527e-08,
"loss": 0.8483,
"step": 747
},
{
"epoch": 0.97,
"grad_norm": 1.0850626596964983,
"learning_rate": 2.1443555149957552e-08,
"loss": 0.8224,
"step": 748
},
{
"epoch": 0.97,
"grad_norm": 1.2824700674922327,
"learning_rate": 1.9539687771019666e-08,
"loss": 0.8203,
"step": 749
},
{
"epoch": 0.97,
"grad_norm": 1.1706439545733154,
"learning_rate": 1.772414687392865e-08,
"loss": 0.8267,
"step": 750
},
{
"epoch": 0.98,
"grad_norm": 1.3274049419404885,
"learning_rate": 1.5996964656579405e-08,
"loss": 0.8095,
"step": 751
},
{
"epoch": 0.98,
"grad_norm": 1.1111642410420237,
"learning_rate": 1.4358171749861427e-08,
"loss": 0.7953,
"step": 752
},
{
"epoch": 0.98,
"grad_norm": 1.0865690516156918,
"learning_rate": 1.2807797217114782e-08,
"loss": 0.8062,
"step": 753
},
{
"epoch": 0.98,
"grad_norm": 1.1521753176211016,
"learning_rate": 1.1345868553615525e-08,
"loss": 0.8374,
"step": 754
},
{
"epoch": 0.98,
"grad_norm": 1.463454458386019,
"learning_rate": 9.972411686085537e-09,
"loss": 0.829,
"step": 755
},
{
"epoch": 0.98,
"grad_norm": 1.3936488651540875,
"learning_rate": 8.687450972237332e-09,
"loss": 0.803,
"step": 756
},
{
"epoch": 0.98,
"grad_norm": 1.136244795446509,
"learning_rate": 7.49100920033663e-09,
"loss": 0.8198,
"step": 757
},
{
"epoch": 0.98,
"grad_norm": 1.1217747805128075,
"learning_rate": 6.383107588802673e-09,
"loss": 0.8234,
"step": 758
},
{
"epoch": 0.99,
"grad_norm": 1.0561447394170373,
"learning_rate": 5.363765785829644e-09,
"loss": 0.8227,
"step": 759
},
{
"epoch": 0.99,
"grad_norm": 1.1412749499521242,
"learning_rate": 4.433001869039166e-09,
"loss": 0.8051,
"step": 760
},
{
"epoch": 0.99,
"grad_norm": 1.2442817598179234,
"learning_rate": 3.590832345158335e-09,
"loss": 0.8286,
"step": 761
},
{
"epoch": 0.99,
"grad_norm": 1.9819842833626258,
"learning_rate": 2.8372721497288423e-09,
"loss": 0.8063,
"step": 762
},
{
"epoch": 0.99,
"grad_norm": 1.1106115527855582,
"learning_rate": 2.172334646841079e-09,
"loss": 0.7925,
"step": 763
},
{
"epoch": 0.99,
"grad_norm": 1.060032513959221,
"learning_rate": 1.596031628896544e-09,
"loss": 0.8132,
"step": 764
},
{
"epoch": 0.99,
"grad_norm": 1.1721837091645986,
"learning_rate": 1.1083733164007904e-09,
"loss": 0.8032,
"step": 765
},
{
"epoch": 0.99,
"grad_norm": 1.1718732472907902,
"learning_rate": 7.093683577791277e-10,
"loss": 0.8607,
"step": 766
},
{
"epoch": 1.0,
"grad_norm": 1.1656756317925252,
"learning_rate": 3.99023829225631e-10,
"loss": 0.8347,
"step": 767
},
{
"epoch": 1.0,
"grad_norm": 1.200374279193499,
"learning_rate": 1.7734523457824116e-10,
"loss": 0.8098,
"step": 768
},
{
"epoch": 1.0,
"grad_norm": 1.2441092942236753,
"learning_rate": 4.433650521717958e-11,
"loss": 0.7924,
"step": 769
},
{
"epoch": 1.0,
"grad_norm": 1.0824485212601966,
"learning_rate": 0.0,
"loss": 0.8336,
"step": 770
},
{
"epoch": 1.0,
"step": 770,
"total_flos": 423037538631680.0,
"train_loss": 0.8593585531432907,
"train_runtime": 3300.3249,
"train_samples_per_second": 29.87,
"train_steps_per_second": 0.233
}
],
"logging_steps": 1.0,
"max_steps": 770,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 423037538631680.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}