cwaud's picture
Training in progress, epoch 1, checkpoint
4b7a997 verified
raw
history blame
70.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8596001859600186,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004649000464900047,
"grad_norm": 0.3293333649635315,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.2004,
"step": 1
},
{
"epoch": 0.009298000929800094,
"grad_norm": 0.3225855827331543,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.2289,
"step": 2
},
{
"epoch": 0.01394700139470014,
"grad_norm": 0.3228434920310974,
"learning_rate": 5e-05,
"loss": 1.2513,
"step": 3
},
{
"epoch": 0.018596001859600187,
"grad_norm": 0.3422330915927887,
"learning_rate": 6.666666666666667e-05,
"loss": 1.2952,
"step": 4
},
{
"epoch": 0.023245002324500233,
"grad_norm": 0.35845088958740234,
"learning_rate": 8.333333333333334e-05,
"loss": 1.3072,
"step": 5
},
{
"epoch": 0.02789400278940028,
"grad_norm": 0.41468068957328796,
"learning_rate": 0.0001,
"loss": 1.4565,
"step": 6
},
{
"epoch": 0.032543003254300325,
"grad_norm": 0.45462965965270996,
"learning_rate": 0.00011666666666666668,
"loss": 1.4221,
"step": 7
},
{
"epoch": 0.037192003719200374,
"grad_norm": 0.5441383719444275,
"learning_rate": 0.00013333333333333334,
"loss": 1.4777,
"step": 8
},
{
"epoch": 0.04184100418410042,
"grad_norm": 0.7736104726791382,
"learning_rate": 0.00015000000000000001,
"loss": 1.5793,
"step": 9
},
{
"epoch": 0.046490004649000466,
"grad_norm": 1.1128149032592773,
"learning_rate": 0.0001666666666666667,
"loss": 1.7799,
"step": 10
},
{
"epoch": 0.05113900511390051,
"grad_norm": 1.295057773590088,
"learning_rate": 0.00018333333333333334,
"loss": 1.6539,
"step": 11
},
{
"epoch": 0.05578800557880056,
"grad_norm": 2.0307557582855225,
"learning_rate": 0.0002,
"loss": 1.6614,
"step": 12
},
{
"epoch": 0.06043700604370061,
"grad_norm": 1.489993691444397,
"learning_rate": 0.0001999970498341241,
"loss": 1.3891,
"step": 13
},
{
"epoch": 0.06508600650860065,
"grad_norm": 0.45855778455734253,
"learning_rate": 0.0001999881995299069,
"loss": 1.1051,
"step": 14
},
{
"epoch": 0.0697350069735007,
"grad_norm": 0.464190274477005,
"learning_rate": 0.0001999734496675677,
"loss": 1.1558,
"step": 15
},
{
"epoch": 0.07438400743840075,
"grad_norm": 0.47878339886665344,
"learning_rate": 0.00019995280121409636,
"loss": 1.1533,
"step": 16
},
{
"epoch": 0.07903300790330078,
"grad_norm": 0.4506080448627472,
"learning_rate": 0.00019992625552318972,
"loss": 1.2074,
"step": 17
},
{
"epoch": 0.08368200836820083,
"grad_norm": 0.4147086441516876,
"learning_rate": 0.00019989381433516316,
"loss": 1.274,
"step": 18
},
{
"epoch": 0.08833100883310088,
"grad_norm": 0.3981817066669464,
"learning_rate": 0.00019985547977683643,
"loss": 1.2457,
"step": 19
},
{
"epoch": 0.09298000929800093,
"grad_norm": 0.3968053162097931,
"learning_rate": 0.00019981125436139405,
"loss": 1.1961,
"step": 20
},
{
"epoch": 0.09762900976290098,
"grad_norm": 0.5115089416503906,
"learning_rate": 0.00019976114098822073,
"loss": 1.228,
"step": 21
},
{
"epoch": 0.10227801022780102,
"grad_norm": 0.5449497103691101,
"learning_rate": 0.00019970514294271124,
"loss": 1.1062,
"step": 22
},
{
"epoch": 0.10692701069270107,
"grad_norm": 0.5306552052497864,
"learning_rate": 0.00019964326389605496,
"loss": 1.0354,
"step": 23
},
{
"epoch": 0.11157601115760112,
"grad_norm": 0.5531755089759827,
"learning_rate": 0.00019957550790499526,
"loss": 1.0759,
"step": 24
},
{
"epoch": 0.11622501162250116,
"grad_norm": 0.9215659499168396,
"learning_rate": 0.00019950187941156355,
"loss": 1.0,
"step": 25
},
{
"epoch": 0.12087401208740121,
"grad_norm": 0.2715967297554016,
"learning_rate": 0.00019942238324278803,
"loss": 1.0803,
"step": 26
},
{
"epoch": 0.12552301255230125,
"grad_norm": 0.24064235389232635,
"learning_rate": 0.00019933702461037716,
"loss": 1.0892,
"step": 27
},
{
"epoch": 0.1301720130172013,
"grad_norm": 0.18895158171653748,
"learning_rate": 0.00019924580911037827,
"loss": 1.1303,
"step": 28
},
{
"epoch": 0.13482101348210135,
"grad_norm": 0.1712905615568161,
"learning_rate": 0.00019914874272281032,
"loss": 1.1635,
"step": 29
},
{
"epoch": 0.1394700139470014,
"grad_norm": 0.18927285075187683,
"learning_rate": 0.00019904583181127206,
"loss": 1.1706,
"step": 30
},
{
"epoch": 0.14411901441190145,
"grad_norm": 0.2120945006608963,
"learning_rate": 0.0001989370831225248,
"loss": 1.1774,
"step": 31
},
{
"epoch": 0.1487680148768015,
"grad_norm": 0.2811720371246338,
"learning_rate": 0.00019882250378605015,
"loss": 1.1867,
"step": 32
},
{
"epoch": 0.15341701534170155,
"grad_norm": 0.3283132016658783,
"learning_rate": 0.00019870210131358253,
"loss": 1.2136,
"step": 33
},
{
"epoch": 0.15806601580660157,
"grad_norm": 0.3853696286678314,
"learning_rate": 0.0001985758835986167,
"loss": 1.1155,
"step": 34
},
{
"epoch": 0.16271501627150162,
"grad_norm": 0.3992000222206116,
"learning_rate": 0.0001984438589158903,
"loss": 1.071,
"step": 35
},
{
"epoch": 0.16736401673640167,
"grad_norm": 0.3955259621143341,
"learning_rate": 0.0001983060359208415,
"loss": 0.9018,
"step": 36
},
{
"epoch": 0.17201301720130172,
"grad_norm": 0.44543391466140747,
"learning_rate": 0.00019816242364904132,
"loss": 0.8921,
"step": 37
},
{
"epoch": 0.17666201766620176,
"grad_norm": 0.5863587856292725,
"learning_rate": 0.00019801303151560138,
"loss": 1.0336,
"step": 38
},
{
"epoch": 0.18131101813110181,
"grad_norm": 0.13918030261993408,
"learning_rate": 0.0001978578693145566,
"loss": 1.1035,
"step": 39
},
{
"epoch": 0.18596001859600186,
"grad_norm": 0.14183400571346283,
"learning_rate": 0.00019769694721822337,
"loss": 1.103,
"step": 40
},
{
"epoch": 0.1906090190609019,
"grad_norm": 0.11069459468126297,
"learning_rate": 0.00019753027577653213,
"loss": 1.11,
"step": 41
},
{
"epoch": 0.19525801952580196,
"grad_norm": 0.1001875028014183,
"learning_rate": 0.00019735786591633633,
"loss": 1.1537,
"step": 42
},
{
"epoch": 0.199907019990702,
"grad_norm": 0.137644425034523,
"learning_rate": 0.0001971797289406956,
"loss": 1.1678,
"step": 43
},
{
"epoch": 0.20455602045560203,
"grad_norm": 0.13967232406139374,
"learning_rate": 0.00019699587652813503,
"loss": 1.147,
"step": 44
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.19884178042411804,
"learning_rate": 0.00019680632073187931,
"loss": 1.1424,
"step": 45
},
{
"epoch": 0.21385402138540213,
"grad_norm": 0.22492916882038116,
"learning_rate": 0.00019661107397906275,
"loss": 1.061,
"step": 46
},
{
"epoch": 0.21850302185030218,
"grad_norm": 0.2543758153915405,
"learning_rate": 0.00019641014906991437,
"loss": 0.9985,
"step": 47
},
{
"epoch": 0.22315202231520223,
"grad_norm": 0.3426196873188019,
"learning_rate": 0.00019620355917691884,
"loss": 0.935,
"step": 48
},
{
"epoch": 0.22780102278010228,
"grad_norm": 0.39695194363594055,
"learning_rate": 0.00019599131784395297,
"loss": 0.8601,
"step": 49
},
{
"epoch": 0.23245002324500233,
"grad_norm": 0.762519359588623,
"learning_rate": 0.00019577343898539748,
"loss": 0.9861,
"step": 50
},
{
"epoch": 0.23709902370990238,
"grad_norm": 0.10567212849855423,
"learning_rate": 0.00019554993688522524,
"loss": 1.0594,
"step": 51
},
{
"epoch": 0.24174802417480243,
"grad_norm": 0.11329612880945206,
"learning_rate": 0.00019532082619606436,
"loss": 1.0659,
"step": 52
},
{
"epoch": 0.24639702463970248,
"grad_norm": 0.1084655150771141,
"learning_rate": 0.00019508612193823793,
"loss": 1.0896,
"step": 53
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.1181301698088646,
"learning_rate": 0.00019484583949877908,
"loss": 1.1282,
"step": 54
},
{
"epoch": 0.2556950255695026,
"grad_norm": 0.14401216804981232,
"learning_rate": 0.00019459999463042237,
"loss": 1.118,
"step": 55
},
{
"epoch": 0.2603440260344026,
"grad_norm": 0.1677497774362564,
"learning_rate": 0.00019434860345057096,
"loss": 1.1425,
"step": 56
},
{
"epoch": 0.2649930264993027,
"grad_norm": 0.20358914136886597,
"learning_rate": 0.00019409168244023987,
"loss": 1.1249,
"step": 57
},
{
"epoch": 0.2696420269642027,
"grad_norm": 0.26533111929893494,
"learning_rate": 0.00019382924844297582,
"loss": 1.1438,
"step": 58
},
{
"epoch": 0.2742910274291027,
"grad_norm": 0.33439338207244873,
"learning_rate": 0.0001935613186637526,
"loss": 1.0949,
"step": 59
},
{
"epoch": 0.2789400278940028,
"grad_norm": 0.4424538016319275,
"learning_rate": 0.0001932879106678434,
"loss": 1.0185,
"step": 60
},
{
"epoch": 0.2835890283589028,
"grad_norm": 0.4415128827095032,
"learning_rate": 0.00019300904237966906,
"loss": 0.9373,
"step": 61
},
{
"epoch": 0.2882380288238029,
"grad_norm": 0.4046485424041748,
"learning_rate": 0.00019272473208162313,
"loss": 0.8202,
"step": 62
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.324368417263031,
"learning_rate": 0.00019243499841287308,
"loss": 0.9427,
"step": 63
},
{
"epoch": 0.297536029753603,
"grad_norm": 0.07730630785226822,
"learning_rate": 0.00019213986036813863,
"loss": 1.0637,
"step": 64
},
{
"epoch": 0.302185030218503,
"grad_norm": 0.08834154158830643,
"learning_rate": 0.0001918393372964461,
"loss": 1.0867,
"step": 65
},
{
"epoch": 0.3068340306834031,
"grad_norm": 0.09792964160442352,
"learning_rate": 0.00019153344889986023,
"loss": 1.0948,
"step": 66
},
{
"epoch": 0.3114830311483031,
"grad_norm": 0.08694823086261749,
"learning_rate": 0.0001912222152321923,
"loss": 1.1405,
"step": 67
},
{
"epoch": 0.31613203161320313,
"grad_norm": 0.12065356969833374,
"learning_rate": 0.0001909056566976856,
"loss": 1.1431,
"step": 68
},
{
"epoch": 0.3207810320781032,
"grad_norm": 0.1258689910173416,
"learning_rate": 0.00019058379404967757,
"loss": 1.1709,
"step": 69
},
{
"epoch": 0.32543003254300323,
"grad_norm": 0.14431503415107727,
"learning_rate": 0.0001902566483892393,
"loss": 1.0588,
"step": 70
},
{
"epoch": 0.3300790330079033,
"grad_norm": 0.1783875674009323,
"learning_rate": 0.00018992424116379228,
"loss": 1.0258,
"step": 71
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.23002314567565918,
"learning_rate": 0.00018958659416570212,
"loss": 0.9455,
"step": 72
},
{
"epoch": 0.3393770339377034,
"grad_norm": 0.305070161819458,
"learning_rate": 0.00018924372953084997,
"loss": 0.9194,
"step": 73
},
{
"epoch": 0.34402603440260343,
"grad_norm": 0.3687322437763214,
"learning_rate": 0.0001888956697371813,
"loss": 0.883,
"step": 74
},
{
"epoch": 0.3486750348675035,
"grad_norm": 0.6616573333740234,
"learning_rate": 0.00018854243760323223,
"loss": 0.872,
"step": 75
},
{
"epoch": 0.35332403533240353,
"grad_norm": 0.09560154378414154,
"learning_rate": 0.0001881840562866336,
"loss": 1.0362,
"step": 76
},
{
"epoch": 0.3579730357973036,
"grad_norm": 0.09787497669458389,
"learning_rate": 0.00018782054928259277,
"loss": 1.0836,
"step": 77
},
{
"epoch": 0.36262203626220363,
"grad_norm": 0.12214916199445724,
"learning_rate": 0.0001874519404223533,
"loss": 1.0993,
"step": 78
},
{
"epoch": 0.36727103672710365,
"grad_norm": 0.1271420419216156,
"learning_rate": 0.00018707825387163248,
"loss": 1.1327,
"step": 79
},
{
"epoch": 0.3719200371920037,
"grad_norm": 0.15113487839698792,
"learning_rate": 0.00018669951412903725,
"loss": 1.1284,
"step": 80
},
{
"epoch": 0.37656903765690375,
"grad_norm": 0.1705760657787323,
"learning_rate": 0.00018631574602445792,
"loss": 1.1973,
"step": 81
},
{
"epoch": 0.3812180381218038,
"grad_norm": 0.1751885861158371,
"learning_rate": 0.0001859269747174404,
"loss": 1.1302,
"step": 82
},
{
"epoch": 0.38586703858670385,
"grad_norm": 0.23309065401554108,
"learning_rate": 0.00018553322569553682,
"loss": 1.1091,
"step": 83
},
{
"epoch": 0.3905160390516039,
"grad_norm": 0.30925455689430237,
"learning_rate": 0.0001851345247726344,
"loss": 1.0555,
"step": 84
},
{
"epoch": 0.39516503951650395,
"grad_norm": 0.41445159912109375,
"learning_rate": 0.00018473089808726336,
"loss": 0.9417,
"step": 85
},
{
"epoch": 0.399814039981404,
"grad_norm": 0.3391374945640564,
"learning_rate": 0.00018432237210088307,
"loss": 0.9262,
"step": 86
},
{
"epoch": 0.40446304044630405,
"grad_norm": 0.43099090456962585,
"learning_rate": 0.00018390897359614748,
"loss": 0.77,
"step": 87
},
{
"epoch": 0.40911204091120407,
"grad_norm": 0.3982544243335724,
"learning_rate": 0.00018349072967514896,
"loss": 0.9355,
"step": 88
},
{
"epoch": 0.41376104137610414,
"grad_norm": 0.06400130689144135,
"learning_rate": 0.00018306766775764196,
"loss": 1.0571,
"step": 89
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.0744890421628952,
"learning_rate": 0.00018263981557924483,
"loss": 1.0519,
"step": 90
},
{
"epoch": 0.42305904230590424,
"grad_norm": 0.08555030077695847,
"learning_rate": 0.00018220720118962205,
"loss": 1.1044,
"step": 91
},
{
"epoch": 0.42770804277080426,
"grad_norm": 0.09533877670764923,
"learning_rate": 0.00018176985295064487,
"loss": 1.1114,
"step": 92
},
{
"epoch": 0.43235704323570434,
"grad_norm": 0.11658069491386414,
"learning_rate": 0.00018132779953453226,
"loss": 1.1064,
"step": 93
},
{
"epoch": 0.43700604370060436,
"grad_norm": 0.13404923677444458,
"learning_rate": 0.00018088106992197091,
"loss": 1.1292,
"step": 94
},
{
"epoch": 0.44165504416550444,
"grad_norm": 0.16842801868915558,
"learning_rate": 0.00018042969340021546,
"loss": 1.1256,
"step": 95
},
{
"epoch": 0.44630404463040446,
"grad_norm": 0.24980179965496063,
"learning_rate": 0.00017997369956116845,
"loss": 1.0524,
"step": 96
},
{
"epoch": 0.4509530450953045,
"grad_norm": 0.27264076471328735,
"learning_rate": 0.00017951311829944014,
"loss": 0.9696,
"step": 97
},
{
"epoch": 0.45560204556020456,
"grad_norm": 0.33832067251205444,
"learning_rate": 0.00017904797981038874,
"loss": 0.9045,
"step": 98
},
{
"epoch": 0.4602510460251046,
"grad_norm": 0.40327176451683044,
"learning_rate": 0.00017857831458814098,
"loss": 0.8434,
"step": 99
},
{
"epoch": 0.46490004649000466,
"grad_norm": 0.7074009776115417,
"learning_rate": 0.00017810415342359257,
"loss": 0.9095,
"step": 100
},
{
"epoch": 0.4695490469549047,
"grad_norm": 0.0739617720246315,
"learning_rate": 0.00017762552740238998,
"loss": 1.0309,
"step": 101
},
{
"epoch": 0.47419804741980476,
"grad_norm": 0.0909004807472229,
"learning_rate": 0.00017714246790289214,
"loss": 1.0933,
"step": 102
},
{
"epoch": 0.4788470478847048,
"grad_norm": 0.0961037203669548,
"learning_rate": 0.0001766550065941136,
"loss": 1.0684,
"step": 103
},
{
"epoch": 0.48349604834960486,
"grad_norm": 0.11135982722043991,
"learning_rate": 0.00017616317543364804,
"loss": 1.118,
"step": 104
},
{
"epoch": 0.4881450488145049,
"grad_norm": 0.11919623613357544,
"learning_rate": 0.00017566700666557346,
"loss": 1.1175,
"step": 105
},
{
"epoch": 0.49279404927940496,
"grad_norm": 0.1320277750492096,
"learning_rate": 0.00017516653281833794,
"loss": 1.113,
"step": 106
},
{
"epoch": 0.497443049744305,
"grad_norm": 0.15232138335704803,
"learning_rate": 0.00017466178670262747,
"loss": 1.1451,
"step": 107
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.17556369304656982,
"learning_rate": 0.00017415280140921463,
"loss": 1.0489,
"step": 108
},
{
"epoch": 0.506741050674105,
"grad_norm": 0.2791363596916199,
"learning_rate": 0.00017363961030678927,
"loss": 1.0767,
"step": 109
},
{
"epoch": 0.5113900511390052,
"grad_norm": 0.39919206500053406,
"learning_rate": 0.00017312224703977094,
"loss": 0.999,
"step": 110
},
{
"epoch": 0.5160390516039052,
"grad_norm": 0.41332653164863586,
"learning_rate": 0.00017260074552610306,
"loss": 0.8625,
"step": 111
},
{
"epoch": 0.5206880520688052,
"grad_norm": 0.41465967893600464,
"learning_rate": 0.00017207513995502939,
"loss": 0.8294,
"step": 112
},
{
"epoch": 0.5253370525337052,
"grad_norm": 0.35807111859321594,
"learning_rate": 0.00017154546478485264,
"loss": 0.9328,
"step": 113
},
{
"epoch": 0.5299860529986054,
"grad_norm": 0.18978600203990936,
"learning_rate": 0.0001710117547406753,
"loss": 1.0327,
"step": 114
},
{
"epoch": 0.5346350534635054,
"grad_norm": 0.16808465123176575,
"learning_rate": 0.00017047404481212314,
"loss": 1.0977,
"step": 115
},
{
"epoch": 0.5392840539284054,
"grad_norm": 0.1754007339477539,
"learning_rate": 0.0001699323702510513,
"loss": 1.063,
"step": 116
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.15331235527992249,
"learning_rate": 0.0001693867665692333,
"loss": 1.1018,
"step": 117
},
{
"epoch": 0.5485820548582054,
"grad_norm": 0.13346055150032043,
"learning_rate": 0.00016883726953603273,
"loss": 1.1215,
"step": 118
},
{
"epoch": 0.5532310553231056,
"grad_norm": 0.14829131960868835,
"learning_rate": 0.00016828391517605845,
"loss": 1.1484,
"step": 119
},
{
"epoch": 0.5578800557880056,
"grad_norm": 0.14702975749969482,
"learning_rate": 0.0001677267397668026,
"loss": 1.0535,
"step": 120
},
{
"epoch": 0.5625290562529056,
"grad_norm": 0.21014554798603058,
"learning_rate": 0.00016716577983626259,
"loss": 1.1138,
"step": 121
},
{
"epoch": 0.5671780567178056,
"grad_norm": 0.2756856679916382,
"learning_rate": 0.000166601072160546,
"loss": 0.9879,
"step": 122
},
{
"epoch": 0.5718270571827058,
"grad_norm": 0.39934659004211426,
"learning_rate": 0.0001660326537614599,
"loss": 0.9133,
"step": 123
},
{
"epoch": 0.5764760576476058,
"grad_norm": 0.37866297364234924,
"learning_rate": 0.0001654605619040835,
"loss": 0.9116,
"step": 124
},
{
"epoch": 0.5811250581125058,
"grad_norm": 0.479396253824234,
"learning_rate": 0.00016488483409432504,
"loss": 0.8499,
"step": 125
},
{
"epoch": 0.5857740585774058,
"grad_norm": 0.0876205563545227,
"learning_rate": 0.00016430550807646323,
"loss": 1.0378,
"step": 126
},
{
"epoch": 0.5904230590423059,
"grad_norm": 0.09576641768217087,
"learning_rate": 0.00016372262183067247,
"loss": 1.0617,
"step": 127
},
{
"epoch": 0.595072059507206,
"grad_norm": 0.08960004895925522,
"learning_rate": 0.00016313621357053306,
"loss": 1.0774,
"step": 128
},
{
"epoch": 0.599721059972106,
"grad_norm": 0.0907701924443245,
"learning_rate": 0.00016254632174052578,
"loss": 1.0989,
"step": 129
},
{
"epoch": 0.604370060437006,
"grad_norm": 0.09422672539949417,
"learning_rate": 0.00016195298501351177,
"loss": 1.1659,
"step": 130
},
{
"epoch": 0.609019060901906,
"grad_norm": 0.10595980286598206,
"learning_rate": 0.00016135624228819683,
"loss": 1.1642,
"step": 131
},
{
"epoch": 0.6136680613668062,
"grad_norm": 0.12369013577699661,
"learning_rate": 0.00016075613268658157,
"loss": 1.1369,
"step": 132
},
{
"epoch": 0.6183170618317062,
"grad_norm": 0.16726712882518768,
"learning_rate": 0.00016015269555139642,
"loss": 1.0458,
"step": 133
},
{
"epoch": 0.6229660622966062,
"grad_norm": 0.21234917640686035,
"learning_rate": 0.00015954597044352234,
"loss": 1.0013,
"step": 134
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.28728505969047546,
"learning_rate": 0.00015893599713939728,
"loss": 0.9075,
"step": 135
},
{
"epoch": 0.6322640632264063,
"grad_norm": 0.4090898334980011,
"learning_rate": 0.00015832281562840856,
"loss": 0.9677,
"step": 136
},
{
"epoch": 0.6369130636913064,
"grad_norm": 0.3748779296875,
"learning_rate": 0.000157706466110271,
"loss": 0.7873,
"step": 137
},
{
"epoch": 0.6415620641562064,
"grad_norm": 0.31243249773979187,
"learning_rate": 0.00015708698899239172,
"loss": 0.8241,
"step": 138
},
{
"epoch": 0.6462110646211064,
"grad_norm": 0.08351978659629822,
"learning_rate": 0.00015646442488722074,
"loss": 1.0431,
"step": 139
},
{
"epoch": 0.6508600650860065,
"grad_norm": 0.08479636162519455,
"learning_rate": 0.00015583881460958868,
"loss": 1.0725,
"step": 140
},
{
"epoch": 0.6555090655509066,
"grad_norm": 0.09441729635000229,
"learning_rate": 0.000155210199174031,
"loss": 1.076,
"step": 141
},
{
"epoch": 0.6601580660158066,
"grad_norm": 0.10794027149677277,
"learning_rate": 0.0001545786197920989,
"loss": 1.1112,
"step": 142
},
{
"epoch": 0.6648070664807066,
"grad_norm": 0.11890177428722382,
"learning_rate": 0.00015394411786965776,
"loss": 1.1748,
"step": 143
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.13239571452140808,
"learning_rate": 0.0001533067350041725,
"loss": 1.1075,
"step": 144
},
{
"epoch": 0.6741050674105067,
"grad_norm": 0.1535806506872177,
"learning_rate": 0.00015266651298198033,
"loss": 1.1259,
"step": 145
},
{
"epoch": 0.6787540678754068,
"grad_norm": 0.19703824818134308,
"learning_rate": 0.00015202349377555166,
"loss": 1.0655,
"step": 146
},
{
"epoch": 0.6834030683403068,
"grad_norm": 0.2627493441104889,
"learning_rate": 0.00015137771954073804,
"loss": 0.9644,
"step": 147
},
{
"epoch": 0.6880520688052069,
"grad_norm": 0.3154362142086029,
"learning_rate": 0.0001507292326140085,
"loss": 0.8241,
"step": 148
},
{
"epoch": 0.6927010692701069,
"grad_norm": 0.3660978078842163,
"learning_rate": 0.0001500780755096743,
"loss": 0.9218,
"step": 149
},
{
"epoch": 0.697350069735007,
"grad_norm": 0.4281309247016907,
"learning_rate": 0.00014942429091710141,
"loss": 0.7471,
"step": 150
},
{
"epoch": 0.701999070199907,
"grad_norm": 0.058116402477025986,
"learning_rate": 0.00014876792169791193,
"loss": 1.0336,
"step": 151
},
{
"epoch": 0.7066480706648071,
"grad_norm": 0.06079982966184616,
"learning_rate": 0.00014810901088317414,
"loss": 1.0446,
"step": 152
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.06905698031187057,
"learning_rate": 0.00014744760167058137,
"loss": 1.0841,
"step": 153
},
{
"epoch": 0.7159460715946072,
"grad_norm": 0.07570036500692368,
"learning_rate": 0.00014678373742162007,
"loss": 1.0895,
"step": 154
},
{
"epoch": 0.7205950720595072,
"grad_norm": 0.08454253524541855,
"learning_rate": 0.00014611746165872698,
"loss": 1.1083,
"step": 155
},
{
"epoch": 0.7252440725244073,
"grad_norm": 0.10023923218250275,
"learning_rate": 0.00014544881806243583,
"loss": 1.0951,
"step": 156
},
{
"epoch": 0.7298930729893073,
"grad_norm": 0.12155482918024063,
"learning_rate": 0.00014477785046851385,
"loss": 1.092,
"step": 157
},
{
"epoch": 0.7345420734542073,
"grad_norm": 0.15077327191829681,
"learning_rate": 0.00014410460286508762,
"loss": 1.0582,
"step": 158
},
{
"epoch": 0.7391910739191074,
"grad_norm": 0.19081415235996246,
"learning_rate": 0.00014342911938975948,
"loss": 1.0055,
"step": 159
},
{
"epoch": 0.7438400743840075,
"grad_norm": 0.24974983930587769,
"learning_rate": 0.0001427514443267139,
"loss": 0.9043,
"step": 160
},
{
"epoch": 0.7484890748489075,
"grad_norm": 0.33478638529777527,
"learning_rate": 0.00014207162210381404,
"loss": 0.8285,
"step": 161
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.34924110770225525,
"learning_rate": 0.0001413896972896894,
"loss": 0.774,
"step": 162
},
{
"epoch": 0.7577870757787076,
"grad_norm": 0.2992526888847351,
"learning_rate": 0.00014070571459081366,
"loss": 0.8779,
"step": 163
},
{
"epoch": 0.7624360762436077,
"grad_norm": 0.07906091213226318,
"learning_rate": 0.0001400197188485739,
"loss": 1.047,
"step": 164
},
{
"epoch": 0.7670850767085077,
"grad_norm": 0.09800871461629868,
"learning_rate": 0.00013933175503633068,
"loss": 1.0439,
"step": 165
},
{
"epoch": 0.7717340771734077,
"grad_norm": 0.09857136756181717,
"learning_rate": 0.00013864186825646995,
"loss": 1.0522,
"step": 166
},
{
"epoch": 0.7763830776383077,
"grad_norm": 0.1110193282365799,
"learning_rate": 0.00013795010373744582,
"loss": 1.1126,
"step": 167
},
{
"epoch": 0.7810320781032078,
"grad_norm": 0.11846626549959183,
"learning_rate": 0.00013725650683081556,
"loss": 1.0925,
"step": 168
},
{
"epoch": 0.7856810785681079,
"grad_norm": 0.14214776456356049,
"learning_rate": 0.00013656112300826646,
"loss": 1.1323,
"step": 169
},
{
"epoch": 0.7903300790330079,
"grad_norm": 0.1606011986732483,
"learning_rate": 0.00013586399785863454,
"loss": 1.0505,
"step": 170
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.18738731741905212,
"learning_rate": 0.000135165177084916,
"loss": 1.0334,
"step": 171
},
{
"epoch": 0.799628079962808,
"grad_norm": 0.23303773999214172,
"learning_rate": 0.0001344647065012709,
"loss": 0.9471,
"step": 172
},
{
"epoch": 0.8042770804277081,
"grad_norm": 0.27448582649230957,
"learning_rate": 0.00013376263203001938,
"loss": 0.8672,
"step": 173
},
{
"epoch": 0.8089260808926081,
"grad_norm": 0.29609808325767517,
"learning_rate": 0.0001330589996986315,
"loss": 0.7936,
"step": 174
},
{
"epoch": 0.8135750813575081,
"grad_norm": 0.43915602564811707,
"learning_rate": 0.00013235385563670934,
"loss": 0.7688,
"step": 175
},
{
"epoch": 0.8182240818224081,
"grad_norm": 0.06484824419021606,
"learning_rate": 0.00013164724607296285,
"loss": 1.0403,
"step": 176
},
{
"epoch": 0.8228730822873083,
"grad_norm": 0.07361859828233719,
"learning_rate": 0.00013093921733217916,
"loss": 1.0539,
"step": 177
},
{
"epoch": 0.8275220827522083,
"grad_norm": 0.08026642352342606,
"learning_rate": 0.00013022981583218565,
"loss": 1.0596,
"step": 178
},
{
"epoch": 0.8321710832171083,
"grad_norm": 0.08283592760562897,
"learning_rate": 0.0001295190880808067,
"loss": 1.0435,
"step": 179
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.11680889129638672,
"learning_rate": 0.00012880708067281477,
"loss": 1.1464,
"step": 180
},
{
"epoch": 0.8414690841469085,
"grad_norm": 0.10784471035003662,
"learning_rate": 0.00012809384028687553,
"loss": 1.1004,
"step": 181
},
{
"epoch": 0.8461180846118085,
"grad_norm": 0.1224328875541687,
"learning_rate": 0.00012737941368248792,
"loss": 1.0699,
"step": 182
},
{
"epoch": 0.8507670850767085,
"grad_norm": 0.15732775628566742,
"learning_rate": 0.0001266638476969183,
"loss": 1.0579,
"step": 183
},
{
"epoch": 0.8554160855416085,
"grad_norm": 0.18987177312374115,
"learning_rate": 0.00012594718924213008,
"loss": 1.0212,
"step": 184
},
{
"epoch": 0.8600650860065086,
"grad_norm": 0.250615656375885,
"learning_rate": 0.00012522948530170806,
"loss": 0.9817,
"step": 185
},
{
"epoch": 0.8647140864714087,
"grad_norm": 0.28331658244132996,
"learning_rate": 0.00012451078292777837,
"loss": 0.8079,
"step": 186
},
{
"epoch": 0.8693630869363087,
"grad_norm": 0.3439493477344513,
"learning_rate": 0.0001237911292379237,
"loss": 0.7382,
"step": 187
},
{
"epoch": 0.8740120874012087,
"grad_norm": 0.35732191801071167,
"learning_rate": 0.00012307057141209415,
"loss": 0.9792,
"step": 188
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.10152771323919296,
"learning_rate": 0.0001223491566895144,
"loss": 1.0674,
"step": 189
},
{
"epoch": 0.8833100883310089,
"grad_norm": 0.11470180004835129,
"learning_rate": 0.00012162693236558658,
"loss": 1.0276,
"step": 190
},
{
"epoch": 0.8879590887959089,
"grad_norm": 0.1217978298664093,
"learning_rate": 0.00012090394578878974,
"loss": 1.0734,
"step": 191
},
{
"epoch": 0.8926080892608089,
"grad_norm": 0.11547485738992691,
"learning_rate": 0.0001201802443575756,
"loss": 1.0862,
"step": 192
},
{
"epoch": 0.897257089725709,
"grad_norm": 0.13204635679721832,
"learning_rate": 0.00011945587551726116,
"loss": 1.1112,
"step": 193
},
{
"epoch": 0.901906090190609,
"grad_norm": 0.12314517050981522,
"learning_rate": 0.00011873088675691835,
"loss": 1.1342,
"step": 194
},
{
"epoch": 0.9065550906555091,
"grad_norm": 0.14891035854816437,
"learning_rate": 0.00011800532560626048,
"loss": 1.0975,
"step": 195
},
{
"epoch": 0.9112040911204091,
"grad_norm": 0.17384392023086548,
"learning_rate": 0.0001172792396325264,
"loss": 1.0599,
"step": 196
},
{
"epoch": 0.9158530915853091,
"grad_norm": 0.22434796392917633,
"learning_rate": 0.00011655267643736194,
"loss": 1.0202,
"step": 197
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.26256677508354187,
"learning_rate": 0.00011582568365369924,
"loss": 0.9057,
"step": 198
},
{
"epoch": 0.9251510925151093,
"grad_norm": 0.30824849009513855,
"learning_rate": 0.00011509830894263387,
"loss": 0.8073,
"step": 199
},
{
"epoch": 0.9298000929800093,
"grad_norm": 0.4767857789993286,
"learning_rate": 0.00011437059999030035,
"loss": 0.806,
"step": 200
},
{
"epoch": 0.9344490934449093,
"grad_norm": 0.05803222209215164,
"learning_rate": 0.00011364260450474575,
"loss": 1.0481,
"step": 201
},
{
"epoch": 0.9390980939098094,
"grad_norm": 0.06460921466350555,
"learning_rate": 0.00011291437021280205,
"loss": 1.0617,
"step": 202
},
{
"epoch": 0.9437470943747094,
"grad_norm": 0.07504323869943619,
"learning_rate": 0.0001121859448569572,
"loss": 1.0995,
"step": 203
},
{
"epoch": 0.9483960948396095,
"grad_norm": 0.07197124511003494,
"learning_rate": 0.00011145737619222516,
"loss": 1.0629,
"step": 204
},
{
"epoch": 0.9530450953045095,
"grad_norm": 0.08467988669872284,
"learning_rate": 0.0001107287119830151,
"loss": 1.0961,
"step": 205
},
{
"epoch": 0.9576940957694096,
"grad_norm": 0.10283592343330383,
"learning_rate": 0.00011000000000000002,
"loss": 1.1351,
"step": 206
},
{
"epoch": 0.9623430962343096,
"grad_norm": 0.11667031794786453,
"learning_rate": 0.00010927128801698494,
"loss": 1.0459,
"step": 207
},
{
"epoch": 0.9669920966992097,
"grad_norm": 0.1426560878753662,
"learning_rate": 0.00010854262380777486,
"loss": 1.0367,
"step": 208
},
{
"epoch": 0.9716410971641097,
"grad_norm": 0.18659153580665588,
"learning_rate": 0.00010781405514304284,
"loss": 0.9566,
"step": 209
},
{
"epoch": 0.9762900976290098,
"grad_norm": 0.2517950236797333,
"learning_rate": 0.000107085629787198,
"loss": 0.9468,
"step": 210
},
{
"epoch": 0.9809390980939098,
"grad_norm": 0.27402400970458984,
"learning_rate": 0.0001063573954952543,
"loss": 0.8093,
"step": 211
},
{
"epoch": 0.9855880985588099,
"grad_norm": 0.339691698551178,
"learning_rate": 0.0001056294000096997,
"loss": 0.7326,
"step": 212
},
{
"epoch": 0.9902370990237099,
"grad_norm": 0.36932969093322754,
"learning_rate": 0.00010490169105736613,
"loss": 0.9435,
"step": 213
},
{
"epoch": 0.99488609948861,
"grad_norm": 0.09353512525558472,
"learning_rate": 0.0001041743163463008,
"loss": 1.1068,
"step": 214
},
{
"epoch": 0.99953509995351,
"grad_norm": 0.249754399061203,
"learning_rate": 0.00010344732356263808,
"loss": 0.8395,
"step": 215
},
{
"epoch": 1.00418410041841,
"grad_norm": 0.0985850915312767,
"learning_rate": 0.00010272076036747365,
"loss": 0.9939,
"step": 216
},
{
"epoch": 1.00883310088331,
"grad_norm": 0.06556614488363266,
"learning_rate": 0.00010199467439373956,
"loss": 1.0809,
"step": 217
},
{
"epoch": 1.01348210134821,
"grad_norm": 0.07044567912817001,
"learning_rate": 0.00010126911324308168,
"loss": 1.0733,
"step": 218
},
{
"epoch": 1.0181311018131103,
"grad_norm": 0.08298351615667343,
"learning_rate": 0.00010054412448273886,
"loss": 1.0562,
"step": 219
},
{
"epoch": 1.0227801022780103,
"grad_norm": 0.09130789339542389,
"learning_rate": 9.981975564242443e-05,
"loss": 1.0767,
"step": 220
},
{
"epoch": 1.0274291027429103,
"grad_norm": 0.10151571035385132,
"learning_rate": 9.909605421121028e-05,
"loss": 1.1066,
"step": 221
},
{
"epoch": 1.0320781032078103,
"grad_norm": 0.11376982927322388,
"learning_rate": 9.837306763441345e-05,
"loss": 1.0909,
"step": 222
},
{
"epoch": 1.0367271036727104,
"grad_norm": 0.14317406713962555,
"learning_rate": 9.765084331048567e-05,
"loss": 1.0539,
"step": 223
},
{
"epoch": 1.0413761041376104,
"grad_norm": 0.183615580201149,
"learning_rate": 9.692942858790591e-05,
"loss": 0.9804,
"step": 224
},
{
"epoch": 1.0460251046025104,
"grad_norm": 0.2267308384180069,
"learning_rate": 9.620887076207632e-05,
"loss": 0.9,
"step": 225
},
{
"epoch": 1.0506741050674104,
"grad_norm": 0.25844672322273254,
"learning_rate": 9.548921707222163e-05,
"loss": 0.7342,
"step": 226
},
{
"epoch": 1.0553231055323105,
"grad_norm": 0.35192665457725525,
"learning_rate": 9.477051469829196e-05,
"loss": 0.7048,
"step": 227
},
{
"epoch": 1.0599721059972107,
"grad_norm": 0.28442806005477905,
"learning_rate": 9.405281075786995e-05,
"loss": 0.7852,
"step": 228
},
{
"epoch": 1.0646211064621107,
"grad_norm": 0.06501670181751251,
"learning_rate": 9.333615230308173e-05,
"loss": 1.0592,
"step": 229
},
{
"epoch": 1.0692701069270107,
"grad_norm": 0.06888816505670547,
"learning_rate": 9.26205863175121e-05,
"loss": 1.0582,
"step": 230
},
{
"epoch": 1.0739191073919108,
"grad_norm": 0.07344524562358856,
"learning_rate": 9.190615971312446e-05,
"loss": 1.0434,
"step": 231
},
{
"epoch": 1.0785681078568108,
"grad_norm": 0.08665701746940613,
"learning_rate": 9.119291932718525e-05,
"loss": 1.0843,
"step": 232
},
{
"epoch": 1.0832171083217108,
"grad_norm": 0.09859387576580048,
"learning_rate": 9.048091191919332e-05,
"loss": 1.0871,
"step": 233
},
{
"epoch": 1.0878661087866108,
"grad_norm": 0.11464710533618927,
"learning_rate": 8.97701841678144e-05,
"loss": 1.0692,
"step": 234
},
{
"epoch": 1.0925151092515109,
"grad_norm": 0.14108416438102722,
"learning_rate": 8.906078266782087e-05,
"loss": 1.0655,
"step": 235
},
{
"epoch": 1.0971641097164109,
"grad_norm": 0.1800997108221054,
"learning_rate": 8.835275392703721e-05,
"loss": 1.0205,
"step": 236
},
{
"epoch": 1.1018131101813111,
"grad_norm": 0.2220492660999298,
"learning_rate": 8.764614436329066e-05,
"loss": 0.9219,
"step": 237
},
{
"epoch": 1.1064621106462111,
"grad_norm": 0.2564367353916168,
"learning_rate": 8.694100030136849e-05,
"loss": 0.7692,
"step": 238
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.2938039004802704,
"learning_rate": 8.623736796998063e-05,
"loss": 0.7331,
"step": 239
},
{
"epoch": 1.1157601115760112,
"grad_norm": 0.37189003825187683,
"learning_rate": 8.553529349872916e-05,
"loss": 0.5737,
"step": 240
},
{
"epoch": 1.1204091120409112,
"grad_norm": 0.21121616661548615,
"learning_rate": 8.4834822915084e-05,
"loss": 1.0279,
"step": 241
},
{
"epoch": 1.1250581125058112,
"grad_norm": 0.08440238237380981,
"learning_rate": 8.413600214136548e-05,
"loss": 1.039,
"step": 242
},
{
"epoch": 1.1297071129707112,
"grad_norm": 0.08833085745573044,
"learning_rate": 8.343887699173356e-05,
"loss": 1.065,
"step": 243
},
{
"epoch": 1.1343561134356113,
"grad_norm": 0.09478267282247543,
"learning_rate": 8.274349316918446e-05,
"loss": 1.0549,
"step": 244
},
{
"epoch": 1.1390051139005113,
"grad_norm": 0.1044965535402298,
"learning_rate": 8.204989626255422e-05,
"loss": 1.1372,
"step": 245
},
{
"epoch": 1.1436541143654115,
"grad_norm": 0.13615119457244873,
"learning_rate": 8.135813174353008e-05,
"loss": 1.1241,
"step": 246
},
{
"epoch": 1.1483031148303116,
"grad_norm": 0.1386057436466217,
"learning_rate": 8.066824496366937e-05,
"loss": 1.1088,
"step": 247
},
{
"epoch": 1.1529521152952116,
"grad_norm": 0.16772480309009552,
"learning_rate": 7.998028115142617e-05,
"loss": 1.0241,
"step": 248
},
{
"epoch": 1.1576011157601116,
"grad_norm": 0.2133951038122177,
"learning_rate": 7.929428540918635e-05,
"loss": 0.9239,
"step": 249
},
{
"epoch": 1.1622501162250116,
"grad_norm": 0.2830871343612671,
"learning_rate": 7.86103027103106e-05,
"loss": 0.8394,
"step": 250
},
{
"epoch": 1.1668991166899116,
"grad_norm": 0.29441362619400024,
"learning_rate": 7.792837789618595e-05,
"loss": 0.7265,
"step": 251
},
{
"epoch": 1.1715481171548117,
"grad_norm": 0.33453473448753357,
"learning_rate": 7.724855567328613e-05,
"loss": 0.6728,
"step": 252
},
{
"epoch": 1.1761971176197117,
"grad_norm": 0.3466179370880127,
"learning_rate": 7.657088061024054e-05,
"loss": 0.7098,
"step": 253
},
{
"epoch": 1.1808461180846117,
"grad_norm": 0.0637335404753685,
"learning_rate": 7.58953971349124e-05,
"loss": 1.0493,
"step": 254
},
{
"epoch": 1.185495118549512,
"grad_norm": 0.07033415138721466,
"learning_rate": 7.522214953148618e-05,
"loss": 1.04,
"step": 255
},
{
"epoch": 1.190144119014412,
"grad_norm": 0.07706312835216522,
"learning_rate": 7.455118193756419e-05,
"loss": 1.0632,
"step": 256
},
{
"epoch": 1.194793119479312,
"grad_norm": 0.09520713984966278,
"learning_rate": 7.388253834127306e-05,
"loss": 1.1191,
"step": 257
},
{
"epoch": 1.199442119944212,
"grad_norm": 0.10701565444469452,
"learning_rate": 7.321626257837996e-05,
"loss": 1.0896,
"step": 258
},
{
"epoch": 1.204091120409112,
"grad_norm": 0.11862656474113464,
"learning_rate": 7.255239832941866e-05,
"loss": 1.0893,
"step": 259
},
{
"epoch": 1.208740120874012,
"grad_norm": 0.1468852460384369,
"learning_rate": 7.189098911682592e-05,
"loss": 1.0755,
"step": 260
},
{
"epoch": 1.213389121338912,
"grad_norm": 0.18458393216133118,
"learning_rate": 7.123207830208806e-05,
"loss": 0.951,
"step": 261
},
{
"epoch": 1.218038121803812,
"grad_norm": 0.22184807062149048,
"learning_rate": 7.05757090828986e-05,
"loss": 0.8779,
"step": 262
},
{
"epoch": 1.2226871222687121,
"grad_norm": 0.3002167344093323,
"learning_rate": 6.992192449032571e-05,
"loss": 0.8529,
"step": 263
},
{
"epoch": 1.2273361227336124,
"grad_norm": 0.3212537169456482,
"learning_rate": 6.927076738599152e-05,
"loss": 0.6355,
"step": 264
},
{
"epoch": 1.2319851231985124,
"grad_norm": 0.3631160259246826,
"learning_rate": 6.862228045926202e-05,
"loss": 0.4947,
"step": 265
},
{
"epoch": 1.2366341236634124,
"grad_norm": 0.19289681315422058,
"learning_rate": 6.797650622444836e-05,
"loss": 0.9966,
"step": 266
},
{
"epoch": 1.2412831241283124,
"grad_norm": 0.07654353976249695,
"learning_rate": 6.733348701801969e-05,
"loss": 1.0453,
"step": 267
},
{
"epoch": 1.2459321245932125,
"grad_norm": 0.08304441720247269,
"learning_rate": 6.669326499582755e-05,
"loss": 1.0742,
"step": 268
},
{
"epoch": 1.2505811250581125,
"grad_norm": 0.0931682139635086,
"learning_rate": 6.605588213034227e-05,
"loss": 1.0973,
"step": 269
},
{
"epoch": 1.2552301255230125,
"grad_norm": 0.10273009538650513,
"learning_rate": 6.542138020790116e-05,
"loss": 1.0871,
"step": 270
},
{
"epoch": 1.2598791259879125,
"grad_norm": 0.1231117695569992,
"learning_rate": 6.478980082596905e-05,
"loss": 1.1266,
"step": 271
},
{
"epoch": 1.2645281264528125,
"grad_norm": 0.13809573650360107,
"learning_rate": 6.416118539041135e-05,
"loss": 1.0663,
"step": 272
},
{
"epoch": 1.2691771269177128,
"grad_norm": 0.17248062789440155,
"learning_rate": 6.353557511277928e-05,
"loss": 1.0427,
"step": 273
},
{
"epoch": 1.2738261273826128,
"grad_norm": 0.22152015566825867,
"learning_rate": 6.291301100760829e-05,
"loss": 0.9226,
"step": 274
},
{
"epoch": 1.2784751278475128,
"grad_norm": 0.2788388729095459,
"learning_rate": 6.229353388972901e-05,
"loss": 0.8773,
"step": 275
},
{
"epoch": 1.2831241283124128,
"grad_norm": 0.30847859382629395,
"learning_rate": 6.167718437159147e-05,
"loss": 0.7012,
"step": 276
},
{
"epoch": 1.2877731287773129,
"grad_norm": 0.3570963442325592,
"learning_rate": 6.106400286060274e-05,
"loss": 0.6076,
"step": 277
},
{
"epoch": 1.292422129242213,
"grad_norm": 0.3086980879306793,
"learning_rate": 6.045402955647769e-05,
"loss": 0.7205,
"step": 278
},
{
"epoch": 1.297071129707113,
"grad_norm": 0.0752180814743042,
"learning_rate": 5.98473044486036e-05,
"loss": 1.0282,
"step": 279
},
{
"epoch": 1.301720130172013,
"grad_norm": 0.07571510970592499,
"learning_rate": 5.924386731341842e-05,
"loss": 1.0712,
"step": 280
},
{
"epoch": 1.306369130636913,
"grad_norm": 0.08388427644968033,
"learning_rate": 5.864375771180317e-05,
"loss": 1.0425,
"step": 281
},
{
"epoch": 1.3110181311018132,
"grad_norm": 0.0906265452504158,
"learning_rate": 5.804701498648828e-05,
"loss": 1.0463,
"step": 282
},
{
"epoch": 1.3156671315667132,
"grad_norm": 0.10397284477949142,
"learning_rate": 5.7453678259474234e-05,
"loss": 1.0913,
"step": 283
},
{
"epoch": 1.3203161320316132,
"grad_norm": 0.12175474315881729,
"learning_rate": 5.686378642946699e-05,
"loss": 1.0896,
"step": 284
},
{
"epoch": 1.3249651324965133,
"grad_norm": 0.14317680895328522,
"learning_rate": 5.627737816932754e-05,
"loss": 1.0503,
"step": 285
},
{
"epoch": 1.3296141329614133,
"grad_norm": 0.19962027668952942,
"learning_rate": 5.569449192353678e-05,
"loss": 1.0168,
"step": 286
},
{
"epoch": 1.3342631334263133,
"grad_norm": 0.253571093082428,
"learning_rate": 5.511516590567499e-05,
"loss": 0.8714,
"step": 287
},
{
"epoch": 1.3389121338912133,
"grad_norm": 0.2986457049846649,
"learning_rate": 5.453943809591654e-05,
"loss": 0.7573,
"step": 288
},
{
"epoch": 1.3435611343561136,
"grad_norm": 0.37284067273139954,
"learning_rate": 5.396734623854012e-05,
"loss": 0.6708,
"step": 289
},
{
"epoch": 1.3482101348210134,
"grad_norm": 0.4108807444572449,
"learning_rate": 5.3398927839453996e-05,
"loss": 0.549,
"step": 290
},
{
"epoch": 1.3528591352859136,
"grad_norm": 0.21761813759803772,
"learning_rate": 5.283422016373745e-05,
"loss": 0.9454,
"step": 291
},
{
"epoch": 1.3575081357508136,
"grad_norm": 0.07017389684915543,
"learning_rate": 5.227326023319743e-05,
"loss": 1.06,
"step": 292
},
{
"epoch": 1.3621571362157137,
"grad_norm": 0.07673922181129456,
"learning_rate": 5.17160848239416e-05,
"loss": 1.061,
"step": 293
},
{
"epoch": 1.3668061366806137,
"grad_norm": 0.08843539655208588,
"learning_rate": 5.1162730463967304e-05,
"loss": 1.0668,
"step": 294
},
{
"epoch": 1.3714551371455137,
"grad_norm": 0.10018595308065414,
"learning_rate": 5.061323343076672e-05,
"loss": 1.0926,
"step": 295
},
{
"epoch": 1.3761041376104137,
"grad_norm": 0.11475757509469986,
"learning_rate": 5.006762974894872e-05,
"loss": 1.0878,
"step": 296
},
{
"epoch": 1.3807531380753137,
"grad_norm": 0.14695028960704803,
"learning_rate": 4.9525955187876885e-05,
"loss": 1.088,
"step": 297
},
{
"epoch": 1.385402138540214,
"grad_norm": 0.18600180745124817,
"learning_rate": 4.898824525932471e-05,
"loss": 1.0019,
"step": 298
},
{
"epoch": 1.3900511390051138,
"grad_norm": 0.21412943303585052,
"learning_rate": 4.845453521514738e-05,
"loss": 0.9161,
"step": 299
},
{
"epoch": 1.394700139470014,
"grad_norm": 0.2769903838634491,
"learning_rate": 4.7924860044970615e-05,
"loss": 0.8767,
"step": 300
},
{
"epoch": 1.399349139934914,
"grad_norm": 0.37584012746810913,
"learning_rate": 4.739925447389698e-05,
"loss": 0.8159,
"step": 301
},
{
"epoch": 1.403998140399814,
"grad_norm": 0.35033103823661804,
"learning_rate": 4.687775296022908e-05,
"loss": 0.5912,
"step": 302
},
{
"epoch": 1.408647140864714,
"grad_norm": 0.3620702624320984,
"learning_rate": 4.6360389693210735e-05,
"loss": 0.7819,
"step": 303
},
{
"epoch": 1.4132961413296141,
"grad_norm": 0.06753652542829514,
"learning_rate": 4.5847198590785394e-05,
"loss": 1.023,
"step": 304
},
{
"epoch": 1.4179451417945141,
"grad_norm": 0.07456682622432709,
"learning_rate": 4.5338213297372534e-05,
"loss": 1.0321,
"step": 305
},
{
"epoch": 1.4225941422594142,
"grad_norm": 0.08220840245485306,
"learning_rate": 4.4833467181662086e-05,
"loss": 1.0518,
"step": 306
},
{
"epoch": 1.4272431427243144,
"grad_norm": 0.09785965085029602,
"learning_rate": 4.4332993334426576e-05,
"loss": 1.0736,
"step": 307
},
{
"epoch": 1.4318921431892142,
"grad_norm": 0.10714032500982285,
"learning_rate": 4.383682456635199e-05,
"loss": 1.0766,
"step": 308
},
{
"epoch": 1.4365411436541144,
"grad_norm": 0.12374605983495712,
"learning_rate": 4.3344993405886425e-05,
"loss": 1.1013,
"step": 309
},
{
"epoch": 1.4411901441190145,
"grad_norm": 0.15557971596717834,
"learning_rate": 4.285753209710786e-05,
"loss": 1.0673,
"step": 310
},
{
"epoch": 1.4458391445839145,
"grad_norm": 0.1834760308265686,
"learning_rate": 4.2374472597610044e-05,
"loss": 0.9611,
"step": 311
},
{
"epoch": 1.4504881450488145,
"grad_norm": 0.24258282780647278,
"learning_rate": 4.1895846576407424e-05,
"loss": 0.9162,
"step": 312
},
{
"epoch": 1.4551371455137145,
"grad_norm": 0.31023555994033813,
"learning_rate": 4.1421685411859046e-05,
"loss": 0.8321,
"step": 313
},
{
"epoch": 1.4597861459786146,
"grad_norm": 0.3393552601337433,
"learning_rate": 4.095202018961125e-05,
"loss": 0.6979,
"step": 314
},
{
"epoch": 1.4644351464435146,
"grad_norm": 0.3873143792152405,
"learning_rate": 4.048688170055989e-05,
"loss": 0.5101,
"step": 315
},
{
"epoch": 1.4690841469084148,
"grad_norm": 0.20240376889705658,
"learning_rate": 4.002630043883159e-05,
"loss": 1.0254,
"step": 316
},
{
"epoch": 1.4737331473733146,
"grad_norm": 0.08021257072687149,
"learning_rate": 3.9570306599784544e-05,
"loss": 1.033,
"step": 317
},
{
"epoch": 1.4783821478382149,
"grad_norm": 0.079825259745121,
"learning_rate": 3.911893007802913e-05,
"loss": 1.0105,
"step": 318
},
{
"epoch": 1.4830311483031149,
"grad_norm": 0.09591138362884521,
"learning_rate": 3.8672200465467765e-05,
"loss": 1.0867,
"step": 319
},
{
"epoch": 1.487680148768015,
"grad_norm": 0.10192592442035675,
"learning_rate": 3.8230147049355147e-05,
"loss": 1.1238,
"step": 320
},
{
"epoch": 1.492329149232915,
"grad_norm": 0.11998689919710159,
"learning_rate": 3.779279881037797e-05,
"loss": 1.099,
"step": 321
},
{
"epoch": 1.496978149697815,
"grad_norm": 0.14268672466278076,
"learning_rate": 3.7360184420755165e-05,
"loss": 1.0832,
"step": 322
},
{
"epoch": 1.501627150162715,
"grad_norm": 0.1779240220785141,
"learning_rate": 3.693233224235806e-05,
"loss": 1.0322,
"step": 323
},
{
"epoch": 1.506276150627615,
"grad_norm": 0.23235994577407837,
"learning_rate": 3.650927032485101e-05,
"loss": 0.987,
"step": 324
},
{
"epoch": 1.5109251510925152,
"grad_norm": 0.2786919176578522,
"learning_rate": 3.609102640385254e-05,
"loss": 0.7974,
"step": 325
},
{
"epoch": 1.515574151557415,
"grad_norm": 0.3645856976509094,
"learning_rate": 3.567762789911693e-05,
"loss": 0.7208,
"step": 326
},
{
"epoch": 1.5202231520223153,
"grad_norm": 0.4197288751602173,
"learning_rate": 3.526910191273665e-05,
"loss": 0.5941,
"step": 327
},
{
"epoch": 1.524872152487215,
"grad_norm": 0.43531715869903564,
"learning_rate": 3.486547522736562e-05,
"loss": 0.8169,
"step": 328
},
{
"epoch": 1.5295211529521153,
"grad_norm": 0.07292664051055908,
"learning_rate": 3.44667743044632e-05,
"loss": 1.026,
"step": 329
},
{
"epoch": 1.5341701534170153,
"grad_norm": 0.07665824145078659,
"learning_rate": 3.407302528255961e-05,
"loss": 1.0711,
"step": 330
},
{
"epoch": 1.5388191538819154,
"grad_norm": 0.09035459905862808,
"learning_rate": 3.36842539755421e-05,
"loss": 1.058,
"step": 331
},
{
"epoch": 1.5434681543468154,
"grad_norm": 0.10302021354436874,
"learning_rate": 3.3300485870962776e-05,
"loss": 1.0717,
"step": 332
},
{
"epoch": 1.5481171548117154,
"grad_norm": 0.11944916844367981,
"learning_rate": 3.292174612836757e-05,
"loss": 1.1738,
"step": 333
},
{
"epoch": 1.5527661552766157,
"grad_norm": 0.1299082338809967,
"learning_rate": 3.254805957764673e-05,
"loss": 1.1078,
"step": 334
},
{
"epoch": 1.5574151557415155,
"grad_norm": 0.15316098928451538,
"learning_rate": 3.217945071740724e-05,
"loss": 1.0111,
"step": 335
},
{
"epoch": 1.5620641562064157,
"grad_norm": 0.19828684628009796,
"learning_rate": 3.1815943713366404e-05,
"loss": 1.0292,
"step": 336
},
{
"epoch": 1.5667131566713157,
"grad_norm": 0.2431751936674118,
"learning_rate": 3.145756239676779e-05,
"loss": 0.867,
"step": 337
},
{
"epoch": 1.5713621571362157,
"grad_norm": 0.3115411698818207,
"learning_rate": 3.110433026281872e-05,
"loss": 0.7307,
"step": 338
},
{
"epoch": 1.5760111576011158,
"grad_norm": 0.37397709488868713,
"learning_rate": 3.075627046915003e-05,
"loss": 0.6894,
"step": 339
},
{
"epoch": 1.5806601580660158,
"grad_norm": 0.4691689610481262,
"learning_rate": 3.041340583429789e-05,
"loss": 0.6568,
"step": 340
},
{
"epoch": 1.5853091585309158,
"grad_norm": 0.27498671412467957,
"learning_rate": 3.0075758836207716e-05,
"loss": 1.0342,
"step": 341
},
{
"epoch": 1.5899581589958158,
"grad_norm": 0.07807187736034393,
"learning_rate": 2.9743351610760716e-05,
"loss": 1.0501,
"step": 342
},
{
"epoch": 1.594607159460716,
"grad_norm": 0.08518065512180328,
"learning_rate": 2.941620595032246e-05,
"loss": 1.0657,
"step": 343
},
{
"epoch": 1.5992561599256159,
"grad_norm": 0.09096917510032654,
"learning_rate": 2.9094343302314432e-05,
"loss": 1.0534,
"step": 344
},
{
"epoch": 1.6039051603905161,
"grad_norm": 0.10106653720140457,
"learning_rate": 2.8777784767807727e-05,
"loss": 1.0949,
"step": 345
},
{
"epoch": 1.6085541608554161,
"grad_norm": 0.11770515143871307,
"learning_rate": 2.846655110013978e-05,
"loss": 1.1013,
"step": 346
},
{
"epoch": 1.6132031613203162,
"grad_norm": 0.14044031500816345,
"learning_rate": 2.816066270355391e-05,
"loss": 1.0657,
"step": 347
},
{
"epoch": 1.6178521617852162,
"grad_norm": 0.1705469787120819,
"learning_rate": 2.78601396318614e-05,
"loss": 1.0108,
"step": 348
},
{
"epoch": 1.6225011622501162,
"grad_norm": 0.23262286186218262,
"learning_rate": 2.7565001587126922e-05,
"loss": 0.9104,
"step": 349
},
{
"epoch": 1.6271501627150162,
"grad_norm": 0.3084715008735657,
"learning_rate": 2.7275267918376912e-05,
"loss": 0.8493,
"step": 350
},
{
"epoch": 1.6317991631799162,
"grad_norm": 0.34869176149368286,
"learning_rate": 2.6990957620330954e-05,
"loss": 0.7103,
"step": 351
},
{
"epoch": 1.6364481636448165,
"grad_norm": 0.40754231810569763,
"learning_rate": 2.6712089332156633e-05,
"loss": 0.606,
"step": 352
},
{
"epoch": 1.6410971641097163,
"grad_norm": 0.38754644989967346,
"learning_rate": 2.6438681336247417e-05,
"loss": 0.7465,
"step": 353
},
{
"epoch": 1.6457461645746165,
"grad_norm": 0.07341364026069641,
"learning_rate": 2.6170751557024197e-05,
"loss": 1.0565,
"step": 354
},
{
"epoch": 1.6503951650395166,
"grad_norm": 0.08013252913951874,
"learning_rate": 2.5908317559760138e-05,
"loss": 1.0146,
"step": 355
},
{
"epoch": 1.6550441655044166,
"grad_norm": 0.09281094372272491,
"learning_rate": 2.5651396549429086e-05,
"loss": 1.0903,
"step": 356
},
{
"epoch": 1.6596931659693166,
"grad_norm": 0.09770162403583527,
"learning_rate": 2.540000536957765e-05,
"loss": 1.1007,
"step": 357
},
{
"epoch": 1.6643421664342166,
"grad_norm": 0.11755497008562088,
"learning_rate": 2.515416050122092e-05,
"loss": 1.1027,
"step": 358
},
{
"epoch": 1.6689911668991166,
"grad_norm": 0.1314418464899063,
"learning_rate": 2.4913878061762094e-05,
"loss": 1.0532,
"step": 359
},
{
"epoch": 1.6736401673640167,
"grad_norm": 0.1723410040140152,
"learning_rate": 2.4679173803935662e-05,
"loss": 1.0447,
"step": 360
},
{
"epoch": 1.678289167828917,
"grad_norm": 0.22374863922595978,
"learning_rate": 2.4450063114774784e-05,
"loss": 1.0271,
"step": 361
},
{
"epoch": 1.6829381682938167,
"grad_norm": 0.2649782598018646,
"learning_rate": 2.4226561014602522e-05,
"loss": 0.8288,
"step": 362
},
{
"epoch": 1.687587168758717,
"grad_norm": 0.3294927477836609,
"learning_rate": 2.400868215604706e-05,
"loss": 0.7618,
"step": 363
},
{
"epoch": 1.692236169223617,
"grad_norm": 0.390989750623703,
"learning_rate": 2.3796440823081167e-05,
"loss": 0.6498,
"step": 364
},
{
"epoch": 1.696885169688517,
"grad_norm": 0.47515323758125305,
"learning_rate": 2.358985093008566e-05,
"loss": 0.6206,
"step": 365
},
{
"epoch": 1.701534170153417,
"grad_norm": 0.18342038989067078,
"learning_rate": 2.3388926020937286e-05,
"loss": 0.9768,
"step": 366
},
{
"epoch": 1.706183170618317,
"grad_norm": 0.08224090933799744,
"learning_rate": 2.3193679268120718e-05,
"loss": 1.0393,
"step": 367
},
{
"epoch": 1.710832171083217,
"grad_norm": 0.08616173267364502,
"learning_rate": 2.3004123471865e-05,
"loss": 1.0021,
"step": 368
},
{
"epoch": 1.715481171548117,
"grad_norm": 0.08873171359300613,
"learning_rate": 2.2820271059304412e-05,
"loss": 1.066,
"step": 369
},
{
"epoch": 1.7201301720130173,
"grad_norm": 0.10707499831914902,
"learning_rate": 2.2642134083663678e-05,
"loss": 1.1172,
"step": 370
},
{
"epoch": 1.7247791724779171,
"grad_norm": 0.1225440576672554,
"learning_rate": 2.2469724223467866e-05,
"loss": 1.0865,
"step": 371
},
{
"epoch": 1.7294281729428174,
"grad_norm": 0.1396765112876892,
"learning_rate": 2.2303052781776664e-05,
"loss": 1.0273,
"step": 372
},
{
"epoch": 1.7340771734077174,
"grad_norm": 0.17938072979450226,
"learning_rate": 2.2142130685443382e-05,
"loss": 1.1046,
"step": 373
},
{
"epoch": 1.7387261738726174,
"grad_norm": 0.2324485331773758,
"learning_rate": 2.198696848439865e-05,
"loss": 0.9628,
"step": 374
},
{
"epoch": 1.7433751743375174,
"grad_norm": 0.2933482527732849,
"learning_rate": 2.1837576350958686e-05,
"loss": 0.8228,
"step": 375
},
{
"epoch": 1.7480241748024175,
"grad_norm": 0.3423949182033539,
"learning_rate": 2.169396407915849e-05,
"loss": 0.7043,
"step": 376
},
{
"epoch": 1.7526731752673175,
"grad_norm": 0.38756224513053894,
"learning_rate": 2.155614108410968e-05,
"loss": 0.6579,
"step": 377
},
{
"epoch": 1.7573221757322175,
"grad_norm": 0.4097736179828644,
"learning_rate": 2.142411640138332e-05,
"loss": 0.7569,
"step": 378
},
{
"epoch": 1.7619711761971177,
"grad_norm": 0.07299873232841492,
"learning_rate": 2.129789868641749e-05,
"loss": 1.0571,
"step": 379
},
{
"epoch": 1.7666201766620175,
"grad_norm": 0.07895659655332565,
"learning_rate": 2.1177496213949837e-05,
"loss": 1.0487,
"step": 380
},
{
"epoch": 1.7712691771269178,
"grad_norm": 0.09031179547309875,
"learning_rate": 2.1062916877475198e-05,
"loss": 1.0911,
"step": 381
},
{
"epoch": 1.7759181775918178,
"grad_norm": 0.09883815795183182,
"learning_rate": 2.0954168188727962e-05,
"loss": 1.0619,
"step": 382
},
{
"epoch": 1.7805671780567178,
"grad_norm": 0.11488756537437439,
"learning_rate": 2.0851257277189703e-05,
"loss": 1.0788,
"step": 383
},
{
"epoch": 1.7852161785216178,
"grad_norm": 0.14409998059272766,
"learning_rate": 2.0754190889621745e-05,
"loss": 1.0902,
"step": 384
},
{
"epoch": 1.7898651789865179,
"grad_norm": 0.1650954782962799,
"learning_rate": 2.0662975389622843e-05,
"loss": 0.999,
"step": 385
},
{
"epoch": 1.794514179451418,
"grad_norm": 0.22679570317268372,
"learning_rate": 2.0577616757212016e-05,
"loss": 0.9647,
"step": 386
},
{
"epoch": 1.799163179916318,
"grad_norm": 0.284229040145874,
"learning_rate": 2.0498120588436466e-05,
"loss": 0.85,
"step": 387
},
{
"epoch": 1.8038121803812182,
"grad_norm": 0.32825493812561035,
"learning_rate": 2.0424492095004746e-05,
"loss": 0.7025,
"step": 388
},
{
"epoch": 1.808461180846118,
"grad_norm": 0.37925466895103455,
"learning_rate": 2.0356736103945047e-05,
"loss": 0.6595,
"step": 389
},
{
"epoch": 1.8131101813110182,
"grad_norm": 0.45006221532821655,
"learning_rate": 2.029485705728876e-05,
"loss": 0.4914,
"step": 390
},
{
"epoch": 1.8177591817759182,
"grad_norm": 0.2308957725763321,
"learning_rate": 2.023885901177926e-05,
"loss": 1.0114,
"step": 391
},
{
"epoch": 1.8224081822408182,
"grad_norm": 0.07487577944993973,
"learning_rate": 2.0188745638605954e-05,
"loss": 1.0175,
"step": 392
},
{
"epoch": 1.8270571827057183,
"grad_norm": 0.07909037917852402,
"learning_rate": 2.014452022316358e-05,
"loss": 1.0365,
"step": 393
},
{
"epoch": 1.8317061831706183,
"grad_norm": 0.09483642131090164,
"learning_rate": 2.010618566483684e-05,
"loss": 1.0723,
"step": 394
},
{
"epoch": 1.8363551836355185,
"grad_norm": 0.1127537190914154,
"learning_rate": 2.00737444768103e-05,
"loss": 1.0905,
"step": 395
},
{
"epoch": 1.8410041841004183,
"grad_norm": 0.12364522367715836,
"learning_rate": 2.0047198785903658e-05,
"loss": 1.1045,
"step": 396
},
{
"epoch": 1.8456531845653186,
"grad_norm": 0.14588101208209991,
"learning_rate": 2.002655033243228e-05,
"loss": 1.0753,
"step": 397
},
{
"epoch": 1.8503021850302184,
"grad_norm": 0.196581169962883,
"learning_rate": 2.0011800470093105e-05,
"loss": 1.0165,
"step": 398
},
{
"epoch": 1.8549511854951186,
"grad_norm": 0.24468840658664703,
"learning_rate": 2.0002950165875934e-05,
"loss": 0.9067,
"step": 399
},
{
"epoch": 1.8596001859600186,
"grad_norm": 0.32478219270706177,
"learning_rate": 2e-05,
"loss": 0.8752,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3538115420584673e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}