cwaud's picture
Training in progress, step 133, checkpoint
e5cad10 verified
raw
history blame
25.1 kB
{
"best_metric": 1.3449122905731201,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 1.1450094161958568,
"eval_steps": 25,
"global_step": 133,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008609093354856066,
"grad_norm": 0.6988075971603394,
"learning_rate": 2e-05,
"loss": 1.7571,
"step": 1
},
{
"epoch": 0.008609093354856066,
"eval_loss": 3.2656900882720947,
"eval_runtime": 1.2466,
"eval_samples_per_second": 40.11,
"eval_steps_per_second": 10.429,
"step": 1
},
{
"epoch": 0.017218186709712133,
"grad_norm": 0.7088323831558228,
"learning_rate": 4e-05,
"loss": 2.0087,
"step": 2
},
{
"epoch": 0.0258272800645682,
"grad_norm": 1.009822130203247,
"learning_rate": 6e-05,
"loss": 2.4702,
"step": 3
},
{
"epoch": 0.034436373419424265,
"grad_norm": 0.9133837819099426,
"learning_rate": 8e-05,
"loss": 2.5742,
"step": 4
},
{
"epoch": 0.04304546677428033,
"grad_norm": 1.0774202346801758,
"learning_rate": 0.0001,
"loss": 2.4651,
"step": 5
},
{
"epoch": 0.0516545601291364,
"grad_norm": 1.940169095993042,
"learning_rate": 9.99864468413292e-05,
"loss": 2.7402,
"step": 6
},
{
"epoch": 0.060263653483992465,
"grad_norm": 2.5114591121673584,
"learning_rate": 9.994579552923277e-05,
"loss": 2.36,
"step": 7
},
{
"epoch": 0.06887274683884853,
"grad_norm": 2.1416642665863037,
"learning_rate": 9.987807055054106e-05,
"loss": 2.1412,
"step": 8
},
{
"epoch": 0.0774818401937046,
"grad_norm": 2.493290424346924,
"learning_rate": 9.978331270024886e-05,
"loss": 2.134,
"step": 9
},
{
"epoch": 0.08609093354856066,
"grad_norm": 2.5377631187438965,
"learning_rate": 9.966157905694196e-05,
"loss": 2.2798,
"step": 10
},
{
"epoch": 0.09470002690341674,
"grad_norm": 5.070113658905029,
"learning_rate": 9.951294294841516e-05,
"loss": 2.0949,
"step": 11
},
{
"epoch": 0.1033091202582728,
"grad_norm": 8.32091999053955,
"learning_rate": 9.933749390750235e-05,
"loss": 2.669,
"step": 12
},
{
"epoch": 0.11191821361312887,
"grad_norm": 10.361486434936523,
"learning_rate": 9.913533761814537e-05,
"loss": 2.7766,
"step": 13
},
{
"epoch": 0.12052730696798493,
"grad_norm": 5.311871528625488,
"learning_rate": 9.890659585173379e-05,
"loss": 1.494,
"step": 14
},
{
"epoch": 0.129136400322841,
"grad_norm": 4.676233291625977,
"learning_rate": 9.865140639375449e-05,
"loss": 1.5863,
"step": 15
},
{
"epoch": 0.13774549367769706,
"grad_norm": 3.6029069423675537,
"learning_rate": 9.83699229607948e-05,
"loss": 1.7019,
"step": 16
},
{
"epoch": 0.14635458703255314,
"grad_norm": 1.6760152578353882,
"learning_rate": 9.80623151079494e-05,
"loss": 1.6638,
"step": 17
},
{
"epoch": 0.1549636803874092,
"grad_norm": 1.4253727197647095,
"learning_rate": 9.772876812668666e-05,
"loss": 1.7801,
"step": 18
},
{
"epoch": 0.16357277374226525,
"grad_norm": 1.3295799493789673,
"learning_rate": 9.736948293323593e-05,
"loss": 1.7038,
"step": 19
},
{
"epoch": 0.17218186709712133,
"grad_norm": 0.7724640965461731,
"learning_rate": 9.698467594756325e-05,
"loss": 1.7746,
"step": 20
},
{
"epoch": 0.1807909604519774,
"grad_norm": 0.8214246034622192,
"learning_rate": 9.657457896300791e-05,
"loss": 1.5971,
"step": 21
},
{
"epoch": 0.18940005380683347,
"grad_norm": 1.1587809324264526,
"learning_rate": 9.613943900665889e-05,
"loss": 1.7559,
"step": 22
},
{
"epoch": 0.19800914716168955,
"grad_norm": 1.8553802967071533,
"learning_rate": 9.567951819055496e-05,
"loss": 1.7612,
"step": 23
},
{
"epoch": 0.2066182405165456,
"grad_norm": 2.004556655883789,
"learning_rate": 9.519509355379818e-05,
"loss": 1.5969,
"step": 24
},
{
"epoch": 0.21522733387140167,
"grad_norm": 5.224137783050537,
"learning_rate": 9.468645689567598e-05,
"loss": 2.0711,
"step": 25
},
{
"epoch": 0.21522733387140167,
"eval_loss": 1.508137822151184,
"eval_runtime": 1.2723,
"eval_samples_per_second": 39.3,
"eval_steps_per_second": 10.218,
"step": 25
},
{
"epoch": 0.22383642722625774,
"grad_norm": 1.341894268989563,
"learning_rate": 9.415391459989203e-05,
"loss": 1.2867,
"step": 26
},
{
"epoch": 0.2324455205811138,
"grad_norm": 2.224653720855713,
"learning_rate": 9.359778745001225e-05,
"loss": 1.2927,
"step": 27
},
{
"epoch": 0.24105461393596986,
"grad_norm": 1.4196522235870361,
"learning_rate": 9.301841043623682e-05,
"loss": 1.3711,
"step": 28
},
{
"epoch": 0.24966370729082593,
"grad_norm": 1.2161178588867188,
"learning_rate": 9.241613255361455e-05,
"loss": 1.4576,
"step": 29
},
{
"epoch": 0.258272800645682,
"grad_norm": 0.8192944526672363,
"learning_rate": 9.179131659182127e-05,
"loss": 1.6426,
"step": 30
},
{
"epoch": 0.2668818940005381,
"grad_norm": 0.6419580578804016,
"learning_rate": 9.114433891662902e-05,
"loss": 1.7142,
"step": 31
},
{
"epoch": 0.2754909873553941,
"grad_norm": 0.41701650619506836,
"learning_rate": 9.047558924319729e-05,
"loss": 1.6853,
"step": 32
},
{
"epoch": 0.2841000807102502,
"grad_norm": 0.4797891080379486,
"learning_rate": 8.978547040132317e-05,
"loss": 1.6622,
"step": 33
},
{
"epoch": 0.29270917406510627,
"grad_norm": 0.737724781036377,
"learning_rate": 8.907439809279181e-05,
"loss": 1.6734,
"step": 34
},
{
"epoch": 0.3013182674199623,
"grad_norm": 1.0522786378860474,
"learning_rate": 8.834280064097317e-05,
"loss": 1.6301,
"step": 35
},
{
"epoch": 0.3099273607748184,
"grad_norm": 1.4976811408996582,
"learning_rate": 8.759111873281603e-05,
"loss": 1.6521,
"step": 36
},
{
"epoch": 0.31853645412967446,
"grad_norm": 3.2661497592926025,
"learning_rate": 8.681980515339464e-05,
"loss": 1.5304,
"step": 37
},
{
"epoch": 0.3271455474845305,
"grad_norm": 2.8426661491394043,
"learning_rate": 8.602932451316802e-05,
"loss": 1.437,
"step": 38
},
{
"epoch": 0.3357546408393866,
"grad_norm": 0.49562180042266846,
"learning_rate": 8.522015296811584e-05,
"loss": 1.2391,
"step": 39
},
{
"epoch": 0.34436373419424265,
"grad_norm": 0.5359828472137451,
"learning_rate": 8.439277793291995e-05,
"loss": 1.2585,
"step": 40
},
{
"epoch": 0.35297282754909876,
"grad_norm": 0.8592618107795715,
"learning_rate": 8.354769778736406e-05,
"loss": 1.3682,
"step": 41
},
{
"epoch": 0.3615819209039548,
"grad_norm": 0.8251994848251343,
"learning_rate": 8.268542157612821e-05,
"loss": 1.548,
"step": 42
},
{
"epoch": 0.37019101425881085,
"grad_norm": 0.9783174991607666,
"learning_rate": 8.180646870215952e-05,
"loss": 1.7041,
"step": 43
},
{
"epoch": 0.37880010761366695,
"grad_norm": 0.894888699054718,
"learning_rate": 8.091136861380305e-05,
"loss": 1.8391,
"step": 44
},
{
"epoch": 0.387409200968523,
"grad_norm": 0.5933730006217957,
"learning_rate": 8.000066048588211e-05,
"loss": 1.6974,
"step": 45
},
{
"epoch": 0.3960182943233791,
"grad_norm": 0.7440256476402283,
"learning_rate": 7.907489289491939e-05,
"loss": 1.6231,
"step": 46
},
{
"epoch": 0.40462738767823514,
"grad_norm": 0.7255629897117615,
"learning_rate": 7.813462348869497e-05,
"loss": 1.6172,
"step": 47
},
{
"epoch": 0.4132364810330912,
"grad_norm": 1.1230436563491821,
"learning_rate": 7.71804186503403e-05,
"loss": 1.5745,
"step": 48
},
{
"epoch": 0.4218455743879473,
"grad_norm": 1.6526938676834106,
"learning_rate": 7.62128531571699e-05,
"loss": 1.2586,
"step": 49
},
{
"epoch": 0.43045466774280333,
"grad_norm": 5.730405330657959,
"learning_rate": 7.523250983445731e-05,
"loss": 1.7199,
"step": 50
},
{
"epoch": 0.43045466774280333,
"eval_loss": 1.389930248260498,
"eval_runtime": 1.2729,
"eval_samples_per_second": 39.28,
"eval_steps_per_second": 10.213,
"step": 50
},
{
"epoch": 0.4390637610976594,
"grad_norm": 0.4678877890110016,
"learning_rate": 7.42399792043627e-05,
"loss": 1.2294,
"step": 51
},
{
"epoch": 0.4476728544525155,
"grad_norm": 0.5002795457839966,
"learning_rate": 7.323585913022454e-05,
"loss": 1.2342,
"step": 52
},
{
"epoch": 0.4562819478073715,
"grad_norm": 0.3534197509288788,
"learning_rate": 7.222075445642904e-05,
"loss": 1.2975,
"step": 53
},
{
"epoch": 0.4648910411622276,
"grad_norm": 0.6102612018585205,
"learning_rate": 7.119527664407447e-05,
"loss": 1.4773,
"step": 54
},
{
"epoch": 0.47350013451708367,
"grad_norm": 0.5064122080802917,
"learning_rate": 7.01600434026499e-05,
"loss": 1.5257,
"step": 55
},
{
"epoch": 0.4821092278719397,
"grad_norm": 0.6477398872375488,
"learning_rate": 6.911567831795013e-05,
"loss": 1.7135,
"step": 56
},
{
"epoch": 0.4907183212267958,
"grad_norm": 1.0539360046386719,
"learning_rate": 6.80628104764508e-05,
"loss": 1.8241,
"step": 57
},
{
"epoch": 0.49932741458165186,
"grad_norm": 0.7702855467796326,
"learning_rate": 6.700207408637044e-05,
"loss": 1.7362,
"step": 58
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.6455403566360474,
"learning_rate": 6.593410809564689e-05,
"loss": 1.5381,
"step": 59
},
{
"epoch": 0.516545601291364,
"grad_norm": 0.6673574447631836,
"learning_rate": 6.485955580705913e-05,
"loss": 1.4796,
"step": 60
},
{
"epoch": 0.5251546946462201,
"grad_norm": 0.8242542743682861,
"learning_rate": 6.377906449072578e-05,
"loss": 1.6654,
"step": 61
},
{
"epoch": 0.5337637880010762,
"grad_norm": 1.4092378616333008,
"learning_rate": 6.269328499421356e-05,
"loss": 1.2351,
"step": 62
},
{
"epoch": 0.5423728813559322,
"grad_norm": 2.419718027114868,
"learning_rate": 6.160287135049127e-05,
"loss": 1.4315,
"step": 63
},
{
"epoch": 0.5509819747107882,
"grad_norm": 0.38671913743019104,
"learning_rate": 6.050848038396473e-05,
"loss": 1.2274,
"step": 64
},
{
"epoch": 0.5595910680656443,
"grad_norm": 0.5623155832290649,
"learning_rate": 5.941077131483025e-05,
"loss": 1.3062,
"step": 65
},
{
"epoch": 0.5682001614205004,
"grad_norm": 0.6458035111427307,
"learning_rate": 5.831040536198504e-05,
"loss": 1.4318,
"step": 66
},
{
"epoch": 0.5768092547753565,
"grad_norm": 0.6504884958267212,
"learning_rate": 5.720804534473382e-05,
"loss": 1.3897,
"step": 67
},
{
"epoch": 0.5854183481302125,
"grad_norm": 0.4223068356513977,
"learning_rate": 5.610435528353106e-05,
"loss": 1.5331,
"step": 68
},
{
"epoch": 0.5940274414850686,
"grad_norm": 0.5046920776367188,
"learning_rate": 5.500000000000001e-05,
"loss": 1.6225,
"step": 69
},
{
"epoch": 0.6026365348399246,
"grad_norm": 0.41651174426078796,
"learning_rate": 5.389564471646895e-05,
"loss": 1.7376,
"step": 70
},
{
"epoch": 0.6112456281947808,
"grad_norm": 0.32924169301986694,
"learning_rate": 5.27919546552662e-05,
"loss": 1.5401,
"step": 71
},
{
"epoch": 0.6198547215496368,
"grad_norm": 0.4280257821083069,
"learning_rate": 5.168959463801497e-05,
"loss": 1.5662,
"step": 72
},
{
"epoch": 0.6284638149044929,
"grad_norm": 0.6656383275985718,
"learning_rate": 5.058922868516978e-05,
"loss": 1.4713,
"step": 73
},
{
"epoch": 0.6370729082593489,
"grad_norm": 0.8646160960197449,
"learning_rate": 4.9491519616035276e-05,
"loss": 1.2566,
"step": 74
},
{
"epoch": 0.645682001614205,
"grad_norm": 2.5206289291381836,
"learning_rate": 4.839712864950873e-05,
"loss": 1.7236,
"step": 75
},
{
"epoch": 0.645682001614205,
"eval_loss": 1.349289894104004,
"eval_runtime": 1.2727,
"eval_samples_per_second": 39.288,
"eval_steps_per_second": 10.215,
"step": 75
},
{
"epoch": 0.654291094969061,
"grad_norm": 0.4360639750957489,
"learning_rate": 4.730671500578645e-05,
"loss": 1.1383,
"step": 76
},
{
"epoch": 0.6629001883239172,
"grad_norm": 0.8731722235679626,
"learning_rate": 4.6220935509274235e-05,
"loss": 1.4032,
"step": 77
},
{
"epoch": 0.6715092816787732,
"grad_norm": 0.7142120003700256,
"learning_rate": 4.5140444192940864e-05,
"loss": 1.1904,
"step": 78
},
{
"epoch": 0.6801183750336293,
"grad_norm": 0.594018280506134,
"learning_rate": 4.406589190435313e-05,
"loss": 1.3872,
"step": 79
},
{
"epoch": 0.6887274683884853,
"grad_norm": 0.6022002696990967,
"learning_rate": 4.2997925913629577e-05,
"loss": 1.5956,
"step": 80
},
{
"epoch": 0.6973365617433414,
"grad_norm": 0.5471949577331543,
"learning_rate": 4.19371895235492e-05,
"loss": 1.6525,
"step": 81
},
{
"epoch": 0.7059456550981975,
"grad_norm": 0.3283829391002655,
"learning_rate": 4.0884321682049884e-05,
"loss": 1.772,
"step": 82
},
{
"epoch": 0.7145547484530536,
"grad_norm": 0.34206530451774597,
"learning_rate": 3.98399565973501e-05,
"loss": 1.6938,
"step": 83
},
{
"epoch": 0.7231638418079096,
"grad_norm": 0.35002151131629944,
"learning_rate": 3.880472335592553e-05,
"loss": 1.418,
"step": 84
},
{
"epoch": 0.7317729351627656,
"grad_norm": 0.7404176592826843,
"learning_rate": 3.777924554357096e-05,
"loss": 1.5774,
"step": 85
},
{
"epoch": 0.7403820285176217,
"grad_norm": 0.8380143046379089,
"learning_rate": 3.676414086977546e-05,
"loss": 1.3188,
"step": 86
},
{
"epoch": 0.7489911218724778,
"grad_norm": 2.1834990978240967,
"learning_rate": 3.576002079563732e-05,
"loss": 1.4621,
"step": 87
},
{
"epoch": 0.7576002152273339,
"grad_norm": 2.3319005966186523,
"learning_rate": 3.4767490165542704e-05,
"loss": 1.5594,
"step": 88
},
{
"epoch": 0.7662093085821899,
"grad_norm": 0.3592979311943054,
"learning_rate": 3.378714684283011e-05,
"loss": 1.1,
"step": 89
},
{
"epoch": 0.774818401937046,
"grad_norm": 0.49761757254600525,
"learning_rate": 3.281958134965972e-05,
"loss": 1.3531,
"step": 90
},
{
"epoch": 0.783427495291902,
"grad_norm": 0.3277381658554077,
"learning_rate": 3.186537651130503e-05,
"loss": 1.3467,
"step": 91
},
{
"epoch": 0.7920365886467582,
"grad_norm": 0.3256728947162628,
"learning_rate": 3.0925107105080636e-05,
"loss": 1.5374,
"step": 92
},
{
"epoch": 0.8006456820016142,
"grad_norm": 0.35263001918792725,
"learning_rate": 2.9999339514117912e-05,
"loss": 1.5367,
"step": 93
},
{
"epoch": 0.8092547753564703,
"grad_norm": 0.3698779344558716,
"learning_rate": 2.9088631386196964e-05,
"loss": 1.7344,
"step": 94
},
{
"epoch": 0.8178638687113263,
"grad_norm": 0.445311576128006,
"learning_rate": 2.8193531297840503e-05,
"loss": 1.7141,
"step": 95
},
{
"epoch": 0.8264729620661824,
"grad_norm": 0.4353031814098358,
"learning_rate": 2.73145784238718e-05,
"loss": 1.5168,
"step": 96
},
{
"epoch": 0.8350820554210385,
"grad_norm": 0.6268022060394287,
"learning_rate": 2.645230221263596e-05,
"loss": 1.4016,
"step": 97
},
{
"epoch": 0.8436911487758946,
"grad_norm": 0.5284622311592102,
"learning_rate": 2.560722206708006e-05,
"loss": 1.5741,
"step": 98
},
{
"epoch": 0.8523002421307506,
"grad_norm": 0.7828362584114075,
"learning_rate": 2.4779847031884175e-05,
"loss": 1.243,
"step": 99
},
{
"epoch": 0.8609093354856067,
"grad_norm": 3.621532678604126,
"learning_rate": 2.397067548683199e-05,
"loss": 1.5976,
"step": 100
},
{
"epoch": 0.8609093354856067,
"eval_loss": 1.3449122905731201,
"eval_runtime": 1.2724,
"eval_samples_per_second": 39.295,
"eval_steps_per_second": 10.217,
"step": 100
},
{
"epoch": 0.8695184288404627,
"grad_norm": 0.3012229800224304,
"learning_rate": 2.3180194846605367e-05,
"loss": 1.176,
"step": 101
},
{
"epoch": 0.8781275221953188,
"grad_norm": 0.4754287004470825,
"learning_rate": 2.2408881267183997e-05,
"loss": 1.1958,
"step": 102
},
{
"epoch": 0.8867366155501749,
"grad_norm": 0.43265655636787415,
"learning_rate": 2.165719935902685e-05,
"loss": 1.3262,
"step": 103
},
{
"epoch": 0.895345708905031,
"grad_norm": 0.5260616540908813,
"learning_rate": 2.09256019072082e-05,
"loss": 1.3721,
"step": 104
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.5602609515190125,
"learning_rate": 2.0214529598676836e-05,
"loss": 1.401,
"step": 105
},
{
"epoch": 0.912563895614743,
"grad_norm": 0.29336562752723694,
"learning_rate": 1.952441075680272e-05,
"loss": 1.6924,
"step": 106
},
{
"epoch": 0.9211729889695991,
"grad_norm": 0.9488304853439331,
"learning_rate": 1.8855661083370986e-05,
"loss": 1.8012,
"step": 107
},
{
"epoch": 0.9297820823244553,
"grad_norm": 0.3932758867740631,
"learning_rate": 1.820868340817874e-05,
"loss": 1.6428,
"step": 108
},
{
"epoch": 0.9383911756793113,
"grad_norm": 0.3379191756248474,
"learning_rate": 1.758386744638546e-05,
"loss": 1.3678,
"step": 109
},
{
"epoch": 0.9470002690341673,
"grad_norm": 0.5376018285751343,
"learning_rate": 1.698158956376318e-05,
"loss": 1.6057,
"step": 110
},
{
"epoch": 0.9556093623890234,
"grad_norm": 0.6705049872398376,
"learning_rate": 1.6402212549987762e-05,
"loss": 1.5497,
"step": 111
},
{
"epoch": 0.9642184557438794,
"grad_norm": 1.5708343982696533,
"learning_rate": 1.584608540010799e-05,
"loss": 1.4589,
"step": 112
},
{
"epoch": 0.9728275490987356,
"grad_norm": 2.8929443359375,
"learning_rate": 1.531354310432403e-05,
"loss": 1.5784,
"step": 113
},
{
"epoch": 0.9814366424535916,
"grad_norm": 0.3657113313674927,
"learning_rate": 1.4804906446201816e-05,
"loss": 1.3912,
"step": 114
},
{
"epoch": 0.9900457358084477,
"grad_norm": 0.3794941306114197,
"learning_rate": 1.4320481809445051e-05,
"loss": 1.5847,
"step": 115
},
{
"epoch": 0.9986548291633037,
"grad_norm": 0.7362991571426392,
"learning_rate": 1.386056099334112e-05,
"loss": 1.399,
"step": 116
},
{
"epoch": 1.0072639225181599,
"grad_norm": 0.8029009103775024,
"learning_rate": 1.3425421036992098e-05,
"loss": 1.2215,
"step": 117
},
{
"epoch": 1.0158730158730158,
"grad_norm": 0.5080808997154236,
"learning_rate": 1.3015324052436753e-05,
"loss": 1.2015,
"step": 118
},
{
"epoch": 1.024482109227872,
"grad_norm": 0.44496291875839233,
"learning_rate": 1.2630517066764069e-05,
"loss": 1.2138,
"step": 119
},
{
"epoch": 1.033091202582728,
"grad_norm": 0.4348479211330414,
"learning_rate": 1.227123187331335e-05,
"loss": 1.2767,
"step": 120
},
{
"epoch": 1.041700295937584,
"grad_norm": 0.37992164492607117,
"learning_rate": 1.1937684892050604e-05,
"loss": 1.5242,
"step": 121
},
{
"epoch": 1.0503093892924402,
"grad_norm": 0.32971861958503723,
"learning_rate": 1.1630077039205209e-05,
"loss": 1.5498,
"step": 122
},
{
"epoch": 1.0589184826472962,
"grad_norm": 0.5224172472953796,
"learning_rate": 1.1348593606245522e-05,
"loss": 1.6984,
"step": 123
},
{
"epoch": 1.0675275760021523,
"grad_norm": 0.43070971965789795,
"learning_rate": 1.109340414826622e-05,
"loss": 1.5932,
"step": 124
},
{
"epoch": 1.0761366693570082,
"grad_norm": 0.4774491786956787,
"learning_rate": 1.0864662381854632e-05,
"loss": 1.4308,
"step": 125
},
{
"epoch": 1.0761366693570082,
"eval_loss": 1.3410676717758179,
"eval_runtime": 1.2741,
"eval_samples_per_second": 39.242,
"eval_steps_per_second": 10.203,
"step": 125
},
{
"epoch": 1.0847457627118644,
"grad_norm": 0.5184400677680969,
"learning_rate": 1.0662506092497646e-05,
"loss": 1.4641,
"step": 126
},
{
"epoch": 1.0933548560667206,
"grad_norm": 0.5525245666503906,
"learning_rate": 1.0487057051584856e-05,
"loss": 1.5545,
"step": 127
},
{
"epoch": 1.1019639494215765,
"grad_norm": 1.609927773475647,
"learning_rate": 1.0338420943058053e-05,
"loss": 1.3439,
"step": 128
},
{
"epoch": 1.1105730427764327,
"grad_norm": 2.2938551902770996,
"learning_rate": 1.0216687299751144e-05,
"loss": 1.4817,
"step": 129
},
{
"epoch": 1.1191821361312886,
"grad_norm": 0.45292142033576965,
"learning_rate": 1.0121929449458941e-05,
"loss": 1.1242,
"step": 130
},
{
"epoch": 1.1277912294861447,
"grad_norm": 0.4423352777957916,
"learning_rate": 1.0054204470767243e-05,
"loss": 1.1672,
"step": 131
},
{
"epoch": 1.136400322841001,
"grad_norm": 0.33851832151412964,
"learning_rate": 1.0013553158670811e-05,
"loss": 1.2433,
"step": 132
},
{
"epoch": 1.1450094161958568,
"grad_norm": 0.3434777855873108,
"learning_rate": 1e-05,
"loss": 1.4094,
"step": 133
}
],
"logging_steps": 1,
"max_steps": 133,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8256510115053568e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}