zLlamask / trainer_state.json
theostos's picture
Upload 10 files
1340c42 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.643835616438356,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010958904109589041,
"grad_norm": 5.003701210021973,
"learning_rate": 0.0002,
"loss": 1.3947,
"step": 1
},
{
"epoch": 0.021917808219178082,
"grad_norm": 10.884932518005371,
"learning_rate": 0.0002,
"loss": 0.657,
"step": 2
},
{
"epoch": 0.03287671232876712,
"grad_norm": 1.7771106958389282,
"learning_rate": 0.0002,
"loss": 1.1432,
"step": 3
},
{
"epoch": 0.043835616438356165,
"grad_norm": 2.750317096710205,
"learning_rate": 0.0002,
"loss": 0.6564,
"step": 4
},
{
"epoch": 0.0547945205479452,
"grad_norm": 1.633827567100525,
"learning_rate": 0.0002,
"loss": 0.9156,
"step": 5
},
{
"epoch": 0.06575342465753424,
"grad_norm": 1.747514009475708,
"learning_rate": 0.0002,
"loss": 0.5935,
"step": 6
},
{
"epoch": 0.07671232876712329,
"grad_norm": 1.3404035568237305,
"learning_rate": 0.0002,
"loss": 0.5328,
"step": 7
},
{
"epoch": 0.08767123287671233,
"grad_norm": 7.1872968673706055,
"learning_rate": 0.0002,
"loss": 1.0489,
"step": 8
},
{
"epoch": 0.09863013698630137,
"grad_norm": 1.8788518905639648,
"learning_rate": 0.0002,
"loss": 0.8711,
"step": 9
},
{
"epoch": 0.1095890410958904,
"grad_norm": 1.554734706878662,
"learning_rate": 0.0002,
"loss": 0.6025,
"step": 10
},
{
"epoch": 0.12054794520547946,
"grad_norm": 1.5181900262832642,
"learning_rate": 0.0002,
"loss": 0.9178,
"step": 11
},
{
"epoch": 0.13150684931506848,
"grad_norm": 1.74048912525177,
"learning_rate": 0.0002,
"loss": 0.8413,
"step": 12
},
{
"epoch": 0.14246575342465753,
"grad_norm": 1.5069341659545898,
"learning_rate": 0.0002,
"loss": 0.7297,
"step": 13
},
{
"epoch": 0.15342465753424658,
"grad_norm": 1.9904654026031494,
"learning_rate": 0.0002,
"loss": 0.909,
"step": 14
},
{
"epoch": 0.1643835616438356,
"grad_norm": 1.329214334487915,
"learning_rate": 0.0002,
"loss": 0.6475,
"step": 15
},
{
"epoch": 0.17534246575342466,
"grad_norm": 3.0678200721740723,
"learning_rate": 0.0002,
"loss": 0.8995,
"step": 16
},
{
"epoch": 0.1863013698630137,
"grad_norm": 1.5679476261138916,
"learning_rate": 0.0002,
"loss": 0.6728,
"step": 17
},
{
"epoch": 0.19726027397260273,
"grad_norm": 1.196855068206787,
"learning_rate": 0.0002,
"loss": 0.4871,
"step": 18
},
{
"epoch": 0.20821917808219179,
"grad_norm": 1.8244729042053223,
"learning_rate": 0.0002,
"loss": 0.5748,
"step": 19
},
{
"epoch": 0.2191780821917808,
"grad_norm": 1.1432626247406006,
"learning_rate": 0.0002,
"loss": 0.5982,
"step": 20
},
{
"epoch": 0.23013698630136986,
"grad_norm": 1.672303557395935,
"learning_rate": 0.0002,
"loss": 0.5639,
"step": 21
},
{
"epoch": 0.2410958904109589,
"grad_norm": 1.5940009355545044,
"learning_rate": 0.0002,
"loss": 0.6198,
"step": 22
},
{
"epoch": 0.25205479452054796,
"grad_norm": 1.710236668586731,
"learning_rate": 0.0002,
"loss": 0.8638,
"step": 23
},
{
"epoch": 0.26301369863013696,
"grad_norm": 1.7786955833435059,
"learning_rate": 0.0002,
"loss": 0.7169,
"step": 24
},
{
"epoch": 0.273972602739726,
"grad_norm": 1.2856895923614502,
"learning_rate": 0.0002,
"loss": 0.5522,
"step": 25
},
{
"epoch": 0.28493150684931506,
"grad_norm": 1.3859294652938843,
"learning_rate": 0.0002,
"loss": 0.6687,
"step": 26
},
{
"epoch": 0.2958904109589041,
"grad_norm": 1.3184758424758911,
"learning_rate": 0.0002,
"loss": 0.6227,
"step": 27
},
{
"epoch": 0.30684931506849317,
"grad_norm": 1.3072282075881958,
"learning_rate": 0.0002,
"loss": 0.4417,
"step": 28
},
{
"epoch": 0.3178082191780822,
"grad_norm": 1.7866010665893555,
"learning_rate": 0.0002,
"loss": 0.7648,
"step": 29
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.0981870889663696,
"learning_rate": 0.0002,
"loss": 0.6496,
"step": 30
},
{
"epoch": 0.33972602739726027,
"grad_norm": 1.586014747619629,
"learning_rate": 0.0002,
"loss": 0.7389,
"step": 31
},
{
"epoch": 0.3506849315068493,
"grad_norm": 1.4227101802825928,
"learning_rate": 0.0002,
"loss": 0.7764,
"step": 32
},
{
"epoch": 0.36164383561643837,
"grad_norm": 1.0654901266098022,
"learning_rate": 0.0002,
"loss": 0.7015,
"step": 33
},
{
"epoch": 0.3726027397260274,
"grad_norm": 1.22892427444458,
"learning_rate": 0.0002,
"loss": 0.6085,
"step": 34
},
{
"epoch": 0.3835616438356164,
"grad_norm": 1.1209129095077515,
"learning_rate": 0.0002,
"loss": 0.334,
"step": 35
},
{
"epoch": 0.39452054794520547,
"grad_norm": 1.3106253147125244,
"learning_rate": 0.0002,
"loss": 0.6377,
"step": 36
},
{
"epoch": 0.4054794520547945,
"grad_norm": 1.1807057857513428,
"learning_rate": 0.0002,
"loss": 0.4686,
"step": 37
},
{
"epoch": 0.41643835616438357,
"grad_norm": 1.1512413024902344,
"learning_rate": 0.0002,
"loss": 0.5934,
"step": 38
},
{
"epoch": 0.4273972602739726,
"grad_norm": 1.2861087322235107,
"learning_rate": 0.0002,
"loss": 0.4494,
"step": 39
},
{
"epoch": 0.4383561643835616,
"grad_norm": 1.6408014297485352,
"learning_rate": 0.0002,
"loss": 0.8894,
"step": 40
},
{
"epoch": 0.44931506849315067,
"grad_norm": 1.1644452810287476,
"learning_rate": 0.0002,
"loss": 0.9024,
"step": 41
},
{
"epoch": 0.4602739726027397,
"grad_norm": 0.989621639251709,
"learning_rate": 0.0002,
"loss": 0.4873,
"step": 42
},
{
"epoch": 0.4712328767123288,
"grad_norm": 1.2218654155731201,
"learning_rate": 0.0002,
"loss": 0.7415,
"step": 43
},
{
"epoch": 0.4821917808219178,
"grad_norm": 1.2144018411636353,
"learning_rate": 0.0002,
"loss": 0.6073,
"step": 44
},
{
"epoch": 0.4931506849315068,
"grad_norm": 1.5843247175216675,
"learning_rate": 0.0002,
"loss": 0.8353,
"step": 45
},
{
"epoch": 0.5041095890410959,
"grad_norm": 1.3587316274642944,
"learning_rate": 0.0002,
"loss": 0.5154,
"step": 46
},
{
"epoch": 0.5150684931506849,
"grad_norm": 1.173448085784912,
"learning_rate": 0.0002,
"loss": 0.616,
"step": 47
},
{
"epoch": 0.5260273972602739,
"grad_norm": 1.6074247360229492,
"learning_rate": 0.0002,
"loss": 0.8318,
"step": 48
},
{
"epoch": 0.536986301369863,
"grad_norm": 1.0739307403564453,
"learning_rate": 0.0002,
"loss": 0.5982,
"step": 49
},
{
"epoch": 0.547945205479452,
"grad_norm": 1.330855131149292,
"learning_rate": 0.0002,
"loss": 0.5309,
"step": 50
},
{
"epoch": 0.5589041095890411,
"grad_norm": 1.5128343105316162,
"learning_rate": 0.0002,
"loss": 0.5063,
"step": 51
},
{
"epoch": 0.5698630136986301,
"grad_norm": 1.5110679864883423,
"learning_rate": 0.0002,
"loss": 0.5551,
"step": 52
},
{
"epoch": 0.5808219178082191,
"grad_norm": 2.263357639312744,
"learning_rate": 0.0002,
"loss": 0.6387,
"step": 53
},
{
"epoch": 0.5917808219178082,
"grad_norm": 1.3241772651672363,
"learning_rate": 0.0002,
"loss": 0.8482,
"step": 54
},
{
"epoch": 0.6027397260273972,
"grad_norm": 1.246489405632019,
"learning_rate": 0.0002,
"loss": 0.7622,
"step": 55
},
{
"epoch": 0.6136986301369863,
"grad_norm": 1.2963398694992065,
"learning_rate": 0.0002,
"loss": 0.6943,
"step": 56
},
{
"epoch": 0.6246575342465753,
"grad_norm": 1.116220474243164,
"learning_rate": 0.0002,
"loss": 0.6305,
"step": 57
},
{
"epoch": 0.6356164383561644,
"grad_norm": 1.4782965183258057,
"learning_rate": 0.0002,
"loss": 0.7089,
"step": 58
},
{
"epoch": 0.6465753424657534,
"grad_norm": 1.207879662513733,
"learning_rate": 0.0002,
"loss": 0.8837,
"step": 59
},
{
"epoch": 0.6575342465753424,
"grad_norm": 1.0886225700378418,
"learning_rate": 0.0002,
"loss": 0.7521,
"step": 60
},
{
"epoch": 0.6684931506849315,
"grad_norm": 1.1209737062454224,
"learning_rate": 0.0002,
"loss": 0.6905,
"step": 61
},
{
"epoch": 0.6794520547945205,
"grad_norm": 1.732853889465332,
"learning_rate": 0.0002,
"loss": 0.6397,
"step": 62
},
{
"epoch": 0.6904109589041096,
"grad_norm": 1.2688523530960083,
"learning_rate": 0.0002,
"loss": 0.647,
"step": 63
},
{
"epoch": 0.7013698630136986,
"grad_norm": 1.3005374670028687,
"learning_rate": 0.0002,
"loss": 0.6742,
"step": 64
},
{
"epoch": 0.7123287671232876,
"grad_norm": 1.3675568103790283,
"learning_rate": 0.0002,
"loss": 0.8946,
"step": 65
},
{
"epoch": 0.7232876712328767,
"grad_norm": 1.3661890029907227,
"learning_rate": 0.0002,
"loss": 0.6946,
"step": 66
},
{
"epoch": 0.7342465753424657,
"grad_norm": 1.4970860481262207,
"learning_rate": 0.0002,
"loss": 0.6293,
"step": 67
},
{
"epoch": 0.7452054794520548,
"grad_norm": 1.445917010307312,
"learning_rate": 0.0002,
"loss": 0.8058,
"step": 68
},
{
"epoch": 0.7561643835616438,
"grad_norm": 1.6117463111877441,
"learning_rate": 0.0002,
"loss": 0.7998,
"step": 69
},
{
"epoch": 0.7671232876712328,
"grad_norm": 1.6023530960083008,
"learning_rate": 0.0002,
"loss": 0.5355,
"step": 70
},
{
"epoch": 0.7780821917808219,
"grad_norm": 1.4635958671569824,
"learning_rate": 0.0002,
"loss": 0.6999,
"step": 71
},
{
"epoch": 0.7890410958904109,
"grad_norm": 1.4061299562454224,
"learning_rate": 0.0002,
"loss": 0.6554,
"step": 72
},
{
"epoch": 0.8,
"grad_norm": 1.4091109037399292,
"learning_rate": 0.0002,
"loss": 0.6972,
"step": 73
},
{
"epoch": 0.810958904109589,
"grad_norm": 1.3066381216049194,
"learning_rate": 0.0002,
"loss": 0.742,
"step": 74
},
{
"epoch": 0.821917808219178,
"grad_norm": 0.9933669567108154,
"learning_rate": 0.0002,
"loss": 0.6989,
"step": 75
},
{
"epoch": 0.8328767123287671,
"grad_norm": 1.2205321788787842,
"learning_rate": 0.0002,
"loss": 0.6398,
"step": 76
},
{
"epoch": 0.8438356164383561,
"grad_norm": 1.3536911010742188,
"learning_rate": 0.0002,
"loss": 0.5861,
"step": 77
},
{
"epoch": 0.8547945205479452,
"grad_norm": 1.5119093656539917,
"learning_rate": 0.0002,
"loss": 0.9953,
"step": 78
},
{
"epoch": 0.8657534246575342,
"grad_norm": 1.0627142190933228,
"learning_rate": 0.0002,
"loss": 0.4492,
"step": 79
},
{
"epoch": 0.8767123287671232,
"grad_norm": 1.2815035581588745,
"learning_rate": 0.0002,
"loss": 0.7471,
"step": 80
},
{
"epoch": 0.8876712328767123,
"grad_norm": 1.376985788345337,
"learning_rate": 0.0002,
"loss": 0.8526,
"step": 81
},
{
"epoch": 0.8986301369863013,
"grad_norm": 1.3588144779205322,
"learning_rate": 0.0002,
"loss": 0.7122,
"step": 82
},
{
"epoch": 0.9095890410958904,
"grad_norm": 1.378824234008789,
"learning_rate": 0.0002,
"loss": 0.8444,
"step": 83
},
{
"epoch": 0.9205479452054794,
"grad_norm": 1.5447663068771362,
"learning_rate": 0.0002,
"loss": 0.6788,
"step": 84
},
{
"epoch": 0.9315068493150684,
"grad_norm": 1.4500224590301514,
"learning_rate": 0.0002,
"loss": 0.5721,
"step": 85
},
{
"epoch": 0.9424657534246575,
"grad_norm": 1.0830070972442627,
"learning_rate": 0.0002,
"loss": 0.657,
"step": 86
},
{
"epoch": 0.9534246575342465,
"grad_norm": 1.3003672361373901,
"learning_rate": 0.0002,
"loss": 0.4806,
"step": 87
},
{
"epoch": 0.9643835616438357,
"grad_norm": 1.1137444972991943,
"learning_rate": 0.0002,
"loss": 0.7444,
"step": 88
},
{
"epoch": 0.9753424657534246,
"grad_norm": 1.2204691171646118,
"learning_rate": 0.0002,
"loss": 0.7924,
"step": 89
},
{
"epoch": 0.9863013698630136,
"grad_norm": 1.3225165605545044,
"learning_rate": 0.0002,
"loss": 0.7357,
"step": 90
},
{
"epoch": 0.9972602739726028,
"grad_norm": 1.2743207216262817,
"learning_rate": 0.0002,
"loss": 0.6903,
"step": 91
},
{
"epoch": 1.0082191780821919,
"grad_norm": 1.2072831392288208,
"learning_rate": 0.0002,
"loss": 0.4617,
"step": 92
},
{
"epoch": 1.0191780821917809,
"grad_norm": 1.0190479755401611,
"learning_rate": 0.0002,
"loss": 0.4412,
"step": 93
},
{
"epoch": 1.0301369863013699,
"grad_norm": 0.8685715198516846,
"learning_rate": 0.0002,
"loss": 0.4422,
"step": 94
},
{
"epoch": 1.0410958904109588,
"grad_norm": 0.6671916246414185,
"learning_rate": 0.0002,
"loss": 0.2872,
"step": 95
},
{
"epoch": 1.0520547945205478,
"grad_norm": 0.8552739024162292,
"learning_rate": 0.0002,
"loss": 0.2837,
"step": 96
},
{
"epoch": 1.063013698630137,
"grad_norm": 0.8662064075469971,
"learning_rate": 0.0002,
"loss": 0.2549,
"step": 97
},
{
"epoch": 1.073972602739726,
"grad_norm": 1.6159878969192505,
"learning_rate": 0.0002,
"loss": 0.4545,
"step": 98
},
{
"epoch": 1.084931506849315,
"grad_norm": 1.0922621488571167,
"learning_rate": 0.0002,
"loss": 0.2703,
"step": 99
},
{
"epoch": 1.095890410958904,
"grad_norm": 0.9011418223381042,
"learning_rate": 0.0002,
"loss": 0.1948,
"step": 100
},
{
"epoch": 1.106849315068493,
"grad_norm": 1.094281554222107,
"learning_rate": 0.0002,
"loss": 0.2967,
"step": 101
},
{
"epoch": 1.1178082191780823,
"grad_norm": 0.9296566843986511,
"learning_rate": 0.0002,
"loss": 0.2372,
"step": 102
},
{
"epoch": 1.1287671232876713,
"grad_norm": 1.2015409469604492,
"learning_rate": 0.0002,
"loss": 0.2724,
"step": 103
},
{
"epoch": 1.1397260273972603,
"grad_norm": 1.0707019567489624,
"learning_rate": 0.0002,
"loss": 0.242,
"step": 104
},
{
"epoch": 1.1506849315068493,
"grad_norm": 1.381605863571167,
"learning_rate": 0.0002,
"loss": 0.4983,
"step": 105
},
{
"epoch": 1.1616438356164385,
"grad_norm": 1.3150050640106201,
"learning_rate": 0.0002,
"loss": 0.3801,
"step": 106
},
{
"epoch": 1.1726027397260275,
"grad_norm": 1.2527716159820557,
"learning_rate": 0.0002,
"loss": 0.3798,
"step": 107
},
{
"epoch": 1.1835616438356165,
"grad_norm": 1.2365212440490723,
"learning_rate": 0.0002,
"loss": 0.2736,
"step": 108
},
{
"epoch": 1.1945205479452055,
"grad_norm": 1.1183747053146362,
"learning_rate": 0.0002,
"loss": 0.4753,
"step": 109
},
{
"epoch": 1.2054794520547945,
"grad_norm": 0.8566204905509949,
"learning_rate": 0.0002,
"loss": 0.2531,
"step": 110
},
{
"epoch": 1.2164383561643834,
"grad_norm": 1.0663121938705444,
"learning_rate": 0.0002,
"loss": 0.1986,
"step": 111
},
{
"epoch": 1.2273972602739727,
"grad_norm": 1.4607142210006714,
"learning_rate": 0.0002,
"loss": 0.3589,
"step": 112
},
{
"epoch": 1.2383561643835617,
"grad_norm": 0.7903380990028381,
"learning_rate": 0.0002,
"loss": 0.1885,
"step": 113
},
{
"epoch": 1.2493150684931507,
"grad_norm": 1.3529448509216309,
"learning_rate": 0.0002,
"loss": 0.2417,
"step": 114
},
{
"epoch": 1.2602739726027397,
"grad_norm": 1.0445804595947266,
"learning_rate": 0.0002,
"loss": 0.2208,
"step": 115
},
{
"epoch": 1.2712328767123289,
"grad_norm": 1.0864062309265137,
"learning_rate": 0.0002,
"loss": 0.2603,
"step": 116
},
{
"epoch": 1.2821917808219179,
"grad_norm": 1.0503292083740234,
"learning_rate": 0.0002,
"loss": 0.1478,
"step": 117
},
{
"epoch": 1.2931506849315069,
"grad_norm": 1.4396042823791504,
"learning_rate": 0.0002,
"loss": 0.2482,
"step": 118
},
{
"epoch": 1.3041095890410959,
"grad_norm": 1.7265571355819702,
"learning_rate": 0.0002,
"loss": 0.3195,
"step": 119
},
{
"epoch": 1.3150684931506849,
"grad_norm": 1.2890552282333374,
"learning_rate": 0.0002,
"loss": 0.1891,
"step": 120
},
{
"epoch": 1.3260273972602739,
"grad_norm": 1.25291109085083,
"learning_rate": 0.0002,
"loss": 0.272,
"step": 121
},
{
"epoch": 1.336986301369863,
"grad_norm": 1.3044368028640747,
"learning_rate": 0.0002,
"loss": 0.3068,
"step": 122
},
{
"epoch": 1.347945205479452,
"grad_norm": 1.7130950689315796,
"learning_rate": 0.0002,
"loss": 0.5022,
"step": 123
},
{
"epoch": 1.358904109589041,
"grad_norm": 2.3856253623962402,
"learning_rate": 0.0002,
"loss": 0.2692,
"step": 124
},
{
"epoch": 1.36986301369863,
"grad_norm": 1.2418773174285889,
"learning_rate": 0.0002,
"loss": 0.3586,
"step": 125
},
{
"epoch": 1.3808219178082193,
"grad_norm": 1.4788987636566162,
"learning_rate": 0.0002,
"loss": 0.3331,
"step": 126
},
{
"epoch": 1.3917808219178083,
"grad_norm": 0.8837617635726929,
"learning_rate": 0.0002,
"loss": 0.2599,
"step": 127
},
{
"epoch": 1.4027397260273973,
"grad_norm": 1.1440480947494507,
"learning_rate": 0.0002,
"loss": 0.4741,
"step": 128
},
{
"epoch": 1.4136986301369863,
"grad_norm": 0.924139142036438,
"learning_rate": 0.0002,
"loss": 0.3046,
"step": 129
},
{
"epoch": 1.4246575342465753,
"grad_norm": 1.0871144533157349,
"learning_rate": 0.0002,
"loss": 0.2887,
"step": 130
},
{
"epoch": 1.4356164383561643,
"grad_norm": 0.9994255304336548,
"learning_rate": 0.0002,
"loss": 0.2292,
"step": 131
},
{
"epoch": 1.4465753424657535,
"grad_norm": 1.2388752698898315,
"learning_rate": 0.0002,
"loss": 0.2912,
"step": 132
},
{
"epoch": 1.4575342465753425,
"grad_norm": 1.0453673601150513,
"learning_rate": 0.0002,
"loss": 0.2324,
"step": 133
},
{
"epoch": 1.4684931506849315,
"grad_norm": 1.558586597442627,
"learning_rate": 0.0002,
"loss": 0.3854,
"step": 134
},
{
"epoch": 1.4794520547945205,
"grad_norm": 1.2428361177444458,
"learning_rate": 0.0002,
"loss": 0.2,
"step": 135
},
{
"epoch": 1.4904109589041097,
"grad_norm": 1.2706862688064575,
"learning_rate": 0.0002,
"loss": 0.1539,
"step": 136
},
{
"epoch": 1.5013698630136987,
"grad_norm": 1.4815326929092407,
"learning_rate": 0.0002,
"loss": 0.4198,
"step": 137
},
{
"epoch": 1.5123287671232877,
"grad_norm": 1.3065235614776611,
"learning_rate": 0.0002,
"loss": 0.2996,
"step": 138
},
{
"epoch": 1.5232876712328767,
"grad_norm": 1.1650217771530151,
"learning_rate": 0.0002,
"loss": 0.2343,
"step": 139
},
{
"epoch": 1.5342465753424657,
"grad_norm": 2.339799165725708,
"learning_rate": 0.0002,
"loss": 0.3193,
"step": 140
},
{
"epoch": 1.5452054794520547,
"grad_norm": 1.2828121185302734,
"learning_rate": 0.0002,
"loss": 0.3996,
"step": 141
},
{
"epoch": 1.5561643835616439,
"grad_norm": 1.0856819152832031,
"learning_rate": 0.0002,
"loss": 0.3242,
"step": 142
},
{
"epoch": 1.5671232876712329,
"grad_norm": 1.0250024795532227,
"learning_rate": 0.0002,
"loss": 0.23,
"step": 143
},
{
"epoch": 1.5780821917808219,
"grad_norm": 0.9548241496086121,
"learning_rate": 0.0002,
"loss": 0.1995,
"step": 144
},
{
"epoch": 1.589041095890411,
"grad_norm": 0.966123104095459,
"learning_rate": 0.0002,
"loss": 0.3443,
"step": 145
},
{
"epoch": 1.6,
"grad_norm": 1.8860892057418823,
"learning_rate": 0.0002,
"loss": 0.3481,
"step": 146
},
{
"epoch": 1.610958904109589,
"grad_norm": 1.1538076400756836,
"learning_rate": 0.0002,
"loss": 0.2511,
"step": 147
},
{
"epoch": 1.621917808219178,
"grad_norm": 1.4117934703826904,
"learning_rate": 0.0002,
"loss": 0.3807,
"step": 148
},
{
"epoch": 1.632876712328767,
"grad_norm": 1.4486627578735352,
"learning_rate": 0.0002,
"loss": 0.2264,
"step": 149
},
{
"epoch": 1.643835616438356,
"grad_norm": 0.643312931060791,
"learning_rate": 0.0002,
"loss": 0.0966,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 364,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.76287234956329e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}