Training in progress, step 50, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +6 -6
last-checkpoint/adapter_model.safetensors +2 -2
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +227 -576
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -16,17 +16,17 @@
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "k_proj",
-    "up_proj",
-    "v_proj",
     "down_proj",
     "gate_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "r": 32,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
     "gate_proj",
+    "up_proj",
+    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f2c54b3cff7229bba3a337321576ca3fbedcde46f10b6c700245830c01cb495
-size 80013120

 version https://git-lfs.github.com/spec/v1
+oid sha256:43756c87d3c353cd42eeb6185a59e755a36f9b382d2d1141fdd67b110a8adbc4
+size 319876032

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79bc646c7471cb3943c0b5456f615d091883e21fcd695a7c0aa6311ff2dd361a
-size 41119636

 version https://git-lfs.github.com/spec/v1
+oid sha256:e2df7d962df4e50580d38c384c9f5830d70e0b87aebfd1ee3eca672bf2f43f95
+size 639908666

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5da8984c55f90689ec5dc6254808c095ed22f24233bafba7be5034f696b9c85
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:58ae2ba232b823e0685960e3c048b41a588f00aea0d1bea73208b1f786b5c9f9
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9299ec7d0989f843c66221f6a5f12c76f22cfda8e3a2897dd9a527db5b37854
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:f48d04a21be75a42496761f8a6d10bd6bbb09a3805770c41b54fa6f987df24ff
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,756 +1,407 @@
 {
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.2,
   "eval_steps": 50,
-  "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.002,
-      "grad_norm": 0.3986969590187073,
-      "learning_rate": 0.0001,
-      "loss": 2.7769,
       "step": 1
     },
     {
-      "epoch": 0.002,
       "eval_loss": 3.0125324726104736,
-      "eval_runtime": 4.8013,
-      "eval_samples_per_second": 4.374,
-      "eval_steps_per_second": 4.374,
       "step": 1
     },
     {
-      "epoch": 0.004,
-      "grad_norm": 0.5986809730529785,
-      "learning_rate": 0.0002,
-      "loss": 2.9521,
       "step": 2
     },
     {
-      "epoch": 0.006,
-      "grad_norm": 0.595142662525177,
-      "learning_rate": 0.0003,
-      "loss": 2.955,
       "step": 3
     },
     {
-      "epoch": 0.008,
-      "grad_norm": 0.7013932466506958,
-      "learning_rate": 0.0004,
-      "loss": 2.9037,
       "step": 4
     },
     {
-      "epoch": 0.01,
-      "grad_norm": 1.5847638845443726,
-      "learning_rate": 0.0005,
-      "loss": 2.9706,
       "step": 5
     },
     {
-      "epoch": 0.012,
-      "grad_norm": 1.6309813261032104,
-      "learning_rate": 0.0006,
-      "loss": 2.75,
       "step": 6
     },
     {
-      "epoch": 0.014,
-      "grad_norm": 1.3442208766937256,
-      "learning_rate": 0.0007,
-      "loss": 2.5161,
       "step": 7
     },
     {
-      "epoch": 0.016,
-      "grad_norm": 0.900488018989563,
-      "learning_rate": 0.0008,
-      "loss": 2.2906,
       "step": 8
     },
     {
-      "epoch": 0.018,
-      "grad_norm": 2.340869903564453,
-      "learning_rate": 0.0009000000000000001,
-      "loss": 2.6079,
       "step": 9
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 2.987302303314209,
-      "learning_rate": 0.001,
-      "loss": 2.5506,
       "step": 10
     },
     {
-      "epoch": 0.022,
-      "grad_norm": 1.844685673713684,
-      "learning_rate": 0.0009996954135095479,
-      "loss": 2.7146,
       "step": 11
     },
     {
-      "epoch": 0.024,
-      "grad_norm": 0.9662850499153137,
-      "learning_rate": 0.0009987820251299122,
-      "loss": 2.6323,
       "step": 12
     },
     {
-      "epoch": 0.026,
-      "grad_norm": 3.0721042156219482,
-      "learning_rate": 0.0009972609476841367,
-      "loss": 2.1718,
       "step": 13
     },
     {
-      "epoch": 0.028,
-      "grad_norm": 1.0009405612945557,
-      "learning_rate": 0.0009951340343707852,
-      "loss": 2.6348,
       "step": 14
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 14.435264587402344,
-      "learning_rate": 0.000992403876506104,
-      "loss": 2.5352,
       "step": 15
     },
     {
-      "epoch": 0.032,
-      "grad_norm": 5.060039520263672,
-      "learning_rate": 0.0009890738003669028,
-      "loss": 2.708,
       "step": 16
     },
     {
-      "epoch": 0.034,
-      "grad_norm": 1.6351608037948608,
-      "learning_rate": 0.0009851478631379982,
-      "loss": 2.3905,
       "step": 17
     },
     {
-      "epoch": 0.036,
-      "grad_norm": 2.9582386016845703,
-      "learning_rate": 0.0009806308479691594,
-      "loss": 2.5147,
       "step": 18
     },
     {
-      "epoch": 0.038,
-      "grad_norm": 1.8205921649932861,
-      "learning_rate": 0.0009755282581475768,
-      "loss": 2.766,
       "step": 19
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 1.1158825159072876,
-      "learning_rate": 0.0009698463103929542,
-      "loss": 2.7895,
       "step": 20
     },
     {
-      "epoch": 0.042,
-      "grad_norm": 1.1689060926437378,
-      "learning_rate": 0.0009635919272833937,
-      "loss": 2.6373,
       "step": 21
     },
     {
-      "epoch": 0.044,
-      "grad_norm": 0.8205438256263733,
-      "learning_rate": 0.0009567727288213005,
-      "loss": 2.4038,
       "step": 22
     },
     {
-      "epoch": 0.046,
-      "grad_norm": 1.2794568538665771,
-      "learning_rate": 0.0009493970231495835,
-      "loss": 2.3676,
       "step": 23
     },
     {
-      "epoch": 0.048,
-      "grad_norm": 0.822256863117218,
-      "learning_rate": 0.0009414737964294635,
-      "loss": 2.327,
       "step": 24
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 1.986864447593689,
-      "learning_rate": 0.0009330127018922195,
-      "loss": 2.4431,
       "step": 25
     },
     {
-      "epoch": 0.052,
-      "grad_norm": 3.7959301471710205,
-      "learning_rate": 0.0009240240480782129,
-      "loss": 2.6657,
       "step": 26
     },
     {
-      "epoch": 0.054,
-      "grad_norm": 2.489267587661743,
-      "learning_rate": 0.0009145187862775209,
-      "loss": 2.5005,
       "step": 27
     },
     {
-      "epoch": 0.056,
-      "grad_norm": 2.1583516597747803,
-      "learning_rate": 0.0009045084971874737,
-      "loss": 2.5402,
       "step": 28
     },
     {
-      "epoch": 0.058,
-      "grad_norm": 4.524465084075928,
-      "learning_rate": 0.0008940053768033609,
-      "loss": 2.2461,
       "step": 29
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 1.3595800399780273,
-      "learning_rate": 0.000883022221559489,
-      "loss": 2.331,
       "step": 30
     },
     {
-      "epoch": 0.062,
-      "grad_norm": 0.9844056367874146,
-      "learning_rate": 0.0008715724127386971,
-      "loss": 2.3781,
       "step": 31
     },
     {
-      "epoch": 0.064,
-      "grad_norm": 1.117148518562317,
-      "learning_rate": 0.0008596699001693256,
-      "loss": 2.4258,
       "step": 32
     },
     {
-      "epoch": 0.066,
-      "grad_norm": 0.7900739312171936,
-      "learning_rate": 0.0008473291852294987,
-      "loss": 2.437,
       "step": 33
     },
     {
-      "epoch": 0.068,
-      "grad_norm": 0.8672456741333008,
-      "learning_rate": 0.0008345653031794292,
-      "loss": 2.8025,
       "step": 34
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 0.816504716873169,
-      "learning_rate": 0.0008213938048432696,
-      "loss": 2.5078,
       "step": 35
     },
     {
-      "epoch": 0.072,
-      "grad_norm": 1.0574641227722168,
-      "learning_rate": 0.0008078307376628291,
-      "loss": 2.6408,
       "step": 36
     },
     {
-      "epoch": 0.074,
-      "grad_norm": 0.6753240823745728,
-      "learning_rate": 0.0007938926261462366,
-      "loss": 2.2858,
       "step": 37
     },
     {
-      "epoch": 0.076,
-      "grad_norm": 0.9166250824928284,
-      "learning_rate": 0.0007795964517353734,
-      "loss": 2.7091,
       "step": 38
     },
     {
-      "epoch": 0.078,
-      "grad_norm": 0.9022424221038818,
-      "learning_rate": 0.0007649596321166025,
-      "loss": 2.6459,
       "step": 39
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 0.7723848223686218,
-      "learning_rate": 0.00075,
-      "loss": 2.4329,
       "step": 40
     },
     {
-      "epoch": 0.082,
-      "grad_norm": 0.8669672012329102,
-      "learning_rate": 0.0007347357813929454,
-      "loss": 2.3661,
       "step": 41
     },
     {
-      "epoch": 0.084,
-      "grad_norm": 0.9701873660087585,
-      "learning_rate": 0.0007191855733945387,
-      "loss": 2.6723,
       "step": 42
     },
     {
-      "epoch": 0.086,
-      "grad_norm": 0.8038893342018127,
-      "learning_rate": 0.0007033683215379002,
-      "loss": 2.7652,
       "step": 43
     },
     {
-      "epoch": 0.088,
-      "grad_norm": 0.6812747716903687,
-      "learning_rate": 0.0006873032967079561,
-      "loss": 2.4019,
       "step": 44
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 0.8909493088722229,
-      "learning_rate": 0.0006710100716628344,
-      "loss": 2.349,
       "step": 45
     },
     {
-      "epoch": 0.092,
-      "grad_norm": 0.9887206554412842,
-      "learning_rate": 0.0006545084971874737,
-      "loss": 2.5577,
       "step": 46
     },
     {
-      "epoch": 0.094,
-      "grad_norm": 0.7749077081680298,
-      "learning_rate": 0.0006378186779084996,
-      "loss": 2.2903,
       "step": 47
     },
     {
-      "epoch": 0.096,
-      "grad_norm": 1.0913500785827637,
-      "learning_rate": 0.0006209609477998338,
-      "loss": 2.3697,
       "step": 48
     },
     {
-      "epoch": 0.098,
-      "grad_norm": 0.894119381904602,
-      "learning_rate": 0.0006039558454088796,
-      "loss": 2.5167,
       "step": 49
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 1.159035325050354,
-      "learning_rate": 0.0005868240888334653,
-      "loss": 2.4637,
       "step": 50
     },
     {
-      "epoch": 0.1,
-      "eval_loss": 2.5838444232940674,
-      "eval_runtime": 4.8707,
-      "eval_samples_per_second": 4.311,
-      "eval_steps_per_second": 4.311,
       "step": 50
-    },
-    {
-      "epoch": 0.102,
-      "grad_norm": 0.6844251751899719,
-      "learning_rate": 0.0005695865504800327,
-      "loss": 2.4118,
-      "step": 51
-    },
-    {
-      "epoch": 0.104,
-      "grad_norm": 1.1709848642349243,
-      "learning_rate": 0.0005522642316338268,
-      "loss": 2.444,
-      "step": 52
-    },
-    {
-      "epoch": 0.106,
-      "grad_norm": 0.9435467720031738,
-      "learning_rate": 0.0005348782368720626,
-      "loss": 2.5568,
-      "step": 53
-    },
-    {
-      "epoch": 0.108,
-      "grad_norm": 1.0800719261169434,
-      "learning_rate": 0.0005174497483512506,
-      "loss": 2.5766,
-      "step": 54
-    },
-    {
-      "epoch": 0.11,
-      "grad_norm": 1.001356840133667,
-      "learning_rate": 0.0005,
-      "loss": 2.2205,
-      "step": 55
-    },
-    {
-      "epoch": 0.112,
-      "grad_norm": 1.4582829475402832,
-      "learning_rate": 0.0004825502516487497,
-      "loss": 2.7271,
-      "step": 56
-    },
-    {
-      "epoch": 0.114,
-      "grad_norm": 0.8312236666679382,
-      "learning_rate": 0.00046512176312793734,
-      "loss": 2.3204,
-      "step": 57
-    },
-    {
-      "epoch": 0.116,
-      "grad_norm": 1.2127161026000977,
-      "learning_rate": 0.00044773576836617336,
-      "loss": 2.0169,
-      "step": 58
-    },
-    {
-      "epoch": 0.118,
-      "grad_norm": 1.6428215503692627,
-      "learning_rate": 0.0004304134495199674,
-      "loss": 2.4521,
-      "step": 59
-    },
-    {
-      "epoch": 0.12,
-      "grad_norm": 1.7682443857192993,
-      "learning_rate": 0.00041317591116653486,
-      "loss": 2.6753,
-      "step": 60
-    },
-    {
-      "epoch": 0.122,
-      "grad_norm": 1.0919681787490845,
-      "learning_rate": 0.0003960441545911204,
-      "loss": 2.4022,
-      "step": 61
-    },
-    {
-      "epoch": 0.124,
-      "grad_norm": 2.5304136276245117,
-      "learning_rate": 0.0003790390522001662,
-      "loss": 2.4325,
-      "step": 62
-    },
-    {
-      "epoch": 0.126,
-      "grad_norm": 1.1737953424453735,
-      "learning_rate": 0.00036218132209150044,
-      "loss": 2.2653,
-      "step": 63
-    },
-    {
-      "epoch": 0.128,
-      "grad_norm": 0.7943472862243652,
-      "learning_rate": 0.00034549150281252633,
-      "loss": 2.6079,
-      "step": 64
-    },
-    {
-      "epoch": 0.13,
-      "grad_norm": 1.3269349336624146,
-      "learning_rate": 0.0003289899283371657,
-      "loss": 2.3745,
-      "step": 65
-    },
-    {
-      "epoch": 0.132,
-      "grad_norm": 0.8898394107818604,
-      "learning_rate": 0.00031269670329204396,
-      "loss": 2.3862,
-      "step": 66
-    },
-    {
-      "epoch": 0.134,
-      "grad_norm": 0.8309778571128845,
-      "learning_rate": 0.0002966316784621,
-      "loss": 2.5131,
-      "step": 67
-    },
-    {
-      "epoch": 0.136,
-      "grad_norm": 1.2103646993637085,
-      "learning_rate": 0.00028081442660546124,
-      "loss": 2.5138,
-      "step": 68
-    },
-    {
-      "epoch": 0.138,
-      "grad_norm": 0.9281813502311707,
-      "learning_rate": 0.00026526421860705474,
-      "loss": 2.5798,
-      "step": 69
-    },
-    {
-      "epoch": 0.14,
-      "grad_norm": 0.8275775909423828,
-      "learning_rate": 0.0002500000000000001,
-      "loss": 2.5348,
-      "step": 70
-    },
-    {
-      "epoch": 0.142,
-      "grad_norm": 1.5009329319000244,
-      "learning_rate": 0.0002350403678833976,
-      "loss": 2.5156,
-      "step": 71
-    },
-    {
-      "epoch": 0.144,
-      "grad_norm": 1.4796998500823975,
-      "learning_rate": 0.00022040354826462666,
-      "loss": 2.3567,
-      "step": 72
-    },
-    {
-      "epoch": 0.146,
-      "grad_norm": 0.7437081933021545,
-      "learning_rate": 0.00020610737385376348,
-      "loss": 2.4399,
-      "step": 73
-    },
-    {
-      "epoch": 0.148,
-      "grad_norm": 0.7033576369285583,
-      "learning_rate": 0.00019216926233717085,
-      "loss": 2.3149,
-      "step": 74
-    },
-    {
-      "epoch": 0.15,
-      "grad_norm": 0.9651651978492737,
-      "learning_rate": 0.0001786061951567303,
-      "loss": 2.5816,
-      "step": 75
-    },
-    {
-      "epoch": 0.152,
-      "grad_norm": 1.0059478282928467,
-      "learning_rate": 0.00016543469682057105,
-      "loss": 2.6395,
-      "step": 76
-    },
-    {
-      "epoch": 0.154,
-      "grad_norm": 1.6795697212219238,
-      "learning_rate": 0.00015267081477050133,
-      "loss": 2.3551,
-      "step": 77
-    },
-    {
-      "epoch": 0.156,
-      "grad_norm": 0.7962441444396973,
-      "learning_rate": 0.00014033009983067452,
-      "loss": 2.2151,
-      "step": 78
-    },
-    {
-      "epoch": 0.158,
-      "grad_norm": 0.880089282989502,
-      "learning_rate": 0.00012842758726130281,
-      "loss": 2.4376,
-      "step": 79
-    },
-    {
-      "epoch": 0.16,
-      "grad_norm": 1.0629572868347168,
-      "learning_rate": 0.00011697777844051105,
-      "loss": 2.6063,
-      "step": 80
-    },
-    {
-      "epoch": 0.162,
-      "grad_norm": 0.8691402077674866,
-      "learning_rate": 0.00010599462319663906,
-      "loss": 2.4764,
-      "step": 81
-    },
-    {
-      "epoch": 0.164,
-      "grad_norm": 0.8258126378059387,
-      "learning_rate": 9.549150281252633e-05,
-      "loss": 2.3996,
-      "step": 82
-    },
-    {
-      "epoch": 0.166,
-      "grad_norm": 2.253006935119629,
-      "learning_rate": 8.548121372247918e-05,
-      "loss": 2.7106,
-      "step": 83
-    },
-    {
-      "epoch": 0.168,
-      "grad_norm": 0.9351361393928528,
-      "learning_rate": 7.597595192178702e-05,
-      "loss": 2.3613,
-      "step": 84
-    },
-    {
-      "epoch": 0.17,
-      "grad_norm": 0.8624694347381592,
-      "learning_rate": 6.698729810778065e-05,
-      "loss": 2.4328,
-      "step": 85
-    },
-    {
-      "epoch": 0.172,
-      "grad_norm": 0.6949071884155273,
-      "learning_rate": 5.852620357053651e-05,
-      "loss": 2.4157,
-      "step": 86
-    },
-    {
-      "epoch": 0.174,
-      "grad_norm": 0.7830259203910828,
-      "learning_rate": 5.060297685041659e-05,
-      "loss": 2.2797,
-      "step": 87
-    },
-    {
-      "epoch": 0.176,
-      "grad_norm": 1.3727121353149414,
-      "learning_rate": 4.322727117869951e-05,
-      "loss": 2.6155,
-      "step": 88
-    },
-    {
-      "epoch": 0.178,
-      "grad_norm": 0.6731472611427307,
-      "learning_rate": 3.6408072716606344e-05,
-      "loss": 2.4149,
-      "step": 89
-    },
-    {
-      "epoch": 0.18,
-      "grad_norm": 0.846976101398468,
-      "learning_rate": 3.0153689607045842e-05,
-      "loss": 2.3137,
-      "step": 90
-    },
-    {
-      "epoch": 0.182,
-      "grad_norm": 0.9294453859329224,
-      "learning_rate": 2.4471741852423235e-05,
-      "loss": 2.5798,
-      "step": 91
-    },
-    {
-      "epoch": 0.184,
-      "grad_norm": 0.766918957233429,
-      "learning_rate": 1.9369152030840554e-05,
-      "loss": 2.6766,
-      "step": 92
-    },
-    {
-      "epoch": 0.186,
-      "grad_norm": 1.3079534769058228,
-      "learning_rate": 1.4852136862001764e-05,
-      "loss": 2.6047,
-      "step": 93
-    },
-    {
-      "epoch": 0.188,
-      "grad_norm": 1.1351994276046753,
-      "learning_rate": 1.0926199633097156e-05,
-      "loss": 2.6034,
-      "step": 94
-    },
-    {
-      "epoch": 0.19,
-      "grad_norm": 0.8010856509208679,
-      "learning_rate": 7.59612349389599e-06,
-      "loss": 2.2994,
-      "step": 95
-    },
-    {
-      "epoch": 0.192,
-      "grad_norm": 0.9184717535972595,
-      "learning_rate": 4.865965629214819e-06,
-      "loss": 2.5489,
-      "step": 96
-    },
-    {
-      "epoch": 0.194,
-      "grad_norm": 0.9543655514717102,
-      "learning_rate": 2.739052315863355e-06,
-      "loss": 2.5186,
-      "step": 97
-    },
-    {
-      "epoch": 0.196,
-      "grad_norm": 0.9216803908348083,
-      "learning_rate": 1.2179748700879012e-06,
-      "loss": 2.5627,
-      "step": 98
-    },
-    {
-      "epoch": 0.198,
-      "grad_norm": 0.8810911178588867,
-      "learning_rate": 3.0458649045211895e-07,
-      "loss": 2.6527,
-      "step": 99
-    },
-    {
-      "epoch": 0.2,
-      "grad_norm": 0.7426478266716003,
-      "learning_rate": 0.0,
-      "loss": 2.1737,
-      "step": 100
-    },
-    {
-      "epoch": 0.2,
-      "eval_loss": 2.527949094772339,
-      "eval_runtime": 4.9855,
-      "eval_samples_per_second": 4.212,
-      "eval_steps_per_second": 4.212,
-      "step": 100
     }
   ],
   "logging_steps": 1,
-  "max_steps": 100,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
-  "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": true
       },
       "attributes": {}
     }
   },
-  "total_flos": 1.62874924204032e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 2.580864906311035,
+  "best_model_checkpoint": "miner_id_24/checkpoint-50",
+  "epoch": 0.05,
   "eval_steps": 50,
+  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.001,
+      "grad_norm": 0.2811749279499054,
+      "learning_rate": 2e-05,
+      "loss": 2.6895,
       "step": 1
     },
     {
+      "epoch": 0.001,
       "eval_loss": 3.0125324726104736,
+      "eval_runtime": 4.6936,
+      "eval_samples_per_second": 4.474,
+      "eval_steps_per_second": 4.474,
       "step": 1
     },
     {
+      "epoch": 0.002,
+      "grad_norm": 0.2987586557865143,
+      "learning_rate": 4e-05,
+      "loss": 2.8912,
       "step": 2
     },
     {
+      "epoch": 0.003,
+      "grad_norm": 0.452608197927475,
+      "learning_rate": 6e-05,
+      "loss": 3.4357,
       "step": 3
     },
     {
+      "epoch": 0.004,
+      "grad_norm": 0.38785919547080994,
+      "learning_rate": 8e-05,
+      "loss": 2.5889,
       "step": 4
     },
     {
+      "epoch": 0.005,
+      "grad_norm": 0.38931822776794434,
+      "learning_rate": 0.0001,
+      "loss": 2.7513,
       "step": 5
     },
     {
+      "epoch": 0.006,
+      "grad_norm": 0.516417384147644,
+      "learning_rate": 0.00012,
+      "loss": 3.2128,
       "step": 6
     },
     {
+      "epoch": 0.007,
+      "grad_norm": 0.4206741750240326,
+      "learning_rate": 0.00014,
+      "loss": 2.9368,
       "step": 7
     },
     {
+      "epoch": 0.008,
+      "grad_norm": 0.48171964287757874,
+      "learning_rate": 0.00016,
+      "loss": 2.8618,
       "step": 8
     },
     {
+      "epoch": 0.009,
+      "grad_norm": 0.8544142842292786,
+      "learning_rate": 0.00018,
+      "loss": 3.0312,
       "step": 9
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 0.848558247089386,
+      "learning_rate": 0.0002,
+      "loss": 2.9334,
       "step": 10
     },
     {
+      "epoch": 0.011,
+      "grad_norm": 0.8914313316345215,
+      "learning_rate": 0.00019999996900269505,
+      "loss": 2.7981,
       "step": 11
     },
     {
+      "epoch": 0.012,
+      "grad_norm": 0.6103464365005493,
+      "learning_rate": 0.0001999998760107994,
+      "loss": 2.7247,
       "step": 12
     },
     {
+      "epoch": 0.013,
+      "grad_norm": 0.7618600726127625,
+      "learning_rate": 0.00019999972102437074,
+      "loss": 2.472,
       "step": 13
     },
     {
+      "epoch": 0.014,
+      "grad_norm": 0.6825264692306519,
+      "learning_rate": 0.00019999950404350512,
+      "loss": 2.6008,
       "step": 14
     },
     {
+      "epoch": 0.015,
+      "grad_norm": 0.5940832495689392,
+      "learning_rate": 0.00019999922506833704,
+      "loss": 2.1996,
       "step": 15
     },
     {
+      "epoch": 0.016,
+      "grad_norm": 0.6273623108863831,
+      "learning_rate": 0.00019999888409903948,
+      "loss": 2.3565,
       "step": 16
     },
     {
+      "epoch": 0.017,
+      "grad_norm": 0.7437952160835266,
+      "learning_rate": 0.00019999848113582384,
+      "loss": 2.7232,
       "step": 17
     },
     {
+      "epoch": 0.018,
+      "grad_norm": 0.5971533060073853,
+      "learning_rate": 0.0001999980161789399,
+      "loss": 2.509,
       "step": 18
     },
     {
+      "epoch": 0.019,
+      "grad_norm": 0.5190719962120056,
+      "learning_rate": 0.00019999748922867592,
+      "loss": 2.3535,
       "step": 19
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 0.9244285821914673,
+      "learning_rate": 0.00019999690028535855,
+      "loss": 2.7599,
       "step": 20
     },
     {
+      "epoch": 0.021,
+      "grad_norm": 0.8340674638748169,
+      "learning_rate": 0.00019999624934935296,
+      "loss": 3.0057,
       "step": 21
     },
     {
+      "epoch": 0.022,
+      "grad_norm": 1.0633089542388916,
+      "learning_rate": 0.00019999553642106266,
+      "loss": 2.2808,
       "step": 22
     },
     {
+      "epoch": 0.023,
+      "grad_norm": 4.8767266273498535,
+      "learning_rate": 0.00019999476150092967,
+      "loss": 2.8268,
       "step": 23
     },
     {
+      "epoch": 0.024,
+      "grad_norm": 2.7197344303131104,
+      "learning_rate": 0.00019999392458943432,
+      "loss": 2.6517,
       "step": 24
     },
     {
+      "epoch": 0.025,
+      "grad_norm": 0.9329593777656555,
+      "learning_rate": 0.00019999302568709547,
+      "loss": 2.212,
       "step": 25
     },
     {
+      "epoch": 0.026,
+      "grad_norm": 0.6679103374481201,
+      "learning_rate": 0.00019999206479447045,
+      "loss": 2.0117,
       "step": 26
     },
     {
+      "epoch": 0.027,
+      "grad_norm": 0.5428286790847778,
+      "learning_rate": 0.00019999104191215493,
+      "loss": 2.7582,
       "step": 27
     },
     {
+      "epoch": 0.028,
+      "grad_norm": 0.5552177429199219,
+      "learning_rate": 0.00019998995704078305,
+      "loss": 2.54,
       "step": 28
     },
     {
+      "epoch": 0.029,
+      "grad_norm": 0.5453671216964722,
+      "learning_rate": 0.00019998881018102737,
+      "loss": 2.5358,
       "step": 29
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 0.47653189301490784,
+      "learning_rate": 0.00019998760133359885,
+      "loss": 2.2443,
       "step": 30
     },
     {
+      "epoch": 0.031,
+      "grad_norm": 0.755976140499115,
+      "learning_rate": 0.0001999863304992469,
+      "loss": 2.5519,
       "step": 31
     },
     {
+      "epoch": 0.032,
+      "grad_norm": 0.7680912017822266,
+      "learning_rate": 0.00019998499767875943,
+      "loss": 2.7503,
       "step": 32
     },
     {
+      "epoch": 0.033,
+      "grad_norm": 3.768080472946167,
+      "learning_rate": 0.0001999836028729627,
+      "loss": 2.6051,
       "step": 33
     },
     {
+      "epoch": 0.034,
+      "grad_norm": 0.5304062962532043,
+      "learning_rate": 0.00019998214608272136,
+      "loss": 2.2065,
       "step": 34
     },
     {
+      "epoch": 0.035,
+      "grad_norm": 1.1568998098373413,
+      "learning_rate": 0.00019998062730893862,
+      "loss": 2.444,
       "step": 35
     },
     {
+      "epoch": 0.036,
+      "grad_norm": 0.8356309533119202,
+      "learning_rate": 0.000199979046552556,
+      "loss": 2.5763,
       "step": 36
     },
     {
+      "epoch": 0.037,
+      "grad_norm": 0.5210471749305725,
+      "learning_rate": 0.00019997740381455346,
+      "loss": 2.8545,
       "step": 37
     },
     {
+      "epoch": 0.038,
+      "grad_norm": 1.550714373588562,
+      "learning_rate": 0.00019997569909594947,
+      "loss": 2.6236,
       "step": 38
     },
     {
+      "epoch": 0.039,
+      "grad_norm": 0.6044741868972778,
+      "learning_rate": 0.0001999739323978008,
+      "loss": 2.5349,
       "step": 39
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 0.9703565239906311,
+      "learning_rate": 0.00019997210372120274,
+      "loss": 3.1004,
       "step": 40
     },
     {
+      "epoch": 0.041,
+      "grad_norm": 0.7796650528907776,
+      "learning_rate": 0.000199970213067289,
+      "loss": 2.5757,
       "step": 41
     },
     {
+      "epoch": 0.042,
+      "grad_norm": 0.6824871301651001,
+      "learning_rate": 0.00019996826043723162,
+      "loss": 2.6766,
       "step": 42
     },
     {
+      "epoch": 0.043,
+      "grad_norm": 0.8048773407936096,
+      "learning_rate": 0.00019996624583224114,
+      "loss": 2.3065,
       "step": 43
     },
     {
+      "epoch": 0.044,
+      "grad_norm": 0.5458154082298279,
+      "learning_rate": 0.00019996416925356652,
+      "loss": 2.4336,
       "step": 44
     },
     {
+      "epoch": 0.045,
+      "grad_norm": 0.623190701007843,
+      "learning_rate": 0.00019996203070249516,
+      "loss": 2.3835,
       "step": 45
     },
     {
+      "epoch": 0.046,
+      "grad_norm": 0.5928781032562256,
+      "learning_rate": 0.00019995983018035278,
+      "loss": 2.3408,
       "step": 46
     },
     {
+      "epoch": 0.047,
+      "grad_norm": 0.5790976881980896,
+      "learning_rate": 0.00019995756768850364,
+      "loss": 2.3878,
       "step": 47
     },
     {
+      "epoch": 0.048,
+      "grad_norm": 0.5648425817489624,
+      "learning_rate": 0.00019995524322835034,
+      "loss": 2.2885,
       "step": 48
     },
     {
+      "epoch": 0.049,
+      "grad_norm": 0.526339054107666,
+      "learning_rate": 0.00019995285680133394,
+      "loss": 2.408,
       "step": 49
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 0.6333803534507751,
+      "learning_rate": 0.00019995040840893388,
+      "loss": 2.4391,
       "step": 50
     },
     {
+      "epoch": 0.05,
+      "eval_loss": 2.580864906311035,
+      "eval_runtime": 4.8038,
+      "eval_samples_per_second": 4.372,
+      "eval_steps_per_second": 4.372,
       "step": 50
     }
   ],
   "logging_steps": 1,
+  "max_steps": 4000,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 50,
   "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 2,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
     "TrainerControl": {
       "args": {
         "should_epoch_stop": false,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": false
       },
       "attributes": {}
     }
   },
+  "total_flos": 4108715871436800.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8d9124138abd44af04b2c60a935bcab4ff5cdb3ea64e57559b87dc3f7e79065
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:587385ca4b3a6c2778a0b2f3cca66b2116c2d78e99e16d5766eaff5ef6eeb893
 size 6776