Upload 17 files

Browse files

Update model, new version

Files changed (9) hide show

adapter_config.json +4 -4
adapter_model.safetensors +1 -1
model-00001-of-00003.safetensors +1 -1
model-00002-of-00003.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +318 -59
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -23,13 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
-    "up_proj",
-    "q_proj",
     "gate_proj",
-    "k_proj",
     "down_proj",
-    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "o_proj",
+    "k_proj",
     "v_proj",
     "gate_proj",
+    "q_proj",
     "down_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbfc469fc0f359611284b9a33ffea89a4c2cd108a6411a23c605797eac90d6ef
 size 2118301632

 version https://git-lfs.github.com/spec/v1
+oid sha256:948f6262dafd5b5ddcf25e37fc4e8508aac2f4ec710c58eb57772d4c0305953b
 size 2118301632

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e876dadb94eaf20db78c2ca1778580419eacb10c8032d42f5396d446ce18a1c2
 size 1990270808

 version https://git-lfs.github.com/spec/v1
+oid sha256:deef029ef7f888736b60df04b9783af0ff28917986016b4ea5b120b1e62f6890
 size 1990270808

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c2098a54b6104c08f791ffd311b2c8b264f47830750fc2f261e3f9f8efab622
 size 1006719368

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e7bff07f11bee8fc75b40e42ebbe73636112a72e199a310dc6206396b829f5a
 size 1006719368

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60ec98f4495156924b88229bdbe56f8bb1202e5a8203e76105d5d2b0acc01442
 size 34007674

 version https://git-lfs.github.com/spec/v1
+oid sha256:22e8891df7ee74875e965dcd91e7db8d2130bb7a4758e6a66f1f69a8f7b48bf8
 size 34007674

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0f3a6090e681048a6c2caa8a2f14f616e5a16d68b188a93348c4482d6393a0c
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d156686ae62d2dd6e7c2996e6ec40caf5c5d66591403237deeba81bfda597c8
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c92fc546447cfc1151596b19e46f7088e39100f9de60871cd8a0f9b1a8d1df9
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:cf3863c8948e41c40f37017b57744f08214b2c38da69d49fe98001649774bc48
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,144 +1,403 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9993211133740665,
   "eval_steps": 500,
-  "global_step": 184,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.05431093007467753,
-      "grad_norm": 1.5025734901428223,
-      "learning_rate": 0.00019904804439875633,
-      "loss": 3.2617,
       "step": 10
     },
     {
       "epoch": 0.10862186014935506,
-      "grad_norm": 1.5902663469314575,
-      "learning_rate": 0.00019521176659107142,
-      "loss": 1.7882,
       "step": 20
     },
     {
       "epoch": 0.1629327902240326,
-      "grad_norm": 0.7277324795722961,
-      "learning_rate": 0.000188545602565321,
-      "loss": 1.1819,
       "step": 30
     },
     {
       "epoch": 0.2172437202987101,
-      "grad_norm": 0.6814110279083252,
-      "learning_rate": 0.00017924768419510904,
-      "loss": 1.0805,
       "step": 40
     },
     {
       "epoch": 0.27155465037338766,
-      "grad_norm": 0.6621173620223999,
-      "learning_rate": 0.00016759436441447545,
-      "loss": 1.0806,
       "step": 50
     },
     {
       "epoch": 0.3258655804480652,
-      "grad_norm": 0.717393696308136,
-      "learning_rate": 0.00015393200344991995,
-      "loss": 1.0157,
       "step": 60
     },
     {
       "epoch": 0.3801765105227427,
-      "grad_norm": 0.6353682279586792,
-      "learning_rate": 0.0001386666742941419,
-      "loss": 1.0388,
       "step": 70
     },
     {
       "epoch": 0.4344874405974202,
-      "grad_norm": 0.7220941185951233,
-      "learning_rate": 0.00012225209339563145,
-      "loss": 0.9547,
       "step": 80
     },
     {
       "epoch": 0.48879837067209775,
-      "grad_norm": 0.7532466650009155,
-      "learning_rate": 0.00010517613528842097,
-      "loss": 1.0116,
       "step": 90
     },
     {
       "epoch": 0.5431093007467753,
-      "grad_norm": 0.8198474645614624,
-      "learning_rate": 8.79463319744677e-05,
-      "loss": 0.9763,
       "step": 100
     },
     {
       "epoch": 0.5974202308214528,
-      "grad_norm": 0.79359370470047,
-      "learning_rate": 7.107478804634325e-05,
-      "loss": 1.0034,
       "step": 110
     },
     {
       "epoch": 0.6517311608961304,
-      "grad_norm": 1.1620718240737915,
-      "learning_rate": 5.506295990328385e-05,
-      "loss": 0.9125,
       "step": 120
     },
     {
       "epoch": 0.7060420909708078,
-      "grad_norm": 1.5092897415161133,
-      "learning_rate": 4.038675145307747e-05,
-      "loss": 0.8748,
       "step": 130
     },
     {
       "epoch": 0.7603530210454854,
-      "grad_norm": 1.0216153860092163,
-      "learning_rate": 2.7482369285662378e-05,
-      "loss": 0.9131,
       "step": 140
     },
     {
       "epoch": 0.814663951120163,
-      "grad_norm": 1.5323857069015503,
-      "learning_rate": 1.6733357731279377e-05,
-      "loss": 0.8702,
       "step": 150
     },
     {
       "epoch": 0.8689748811948405,
-      "grad_norm": 0.962648332118988,
-      "learning_rate": 8.45919914746337e-06,
-      "loss": 0.8396,
       "step": 160
     },
     {
       "epoch": 0.923285811269518,
-      "grad_norm": 0.7791914939880371,
-      "learning_rate": 2.905818257394799e-06,
-      "loss": 0.8633,
       "step": 170
     },
     {
       "epoch": 0.9775967413441955,
-      "grad_norm": 0.8390964865684509,
-      "learning_rate": 2.382727698752474e-07,
-      "loss": 0.8817,
       "step": 180
     }
   ],
   "logging_steps": 10,
-  "max_steps": 184,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -152,7 +411,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1445464568266752.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.988458927359131,
   "eval_steps": 500,
+  "global_step": 552,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.05431093007467753,
+      "grad_norm": 1.4870964288711548,
+      "learning_rate": 0.00019997351589651408,
+      "loss": 3.4965,
       "step": 10
     },
     {
       "epoch": 0.10862186014935506,
+      "grad_norm": 1.784044861793518,
+      "learning_rate": 0.00019967573081342103,
+      "loss": 2.065,
       "step": 20
     },
     {
       "epoch": 0.1629327902240326,
+      "grad_norm": 0.7305468916893005,
+      "learning_rate": 0.00019904804439875633,
+      "loss": 1.2421,
       "step": 30
     },
     {
       "epoch": 0.2172437202987101,
+      "grad_norm": 0.6995559930801392,
+      "learning_rate": 0.00019809253413499565,
+      "loss": 1.093,
       "step": 40
     },
     {
       "epoch": 0.27155465037338766,
+      "grad_norm": 0.6627448201179504,
+      "learning_rate": 0.00019681236251822273,
+      "loss": 1.0856,
       "step": 50
     },
     {
       "epoch": 0.3258655804480652,
+      "grad_norm": 0.7160666584968567,
+      "learning_rate": 0.00019521176659107142,
+      "loss": 1.013,
       "step": 60
     },
     {
       "epoch": 0.3801765105227427,
+      "grad_norm": 0.6306814551353455,
+      "learning_rate": 0.0001932960439191915,
+      "loss": 1.0374,
       "step": 70
     },
     {
       "epoch": 0.4344874405974202,
+      "grad_norm": 0.7758208513259888,
+      "learning_rate": 0.00019107153505765306,
+      "loss": 0.9474,
       "step": 80
     },
     {
       "epoch": 0.48879837067209775,
+      "grad_norm": 1.2394300699234009,
+      "learning_rate": 0.000188545602565321,
+      "loss": 0.9932,
       "step": 90
     },
     {
       "epoch": 0.5431093007467753,
+      "grad_norm": 0.829031229019165,
+      "learning_rate": 0.0001857266066366567,
+      "loss": 0.9204,
       "step": 100
     },
     {
       "epoch": 0.5974202308214528,
+      "grad_norm": 0.7629134654998779,
+      "learning_rate": 0.0001826238774315995,
+      "loss": 0.9457,
       "step": 110
     },
     {
       "epoch": 0.6517311608961304,
+      "grad_norm": 0.8157823085784912,
+      "learning_rate": 0.00017924768419510904,
+      "loss": 0.8539,
       "step": 120
     },
     {
       "epoch": 0.7060420909708078,
+      "grad_norm": 0.7475631237030029,
+      "learning_rate": 0.0001756092012685749,
+      "loss": 0.82,
       "step": 130
     },
     {
       "epoch": 0.7603530210454854,
+      "grad_norm": 0.6592528223991394,
+      "learning_rate": 0.000171720471105587,
+      "loss": 0.8846,
       "step": 140
     },
     {
       "epoch": 0.814663951120163,
+      "grad_norm": 0.6989027857780457,
+      "learning_rate": 0.00016759436441447545,
+      "loss": 0.8367,
       "step": 150
     },
     {
       "epoch": 0.8689748811948405,
+      "grad_norm": 0.7253873348236084,
+      "learning_rate": 0.00016324453755953773,
+      "loss": 0.8068,
       "step": 160
     },
     {
       "epoch": 0.923285811269518,
+      "grad_norm": 0.7640873193740845,
+      "learning_rate": 0.00015868538736194427,
+      "loss": 0.8169,
       "step": 170
     },
     {
       "epoch": 0.9775967413441955,
+      "grad_norm": 0.7669989466667175,
+      "learning_rate": 0.00015393200344991995,
+      "loss": 0.8355,
       "step": 180
+    },
+    {
+      "epoch": 1.0271554650373387,
+      "grad_norm": 0.7532988786697388,
+      "learning_rate": 0.0001490001183159105,
+      "loss": 0.7339,
+      "step": 190
+    },
+    {
+      "epoch": 1.0814663951120163,
+      "grad_norm": 0.7974510192871094,
+      "learning_rate": 0.0001439060552460318,
+      "loss": 0.8186,
+      "step": 200
+    },
+    {
+      "epoch": 1.1357773251866938,
+      "grad_norm": 0.9017219543457031,
+      "learning_rate": 0.0001386666742941419,
+      "loss": 0.775,
+      "step": 210
+    },
+    {
+      "epoch": 1.1900882552613714,
+      "grad_norm": 0.8205109238624573,
+      "learning_rate": 0.00013329931647934883,
+      "loss": 0.7421,
+      "step": 220
+    },
+    {
+      "epoch": 1.2443991853360488,
+      "grad_norm": 0.866692066192627,
+      "learning_rate": 0.0001278217463916453,
+      "loss": 0.7113,
+      "step": 230
+    },
+    {
+      "epoch": 1.2987101154107263,
+      "grad_norm": 0.8832337856292725,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 0.7545,
+      "step": 240
+    },
+    {
+      "epoch": 1.353021045485404,
+      "grad_norm": 1.0796443223953247,
+      "learning_rate": 0.00011660879162692675,
+      "loss": 0.7085,
+      "step": 250
+    },
+    {
+      "epoch": 1.4073319755600815,
+      "grad_norm": 0.9231683015823364,
+      "learning_rate": 0.00011091051897986678,
+      "loss": 0.7168,
+      "step": 260
+    },
+    {
+      "epoch": 1.461642905634759,
+      "grad_norm": 0.8881363272666931,
+      "learning_rate": 0.00010517613528842097,
+      "loss": 0.7606,
+      "step": 270
+    },
+    {
+      "epoch": 1.5159538357094364,
+      "grad_norm": 0.8930597901344299,
+      "learning_rate": 9.942461990493625e-05,
+      "loss": 0.6926,
+      "step": 280
+    },
+    {
+      "epoch": 1.570264765784114,
+      "grad_norm": 1.0270030498504639,
+      "learning_rate": 9.367500888330545e-05,
+      "loss": 0.7571,
+      "step": 290
+    },
+    {
+      "epoch": 1.6245756958587916,
+      "grad_norm": 0.8959159255027771,
+      "learning_rate": 8.79463319744677e-05,
+      "loss": 0.7786,
+      "step": 300
+    },
+    {
+      "epoch": 1.6788866259334692,
+      "grad_norm": 0.8595919013023376,
+      "learning_rate": 8.225754964277018e-05,
+      "loss": 0.6935,
+      "step": 310
+    },
+    {
+      "epoch": 1.7331975560081467,
+      "grad_norm": 0.953175961971283,
+      "learning_rate": 7.662749031165092e-05,
+      "loss": 0.6901,
+      "step": 320
+    },
+    {
+      "epoch": 1.787508486082824,
+      "grad_norm": 0.985431969165802,
+      "learning_rate": 7.107478804634325e-05,
+      "loss": 0.7101,
+      "step": 330
+    },
+    {
+      "epoch": 1.8418194161575017,
+      "grad_norm": 1.0016827583312988,
+      "learning_rate": 6.561782087985681e-05,
+      "loss": 0.707,
+      "step": 340
+    },
+    {
+      "epoch": 1.8961303462321792,
+      "grad_norm": 0.9732582569122314,
+      "learning_rate": 6.02746499863599e-05,
+      "loss": 0.7426,
+      "step": 350
+    },
+    {
+      "epoch": 1.9504412763068566,
+      "grad_norm": 0.9253762364387512,
+      "learning_rate": 5.506295990328385e-05,
+      "loss": 0.7273,
+      "step": 360
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.792293071746826,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.7256,
+      "step": 370
+    },
+    {
+      "epoch": 2.0543109300746774,
+      "grad_norm": 0.9254827499389648,
+      "learning_rate": 4.510252738679136e-05,
+      "loss": 0.6432,
+      "step": 380
+    },
+    {
+      "epoch": 2.108621860149355,
+      "grad_norm": 1.0876941680908203,
+      "learning_rate": 4.038675145307747e-05,
+      "loss": 0.6256,
+      "step": 390
+    },
+    {
+      "epoch": 2.1629327902240325,
+      "grad_norm": 0.916249692440033,
+      "learning_rate": 3.5868280218455796e-05,
+      "loss": 0.6442,
+      "step": 400
+    },
+    {
+      "epoch": 2.2172437202987103,
+      "grad_norm": 0.9240853190422058,
+      "learning_rate": 3.1562068674124344e-05,
+      "loss": 0.5883,
+      "step": 410
+    },
+    {
+      "epoch": 2.2715546503733877,
+      "grad_norm": 1.2008038759231567,
+      "learning_rate": 2.7482369285662378e-05,
+      "loss": 0.6987,
+      "step": 420
+    },
+    {
+      "epoch": 2.325865580448065,
+      "grad_norm": 1.2723044157028198,
+      "learning_rate": 2.364268482099218e-05,
+      "loss": 0.708,
+      "step": 430
+    },
+    {
+      "epoch": 2.380176510522743,
+      "grad_norm": 0.9695908427238464,
+      "learning_rate": 2.0055723659649904e-05,
+      "loss": 0.6782,
+      "step": 440
+    },
+    {
+      "epoch": 2.43448744059742,
+      "grad_norm": 1.044391393661499,
+      "learning_rate": 1.6733357731279377e-05,
+      "loss": 0.5803,
+      "step": 450
+    },
+    {
+      "epoch": 2.4887983706720975,
+      "grad_norm": 0.9964624643325806,
+      "learning_rate": 1.368658322256311e-05,
+      "loss": 0.6112,
+      "step": 460
+    },
+    {
+      "epoch": 2.5431093007467753,
+      "grad_norm": 1.004639744758606,
+      "learning_rate": 1.0925484182639467e-05,
+      "loss": 0.6322,
+      "step": 470
+    },
+    {
+      "epoch": 2.5974202308214527,
+      "grad_norm": 1.1456069946289062,
+      "learning_rate": 8.45919914746337e-06,
+      "loss": 0.5633,
+      "step": 480
+    },
+    {
+      "epoch": 2.6517311608961305,
+      "grad_norm": 1.1862763166427612,
+      "learning_rate": 6.2958908935752955e-06,
+      "loss": 0.5859,
+      "step": 490
+    },
+    {
+      "epoch": 2.706042090970808,
+      "grad_norm": 1.1233826875686646,
+      "learning_rate": 4.442719421385922e-06,
+      "loss": 0.6147,
+      "step": 500
+    },
+    {
+      "epoch": 2.7603530210454856,
+      "grad_norm": 1.0159374475479126,
+      "learning_rate": 2.905818257394799e-06,
+      "loss": 0.5829,
+      "step": 510
+    },
+    {
+      "epoch": 2.814663951120163,
+      "grad_norm": 1.053791880607605,
+      "learning_rate": 1.6902741537767609e-06,
+      "loss": 0.5938,
+      "step": 520
+    },
+    {
+      "epoch": 2.8689748811948403,
+      "grad_norm": 1.0928566455841064,
+      "learning_rate": 8.00110252525299e-07,
+      "loss": 0.6136,
+      "step": 530
+    },
+    {
+      "epoch": 2.923285811269518,
+      "grad_norm": 1.1599104404449463,
+      "learning_rate": 2.382727698752474e-07,
+      "loss": 0.6389,
+      "step": 540
+    },
+    {
+      "epoch": 2.9775967413441955,
+      "grad_norm": 1.2020913362503052,
+      "learning_rate": 6.621245075910665e-09,
+      "loss": 0.6719,
+      "step": 550
     }
   ],
   "logging_steps": 10,
+  "max_steps": 552,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 4322859040948224.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d4a4dcbea433b28b388dcc17c84657d36933b48455101d1610f7855b760e64f
 size 5624

 version https://git-lfs.github.com/spec/v1
+oid sha256:75132bb50a07de61eeb928311e97947df98a7dedcfa8b9670cb9777490844360
 size 5624