T145
/

ZEUS-8B-V2-abliterated

@@ -129,7 +129,7 @@ NUM_PROMPT_SAMPLES = 32
 # Used to skip the first and last layers for the modifications.
 SKIP_BEGIN_LAYERS = 1
-SKIP_END_LAYERS = 2
 # The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
 LAYER_FRACTION_TO_USE = 0.6
@@ -153,8 +153,8 @@ model = AutoModelForCausalLM.from_pretrained(
 model.requires_grad_(False)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
 print("Layer index for refusal direction: " + str(layer_idx))
 with open("harmful.txt", "r", encoding="utf-8") as f:
@@ -235,14 +235,13 @@ assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SK
 bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
-# Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
 # NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
 def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
     assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
-    tensor_float32 = tensor_data.to(torch.float32)
-    refusal_dir_float32 = refusal_dir.to(torch.float32)
-    tensor_float32 -= scale_factor * torch.matmul(torch.outer(refusal_dir_float32, refusal_dir_float32), tensor_float32)
-    tensor_modified = tensor_float32.to(torch.bfloat16)
     bar_layers.update(1)
     return torch.nn.Parameter(tensor_modified)
@@ -250,9 +249,9 @@ def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
 # NOTE: These tensors names are speific to "llama" and may need changing.
 #       - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
 for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
-    # lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
-    #     lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
-    # )
     lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
         lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
     )

 # Used to skip the first and last layers for the modifications.
 SKIP_BEGIN_LAYERS = 1
+SKIP_END_LAYERS = 1
 # The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
 LAYER_FRACTION_TO_USE = 0.6
 model.requires_grad_(False)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
 print("Layer index for refusal direction: " + str(layer_idx))
 with open("harmful.txt", "r", encoding="utf-8") as f:
 bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
 # NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
 def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
     assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
+    tensor_float = tensor_data.to(torch.bfloat16)
+    refusal_dir_float = refusal_dir.to(torch.bfloat16)
+    tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
+    tensor_modified = tensor_float.to(torch.bfloat16)
     bar_layers.update(1)
     return torch.nn.Parameter(tensor_modified)
 # NOTE: These tensors names are speific to "llama" and may need changing.
 #       - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
 for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
+    lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
+        lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
+    )
     lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
         lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
     )