T145 commited on
Commit
6c4fdd6
·
verified ·
1 Parent(s): 653401c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -10
README.md CHANGED
@@ -129,7 +129,7 @@ NUM_PROMPT_SAMPLES = 32
129
 
130
  # Used to skip the first and last layers for the modifications.
131
  SKIP_BEGIN_LAYERS = 1
132
- SKIP_END_LAYERS = 2
133
 
134
  # The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
135
  LAYER_FRACTION_TO_USE = 0.6
@@ -153,8 +153,8 @@ model = AutoModelForCausalLM.from_pretrained(
153
  model.requires_grad_(False)
154
 
155
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
156
-
157
  layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
 
158
  print("Layer index for refusal direction: " + str(layer_idx))
159
 
160
  with open("harmful.txt", "r", encoding="utf-8") as f:
@@ -235,14 +235,13 @@ assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SK
235
 
236
  bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
237
 
238
- # Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
239
  # NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
240
  def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
241
  assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
242
- tensor_float32 = tensor_data.to(torch.float32)
243
- refusal_dir_float32 = refusal_dir.to(torch.float32)
244
- tensor_float32 -= scale_factor * torch.matmul(torch.outer(refusal_dir_float32, refusal_dir_float32), tensor_float32)
245
- tensor_modified = tensor_float32.to(torch.bfloat16)
246
  bar_layers.update(1)
247
  return torch.nn.Parameter(tensor_modified)
248
 
@@ -250,9 +249,9 @@ def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
250
  # NOTE: These tensors names are speific to "llama" and may need changing.
251
  # - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
252
  for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
253
- # lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
254
- # lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
255
- # )
256
  lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
257
  lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
258
  )
 
129
 
130
  # Used to skip the first and last layers for the modifications.
131
  SKIP_BEGIN_LAYERS = 1
132
+ SKIP_END_LAYERS = 1
133
 
134
  # The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
135
  LAYER_FRACTION_TO_USE = 0.6
 
153
  model.requires_grad_(False)
154
 
155
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
156
  layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
157
+
158
  print("Layer index for refusal direction: " + str(layer_idx))
159
 
160
  with open("harmful.txt", "r", encoding="utf-8") as f:
 
235
 
236
  bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
237
 
 
238
  # NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
239
  def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
240
  assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
241
+ tensor_float = tensor_data.to(torch.bfloat16)
242
+ refusal_dir_float = refusal_dir.to(torch.bfloat16)
243
+ tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
244
+ tensor_modified = tensor_float.to(torch.bfloat16)
245
  bar_layers.update(1)
246
  return torch.nn.Parameter(tensor_modified)
247
 
 
249
  # NOTE: These tensors names are speific to "llama" and may need changing.
250
  # - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
251
  for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
252
+ lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
253
+ lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
254
+ )
255
  lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
256
  lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
257
  )