Update README.md
Browse files
README.md
CHANGED
@@ -129,7 +129,7 @@ NUM_PROMPT_SAMPLES = 32
|
|
129 |
|
130 |
# Used to skip the first and last layers for the modifications.
|
131 |
SKIP_BEGIN_LAYERS = 1
|
132 |
-
SKIP_END_LAYERS =
|
133 |
|
134 |
# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
|
135 |
LAYER_FRACTION_TO_USE = 0.6
|
@@ -153,8 +153,8 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
153 |
model.requires_grad_(False)
|
154 |
|
155 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
156 |
-
|
157 |
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
|
|
|
158 |
print("Layer index for refusal direction: " + str(layer_idx))
|
159 |
|
160 |
with open("harmful.txt", "r", encoding="utf-8") as f:
|
@@ -235,14 +235,13 @@ assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SK
|
|
235 |
|
236 |
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
|
237 |
|
238 |
-
# Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
|
239 |
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
|
240 |
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
|
241 |
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
tensor_modified =
|
246 |
bar_layers.update(1)
|
247 |
return torch.nn.Parameter(tensor_modified)
|
248 |
|
@@ -250,9 +249,9 @@ def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
|
|
250 |
# NOTE: These tensors names are speific to "llama" and may need changing.
|
251 |
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
|
252 |
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
|
257 |
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
|
258 |
)
|
|
|
129 |
|
130 |
# Used to skip the first and last layers for the modifications.
|
131 |
SKIP_BEGIN_LAYERS = 1
|
132 |
+
SKIP_END_LAYERS = 1
|
133 |
|
134 |
# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
|
135 |
LAYER_FRACTION_TO_USE = 0.6
|
|
|
153 |
model.requires_grad_(False)
|
154 |
|
155 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
|
|
156 |
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
|
157 |
+
|
158 |
print("Layer index for refusal direction: " + str(layer_idx))
|
159 |
|
160 |
with open("harmful.txt", "r", encoding="utf-8") as f:
|
|
|
235 |
|
236 |
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
|
237 |
|
|
|
238 |
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
|
239 |
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
|
240 |
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
|
241 |
+
tensor_float = tensor_data.to(torch.bfloat16)
|
242 |
+
refusal_dir_float = refusal_dir.to(torch.bfloat16)
|
243 |
+
tensor_float -= scale_factor * torch.matmul(torch.outer(refusal_dir_float, refusal_dir_float), tensor_float)
|
244 |
+
tensor_modified = tensor_float.to(torch.bfloat16)
|
245 |
bar_layers.update(1)
|
246 |
return torch.nn.Parameter(tensor_modified)
|
247 |
|
|
|
249 |
# NOTE: These tensors names are speific to "llama" and may need changing.
|
250 |
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
|
251 |
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
|
252 |
+
lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
|
253 |
+
lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
|
254 |
+
)
|
255 |
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
|
256 |
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
|
257 |
)
|