Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
tags:
|
4 |
+
- mergekit
|
5 |
+
- merge
|
6 |
+
- llama-3.1
|
7 |
+
- roleplay
|
8 |
+
- function calling
|
9 |
+
base_model:
|
10 |
+
- arcee-ai/Llama-3.1-SuperNova-Lite
|
11 |
+
- akjindal53244/Llama-3.1-Storm-8B
|
12 |
+
- Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
|
13 |
+
- unsloth/Meta-Llama-3.1-8B-Instruct
|
14 |
+
---
|
15 |
+
|
16 |
+
# ZEUS 8B 🌩️ V2 - ABLITERATED
|
17 |
+
|
18 |
+
V2 abliterated using the following script:
|
19 |
+
|
20 |
+
```python
|
21 |
+
import gc
|
22 |
+
import random
|
23 |
+
|
24 |
+
import torch
|
25 |
+
from tqdm import tqdm
|
26 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
27 |
+
|
28 |
+
MODEL_ID = "T145/ZEUS-8B-V2"
|
29 |
+
|
30 |
+
# More samples can help find the direction better.
|
31 |
+
NUM_PROMPT_SAMPLES = 32
|
32 |
+
|
33 |
+
# Used to skip the first and last layers for the modifications.
|
34 |
+
SKIP_BEGIN_LAYERS = 1
|
35 |
+
SKIP_END_LAYERS = 2
|
36 |
+
|
37 |
+
# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
|
38 |
+
LAYER_FRACTION_TO_USE = 0.6
|
39 |
+
|
40 |
+
# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
|
41 |
+
SCALE_FACTOR = 1.0
|
42 |
+
|
43 |
+
torch.inference_mode()
|
44 |
+
torch.set_default_device("cpu")
|
45 |
+
torch.set_grad_enabled(False)
|
46 |
+
|
47 |
+
# Load the model on the GPU in quantized type if we can.
|
48 |
+
model = AutoModelForCausalLM.from_pretrained(
|
49 |
+
MODEL_ID,
|
50 |
+
trust_remote_code=True,
|
51 |
+
torch_dtype=torch.float16,
|
52 |
+
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
|
53 |
+
low_cpu_mem_usage=True,
|
54 |
+
device_map='auto'
|
55 |
+
)
|
56 |
+
model.requires_grad_(False)
|
57 |
+
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
59 |
+
|
60 |
+
layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
|
61 |
+
print("Layer index for refusal direction: " + str(layer_idx))
|
62 |
+
|
63 |
+
with open("harmful.txt", "r", encoding="utf-8") as f:
|
64 |
+
harmful = f.readlines()
|
65 |
+
|
66 |
+
with open("harmless.txt", "r", encoding="utf-8") as f:
|
67 |
+
harmless = f.readlines()
|
68 |
+
|
69 |
+
harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful)))
|
70 |
+
harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless)))
|
71 |
+
|
72 |
+
harmful_toks = [
|
73 |
+
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
|
74 |
+
return_tensors="pt") for insn in harmful_instructions]
|
75 |
+
harmless_toks = [
|
76 |
+
tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True, tokenize=False,
|
77 |
+
return_tensors="pt") for insn in harmless_instructions]
|
78 |
+
|
79 |
+
bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples")
|
80 |
+
|
81 |
+
# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM.
|
82 |
+
def generate(toks):
|
83 |
+
inputs = tokenizer(toks, return_tensors="pt", padding=True)
|
84 |
+
inputs = inputs.to(model.device)
|
85 |
+
output = model.generate(
|
86 |
+
inputs['input_ids'],
|
87 |
+
use_cache=False,
|
88 |
+
max_new_tokens=1,
|
89 |
+
return_dict_in_generate=True,
|
90 |
+
output_hidden_states=True,
|
91 |
+
attention_mask=inputs["attention_mask"],
|
92 |
+
pad_token_id=tokenizer.eos_token_id
|
93 |
+
)
|
94 |
+
bar_generate.update(n=1)
|
95 |
+
return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1.
|
96 |
+
|
97 |
+
harmful_hidden = [generate(toks) for toks in harmful_toks]
|
98 |
+
harmless_hidden = [generate(toks) for toks in harmless_toks]
|
99 |
+
|
100 |
+
bar_generate.close()
|
101 |
+
|
102 |
+
harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
|
103 |
+
harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
|
104 |
+
|
105 |
+
refusal_dir = harmful_mean - harmless_mean
|
106 |
+
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()
|
107 |
+
|
108 |
+
torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")
|
109 |
+
|
110 |
+
# Free memory
|
111 |
+
del model
|
112 |
+
gc.collect()
|
113 |
+
torch.cuda.empty_cache()
|
114 |
+
|
115 |
+
# Reload the model in CPU memory with bfloat16 data type
|
116 |
+
model = AutoModelForCausalLM.from_pretrained(
|
117 |
+
MODEL_ID,
|
118 |
+
trust_remote_code=True,
|
119 |
+
torch_dtype=torch.bfloat16,
|
120 |
+
low_cpu_mem_usage=True,
|
121 |
+
device_map='cpu'
|
122 |
+
)
|
123 |
+
model.requires_grad_(False)
|
124 |
+
|
125 |
+
# Make sure it's on the 'cpu' device.
|
126 |
+
if refusal_dir.device != model.device:
|
127 |
+
refusal_dir = refusal_dir.to(model.device)
|
128 |
+
|
129 |
+
# Get the language model component and check it's as expected.
|
130 |
+
lm_model = model.model
|
131 |
+
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."
|
132 |
+
|
133 |
+
# Check the ranges are valid.
|
134 |
+
num_layers = len(lm_model.layers)
|
135 |
+
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
|
136 |
+
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
|
137 |
+
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."
|
138 |
+
|
139 |
+
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
|
140 |
+
|
141 |
+
# Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
|
142 |
+
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
|
143 |
+
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
|
144 |
+
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
|
145 |
+
tensor_float32 = tensor_data.to(torch.float32)
|
146 |
+
refusal_dir_float32 = refusal_dir.to(torch.float32)
|
147 |
+
tensor_float32 -= scale_factor * torch.matmul(torch.outer(refusal_dir_float32, refusal_dir_float32), tensor_float32)
|
148 |
+
tensor_modified = tensor_float32.to(torch.bfloat16)
|
149 |
+
bar_layers.update(1)
|
150 |
+
return torch.nn.Parameter(tensor_modified)
|
151 |
+
|
152 |
+
# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
|
153 |
+
# NOTE: These tensors names are speific to "llama" and may need changing.
|
154 |
+
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
|
155 |
+
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
|
156 |
+
# lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
|
157 |
+
# lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
|
158 |
+
# )
|
159 |
+
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
|
160 |
+
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
|
161 |
+
)
|
162 |
+
|
163 |
+
bar_layers.close()
|
164 |
+
|
165 |
+
print("Saving modified model (with original tokenizer)...")
|
166 |
+
|
167 |
+
FIXED_ID = f"{MODEL_ID}-abliterated"
|
168 |
+
model.save_pretrained(FIXED_ID)
|
169 |
+
tokenizer.save_pretrained(FIXED_ID)
|
170 |
+
```
|
171 |
+
|
172 |
+
## Merge Details
|
173 |
+
### Merge Method
|
174 |
+
|
175 |
+
This model was merged using the [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708) merge method using [unsloth/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct) as a base.
|
176 |
+
|
177 |
+
### Models Merged
|
178 |
+
|
179 |
+
The following models were included in the merge:
|
180 |
+
* [arcee-ai/Llama-3.1-SuperNova-Lite](https://huggingface.co/arcee-ai/Llama-3.1-SuperNova-Lite)
|
181 |
+
* [akjindal53244/Llama-3.1-Storm-8B](https://huggingface.co/akjindal53244/Llama-3.1-Storm-8B)
|
182 |
+
* [Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2](https://huggingface.co/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2)
|
183 |
+
|
184 |
+
### Configuration
|
185 |
+
|
186 |
+
The following YAML configuration was used to produce this model:
|
187 |
+
|
188 |
+
```yaml
|
189 |
+
base_model: unsloth/Meta-Llama-3.1-8B-Instruct
|
190 |
+
dtype: bfloat16
|
191 |
+
merge_method: dare_ties
|
192 |
+
parameters:
|
193 |
+
int8_mask: 1.0
|
194 |
+
slices:
|
195 |
+
- sources:
|
196 |
+
- layer_range: [0, 32]
|
197 |
+
model: akjindal53244/Llama-3.1-Storm-8B
|
198 |
+
parameters:
|
199 |
+
density: 0.8
|
200 |
+
weight: 0.25
|
201 |
+
- layer_range: [0, 32]
|
202 |
+
model: arcee-ai/Llama-3.1-SuperNova-Lite
|
203 |
+
parameters:
|
204 |
+
density: 0.8
|
205 |
+
weight: 0.33
|
206 |
+
- layer_range: [0, 32]
|
207 |
+
model: Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2
|
208 |
+
parameters:
|
209 |
+
density: 0.8
|
210 |
+
weight: 0.42
|
211 |
+
- layer_range: [0, 32]
|
212 |
+
model: unsloth/Meta-Llama-3.1-8B-Instruct
|
213 |
+
tokenizer_source: base
|
214 |
+
```
|