File size: 3,002 Bytes
c0fc8fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
import gc

torch.cuda.empty_cache()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

recipe = """
quant_stage:
    quant_modifiers:
        QuantizationModifier:
            ignore: ["lm_head"]
            config_groups:
                group_0:
                    weights:
                        num_bits: 8
                        type: float
                        strategy: tensor
                        dynamic: false
                        symmetric: true
                    input_activations:
                        num_bits: 8
                        type: float
                        strategy: tensor
                        dynamic: false
                        symmetric: true
                    targets: ["Linear"]
"""

model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
model_name = model_stub.split("/")[-1]

device_map = calculate_offload_device_map(
    model_stub,
    reserve_for_hessians=True,
    num_gpus=1,
    torch_dtype=torch.bfloat16,
    max_memory={0: "18GiB", "cpu": "96GiB"}
)

model = SparseAutoModelForCausalLM.from_pretrained(
    model_stub,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
    low_cpu_mem_usage=True,
    offload_folder="offload_folder",
    offload_state_dict=True
)

torch.cuda.empty_cache()
gc.collect()

tokenizer = AutoTokenizer.from_pretrained(model_stub)

output_dir = f"./{model_name}-FP8"

NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 2048

raw_dataset = load_dataset(
    "HuggingFaceH4/ultrachat_200k",
    split="train_sft"
)
raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset))))

def preprocess_function(examples):
    texts = [tokenizer.apply_chat_template(messages, tokenize=False) 
             for messages in examples["messages"]]

    tokenized = tokenizer(
        texts,
        max_length=MAX_SEQUENCE_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

processed_dataset = raw_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset.column_names,
    desc="Processing dataset",
)

oneshot(
    model=model,
    output_dir=output_dir,
    dataset=processed_dataset,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=False,
    bf16=True,
    save_compressed=True,
    learning_rate=1e-5,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
    preprocessing_num_workers=4,
    dataloader_num_workers=2
)