import torch from datasets import load_dataset from transformers import AutoTokenizer from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot from llmcompressor.transformers.compression.helpers import calculate_offload_device_map import gc torch.cuda.empty_cache() torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True recipe = """ quant_stage: quant_modifiers: QuantizationModifier: ignore: ["lm_head"] config_groups: group_0: weights: num_bits: 8 type: float strategy: tensor dynamic: false symmetric: true input_activations: num_bits: 8 type: float strategy: tensor dynamic: false symmetric: true targets: ["Linear"] """ model_stub = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" model_name = model_stub.split("/")[-1] device_map = calculate_offload_device_map( model_stub, reserve_for_hessians=True, num_gpus=1, torch_dtype=torch.bfloat16, max_memory={0: "18GiB", "cpu": "96GiB"} ) model = SparseAutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.bfloat16, device_map=device_map, low_cpu_mem_usage=True, offload_folder="offload_folder", offload_state_dict=True ) torch.cuda.empty_cache() gc.collect() tokenizer = AutoTokenizer.from_pretrained(model_stub) output_dir = f"./{model_name}-FP8" NUM_CALIBRATION_SAMPLES = 256 MAX_SEQUENCE_LENGTH = 2048 raw_dataset = load_dataset( "HuggingFaceH4/ultrachat_200k", split="train_sft" ) raw_dataset = raw_dataset.select(range(min(NUM_CALIBRATION_SAMPLES, len(raw_dataset)))) def preprocess_function(examples): texts = [tokenizer.apply_chat_template(messages, tokenize=False) for messages in examples["messages"]] tokenized = tokenizer( texts, max_length=MAX_SEQUENCE_LENGTH, padding="max_length", truncation=True, return_tensors="pt" ) tokenized["labels"] = tokenized["input_ids"].clone() return tokenized processed_dataset = raw_dataset.map( preprocess_function, batched=True, remove_columns=raw_dataset.column_names, desc="Processing dataset", ) oneshot( model=model, output_dir=output_dir, dataset=processed_dataset, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, per_device_train_batch_size=1, gradient_accumulation_steps=4, fp16=False, bf16=True, save_compressed=True, learning_rate=1e-5, num_train_epochs=1, logging_steps=10, save_strategy="no", remove_unused_columns=False, push_to_hub=False, preprocessing_num_workers=4, dataloader_num_workers=2 )