|
|
|
from datasets import load_dataset |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser,TrainingArguments, pipeline |
|
from peft import LoraConfig, PeftModel |
|
from trl import SFTTrainer |
|
import torch |
|
|
|
|
|
|
|
data_name = "mlabonne/guanaco-llama2-1k" |
|
training_data = load_dataset(data_name,split='train') |
|
|
|
|
|
base_model_name = "NousResearch/Llama-2-7b-chat-hf" |
|
|
|
|
|
|
|
|
|
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) |
|
llama_tokenizer.pad_token = llama_tokenizer.eos_token |
|
llama_tokenizer.padding_side = 'right' |
|
|
|
|
|
|
|
quant_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type='nf4', |
|
bnb_4bit_compute_dtype=torch.float16, |
|
bnb_4bit_use_double_quant=False |
|
) |
|
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
base_model_name, |
|
quantization_config = quant_config, |
|
device_map='auto' |
|
) |
|
base_model.config.use_cache=False |
|
base_model.config.pretraining_tp=1 |
|
|
|
'''' |
|
Double quantization is a technique where weights are quantized twice with different quantization parameters, |
|
potentially improving the accuracy of the quantized model. However, it may also increase computational complexity. |
|
''' |
|
|
|
''' |
|
LoRA-Specific Parameters |
|
|
|
Dropout Rate (lora_dropout): This is the probability that each neuron’s output is set to zero during training, used to prevent overfitting. |
|
Rank (r): Rank is essentially a measure of how the original weight matrices are broken down into simpler, smaller matrices. This reduces |
|
computational requirements and memory consumption. Lower ranks make the model faster but might sacrifice performance. The original LoRA paper |
|
suggests starting with a rank of 8, but for QLoRA, a rank of 64 is required. |
|
|
|
lora_alpha: This parameter controls the scaling of the low-rank approximation. It’s like a balancing act between the original model and the low-rank approximation. |
|
Higher values might make the approximation more influential in the fine-tuning process, affecting both performance and computational cost. |
|
|
|
''' |
|
|
|
|
|
peft_config = LoraConfig(lora_alpha=16, |
|
lora_dropout=0.1, |
|
r=8, |
|
bias='none', |
|
task_type='CAUSAL_LM') |
|
|
|
|
|
|
|
train_params = TrainingArguments( |
|
output_dir="./", |
|
num_train_epochs=1, |
|
per_device_train_batch_size=4, |
|
gradient_accumulation_steps=1, |
|
optim="paged_adamw_32bit", |
|
save_steps=25, |
|
logging_steps=25, |
|
learning_rate=2e-4, |
|
weight_decay=0.001, |
|
fp16=False, |
|
bf16=False, |
|
max_grad_norm=0.3, |
|
max_steps=-1, |
|
warmup_ratio=0.03, |
|
group_by_length=True, |
|
lr_scheduler_type="constant" |
|
) |
|
|
|
|
|
fine_tuning = SFTTrainer( |
|
model=base_model, |
|
train_dataset=training_data, |
|
peft_config=peft_config, |
|
dataset_text_field="text", |
|
tokenizer=llama_tokenizer, |
|
args=train_params |
|
) |
|
|
|
|
|
fine_tuning.train() |
|
|
|
|
|
fine_tuning.save_model("llama_7b_james") |
|
|
|
|
|
model_name = "llama7b__finetune_sample" |
|
HUGGING_FACE_USER_NAME = "james92" |
|
|
|
fine_tuning.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}") |
|
|
|
print("Model is saved in hggingface") |