File size: 6,029 Bytes
b3e939b d7e7d07 b3e939b 1ead31b 08fdb82 1ead31b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
---
base_model: unsloth/Meta-Llama-3.1-8B-bnb-4bit
language:
- en
license: apache-2.0
tags:
- text-generation-inference
- transformers
- unsloth
- llama
- trl
- sft
---
# Uploaded model
- **Developed by:** vakodiya
- **License:** apache-2.0
- **Finetuned from model :** unsloth/Meta-Llama-3.1-8B-bnb-4bit
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
# Code To Train Model on Google collab:
# Installing required packages
```
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
```
# importing required modules
```
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
```
# Login to HuggingFace using edit Access token storing in secrets
```
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get('HF_API_KEY')
login(token = hf_token)
```
# Check if a GPU is available
```
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available and being used.")
else:
device = torch.device("cpu")
print("GPU is not available, using CPU.")
```
# Loading model from Hugging Face
```
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
max_seq_length=max_seq_length,
load_in_4bit=True,
dtype=None,
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
use_rslora=True,
use_gradient_checkpointing="unsloth"
)
```
# loading and formating Dataset
```
raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]")
# Define a simple prompt template using only Instruction and Response
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{}
### Response:
{}"""
# EOS token for marking the end of each example
EOS_TOKEN = tokenizer.eos_token
# Function to format prompts with only Instruction and Response
def formatting_prompts_func(examples):
Instruction = examples["Instruction"]
Response = examples["Response"]
# Create a formatted text for each example
texts = []
for Instruction, Response in zip(Instruction, Response):
# Format the text with the prompt template and add the EOS token
text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN
texts.append(text)
return {"text": texts}
# Apply the formatting function to the dataset
dataset = raw_dataset.map(formatting_prompts_func, batched=True)
```
# Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization
```
trainer=SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=True,
args=TrainingArguments(
learning_rate=3e-4,
lr_scheduler_type="linear",
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
num_train_epochs=1,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
warmup_steps=10,
output_dir="output",
seed=0,
),
)
```
# Show current memory stats
```
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
```
# Start Training
```
trainer_stats = trainer.train()
```
# Show final memory and time stats
```
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
```
# Finally Saving Trained model and push to HuggingFace
```
# Merge to 16bit
model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token)
```
# Model usage with streaming response
```
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
alpaca_prompt.format(
"What is the difference between a petition and a plaint in Indian law?",''
)
], return_tensors = "pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
``` |