|
--- |
|
base_model: unsloth/Meta-Llama-3.1-8B-bnb-4bit |
|
language: |
|
- en |
|
license: apache-2.0 |
|
tags: |
|
- text-generation-inference |
|
- transformers |
|
- unsloth |
|
- llama |
|
- trl |
|
- sft |
|
--- |
|
|
|
# Uploaded model |
|
|
|
- **Developed by:** vakodiya |
|
- **License:** apache-2.0 |
|
- **Finetuned from model :** unsloth/Meta-Llama-3.1-8B-bnb-4bit |
|
|
|
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. |
|
|
|
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth) |
|
|
|
|
|
|
|
# Code To Train Model on Google collab: |
|
|
|
# Installing required packages |
|
``` |
|
%%capture |
|
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" |
|
from torch import __version__; from packaging.version import Version as V |
|
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers" |
|
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton |
|
``` |
|
# importing required modules |
|
|
|
``` |
|
import torch |
|
from trl import SFTTrainer |
|
from datasets import load_dataset |
|
from transformers import TrainingArguments, TextStreamer |
|
from unsloth.chat_templates import get_chat_template |
|
from unsloth import FastLanguageModel, is_bfloat16_supported |
|
``` |
|
|
|
# Login to HuggingFace using edit Access token storing in secrets |
|
``` |
|
from huggingface_hub import login |
|
from google.colab import userdata |
|
hf_token = userdata.get('HF_API_KEY') |
|
login(token = hf_token) |
|
``` |
|
|
|
# Check if a GPU is available |
|
|
|
``` |
|
import torch |
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
print("GPU is available and being used.") |
|
else: |
|
device = torch.device("cpu") |
|
print("GPU is not available, using CPU.") |
|
``` |
|
|
|
# Loading model from Hugging Face |
|
|
|
``` |
|
max_seq_length = 1024 |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit", |
|
max_seq_length=max_seq_length, |
|
load_in_4bit=True, |
|
dtype=None, |
|
) |
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=16, |
|
lora_alpha=16, |
|
lora_dropout=0, |
|
target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], |
|
use_rslora=True, |
|
use_gradient_checkpointing="unsloth" |
|
) |
|
``` |
|
|
|
# loading and formating Dataset |
|
|
|
``` |
|
raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]") |
|
|
|
# Define a simple prompt template using only Instruction and Response |
|
|
|
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
|
|
# EOS token for marking the end of each example |
|
EOS_TOKEN = tokenizer.eos_token |
|
|
|
# Function to format prompts with only Instruction and Response |
|
def formatting_prompts_func(examples): |
|
Instruction = examples["Instruction"] |
|
Response = examples["Response"] |
|
|
|
# Create a formatted text for each example |
|
texts = [] |
|
for Instruction, Response in zip(Instruction, Response): |
|
# Format the text with the prompt template and add the EOS token |
|
text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN |
|
texts.append(text) |
|
|
|
return {"text": texts} |
|
|
|
# Apply the formatting function to the dataset |
|
dataset = raw_dataset.map(formatting_prompts_func, batched=True) |
|
``` |
|
|
|
# Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization |
|
|
|
``` |
|
trainer=SFTTrainer( |
|
model=model, |
|
tokenizer=tokenizer, |
|
train_dataset=dataset, |
|
dataset_text_field="text", |
|
max_seq_length=max_seq_length, |
|
dataset_num_proc=2, |
|
packing=True, |
|
args=TrainingArguments( |
|
learning_rate=3e-4, |
|
lr_scheduler_type="linear", |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=1, |
|
gradient_checkpointing=True, |
|
num_train_epochs=1, |
|
fp16=not is_bfloat16_supported(), |
|
bf16=is_bfloat16_supported(), |
|
logging_steps=1, |
|
optim="adamw_8bit", |
|
weight_decay=0.01, |
|
warmup_steps=10, |
|
output_dir="output", |
|
seed=0, |
|
), |
|
) |
|
``` |
|
|
|
# Show current memory stats |
|
``` |
|
gpu_stats = torch.cuda.get_device_properties(0) |
|
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) |
|
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) |
|
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") |
|
print(f"{start_gpu_memory} GB of memory reserved.") |
|
``` |
|
|
|
# Start Training |
|
``` |
|
trainer_stats = trainer.train() |
|
``` |
|
|
|
# Show final memory and time stats |
|
``` |
|
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) |
|
used_memory_for_lora = round(used_memory - start_gpu_memory, 3) |
|
used_percentage = round(used_memory /max_memory*100, 3) |
|
lora_percentage = round(used_memory_for_lora/max_memory*100, 3) |
|
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") |
|
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.") |
|
print(f"Peak reserved memory = {used_memory} GB.") |
|
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") |
|
print(f"Peak reserved memory % of max memory = {used_percentage} %.") |
|
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") |
|
``` |
|
|
|
# Finally Saving Trained model and push to HuggingFace |
|
``` |
|
# Merge to 16bit |
|
model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",) |
|
|
|
model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token) |
|
``` |
|
|
|
|
|
# Model usage with streaming response |
|
|
|
``` |
|
# alpaca_prompt = Copied from above |
|
FastLanguageModel.for_inference(model) # Enable native 2x faster inference |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
"What is the difference between a petition and a plaint in Indian law?",'' |
|
) |
|
], return_tensors = "pt").to("cuda") |
|
|
|
from transformers import TextStreamer |
|
text_streamer = TextStreamer(tokenizer) |
|
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) |
|
``` |