vakodiya
/

Viber-Indian-Law-Unsloth-Llama-3.1-8B

@@ -21,3 +21,188 @@ tags:
 This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)

 This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
+# Code To Train Model on Google collab:
+# Installing required packages
+%%capture
+```
+!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+from torch import __version__; from packaging.version import Version as V
+xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
+!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
+```
+# importing required modules
+```
+import torch
+from trl import SFTTrainer
+from datasets import load_dataset
+from transformers import TrainingArguments, TextStreamer
+from unsloth.chat_templates import get_chat_template
+from unsloth import FastLanguageModel, is_bfloat16_supported
+```
+# Login to HuggingFace using edit Access token storing in secrets
+```
+from huggingface_hub import login
+from google.colab import userdata
+hf_token = userdata.get('HF_API_KEY')
+login(token = hf_token)
+```
+# Check if a GPU is available
+```
+import torch
+if torch.cuda.is_available():
+  device = torch.device("cuda")
+  print("GPU is available and being used.")
+else:
+  device = torch.device("cpu")
+  print("GPU is not available, using CPU.")
+```
+# Loading model from Hugging Face
+```
+max_seq_length = 1024
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
+    max_seq_length=max_seq_length,
+    load_in_4bit=True,
+    dtype=None,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    lora_alpha=16,
+    lora_dropout=0,
+    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
+    use_rslora=True,
+    use_gradient_checkpointing="unsloth"
+)
+```
+# loading and formating Dataset
+```
+raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]")
+# Define a simple prompt template using only Instruction and Response
+alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+{}
+### Response:
+{}"""
+# EOS token for marking the end of each example
+EOS_TOKEN = tokenizer.eos_token
+# Function to format prompts with only Instruction and Response
+def formatting_prompts_func(examples):
+    Instruction = examples["Instruction"]
+    Response = examples["Response"]
+    # Create a formatted text for each example
+    texts = []
+    for Instruction, Response in zip(Instruction, Response):
+        # Format the text with the prompt template and add the EOS token
+        text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN
+        texts.append(text)
+    return {"text": texts}
+# Apply the formatting function to the dataset
+dataset = raw_dataset.map(formatting_prompts_func, batched=True)
+```
+# Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization
+```
+trainer=SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    dataset_num_proc=2,
+    packing=True,
+    args=TrainingArguments(
+        learning_rate=3e-4,
+        lr_scheduler_type="linear",
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        num_train_epochs=1,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        logging_steps=1,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        warmup_steps=10,
+        output_dir="output",
+        seed=0,
+    ),
+)
+```
+# Show current memory stats
+```
+gpu_stats = torch.cuda.get_device_properties(0)
+start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+print(f"{start_gpu_memory} GB of memory reserved.")
+```
+# Start Training
+```
+trainer_stats = trainer.train()
+```
+# Show final memory and time stats
+```
+used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
+used_percentage = round(used_memory         /max_memory*100, 3)
+lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
+print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
+print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
+print(f"Peak reserved memory = {used_memory} GB.")
+print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
+```
+# Finally Saving Trained model and push to HuggingFace
+```
+# Merge to 16bit
+model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",)
+model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token)
+```
+# Model usage with streaming response
+```
+# alpaca_prompt = Copied from above
+FastLanguageModel.for_inference(model) # Enable native 2x faster inference
+inputs = tokenizer(
+[
+   alpaca_prompt.format(
+        "What is the difference between a petition and a plaint in Indian law?",''
+    )
+], return_tensors = "pt").to("cuda")
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
+```