Update README.md

08fdb82 verified 3 months ago

6.03 kB

	---
	base_model: unsloth/Meta-Llama-3.1-8B-bnb-4bit
	language:
	- en
	license: apache-2.0
	tags:
	- text-generation-inference
	- transformers
	- unsloth
	- llama
	- trl
	- sft
	---

	# Uploaded model

	- Developed by: vakodiya
	- License: apache-2.0
	- Finetuned from model : unsloth/Meta-Llama-3.1-8B-bnb-4bit

	This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

	[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)



	# Code To Train Model on Google collab:

	# Installing required packages
	```
	%%capture
	!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
	from torch import __version__; from packaging.version import Version as V
	xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
	!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
	```
	# importing required modules

	```
	import torch
	from trl import SFTTrainer
	from datasets import load_dataset
	from transformers import TrainingArguments, TextStreamer
	from unsloth.chat_templates import get_chat_template
	from unsloth import FastLanguageModel, is_bfloat16_supported
	```

	# Login to HuggingFace using edit Access token storing in secrets
	```
	from huggingface_hub import login
	from google.colab import userdata
	hf_token = userdata.get('HF_API_KEY')
	login(token = hf_token)
	```

	# Check if a GPU is available

	```
	import torch

	if torch.cuda.is_available():
	device = torch.device("cuda")
	print("GPU is available and being used.")
	else:
	device = torch.device("cpu")
	print("GPU is not available, using CPU.")
	```

	# Loading model from Hugging Face

	```
	max_seq_length = 1024
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
	max_seq_length=max_seq_length,
	load_in_4bit=True,
	dtype=None,
	)
	model = FastLanguageModel.get_peft_model(
	model,
	r=16,
	lora_alpha=16,
	lora_dropout=0,
	target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
	use_rslora=True,
	use_gradient_checkpointing="unsloth"
	)
	```

	# loading and formating Dataset

	```
	raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]")

	# Define a simple prompt template using only Instruction and Response

	alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Response:
	{}"""

	# EOS token for marking the end of each example
	EOS_TOKEN = tokenizer.eos_token

	# Function to format prompts with only Instruction and Response
	def formatting_prompts_func(examples):
	Instruction = examples["Instruction"]
	Response = examples["Response"]

	# Create a formatted text for each example
	texts = []
	for Instruction, Response in zip(Instruction, Response):
	# Format the text with the prompt template and add the EOS token
	text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN
	texts.append(text)

	return {"text": texts}

	# Apply the formatting function to the dataset
	dataset = raw_dataset.map(formatting_prompts_func, batched=True)
	```

	# Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization

	```
	trainer=SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	dataset_num_proc=2,
	packing=True,
	args=TrainingArguments(
	learning_rate=3e-4,
	lr_scheduler_type="linear",
	per_device_train_batch_size=1,
	gradient_accumulation_steps=1,
	gradient_checkpointing=True,
	num_train_epochs=1,
	fp16=not is_bfloat16_supported(),
	bf16=is_bfloat16_supported(),
	logging_steps=1,
	optim="adamw_8bit",
	weight_decay=0.01,
	warmup_steps=10,
	output_dir="output",
	seed=0,
	),
	)
	```

	# Show current memory stats
	```
	gpu_stats = torch.cuda.get_device_properties(0)
	start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
	max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
	print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
	print(f"{start_gpu_memory} GB of memory reserved.")
	```

	# Start Training
	```
	trainer_stats = trainer.train()
	```

	# Show final memory and time stats
	```
	used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
	used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
	used_percentage = round(used_memory /max_memory*100, 3)
	lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
	print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
	print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
	print(f"Peak reserved memory = {used_memory} GB.")
	print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
	print(f"Peak reserved memory % of max memory = {used_percentage} %.")
	print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
	```

	# Finally Saving Trained model and push to HuggingFace
	```
	# Merge to 16bit
	model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",)

	model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token)
	```


	# Model usage with streaming response

	```
	# alpaca_prompt = Copied from above
	FastLanguageModel.for_inference(model) # Enable native 2x faster inference
	inputs = tokenizer(
	[
	alpaca_prompt.format(
	"What is the difference between a petition and a plaint in Indian law?",''
	)
	], return_tensors = "pt").to("cuda")

	from transformers import TextStreamer
	text_streamer = TextStreamer(tokenizer)
	_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
	```