vakodiya commited on
Commit
1ead31b
1 Parent(s): d7e7d07

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +185 -0
README.md CHANGED
@@ -21,3 +21,188 @@ tags:
21
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
22
 
23
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
22
 
23
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
24
+
25
+
26
+
27
+ # Code To Train Model on Google collab:
28
+
29
+ # Installing required packages
30
+ %%capture
31
+ ```
32
+ !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
33
+ from torch import __version__; from packaging.version import Version as V
34
+ xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
35
+ !pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
36
+ ```
37
+ # importing required modules
38
+
39
+ ```
40
+ import torch
41
+ from trl import SFTTrainer
42
+ from datasets import load_dataset
43
+ from transformers import TrainingArguments, TextStreamer
44
+ from unsloth.chat_templates import get_chat_template
45
+ from unsloth import FastLanguageModel, is_bfloat16_supported
46
+ ```
47
+
48
+ # Login to HuggingFace using edit Access token storing in secrets
49
+ ```
50
+ from huggingface_hub import login
51
+ from google.colab import userdata
52
+ hf_token = userdata.get('HF_API_KEY')
53
+ login(token = hf_token)
54
+ ```
55
+
56
+ # Check if a GPU is available
57
+
58
+ ```
59
+ import torch
60
+
61
+ if torch.cuda.is_available():
62
+ device = torch.device("cuda")
63
+ print("GPU is available and being used.")
64
+ else:
65
+ device = torch.device("cpu")
66
+ print("GPU is not available, using CPU.")
67
+ ```
68
+
69
+ # Loading model from Hugging Face
70
+
71
+ ```
72
+ max_seq_length = 1024
73
+ model, tokenizer = FastLanguageModel.from_pretrained(
74
+ model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
75
+ max_seq_length=max_seq_length,
76
+ load_in_4bit=True,
77
+ dtype=None,
78
+ )
79
+ model = FastLanguageModel.get_peft_model(
80
+ model,
81
+ r=16,
82
+ lora_alpha=16,
83
+ lora_dropout=0,
84
+ target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
85
+ use_rslora=True,
86
+ use_gradient_checkpointing="unsloth"
87
+ )
88
+ ```
89
+
90
+ # loading and formating Dataset
91
+
92
+ ```
93
+ raw_dataset = load_dataset("viber1/indian-law-dataset", split="train[:1000]")
94
+
95
+ # Define a simple prompt template using only Instruction and Response
96
+
97
+ alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
98
+
99
+ ### Instruction:
100
+ {}
101
+
102
+ ### Response:
103
+ {}"""
104
+
105
+ # EOS token for marking the end of each example
106
+ EOS_TOKEN = tokenizer.eos_token
107
+
108
+ # Function to format prompts with only Instruction and Response
109
+ def formatting_prompts_func(examples):
110
+ Instruction = examples["Instruction"]
111
+ Response = examples["Response"]
112
+
113
+ # Create a formatted text for each example
114
+ texts = []
115
+ for Instruction, Response in zip(Instruction, Response):
116
+ # Format the text with the prompt template and add the EOS token
117
+ text = alpaca_prompt.format(Instruction, Response) + EOS_TOKEN
118
+ texts.append(text)
119
+
120
+ return {"text": texts}
121
+
122
+ # Apply the formatting function to the dataset
123
+ dataset = raw_dataset.map(formatting_prompts_func, batched=True)
124
+ ```
125
+
126
+ # Using Trainer with low batch sizes, Gradient Checkpointing, LoRA and Quantization
127
+
128
+ ```
129
+ trainer=SFTTrainer(
130
+ model=model,
131
+ tokenizer=tokenizer,
132
+ train_dataset=dataset,
133
+ dataset_text_field="text",
134
+ max_seq_length=max_seq_length,
135
+ dataset_num_proc=2,
136
+ packing=True,
137
+ args=TrainingArguments(
138
+ learning_rate=3e-4,
139
+ lr_scheduler_type="linear",
140
+ per_device_train_batch_size=1,
141
+ gradient_accumulation_steps=1,
142
+ gradient_checkpointing=True,
143
+ num_train_epochs=1,
144
+ fp16=not is_bfloat16_supported(),
145
+ bf16=is_bfloat16_supported(),
146
+ logging_steps=1,
147
+ optim="adamw_8bit",
148
+ weight_decay=0.01,
149
+ warmup_steps=10,
150
+ output_dir="output",
151
+ seed=0,
152
+ ),
153
+ )
154
+ ```
155
+
156
+ # Show current memory stats
157
+ ```
158
+ gpu_stats = torch.cuda.get_device_properties(0)
159
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
160
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
161
+ print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
162
+ print(f"{start_gpu_memory} GB of memory reserved.")
163
+ ```
164
+
165
+ # Start Training
166
+ ```
167
+ trainer_stats = trainer.train()
168
+ ```
169
+
170
+ # Show final memory and time stats
171
+ ```
172
+ used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
173
+ used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
174
+ used_percentage = round(used_memory /max_memory*100, 3)
175
+ lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
176
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
177
+ print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
178
+ print(f"Peak reserved memory = {used_memory} GB.")
179
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
180
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
181
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
182
+ ```
183
+
184
+ # Finally Saving Trained model and push to HuggingFace
185
+ ```
186
+ # Merge to 16bit
187
+ model.save_pretrained_merged("Indian-Law-Llama-3.1-8B", tokenizer, save_method = "merged_16bit",)
188
+
189
+ model.push_to_hub_merged("vakodiya/Viber-Indian-Law-Unsloth-Llama-3.1-8B", tokenizer, save_method="merged_16bit", token = hf_token)
190
+ ```
191
+
192
+
193
+ # Model usage with streaming response
194
+
195
+ ```
196
+ # alpaca_prompt = Copied from above
197
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
198
+ inputs = tokenizer(
199
+ [
200
+ alpaca_prompt.format(
201
+ "What is the difference between a petition and a plaint in Indian law?",''
202
+ )
203
+ ], return_tensors = "pt").to("cuda")
204
+
205
+ from transformers import TextStreamer
206
+ text_streamer = TextStreamer(tokenizer)
207
+ _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
208
+ ```