Borcherding
commited on
Upload app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import HfApi
|
3 |
-
from unsloth import FastLanguageModel
|
|
|
|
|
4 |
from trl import SFTTrainer
|
5 |
-
from transformers import TrainingArguments, TrainerCallback
|
6 |
-
from unsloth import is_bfloat16_supported
|
7 |
import torch
|
8 |
from datasets import load_dataset
|
9 |
-
import logging
|
10 |
-
from io import StringIO
|
11 |
import time
|
12 |
-
import asyncio
|
13 |
import psutil
|
14 |
import platform
|
15 |
import os
|
@@ -41,6 +39,11 @@ memory = psutil.virtual_memory()
|
|
41 |
|
42 |
# Dropdown options
|
43 |
model_options = [
|
|
|
|
|
|
|
|
|
|
|
44 |
"unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
|
45 |
"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
|
46 |
"unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
|
@@ -98,21 +101,40 @@ class PrinterCallback(TrainerCallback):
|
|
98 |
self.progress(self.step/60, desc=f"Training {self.step}/60")
|
99 |
#print("**Step ", state.global_step)
|
100 |
|
|
|
101 |
|
102 |
|
103 |
def formatting_prompts_func(examples, prompt):
|
104 |
-
|
105 |
instructions = examples["instruction"]
|
106 |
inputs = examples["input"]
|
107 |
outputs = examples["output"]
|
|
|
108 |
texts = []
|
109 |
for instruction, input, output in zip(instructions, inputs, outputs):
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
texts.append(text)
|
113 |
-
|
|
|
114 |
|
115 |
-
def load_model(initial_model_name, load_in_4bit, max_sequence_length):
|
116 |
global model, tokenizer, max_seq_length
|
117 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
118 |
max_seq_length = max_sequence_length
|
@@ -121,7 +143,11 @@ def load_model(initial_model_name, load_in_4bit, max_sequence_length):
|
|
121 |
max_seq_length = max_sequence_length,
|
122 |
dtype = dtype,
|
123 |
load_in_4bit = load_in_4bit,
|
124 |
-
|
|
|
|
|
|
|
|
|
125 |
)
|
126 |
return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
|
127 |
|
@@ -129,6 +155,7 @@ def load_data(dataset_name, data_template_style, data_template):
|
|
129 |
global dataset
|
130 |
dataset = load_dataset(dataset_name, split = "train")
|
131 |
dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
|
|
|
132 |
return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
|
133 |
|
134 |
def inference(prompt, input_text):
|
@@ -150,6 +177,7 @@ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, ggu
|
|
150 |
global model, tokenizer
|
151 |
|
152 |
quants = []
|
|
|
153 |
|
154 |
if gguf_custom:
|
155 |
gguf_custom_value = gguf_custom_value
|
@@ -165,15 +193,35 @@ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, ggu
|
|
165 |
quants.append("q4_k_m")
|
166 |
|
167 |
if merge_16bit:
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
elif merge_4bit:
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
elif just_lora:
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
#model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")
|
177 |
if push_to_hub:
|
178 |
current_quant = 0
|
179 |
for q in quants:
|
@@ -298,8 +346,9 @@ with gr.Blocks(title="Unsloth fine-tuning") as demo:
|
|
298 |
model = model,
|
299 |
tokenizer = tokenizer,
|
300 |
train_dataset = dataset,
|
301 |
-
dataset_text_field
|
302 |
-
max_seq_length
|
|
|
303 |
dataset_num_proc = 2,
|
304 |
packing = False, # Can make training 5x faster for short sequences.
|
305 |
callbacks = [PrinterCallback(progress)],
|
@@ -320,6 +369,11 @@ with gr.Blocks(title="Unsloth fine-tuning") as demo:
|
|
320 |
output_dir = output_dir
|
321 |
),
|
322 |
)
|
|
|
|
|
|
|
|
|
|
|
323 |
trainer.train()
|
324 |
progress(1, desc="Training completed")
|
325 |
time.sleep(1)
|
@@ -381,6 +435,6 @@ Continue the fibonnaci sequence.
|
|
381 |
|
382 |
inference_button = gr.Button("Inference", visible=True, interactive=True)
|
383 |
inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
|
384 |
-
load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
|
385 |
|
386 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import HfApi
|
3 |
+
from unsloth import FastLanguageModel, is_bfloat16_supported
|
4 |
+
from unsloth.chat_templates import get_chat_template, train_on_responses_only
|
5 |
+
|
6 |
from trl import SFTTrainer
|
7 |
+
from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
|
|
|
8 |
import torch
|
9 |
from datasets import load_dataset
|
|
|
|
|
10 |
import time
|
|
|
11 |
import psutil
|
12 |
import platform
|
13 |
import os
|
|
|
39 |
|
40 |
# Dropdown options
|
41 |
model_options = [
|
42 |
+
"unsloth/Meta-Llama-3.1-8B-bnb-4bit",
|
43 |
+
"unsloth/Llama-3.2-1B-bnb-4bit",
|
44 |
+
"unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
|
45 |
+
"unsloth/Llama-3.2-3B-bnb-4bit",
|
46 |
+
"unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
|
47 |
"unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
|
48 |
"unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
|
49 |
"unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
|
|
|
101 |
self.progress(self.step/60, desc=f"Training {self.step}/60")
|
102 |
#print("**Step ", state.global_step)
|
103 |
|
104 |
+
|
105 |
|
106 |
|
107 |
def formatting_prompts_func(examples, prompt):
|
108 |
+
global tokenizer
|
109 |
instructions = examples["instruction"]
|
110 |
inputs = examples["input"]
|
111 |
outputs = examples["output"]
|
112 |
+
|
113 |
texts = []
|
114 |
for instruction, input, output in zip(instructions, inputs, outputs):
|
115 |
+
conversation = [
|
116 |
+
{
|
117 |
+
"role": "system",
|
118 |
+
"content": instruction + tokenizer.eos_token
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"role": "user",
|
122 |
+
"content": input + tokenizer.eos_token
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"role": "assistant",
|
126 |
+
"content": output + tokenizer.eos_token
|
127 |
+
}
|
128 |
+
]
|
129 |
+
text = tokenizer.apply_chat_template(
|
130 |
+
conversation, tokenize=False, add_generation_prompt=False
|
131 |
+
)
|
132 |
+
|
133 |
texts.append(text)
|
134 |
+
|
135 |
+
return { "text" : texts }
|
136 |
|
137 |
+
def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
|
138 |
global model, tokenizer, max_seq_length
|
139 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
140 |
max_seq_length = max_sequence_length
|
|
|
143 |
max_seq_length = max_sequence_length,
|
144 |
dtype = dtype,
|
145 |
load_in_4bit = load_in_4bit,
|
146 |
+
token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
|
147 |
+
)
|
148 |
+
tokenizer = get_chat_template(
|
149 |
+
tokenizer,
|
150 |
+
chat_template="llama-3.1",
|
151 |
)
|
152 |
return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
|
153 |
|
|
|
155 |
global dataset
|
156 |
dataset = load_dataset(dataset_name, split = "train")
|
157 |
dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
|
158 |
+
|
159 |
return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
|
160 |
|
161 |
def inference(prompt, input_text):
|
|
|
177 |
global model, tokenizer
|
178 |
|
179 |
quants = []
|
180 |
+
current_quant = 0
|
181 |
|
182 |
if gguf_custom:
|
183 |
gguf_custom_value = gguf_custom_value
|
|
|
193 |
quants.append("q4_k_m")
|
194 |
|
195 |
if merge_16bit:
|
196 |
+
progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
|
197 |
+
model.save_pretrained_merged(
|
198 |
+
"model",
|
199 |
+
tokenizer,
|
200 |
+
save_method="merged_16bit",
|
201 |
+
)
|
202 |
+
if push_to_hub:
|
203 |
+
model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
|
204 |
+
|
205 |
elif merge_4bit:
|
206 |
+
progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
|
207 |
+
model.save_pretrained_merged(
|
208 |
+
"model",
|
209 |
+
tokenizer,
|
210 |
+
save_method="merged_4bit",
|
211 |
+
)
|
212 |
+
if push_to_hub:
|
213 |
+
model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit", token=hub_token)
|
214 |
+
|
215 |
elif just_lora:
|
216 |
+
progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
|
217 |
+
model.save_pretrained_merged(
|
218 |
+
"model",
|
219 |
+
tokenizer,
|
220 |
+
save_method="lora",
|
221 |
+
)
|
222 |
+
if push_to_hub:
|
223 |
+
model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
|
224 |
|
|
|
225 |
if push_to_hub:
|
226 |
current_quant = 0
|
227 |
for q in quants:
|
|
|
346 |
model = model,
|
347 |
tokenizer = tokenizer,
|
348 |
train_dataset = dataset,
|
349 |
+
dataset_text_field="text",
|
350 |
+
max_seq_length=max_seq_length,
|
351 |
+
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
352 |
dataset_num_proc = 2,
|
353 |
packing = False, # Can make training 5x faster for short sequences.
|
354 |
callbacks = [PrinterCallback(progress)],
|
|
|
369 |
output_dir = output_dir
|
370 |
),
|
371 |
)
|
372 |
+
trainer = train_on_responses_only(
|
373 |
+
trainer,
|
374 |
+
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
|
375 |
+
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
|
376 |
+
)
|
377 |
trainer.train()
|
378 |
progress(1, desc="Training completed")
|
379 |
time.sleep(1)
|
|
|
435 |
|
436 |
inference_button = gr.Button("Inference", visible=True, interactive=True)
|
437 |
inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
|
438 |
+
load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
|
439 |
|
440 |
demo.launch()
|