Borcherding commited on
Commit
169f166
·
verified ·
1 Parent(s): 14810d0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +456 -440
app.py CHANGED
@@ -1,440 +1,456 @@
1
- import gradio as gr
2
- from huggingface_hub import HfApi
3
- from unsloth import FastLanguageModel, is_bfloat16_supported
4
- from unsloth.chat_templates import get_chat_template, train_on_responses_only
5
-
6
- from trl import SFTTrainer
7
- from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
8
- import torch
9
- from datasets import load_dataset
10
- import time
11
- import psutil
12
- import platform
13
- import os
14
-
15
- hf_user = None
16
- try:
17
- hfApi = HfApi()
18
- hf_user = hfApi.whoami()["name"]
19
- except Exception as e:
20
- hf_user = "not logged in"
21
-
22
- def get_human_readable_size(size, decimal_places=2):
23
- for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
24
- if size < 1024.0:
25
- break
26
- size /= 1024.0
27
- return f"{size:.{decimal_places}f} {unit}"
28
-
29
-
30
- # get cpu stats
31
- disk_stats = psutil.disk_usage('.')
32
- print(get_human_readable_size(disk_stats.total))
33
- cpu_info = platform.processor()
34
- print(cpu_info)
35
- os_info = platform.platform()
36
- print(os_info)
37
-
38
- memory = psutil.virtual_memory()
39
-
40
- # Dropdown options
41
- model_options = [
42
- "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
43
- "unsloth/Llama-3.2-1B-bnb-4bit",
44
- "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
45
- "unsloth/Llama-3.2-3B-bnb-4bit",
46
- "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
47
- "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
48
- "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
49
- "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
50
- "unsloth/llama-3-8b-Instruct-bnb-4bit",
51
- "unsloth/llama-3-70b-bnb-4bit",
52
- "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
53
- "unsloth/Phi-3-medium-4k-instruct",
54
- "unsloth/mistral-7b-bnb-4bit",
55
- "unsloth/gemma-2-9b-bnb-4bit",
56
- "unsloth/gemma-2-9b-bnb-4bit-instruct",
57
- "unsloth/gemma-2-27b-bnb-4bit", # Gemma 2x faster!
58
- "unsloth/gemma-2-27b-bnb-4bit-instruct", # Gemma 2x faster!
59
- "unsloth/Qwen2-1.5B-bnb-4bit",
60
- "unsloth/Qwen2-1.5B-bnb-4bit-instruct",
61
- "unsloth/Qwen2-7B-bnb-4bit",
62
- "unsloth/Qwen2-7B-bnb-4bit-instruct",
63
- "unsloth/Qwen2-72B-bnb-4bit",
64
- "unsloth/Qwen2-72B-bnb-4bit-instruct",
65
- "unsloth/yi-6b-bnb-4bit",
66
- "unsloth/yi-34b-bnb-4bit",
67
- ]
68
- gpu_stats = torch.cuda.get_device_properties(0)
69
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
70
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
71
-
72
- running_on_hf = False
73
- if os.getenv("SYSTEM", None) == "spaces":
74
- running_on_hf = True
75
-
76
- system_info = f"""\
77
- - **System:** {os_info}
78
- - **CPU:** {cpu_info} **Memory:** {get_human_readable_size(memory.free)} free of {get_human_readable_size(memory.total)}
79
- - **GPU:** {gpu_stats.name} ({max_memory} GB)
80
- - **Disk:** {get_human_readable_size(disk_stats.free)} free of {get_human_readable_size(disk_stats.total)}
81
- - **Hugging Face:** {running_on_hf}
82
- """
83
-
84
- model=None
85
- tokenizer = None
86
- dataset = None
87
- max_seq_length = 2048
88
-
89
- class PrinterCallback(TrainerCallback):
90
- step = 0
91
- def __init__(self, progress):
92
- self.progress = progress
93
- def on_log(self, args, state, control, logs=None, **kwargs):
94
- _ = logs.pop("total_flos", None)
95
- if state.is_local_process_zero:
96
- #print(logs)
97
- pass
98
- def on_step_end(self, args, state, control, **kwargs):
99
- if state.is_local_process_zero:
100
- self.step = state.global_step
101
- self.progress(self.step/60, desc=f"Training {self.step}/60")
102
- #print("**Step ", state.global_step)
103
-
104
-
105
-
106
-
107
- def formatting_prompts_func(examples, prompt):
108
- global tokenizer
109
- instructions = examples["instruction"]
110
- inputs = examples["input"]
111
- outputs = examples["output"]
112
-
113
- texts = []
114
- for instruction, input, output in zip(instructions, inputs, outputs):
115
- conversation = [
116
- {
117
- "role": "system",
118
- "content": instruction + tokenizer.eos_token
119
- },
120
- {
121
- "role": "user",
122
- "content": input + tokenizer.eos_token
123
- },
124
- {
125
- "role": "assistant",
126
- "content": output + tokenizer.eos_token
127
- }
128
- ]
129
- text = tokenizer.apply_chat_template(
130
- conversation, tokenize=False, add_generation_prompt=False
131
- )
132
-
133
- texts.append(text)
134
-
135
- return { "text" : texts }
136
-
137
- def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
138
- global model, tokenizer, max_seq_length
139
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
140
- max_seq_length = max_sequence_length
141
- model, tokenizer = FastLanguageModel.from_pretrained(
142
- model_name = initial_model_name,
143
- max_seq_length = max_sequence_length,
144
- dtype = dtype,
145
- load_in_4bit = load_in_4bit,
146
- token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
147
- )
148
- tokenizer = get_chat_template(
149
- tokenizer,
150
- chat_template="llama-3.1",
151
- )
152
- return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
153
-
154
- def load_data(dataset_name, data_template_style, data_template):
155
- global dataset
156
- dataset = load_dataset(dataset_name, split = "train")
157
- dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
158
-
159
- return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
160
-
161
- def inference(prompt, input_text):
162
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
163
- inputs = tokenizer(
164
- [
165
- prompt.format(
166
- "Continue the fibonnaci sequence.", # instruction
167
- "1, 1, 2, 3, 5, 8", # input
168
- "", # output - leave this blank for generation!
169
- )
170
- ], return_tensors = "pt").to("cuda")
171
-
172
- outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
173
- result = tokenizer.batch_decode(outputs)
174
- return result[0], gr.update(visible=True, interactive=True)
175
-
176
- def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub, progress=gr.Progress()):
177
- global model, tokenizer
178
-
179
- quants = []
180
- current_quant = 0
181
-
182
- if gguf_custom:
183
- gguf_custom_value = gguf_custom_value
184
- quants.append(gguf_custom_value)
185
- else:
186
- gguf_custom_value = None
187
-
188
- if gguf_16bit:
189
- quants.append("f16")
190
- if gguf_8bit:
191
- quants.append("q8_0")
192
- if gguf_4bit:
193
- quants.append("q4_k_m")
194
-
195
- if merge_16bit:
196
- progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
197
- model.save_pretrained_merged(
198
- "model",
199
- tokenizer,
200
- save_method="merged_16bit",
201
- )
202
- if push_to_hub:
203
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
204
-
205
- elif merge_4bit:
206
- progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
207
- model.save_pretrained_merged(
208
- "model",
209
- tokenizer,
210
- save_method="merged_4bit",
211
- )
212
- if push_to_hub:
213
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit", token=hub_token)
214
-
215
- elif just_lora:
216
- progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
217
- model.save_pretrained_merged(
218
- "model",
219
- tokenizer,
220
- save_method="lora",
221
- )
222
- if push_to_hub:
223
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
224
-
225
- if push_to_hub:
226
- current_quant = 0
227
- for q in quants:
228
- progress(current_quant/len(quants), desc=f"Pushing model {model_name} with {q} to HuggingFace Hub")
229
- model.push_to_hub_gguf(hub_model_name, tokenizer, quantization_method=q, token=hub_token)
230
- current_quant += 1
231
- return "Model saved", gr.update(visible=True, interactive=True)
232
-
233
- def username(profile: gr.OAuthProfile | None):
234
- hf_user = profile["name"] if profile else "not logged in"
235
- return hf_user
236
-
237
- # Create the Gradio interface
238
- with gr.Blocks(title="Unsloth fine-tuning") as demo:
239
- if (running_on_hf):
240
- gr.LoginButton()
241
- # logged_user = gr.Markdown(f"**User:** {hf_user}")
242
- #demo.load(username, inputs=None, outputs=logged_user)
243
- with gr.Row():
244
- with gr.Column(scale=0.5):
245
- gr.Image("unsloth.png", width="300px", interactive=False, show_download_button=False, show_label=False, show_share_button=False)
246
- with gr.Column(min_width="550px", scale=1):
247
- gr.Markdown(system_info)
248
- with gr.Column(min_width="250px", scale=0.3):
249
- gr.Markdown(f"**Links:**\n\n* [Unsloth Hub](https://huggingface.co/unsloth)\n\n* [Unsloth Docs](http://docs.unsloth.com/)\n\n* [Unsloth GitHub](https://github.com/unslothai/unsloth)")
250
- with gr.Tab("Base Model Parameters"):
251
-
252
- with gr.Row():
253
- initial_model_name = gr.Dropdown(choices=model_options, label="Select Base Model", allow_custom_value=True)
254
- load_in_4bit = gr.Checkbox(label="Load 4bit model", value=True)
255
-
256
- gr.Markdown("### Target Model Parameters")
257
- with gr.Row():
258
- max_sequence_length = gr.Slider(minimum=128, value=512, step=64, maximum=128*1024, interactive=True, label="Max Sequence Length")
259
- load_btn = gr.Button("Load")
260
- output = gr.Textbox(label="Model Load Status", value="Model not loaded", interactive=False)
261
- gr.Markdown("---")
262
-
263
- with gr.Tab("Data Preparation"):
264
- with gr.Row():
265
- dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
266
- data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca", allow_custom_value=True)
267
- with gr.Row():
268
- data_template = gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
269
-
270
- ### Instruction:
271
- {}
272
-
273
- ### Input:
274
- {}
275
-
276
- ### Response:
277
- {}""")
278
- gr.Markdown("---")
279
- output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
280
- load_data_btn = gr.Button("Load Dataset", interactive=True)
281
- load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_template], outputs=[output_load_data, load_data_btn])
282
-
283
- with gr.Tab("Fine-Tuning"):
284
- gr.Markdown("""### Fine-Tuned Model Parameters""")
285
- with gr.Row():
286
- model_name = gr.Textbox(label="Model Name", value=initial_model_name.value, interactive=True)
287
-
288
- gr.Markdown("""### Lora Parameters""")
289
-
290
- with gr.Row():
291
- lora_r = gr.Number(label="R", value=16, interactive=True)
292
- lora_alpha = gr.Number(label="Lora Alpha", value=16, interactive=True)
293
- lora_dropout = gr.Number(label="Lora Dropout", value=0.1, interactive=True)
294
-
295
- gr.Markdown("---")
296
- gr.Markdown("""### Training Parameters""")
297
- with gr.Row():
298
- with gr.Column():
299
- with gr.Row():
300
- per_device_train_batch_size = gr.Number(label="Per Device Train Batch Size", value=2, interactive=True)
301
- warmup_steps = gr.Number(label="Warmup Steps", value=5, interactive=True)
302
- max_steps = gr.Number(label="Max Steps", value=60, interactive=True)
303
- gradient_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=4, interactive=True)
304
- with gr.Row():
305
- logging_steps = gr.Number(label="Logging Steps", value=1, interactive=True)
306
- log_to_tensorboard = gr.Checkbox(label="Log to Tensorboard", value=True, interactive=True)
307
-
308
- with gr.Row():
309
- # optim = gr.Dropdown(choices=["adamw_8bit", "adamw", "sgd"], label="Optimizer", value="adamw_8bit")
310
- learning_rate = gr.Number(label="Learning Rate", value=2e-4, interactive=True)
311
-
312
- # with gr.Row():
313
- weight_decay = gr.Number(label="Weight Decay", value=0.01, interactive=True)
314
- # lr_scheduler_type = gr.Dropdown(choices=["linear", "cosine", "constant"], label="LR Scheduler Type", value="linear")
315
- gr.Markdown("---")
316
-
317
- with gr.Row():
318
- seed = gr.Number(label="Seed", value=3407, interactive=True)
319
- output_dir = gr.Textbox(label="Output Directory", value="outputs", interactive=True)
320
- gr.Markdown("---")
321
-
322
- train_output = gr.Textbox(label="Training Status", value="Model not trained", interactive=False)
323
- train_btn = gr.Button("Train", visible=True)
324
-
325
- def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
326
- gradient_accumulation_steps: int, logging_steps: int, log_to_tensorboard: bool, learning_rate, weight_decay, seed: int, output_dir, progress= gr.Progress()):
327
- global model, tokenizer
328
- print(f"$$$ Training model {model_name} with {lora_r} R, {lora_alpha} alpha, {lora_dropout} dropout, {per_device_train_batch_size} per device train batch size, {warmup_steps} warmup steps, {max_steps} max steps, {gradient_accumulation_steps} gradient accumulation steps, {logging_steps} logging steps, {log_to_tensorboard} log to tensorboard, {learning_rate} learning rate, {weight_decay} weight decay, {seed} seed, {output_dir} output dir")
329
- iseed = seed
330
- model = FastLanguageModel.get_peft_model(
331
- model,
332
- r = lora_r,
333
- target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
334
- "gate_proj", "up_proj", "down_proj",],
335
- lora_alpha = lora_alpha,
336
- lora_dropout = lora_dropout,
337
- bias = "none",
338
- use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
339
- random_state=iseed,
340
- use_rslora = False, # We support rank stabilized LoRA
341
- loftq_config = None, # And LoftQ
342
- )
343
- progress(0.0, desc="Loading Trainer")
344
- time.sleep(1)
345
- trainer = SFTTrainer(
346
- model = model,
347
- tokenizer = tokenizer,
348
- train_dataset = dataset,
349
- dataset_text_field="text",
350
- max_seq_length=max_seq_length,
351
- data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
352
- dataset_num_proc = 2,
353
- packing = False, # Can make training 5x faster for short sequences.
354
- callbacks = [PrinterCallback(progress)],
355
- args = TrainingArguments(
356
- per_device_train_batch_size = per_device_train_batch_size,
357
- gradient_accumulation_steps = gradient_accumulation_steps,
358
- warmup_steps = warmup_steps,
359
- max_steps = 60, # Set num_train_epochs = 1 for full training runs
360
- learning_rate = learning_rate,
361
- fp16 = not is_bfloat16_supported(),
362
- bf16 = is_bfloat16_supported(),
363
- logging_steps = logging_steps,
364
- optim = "adamw_8bit",
365
- weight_decay = weight_decay,
366
- lr_scheduler_type = "linear",
367
- seed = iseed,
368
- report_to="tensorboard" if log_to_tensorboard else None,
369
- output_dir = output_dir
370
- ),
371
- )
372
- trainer = train_on_responses_only(
373
- trainer,
374
- instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
375
- response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
376
- )
377
- trainer.train()
378
- progress(1, desc="Training completed")
379
- time.sleep(1)
380
- return "Model trained 100%",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
381
-
382
-
383
- train_btn.click(train_model, inputs=[model_name, lora_r, lora_alpha, lora_dropout, per_device_train_batch_size, warmup_steps, max_steps, gradient_accumulation_steps, logging_steps, log_to_tensorboard, learning_rate, weight_decay, seed, output_dir], outputs=[train_output, train_btn])
384
-
385
- with gr.Tab("Save & Push Options"):
386
-
387
- with gr.Row():
388
- gr.Markdown("### Merging Options")
389
- with gr.Column():
390
- merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
391
- merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
392
- just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
393
- gr.Markdown("---")
394
-
395
- with gr.Row():
396
- gr.Markdown("### GGUF Options")
397
- with gr.Column():
398
- gguf_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
399
- gguf_8bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
400
- gguf_4bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
401
- with gr.Column():
402
- gguf_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
403
- gguf_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
404
- gr.Markdown("---")
405
-
406
- with gr.Row():
407
- gr.Markdown("### Hugging Face Hub Options")
408
- push_to_hub = gr.Checkbox(label="Push to Hub", value=False, interactive=True)
409
- with gr.Column():
410
- hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
411
- hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
412
- gr.Markdown("---")
413
-
414
- # with gr.Row():
415
- # gr.Markdown("### Ollama options")
416
- # with gr.Column():
417
- # ollama_create_local = gr.Checkbox(label="Create in Ollama (local)", value=False, interactive=True)
418
- # ollama_push_to_hub = gr.Checkbox(label="Push to Ollama", value=False, interactive=True)
419
- # with gr.Column():
420
- # ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
421
- # ollama_pub_key = gr.Button("Ollama Pub Key")
422
- save_output = gr.Markdown("---")
423
- save_button = gr.Button("Save Model", visible=True, interactive=True)
424
- save_button.click(save_model, inputs=[model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub], outputs=[save_output, save_button])
425
-
426
- with gr.Tab("Inference"):
427
- with gr.Row():
428
- input_text = gr.Textbox(label="Input Text", lines=4, value="""\
429
- Continue the fibonnaci sequence.
430
- # instruction
431
- 1, 1, 2, 3, 5, 8
432
- # input
433
- """, interactive=True)
434
- output_text = gr.Textbox(label="Output Text", lines=4, value="", interactive=False)
435
-
436
- inference_button = gr.Button("Inference", visible=True, interactive=True)
437
- inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
438
- load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
439
-
440
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import HfApi
3
+ from unsloth import FastLanguageModel, is_bfloat16_supported
4
+ from unsloth.chat_templates import get_chat_template, train_on_responses_only
5
+
6
+ from trl import SFTTrainer
7
+ from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
8
+ import torch
9
+ from datasets import load_dataset
10
+ import time
11
+ import psutil
12
+ import platform
13
+ import os
14
+
15
+ hf_user = None
16
+ try:
17
+ hfApi = HfApi()
18
+ hf_user = hfApi.whoami()["name"]
19
+ except Exception as e:
20
+ hf_user = "not logged in"
21
+
22
+ def get_human_readable_size(size, decimal_places=2):
23
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
24
+ if size < 1024.0:
25
+ break
26
+ size /= 1024.0
27
+ return f"{size:.{decimal_places}f} {unit}"
28
+
29
+
30
+ # get cpu stats
31
+ disk_stats = psutil.disk_usage('.')
32
+ print(get_human_readable_size(disk_stats.total))
33
+ cpu_info = platform.processor()
34
+ print(cpu_info)
35
+ os_info = platform.platform()
36
+ print(os_info)
37
+
38
+ memory = psutil.virtual_memory()
39
+
40
+ # Dropdown options
41
+ model_options = [
42
+ "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
43
+ "unsloth/Llama-3.2-1B-bnb-4bit",
44
+ "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
45
+ "unsloth/Llama-3.2-3B-bnb-4bit",
46
+ "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
47
+ "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
48
+ "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
49
+ "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
50
+ "unsloth/llama-3-8b-Instruct-bnb-4bit",
51
+ "unsloth/llama-3-70b-bnb-4bit",
52
+ "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
53
+ "unsloth/Phi-3-medium-4k-instruct",
54
+ "unsloth/mistral-7b-bnb-4bit",
55
+ "unsloth/gemma-2-9b-bnb-4bit",
56
+ "unsloth/gemma-2-9b-bnb-4bit-instruct",
57
+ "unsloth/gemma-2-27b-bnb-4bit", # Gemma 2x faster!
58
+ "unsloth/gemma-2-27b-bnb-4bit-instruct", # Gemma 2x faster!
59
+ "unsloth/Qwen2-1.5B-bnb-4bit",
60
+ "unsloth/Qwen2-1.5B-bnb-4bit-instruct",
61
+ "unsloth/Qwen2-7B-bnb-4bit",
62
+ "unsloth/Qwen2-7B-bnb-4bit-instruct",
63
+ "unsloth/Qwen2-72B-bnb-4bit",
64
+ "unsloth/Qwen2-72B-bnb-4bit-instruct",
65
+ "unsloth/yi-6b-bnb-4bit",
66
+ "unsloth/yi-34b-bnb-4bit",
67
+ ]
68
+ gpu_stats = torch.cuda.get_device_properties(0)
69
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
70
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
71
+
72
+ running_on_hf = False
73
+ if os.getenv("SYSTEM", None) == "spaces":
74
+ running_on_hf = True
75
+
76
+ system_info = f"""\
77
+ - **System:** {os_info}
78
+ - **CPU:** {cpu_info} **Memory:** {get_human_readable_size(memory.free)} free of {get_human_readable_size(memory.total)}
79
+ - **GPU:** {gpu_stats.name} ({max_memory} GB)
80
+ - **Disk:** {get_human_readable_size(disk_stats.free)} free of {get_human_readable_size(disk_stats.total)}
81
+ - **Hugging Face:** {running_on_hf}
82
+ """
83
+
84
+ model=None
85
+ tokenizer = None
86
+ dataset = None
87
+ max_seq_length = 2048
88
+
89
+ class PrinterCallback(TrainerCallback):
90
+ step = 0
91
+ def __init__(self, progress):
92
+ self.progress = progress
93
+ def on_log(self, args, state, control, logs=None, **kwargs):
94
+ _ = logs.pop("total_flos", None)
95
+ if state.is_local_process_zero:
96
+ #print(logs)
97
+ pass
98
+ def on_step_end(self, args, state, control, **kwargs):
99
+ if state.is_local_process_zero:
100
+ self.step = state.global_step
101
+ self.progress(self.step/60, desc=f"Training {self.step}/60")
102
+ #print("**Step ", state.global_step)
103
+
104
+
105
+
106
+
107
+ def formatting_prompts_func(examples, prompt):
108
+ global tokenizer
109
+ instructions = examples["instruction"]
110
+ inputs = examples["input"]
111
+ outputs = examples["output"]
112
+
113
+ texts = []
114
+ for instruction, input, output in zip(instructions, inputs, outputs):
115
+ conversation = [
116
+ {
117
+ "role": "system",
118
+ "content": instruction + tokenizer.eos_token
119
+ },
120
+ {
121
+ "role": "user",
122
+ "content": input + tokenizer.eos_token
123
+ },
124
+ {
125
+ "role": "assistant",
126
+ "content": output + tokenizer.eos_token
127
+ }
128
+ ]
129
+ text = tokenizer.apply_chat_template(
130
+ conversation, tokenize=False, add_generation_prompt=False
131
+ )
132
+
133
+ texts.append(text)
134
+
135
+ return { "text" : texts }
136
+
137
+ def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
138
+ global model, tokenizer, max_seq_length
139
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
140
+ max_seq_length = max_sequence_length
141
+ model, tokenizer = FastLanguageModel.from_pretrained(
142
+ model_name = initial_model_name,
143
+ max_seq_length = max_sequence_length,
144
+ dtype = dtype,
145
+ load_in_4bit = load_in_4bit,
146
+ token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
147
+ )
148
+ tokenizer = get_chat_template(
149
+ tokenizer,
150
+ chat_template="llama-3.1",
151
+ )
152
+ return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
153
+
154
+ def load_data(dataset_name, data_template_style, data_template):
155
+ global dataset
156
+ dataset = load_dataset(dataset_name, split = "train")
157
+ dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
158
+
159
+ return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
160
+
161
+ def inference(prompt, input_text):
162
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
163
+ inputs = tokenizer(
164
+ [
165
+ prompt.format(
166
+ "Continue the fibonnaci sequence.", # instruction
167
+ "1, 1, 2, 3, 5, 8", # input
168
+ "", # output - leave this blank for generation!
169
+ )
170
+ ], return_tensors = "pt").to("cuda")
171
+
172
+ outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
173
+ result = tokenizer.batch_decode(outputs)
174
+ return result[0], gr.update(visible=True, interactive=True)
175
+
176
+ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub, progress=gr.Progress()):
177
+ global model, tokenizer
178
+
179
+ print("Starting save_model function")
180
+ print(f"Model name: {model_name}")
181
+ print(f"Hub model name: {hub_model_name}")
182
+ print(f"GGUF 16bit: {gguf_16bit}, GGUF 8bit: {gguf_8bit}, GGUF 4bit: {gguf_4bit}")
183
+ print(f"Merge 16bit: {merge_16bit}, Merge 4bit: {merge_4bit}, Just LoRA: {just_lora}")
184
+ print(f"Push to hub: {push_to_hub}")
185
+
186
+ quants = []
187
+ current_quant = 0
188
+
189
+ if gguf_custom:
190
+ gguf_custom_value = gguf_custom_value
191
+ quants.append(gguf_custom_value)
192
+ print(f"Custom GGUF value: {gguf_custom_value}")
193
+ else:
194
+ gguf_custom_value = None
195
+
196
+ if gguf_16bit:
197
+ quants.append("f16")
198
+ if gguf_8bit:
199
+ quants.append("q8_0")
200
+ if gguf_4bit:
201
+ quants.append("q4_k_m")
202
+
203
+ if merge_16bit:
204
+ print("Merging model to 16bit")
205
+ progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
206
+ model.save_pretrained_merged(
207
+ "model",
208
+ tokenizer,
209
+ save_method="merged_16bit",
210
+ )
211
+ if push_to_hub:
212
+ print("Pushing merged 16bit model to HuggingFace Hub")
213
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
214
+
215
+ elif merge_4bit:
216
+ print("Merging model to 4bit")
217
+ progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
218
+ model.save_pretrained_merged(
219
+ "model",
220
+ tokenizer,
221
+ save_method="merged_4bit_forced",
222
+ )
223
+ if push_to_hub:
224
+ print("Pushing merged 4bit model to HuggingFace Hub")
225
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit_forced", token=hub_token)
226
+
227
+ elif just_lora:
228
+ print("Saving just LoRA")
229
+ progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
230
+ model.save_pretrained_merged(
231
+ "model",
232
+ tokenizer,
233
+ save_method="lora",
234
+ )
235
+ if push_to_hub:
236
+ print("Pushing LoRA model to HuggingFace Hub")
237
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
238
+
239
+ if push_to_hub:
240
+ current_quant = 0
241
+ for q in quants:
242
+ print(f"Pushing model with quantization {q} to HuggingFace Hub")
243
+ progress(current_quant/len(quants), desc=f"Pushing model {model_name} with {q} to HuggingFace Hub")
244
+ model.push_to_hub_gguf(hub_model_name, tokenizer, quantization_method=q, token=hub_token)
245
+ current_quant += 1
246
+ print("Model saved successfully")
247
+ return "Model saved", gr.update(visible=True, interactive=True)
248
+
249
+ def username(profile: gr.OAuthProfile | None):
250
+ hf_user = profile["name"] if profile else "not logged in"
251
+ return hf_user
252
+
253
+ # Create the Gradio interface
254
+ with gr.Blocks(title="Unsloth fine-tuning") as demo:
255
+ if (running_on_hf):
256
+ gr.LoginButton()
257
+ # logged_user = gr.Markdown(f"**User:** {hf_user}")
258
+ #demo.load(username, inputs=None, outputs=logged_user)
259
+ with gr.Row():
260
+ with gr.Column(scale=0.5):
261
+ gr.Image("unsloth.png", width="300px", interactive=False, show_download_button=False, show_label=False, show_share_button=False)
262
+ with gr.Column(min_width="550px", scale=1):
263
+ gr.Markdown(system_info)
264
+ with gr.Column(min_width="250px", scale=0.3):
265
+ gr.Markdown(f"**Links:**\n\n* [Unsloth Hub](https://huggingface.co/unsloth)\n\n* [Unsloth Docs](http://docs.unsloth.com/)\n\n* [Unsloth GitHub](https://github.com/unslothai/unsloth)")
266
+ with gr.Tab("Base Model Parameters"):
267
+
268
+ with gr.Row():
269
+ initial_model_name = gr.Dropdown(choices=model_options, label="Select Base Model", allow_custom_value=True)
270
+ load_in_4bit = gr.Checkbox(label="Load 4bit model", value=True)
271
+
272
+ gr.Markdown("### Target Model Parameters")
273
+ with gr.Row():
274
+ max_sequence_length = gr.Slider(minimum=128, value=512, step=64, maximum=128*1024, interactive=True, label="Max Sequence Length")
275
+ load_btn = gr.Button("Load")
276
+ output = gr.Textbox(label="Model Load Status", value="Model not loaded", interactive=False)
277
+ gr.Markdown("---")
278
+
279
+ with gr.Tab("Data Preparation"):
280
+ with gr.Row():
281
+ dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
282
+ data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca", allow_custom_value=True)
283
+ with gr.Row():
284
+ data_template = gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
285
+
286
+ ### Instruction:
287
+ {}
288
+
289
+ ### Input:
290
+ {}
291
+
292
+ ### Response:
293
+ {}""")
294
+ gr.Markdown("---")
295
+ output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
296
+ load_data_btn = gr.Button("Load Dataset", interactive=True)
297
+ load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_template], outputs=[output_load_data, load_data_btn])
298
+
299
+ with gr.Tab("Fine-Tuning"):
300
+ gr.Markdown("""### Fine-Tuned Model Parameters""")
301
+ with gr.Row():
302
+ model_name = gr.Textbox(label="Model Name", value=initial_model_name.value, interactive=True)
303
+
304
+ gr.Markdown("""### Lora Parameters""")
305
+
306
+ with gr.Row():
307
+ lora_r = gr.Number(label="R", value=16, interactive=True)
308
+ lora_alpha = gr.Number(label="Lora Alpha", value=16, interactive=True)
309
+ lora_dropout = gr.Number(label="Lora Dropout", value=0.1, interactive=True)
310
+
311
+ gr.Markdown("---")
312
+ gr.Markdown("""### Training Parameters""")
313
+ with gr.Row():
314
+ with gr.Column():
315
+ with gr.Row():
316
+ per_device_train_batch_size = gr.Number(label="Per Device Train Batch Size", value=2, interactive=True)
317
+ warmup_steps = gr.Number(label="Warmup Steps", value=5, interactive=True)
318
+ max_steps = gr.Number(label="Max Steps", value=60, interactive=True)
319
+ gradient_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=4, interactive=True)
320
+ with gr.Row():
321
+ logging_steps = gr.Number(label="Logging Steps", value=1, interactive=True)
322
+ log_to_tensorboard = gr.Checkbox(label="Log to Tensorboard", value=True, interactive=True)
323
+
324
+ with gr.Row():
325
+ # optim = gr.Dropdown(choices=["adamw_8bit", "adamw", "sgd"], label="Optimizer", value="adamw_8bit")
326
+ learning_rate = gr.Number(label="Learning Rate", value=2e-4, interactive=True)
327
+
328
+ # with gr.Row():
329
+ weight_decay = gr.Number(label="Weight Decay", value=0.01, interactive=True)
330
+ # lr_scheduler_type = gr.Dropdown(choices=["linear", "cosine", "constant"], label="LR Scheduler Type", value="linear")
331
+ gr.Markdown("---")
332
+
333
+ with gr.Row():
334
+ seed = gr.Number(label="Seed", value=3407, interactive=True)
335
+ output_dir = gr.Textbox(label="Output Directory", value="outputs", interactive=True)
336
+ gr.Markdown("---")
337
+
338
+ train_output = gr.Textbox(label="Training Status", value="Model not trained", interactive=False)
339
+ train_btn = gr.Button("Train", visible=True)
340
+
341
+ def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
342
+ gradient_accumulation_steps: int, logging_steps: int, log_to_tensorboard: bool, learning_rate, weight_decay, seed: int, output_dir, progress= gr.Progress()):
343
+ global model, tokenizer
344
+ print(f"$$$ Training model {model_name} with {lora_r} R, {lora_alpha} alpha, {lora_dropout} dropout, {per_device_train_batch_size} per device train batch size, {warmup_steps} warmup steps, {max_steps} max steps, {gradient_accumulation_steps} gradient accumulation steps, {logging_steps} logging steps, {log_to_tensorboard} log to tensorboard, {learning_rate} learning rate, {weight_decay} weight decay, {seed} seed, {output_dir} output dir")
345
+ iseed = seed
346
+ model = FastLanguageModel.get_peft_model(
347
+ model,
348
+ r = lora_r,
349
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
350
+ "gate_proj", "up_proj", "down_proj",],
351
+ lora_alpha = lora_alpha,
352
+ lora_dropout = lora_dropout,
353
+ bias = "none",
354
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
355
+ random_state=iseed,
356
+ use_rslora = False, # We support rank stabilized LoRA
357
+ loftq_config = None, # And LoftQ
358
+ )
359
+ progress(0.0, desc="Loading Trainer")
360
+ time.sleep(1)
361
+ trainer = SFTTrainer(
362
+ model = model,
363
+ tokenizer = tokenizer,
364
+ train_dataset = dataset,
365
+ dataset_text_field="text",
366
+ max_seq_length=max_seq_length,
367
+ data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
368
+ dataset_num_proc = 2,
369
+ packing = False, # Can make training 5x faster for short sequences.
370
+ callbacks = [PrinterCallback(progress)],
371
+ args = TrainingArguments(
372
+ per_device_train_batch_size = per_device_train_batch_size,
373
+ gradient_accumulation_steps = gradient_accumulation_steps,
374
+ warmup_steps = warmup_steps,
375
+ max_steps = 60, # Set num_train_epochs = 1 for full training runs
376
+ learning_rate = learning_rate,
377
+ fp16 = not is_bfloat16_supported(),
378
+ bf16 = is_bfloat16_supported(),
379
+ logging_steps = logging_steps,
380
+ optim = "adamw_8bit",
381
+ weight_decay = weight_decay,
382
+ lr_scheduler_type = "linear",
383
+ seed = iseed,
384
+ report_to="tensorboard" if log_to_tensorboard else None,
385
+ output_dir = output_dir
386
+ ),
387
+ )
388
+ trainer = train_on_responses_only(
389
+ trainer,
390
+ instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
391
+ response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
392
+ )
393
+ trainer.train()
394
+ progress(1, desc="Training completed")
395
+ time.sleep(1)
396
+ return "Model trained 100%",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
397
+
398
+
399
+ train_btn.click(train_model, inputs=[model_name, lora_r, lora_alpha, lora_dropout, per_device_train_batch_size, warmup_steps, max_steps, gradient_accumulation_steps, logging_steps, log_to_tensorboard, learning_rate, weight_decay, seed, output_dir], outputs=[train_output, train_btn])
400
+
401
+ with gr.Tab("Save & Push Options"):
402
+
403
+ with gr.Row():
404
+ gr.Markdown("### Merging Options")
405
+ with gr.Column():
406
+ merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
407
+ merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
408
+ just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
409
+ gr.Markdown("---")
410
+
411
+ with gr.Row():
412
+ gr.Markdown("### GGUF Options")
413
+ with gr.Column():
414
+ gguf_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
415
+ gguf_8bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
416
+ gguf_4bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
417
+ with gr.Column():
418
+ gguf_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
419
+ gguf_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
420
+ gr.Markdown("---")
421
+
422
+ with gr.Row():
423
+ gr.Markdown("### Hugging Face Hub Options")
424
+ push_to_hub = gr.Checkbox(label="Push to Hub", value=False, interactive=True)
425
+ with gr.Column():
426
+ hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
427
+ hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
428
+ gr.Markdown("---")
429
+
430
+ # with gr.Row():
431
+ # gr.Markdown("### Ollama options")
432
+ # with gr.Column():
433
+ # ollama_create_local = gr.Checkbox(label="Create in Ollama (local)", value=False, interactive=True)
434
+ # ollama_push_to_hub = gr.Checkbox(label="Push to Ollama", value=False, interactive=True)
435
+ # with gr.Column():
436
+ # ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
437
+ # ollama_pub_key = gr.Button("Ollama Pub Key")
438
+ save_output = gr.Markdown("---")
439
+ save_button = gr.Button("Save Model", visible=True, interactive=True)
440
+ save_button.click(save_model, inputs=[model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub], outputs=[save_output, save_button])
441
+
442
+ with gr.Tab("Inference"):
443
+ with gr.Row():
444
+ input_text = gr.Textbox(label="Input Text", lines=4, value="""\
445
+ Continue the fibonnaci sequence.
446
+ # instruction
447
+ 1, 1, 2, 3, 5, 8
448
+ # input
449
+ """, interactive=True)
450
+ output_text = gr.Textbox(label="Output Text", lines=4, value="", interactive=False)
451
+
452
+ inference_button = gr.Button("Inference", visible=True, interactive=True)
453
+ inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
454
+ load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
455
+
456
+ demo.launch()