Borcherding commited on
Commit
1316578
·
verified ·
1 Parent(s): 169f166

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +468 -455
app.py CHANGED
@@ -1,456 +1,469 @@
1
- import gradio as gr
2
- from huggingface_hub import HfApi
3
- from unsloth import FastLanguageModel, is_bfloat16_supported
4
- from unsloth.chat_templates import get_chat_template, train_on_responses_only
5
-
6
- from trl import SFTTrainer
7
- from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
8
- import torch
9
- from datasets import load_dataset
10
- import time
11
- import psutil
12
- import platform
13
- import os
14
-
15
- hf_user = None
16
- try:
17
- hfApi = HfApi()
18
- hf_user = hfApi.whoami()["name"]
19
- except Exception as e:
20
- hf_user = "not logged in"
21
-
22
- def get_human_readable_size(size, decimal_places=2):
23
- for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
24
- if size < 1024.0:
25
- break
26
- size /= 1024.0
27
- return f"{size:.{decimal_places}f} {unit}"
28
-
29
-
30
- # get cpu stats
31
- disk_stats = psutil.disk_usage('.')
32
- print(get_human_readable_size(disk_stats.total))
33
- cpu_info = platform.processor()
34
- print(cpu_info)
35
- os_info = platform.platform()
36
- print(os_info)
37
-
38
- memory = psutil.virtual_memory()
39
-
40
- # Dropdown options
41
- model_options = [
42
- "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
43
- "unsloth/Llama-3.2-1B-bnb-4bit",
44
- "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
45
- "unsloth/Llama-3.2-3B-bnb-4bit",
46
- "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
47
- "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
48
- "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
49
- "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
50
- "unsloth/llama-3-8b-Instruct-bnb-4bit",
51
- "unsloth/llama-3-70b-bnb-4bit",
52
- "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
53
- "unsloth/Phi-3-medium-4k-instruct",
54
- "unsloth/mistral-7b-bnb-4bit",
55
- "unsloth/gemma-2-9b-bnb-4bit",
56
- "unsloth/gemma-2-9b-bnb-4bit-instruct",
57
- "unsloth/gemma-2-27b-bnb-4bit", # Gemma 2x faster!
58
- "unsloth/gemma-2-27b-bnb-4bit-instruct", # Gemma 2x faster!
59
- "unsloth/Qwen2-1.5B-bnb-4bit",
60
- "unsloth/Qwen2-1.5B-bnb-4bit-instruct",
61
- "unsloth/Qwen2-7B-bnb-4bit",
62
- "unsloth/Qwen2-7B-bnb-4bit-instruct",
63
- "unsloth/Qwen2-72B-bnb-4bit",
64
- "unsloth/Qwen2-72B-bnb-4bit-instruct",
65
- "unsloth/yi-6b-bnb-4bit",
66
- "unsloth/yi-34b-bnb-4bit",
67
- ]
68
- gpu_stats = torch.cuda.get_device_properties(0)
69
- start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
70
- max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
71
-
72
- running_on_hf = False
73
- if os.getenv("SYSTEM", None) == "spaces":
74
- running_on_hf = True
75
-
76
- system_info = f"""\
77
- - **System:** {os_info}
78
- - **CPU:** {cpu_info} **Memory:** {get_human_readable_size(memory.free)} free of {get_human_readable_size(memory.total)}
79
- - **GPU:** {gpu_stats.name} ({max_memory} GB)
80
- - **Disk:** {get_human_readable_size(disk_stats.free)} free of {get_human_readable_size(disk_stats.total)}
81
- - **Hugging Face:** {running_on_hf}
82
- """
83
-
84
- model=None
85
- tokenizer = None
86
- dataset = None
87
- max_seq_length = 2048
88
-
89
- class PrinterCallback(TrainerCallback):
90
- step = 0
91
- def __init__(self, progress):
92
- self.progress = progress
93
- def on_log(self, args, state, control, logs=None, **kwargs):
94
- _ = logs.pop("total_flos", None)
95
- if state.is_local_process_zero:
96
- #print(logs)
97
- pass
98
- def on_step_end(self, args, state, control, **kwargs):
99
- if state.is_local_process_zero:
100
- self.step = state.global_step
101
- self.progress(self.step/60, desc=f"Training {self.step}/60")
102
- #print("**Step ", state.global_step)
103
-
104
-
105
-
106
-
107
- def formatting_prompts_func(examples, prompt):
108
- global tokenizer
109
- instructions = examples["instruction"]
110
- inputs = examples["input"]
111
- outputs = examples["output"]
112
-
113
- texts = []
114
- for instruction, input, output in zip(instructions, inputs, outputs):
115
- conversation = [
116
- {
117
- "role": "system",
118
- "content": instruction + tokenizer.eos_token
119
- },
120
- {
121
- "role": "user",
122
- "content": input + tokenizer.eos_token
123
- },
124
- {
125
- "role": "assistant",
126
- "content": output + tokenizer.eos_token
127
- }
128
- ]
129
- text = tokenizer.apply_chat_template(
130
- conversation, tokenize=False, add_generation_prompt=False
131
- )
132
-
133
- texts.append(text)
134
-
135
- return { "text" : texts }
136
-
137
- def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
138
- global model, tokenizer, max_seq_length
139
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
140
- max_seq_length = max_sequence_length
141
- model, tokenizer = FastLanguageModel.from_pretrained(
142
- model_name = initial_model_name,
143
- max_seq_length = max_sequence_length,
144
- dtype = dtype,
145
- load_in_4bit = load_in_4bit,
146
- token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
147
- )
148
- tokenizer = get_chat_template(
149
- tokenizer,
150
- chat_template="llama-3.1",
151
- )
152
- return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
153
-
154
- def load_data(dataset_name, data_template_style, data_template):
155
- global dataset
156
- dataset = load_dataset(dataset_name, split = "train")
157
- dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
158
-
159
- return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
160
-
161
- def inference(prompt, input_text):
162
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
163
- inputs = tokenizer(
164
- [
165
- prompt.format(
166
- "Continue the fibonnaci sequence.", # instruction
167
- "1, 1, 2, 3, 5, 8", # input
168
- "", # output - leave this blank for generation!
169
- )
170
- ], return_tensors = "pt").to("cuda")
171
-
172
- outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
173
- result = tokenizer.batch_decode(outputs)
174
- return result[0], gr.update(visible=True, interactive=True)
175
-
176
- def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub, progress=gr.Progress()):
177
- global model, tokenizer
178
-
179
- print("Starting save_model function")
180
- print(f"Model name: {model_name}")
181
- print(f"Hub model name: {hub_model_name}")
182
- print(f"GGUF 16bit: {gguf_16bit}, GGUF 8bit: {gguf_8bit}, GGUF 4bit: {gguf_4bit}")
183
- print(f"Merge 16bit: {merge_16bit}, Merge 4bit: {merge_4bit}, Just LoRA: {just_lora}")
184
- print(f"Push to hub: {push_to_hub}")
185
-
186
- quants = []
187
- current_quant = 0
188
-
189
- if gguf_custom:
190
- gguf_custom_value = gguf_custom_value
191
- quants.append(gguf_custom_value)
192
- print(f"Custom GGUF value: {gguf_custom_value}")
193
- else:
194
- gguf_custom_value = None
195
-
196
- if gguf_16bit:
197
- quants.append("f16")
198
- if gguf_8bit:
199
- quants.append("q8_0")
200
- if gguf_4bit:
201
- quants.append("q4_k_m")
202
-
203
- if merge_16bit:
204
- print("Merging model to 16bit")
205
- progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
206
- model.save_pretrained_merged(
207
- "model",
208
- tokenizer,
209
- save_method="merged_16bit",
210
- )
211
- if push_to_hub:
212
- print("Pushing merged 16bit model to HuggingFace Hub")
213
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
214
-
215
- elif merge_4bit:
216
- print("Merging model to 4bit")
217
- progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
218
- model.save_pretrained_merged(
219
- "model",
220
- tokenizer,
221
- save_method="merged_4bit_forced",
222
- )
223
- if push_to_hub:
224
- print("Pushing merged 4bit model to HuggingFace Hub")
225
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit_forced", token=hub_token)
226
-
227
- elif just_lora:
228
- print("Saving just LoRA")
229
- progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
230
- model.save_pretrained_merged(
231
- "model",
232
- tokenizer,
233
- save_method="lora",
234
- )
235
- if push_to_hub:
236
- print("Pushing LoRA model to HuggingFace Hub")
237
- model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
238
-
239
- if push_to_hub:
240
- current_quant = 0
241
- for q in quants:
242
- print(f"Pushing model with quantization {q} to HuggingFace Hub")
243
- progress(current_quant/len(quants), desc=f"Pushing model {model_name} with {q} to HuggingFace Hub")
244
- model.push_to_hub_gguf(hub_model_name, tokenizer, quantization_method=q, token=hub_token)
245
- current_quant += 1
246
- print("Model saved successfully")
247
- return "Model saved", gr.update(visible=True, interactive=True)
248
-
249
- def username(profile: gr.OAuthProfile | None):
250
- hf_user = profile["name"] if profile else "not logged in"
251
- return hf_user
252
-
253
- # Create the Gradio interface
254
- with gr.Blocks(title="Unsloth fine-tuning") as demo:
255
- if (running_on_hf):
256
- gr.LoginButton()
257
- # logged_user = gr.Markdown(f"**User:** {hf_user}")
258
- #demo.load(username, inputs=None, outputs=logged_user)
259
- with gr.Row():
260
- with gr.Column(scale=0.5):
261
- gr.Image("unsloth.png", width="300px", interactive=False, show_download_button=False, show_label=False, show_share_button=False)
262
- with gr.Column(min_width="550px", scale=1):
263
- gr.Markdown(system_info)
264
- with gr.Column(min_width="250px", scale=0.3):
265
- gr.Markdown(f"**Links:**\n\n* [Unsloth Hub](https://huggingface.co/unsloth)\n\n* [Unsloth Docs](http://docs.unsloth.com/)\n\n* [Unsloth GitHub](https://github.com/unslothai/unsloth)")
266
- with gr.Tab("Base Model Parameters"):
267
-
268
- with gr.Row():
269
- initial_model_name = gr.Dropdown(choices=model_options, label="Select Base Model", allow_custom_value=True)
270
- load_in_4bit = gr.Checkbox(label="Load 4bit model", value=True)
271
-
272
- gr.Markdown("### Target Model Parameters")
273
- with gr.Row():
274
- max_sequence_length = gr.Slider(minimum=128, value=512, step=64, maximum=128*1024, interactive=True, label="Max Sequence Length")
275
- load_btn = gr.Button("Load")
276
- output = gr.Textbox(label="Model Load Status", value="Model not loaded", interactive=False)
277
- gr.Markdown("---")
278
-
279
- with gr.Tab("Data Preparation"):
280
- with gr.Row():
281
- dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
282
- data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca", allow_custom_value=True)
283
- with gr.Row():
284
- data_template = gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
285
-
286
- ### Instruction:
287
- {}
288
-
289
- ### Input:
290
- {}
291
-
292
- ### Response:
293
- {}""")
294
- gr.Markdown("---")
295
- output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
296
- load_data_btn = gr.Button("Load Dataset", interactive=True)
297
- load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_template], outputs=[output_load_data, load_data_btn])
298
-
299
- with gr.Tab("Fine-Tuning"):
300
- gr.Markdown("""### Fine-Tuned Model Parameters""")
301
- with gr.Row():
302
- model_name = gr.Textbox(label="Model Name", value=initial_model_name.value, interactive=True)
303
-
304
- gr.Markdown("""### Lora Parameters""")
305
-
306
- with gr.Row():
307
- lora_r = gr.Number(label="R", value=16, interactive=True)
308
- lora_alpha = gr.Number(label="Lora Alpha", value=16, interactive=True)
309
- lora_dropout = gr.Number(label="Lora Dropout", value=0.1, interactive=True)
310
-
311
- gr.Markdown("---")
312
- gr.Markdown("""### Training Parameters""")
313
- with gr.Row():
314
- with gr.Column():
315
- with gr.Row():
316
- per_device_train_batch_size = gr.Number(label="Per Device Train Batch Size", value=2, interactive=True)
317
- warmup_steps = gr.Number(label="Warmup Steps", value=5, interactive=True)
318
- max_steps = gr.Number(label="Max Steps", value=60, interactive=True)
319
- gradient_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=4, interactive=True)
320
- with gr.Row():
321
- logging_steps = gr.Number(label="Logging Steps", value=1, interactive=True)
322
- log_to_tensorboard = gr.Checkbox(label="Log to Tensorboard", value=True, interactive=True)
323
-
324
- with gr.Row():
325
- # optim = gr.Dropdown(choices=["adamw_8bit", "adamw", "sgd"], label="Optimizer", value="adamw_8bit")
326
- learning_rate = gr.Number(label="Learning Rate", value=2e-4, interactive=True)
327
-
328
- # with gr.Row():
329
- weight_decay = gr.Number(label="Weight Decay", value=0.01, interactive=True)
330
- # lr_scheduler_type = gr.Dropdown(choices=["linear", "cosine", "constant"], label="LR Scheduler Type", value="linear")
331
- gr.Markdown("---")
332
-
333
- with gr.Row():
334
- seed = gr.Number(label="Seed", value=3407, interactive=True)
335
- output_dir = gr.Textbox(label="Output Directory", value="outputs", interactive=True)
336
- gr.Markdown("---")
337
-
338
- train_output = gr.Textbox(label="Training Status", value="Model not trained", interactive=False)
339
- train_btn = gr.Button("Train", visible=True)
340
-
341
- def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
342
- gradient_accumulation_steps: int, logging_steps: int, log_to_tensorboard: bool, learning_rate, weight_decay, seed: int, output_dir, progress= gr.Progress()):
343
- global model, tokenizer
344
- print(f"$$$ Training model {model_name} with {lora_r} R, {lora_alpha} alpha, {lora_dropout} dropout, {per_device_train_batch_size} per device train batch size, {warmup_steps} warmup steps, {max_steps} max steps, {gradient_accumulation_steps} gradient accumulation steps, {logging_steps} logging steps, {log_to_tensorboard} log to tensorboard, {learning_rate} learning rate, {weight_decay} weight decay, {seed} seed, {output_dir} output dir")
345
- iseed = seed
346
- model = FastLanguageModel.get_peft_model(
347
- model,
348
- r = lora_r,
349
- target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
350
- "gate_proj", "up_proj", "down_proj",],
351
- lora_alpha = lora_alpha,
352
- lora_dropout = lora_dropout,
353
- bias = "none",
354
- use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
355
- random_state=iseed,
356
- use_rslora = False, # We support rank stabilized LoRA
357
- loftq_config = None, # And LoftQ
358
- )
359
- progress(0.0, desc="Loading Trainer")
360
- time.sleep(1)
361
- trainer = SFTTrainer(
362
- model = model,
363
- tokenizer = tokenizer,
364
- train_dataset = dataset,
365
- dataset_text_field="text",
366
- max_seq_length=max_seq_length,
367
- data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
368
- dataset_num_proc = 2,
369
- packing = False, # Can make training 5x faster for short sequences.
370
- callbacks = [PrinterCallback(progress)],
371
- args = TrainingArguments(
372
- per_device_train_batch_size = per_device_train_batch_size,
373
- gradient_accumulation_steps = gradient_accumulation_steps,
374
- warmup_steps = warmup_steps,
375
- max_steps = 60, # Set num_train_epochs = 1 for full training runs
376
- learning_rate = learning_rate,
377
- fp16 = not is_bfloat16_supported(),
378
- bf16 = is_bfloat16_supported(),
379
- logging_steps = logging_steps,
380
- optim = "adamw_8bit",
381
- weight_decay = weight_decay,
382
- lr_scheduler_type = "linear",
383
- seed = iseed,
384
- report_to="tensorboard" if log_to_tensorboard else None,
385
- output_dir = output_dir
386
- ),
387
- )
388
- trainer = train_on_responses_only(
389
- trainer,
390
- instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
391
- response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
392
- )
393
- trainer.train()
394
- progress(1, desc="Training completed")
395
- time.sleep(1)
396
- return "Model trained 100%",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
397
-
398
-
399
- train_btn.click(train_model, inputs=[model_name, lora_r, lora_alpha, lora_dropout, per_device_train_batch_size, warmup_steps, max_steps, gradient_accumulation_steps, logging_steps, log_to_tensorboard, learning_rate, weight_decay, seed, output_dir], outputs=[train_output, train_btn])
400
-
401
- with gr.Tab("Save & Push Options"):
402
-
403
- with gr.Row():
404
- gr.Markdown("### Merging Options")
405
- with gr.Column():
406
- merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
407
- merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
408
- just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
409
- gr.Markdown("---")
410
-
411
- with gr.Row():
412
- gr.Markdown("### GGUF Options")
413
- with gr.Column():
414
- gguf_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
415
- gguf_8bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
416
- gguf_4bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
417
- with gr.Column():
418
- gguf_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
419
- gguf_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
420
- gr.Markdown("---")
421
-
422
- with gr.Row():
423
- gr.Markdown("### Hugging Face Hub Options")
424
- push_to_hub = gr.Checkbox(label="Push to Hub", value=False, interactive=True)
425
- with gr.Column():
426
- hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
427
- hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
428
- gr.Markdown("---")
429
-
430
- # with gr.Row():
431
- # gr.Markdown("### Ollama options")
432
- # with gr.Column():
433
- # ollama_create_local = gr.Checkbox(label="Create in Ollama (local)", value=False, interactive=True)
434
- # ollama_push_to_hub = gr.Checkbox(label="Push to Ollama", value=False, interactive=True)
435
- # with gr.Column():
436
- # ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
437
- # ollama_pub_key = gr.Button("Ollama Pub Key")
438
- save_output = gr.Markdown("---")
439
- save_button = gr.Button("Save Model", visible=True, interactive=True)
440
- save_button.click(save_model, inputs=[model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub], outputs=[save_output, save_button])
441
-
442
- with gr.Tab("Inference"):
443
- with gr.Row():
444
- input_text = gr.Textbox(label="Input Text", lines=4, value="""\
445
- Continue the fibonnaci sequence.
446
- # instruction
447
- 1, 1, 2, 3, 5, 8
448
- # input
449
- """, interactive=True)
450
- output_text = gr.Textbox(label="Output Text", lines=4, value="", interactive=False)
451
-
452
- inference_button = gr.Button("Inference", visible=True, interactive=True)
453
- inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
454
- load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
455
-
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  demo.launch()
 
1
+ import gradio as gr
2
+ from huggingface_hub import HfApi
3
+ from unsloth import FastLanguageModel, is_bfloat16_supported
4
+ from unsloth.chat_templates import get_chat_template, train_on_responses_only
5
+
6
+ from trl import SFTTrainer
7
+ from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
8
+ import torch
9
+ from datasets import load_dataset
10
+ import time
11
+ import psutil
12
+ import platform
13
+ import os
14
+ import logging
15
+ from pprint import pprint
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ hf_user = None
22
+ try:
23
+ hfApi = HfApi()
24
+ hf_user = hfApi.whoami()["name"]
25
+ except Exception as e:
26
+ hf_user = "not logged in"
27
+
28
+ def get_human_readable_size(size, decimal_places=2):
29
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
30
+ if size < 1024.0:
31
+ break
32
+ size /= 1024.0
33
+ return f"{size:.{decimal_places}f} {unit}"
34
+
35
+ # get cpu stats
36
+ disk_stats = psutil.disk_usage('.')
37
+ print(get_human_readable_size(disk_stats.total))
38
+ cpu_info = platform.processor()
39
+ print(cpu_info)
40
+ os_info = platform.platform()
41
+ print(os_info)
42
+
43
+ memory = psutil.virtual_memory()
44
+
45
+ # Dropdown options
46
+ model_options = [
47
+ "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
48
+ "unsloth/Llama-3.2-1B-bnb-4bit",
49
+ "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
50
+ "unsloth/Llama-3.2-3B-bnb-4bit",
51
+ "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
52
+ "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
53
+ "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
54
+ "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
55
+ "unsloth/llama-3-8b-Instruct-bnb-4bit",
56
+ "unsloth/llama-3-70b-bnb-4bit",
57
+ "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
58
+ "unsloth/Phi-3-medium-4k-instruct",
59
+ "unsloth/mistral-7b-bnb-4bit",
60
+ "unsloth/gemma-2-9b-bnb-4bit",
61
+ "unsloth/gemma-2-9b-bnb-4bit-instruct",
62
+ "unsloth/gemma-2-27b-bnb-4bit", # Gemma 2x faster!
63
+ "unsloth/gemma-2-27b-bnb-4bit-instruct", # Gemma 2x faster!
64
+ "unsloth/Qwen2-1.5B-bnb-4bit",
65
+ "unsloth/Qwen2-1.5B-bnb-4bit-instruct",
66
+ "unsloth/Qwen2-7B-bnb-4bit",
67
+ "unsloth/Qwen2-7B-bnb-4bit-instruct",
68
+ "unsloth/Qwen2-72B-bnb-4bit",
69
+ "unsloth/Qwen2-72B-bnb-4bit-instruct",
70
+ "unsloth/yi-6b-bnb-4bit",
71
+ "unsloth/yi-34b-bnb-4bit",
72
+ ]
73
+ gpu_stats = torch.cuda.get_device_properties(0)
74
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
75
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
76
+
77
+ running_on_hf = False
78
+ if os.getenv("SYSTEM", None) == "spaces":
79
+ running_on_hf = True
80
+
81
+ system_info = f"""\
82
+ - **System:** {os_info}
83
+ - **CPU:** {cpu_info} **Memory:** {get_human_readable_size(memory.free)} free of {get_human_readable_size(memory.total)}
84
+ - **GPU:** {gpu_stats.name} ({max_memory} GB)
85
+ - **Disk:** {get_human_readable_size(disk_stats.free)} free of {get_human_readable_size(disk_stats.total)}
86
+ - **Hugging Face:** {running_on_hf}
87
+ """
88
+
89
+ model=None
90
+ tokenizer = None
91
+ dataset = None
92
+ max_seq_length = 2048
93
+
94
+ class PrinterCallback(TrainerCallback):
95
+ step = 0
96
+ def __init__(self, progress):
97
+ self.progress = progress
98
+ def on_log(self, args, state, control, logs=None, **kwargs):
99
+ _ = logs.pop("total_flos", None)
100
+ if state.is_local_process_zero:
101
+ #print(logs)
102
+ pass
103
+ def on_step_end(self, args, state, control, **kwargs):
104
+ if state.is_local_process_zero:
105
+ self.step = state.global_step
106
+ self.progress(self.step/60, desc=f"Training {self.step}/60")
107
+ #print("**Step ", state.global_step)
108
+
109
+ def formatting_prompts_func(examples_data, data_template):
110
+ texts = []
111
+ for example in examples_data:
112
+ instruction = example['instruction']
113
+ user_input = example['input']
114
+ assistant_output = example['output']
115
+
116
+ conversation = [
117
+ {
118
+ "role": "system",
119
+ "content": instruction + tokenizer.eos_token
120
+ },
121
+ {
122
+ "role": "user",
123
+ "content": user_input + tokenizer.eos_token
124
+ },
125
+ {
126
+ "role": "assistant",
127
+ "content": assistant_output + tokenizer.eos_token
128
+ }
129
+ ]
130
+ text = tokenizer.apply_chat_template(
131
+ conversation, tokenize=False, add_generation_prompt=False
132
+ )
133
+ texts.append(text)
134
+
135
+ return { "text" : texts }
136
+
137
+ def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
138
+ global model, tokenizer, max_seq_length
139
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
140
+ max_seq_length = max_sequence_length
141
+ model, tokenizer = FastLanguageModel.from_pretrained(
142
+ model_name = initial_model_name,
143
+ max_seq_length = max_sequence_length,
144
+ dtype = dtype,
145
+ load_in_4bit = load_in_4bit,
146
+ token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
147
+ )
148
+ tokenizer = get_chat_template(
149
+ tokenizer,
150
+ chat_template="llama-3.1",
151
+ )
152
+ return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
153
+
154
+ def load_data(dataset_name, data_template_style, data_template):
155
+ global dataset
156
+ try:
157
+ dataset = load_dataset(dataset_name, split="train")
158
+ logger.info("Dataset loaded successfully.")
159
+ pprint(dataset)
160
+ except Exception as e:
161
+ logger.error(f"Error loading dataset: {e}")
162
+ return f"Error loading dataset: {e}", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
163
+
164
+ try:
165
+ dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
166
+ logger.info("Dataset mapped successfully.")
167
+ pprint(dataset)
168
+ except Exception as e:
169
+ logger.error(f"Error mapping dataset: {e}")
170
+ return f"Error mapping dataset: {e}", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
171
+
172
+ return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
173
+
174
+ def inference(prompt, input_text):
175
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
176
+ inputs = tokenizer(
177
+ [
178
+ prompt.format(
179
+ "Continue the fibonnaci sequence.", # instruction
180
+ "1, 1, 2, 3, 5, 8", # input
181
+ "", # output - leave this blank for generation!
182
+ )
183
+ ], return_tensors = "pt").to("cuda")
184
+
185
+ outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
186
+ result = tokenizer.batch_decode(outputs)
187
+ return result[0], gr.update(visible=True, interactive=True)
188
+
189
+ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub, progress=gr.Progress()):
190
+ global model, tokenizer
191
+
192
+ print("Starting save_model function")
193
+ print(f"Model name: {model_name}")
194
+ print(f"Hub model name: {hub_model_name}")
195
+ print(f"GGUF 16bit: {gguf_16bit}, GGUF 8bit: {gguf_8bit}, GGUF 4bit: {gguf_4bit}")
196
+ print(f"Merge 16bit: {merge_16bit}, Merge 4bit: {merge_4bit}, Just LoRA: {just_lora}")
197
+ print(f"Push to hub: {push_to_hub}")
198
+
199
+ quants = []
200
+ current_quant = 0
201
+
202
+ if gguf_custom:
203
+ gguf_custom_value = gguf_custom_value
204
+ quants.append(gguf_custom_value)
205
+ print(f"Custom GGUF value: {gguf_custom_value}")
206
+ else:
207
+ gguf_custom_value = None
208
+
209
+ if gguf_16bit:
210
+ quants.append("f16")
211
+ if gguf_8bit:
212
+ quants.append("q8_0")
213
+ if gguf_4bit:
214
+ quants.append("q4_k_m")
215
+
216
+ if merge_16bit:
217
+ print("Merging model to 16bit")
218
+ progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
219
+ model.save_pretrained_merged(
220
+ "model",
221
+ tokenizer,
222
+ save_method="merged_16bit",
223
+ )
224
+ if push_to_hub:
225
+ print("Pushing merged 16bit model to HuggingFace Hub")
226
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
227
+
228
+ elif merge_4bit:
229
+ print("Merging model to 4bit")
230
+ progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
231
+ model.save_pretrained_merged(
232
+ "model",
233
+ tokenizer,
234
+ save_method="merged_4bit_forced",
235
+ )
236
+ if push_to_hub:
237
+ print("Pushing merged 4bit model to HuggingFace Hub")
238
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit_forced", token=hub_token)
239
+
240
+ elif just_lora:
241
+ print("Saving just LoRA")
242
+ progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
243
+ model.save_pretrained_merged(
244
+ "model",
245
+ tokenizer,
246
+ save_method="lora",
247
+ )
248
+ if push_to_hub:
249
+ print("Pushing LoRA model to HuggingFace Hub")
250
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
251
+
252
+ if push_to_hub:
253
+ current_quant = 0
254
+ for q in quants:
255
+ print(f"Pushing model with quantization {q} to HuggingFace Hub")
256
+ progress(current_quant/len(quants), desc=f"Pushing model {model_name} with {q} to HuggingFace Hub")
257
+ model.push_to_hub_gguf(hub_model_name, tokenizer, quantization_method=q, token=hub_token)
258
+ current_quant += 1
259
+ print("Model saved successfully")
260
+ return "Model saved", gr.update(visible=True, interactive=True)
261
+
262
+ def username(profile: gr.OAuthProfile | None):
263
+ hf_user = profile["name"] if profile else "not logged in"
264
+ return hf_user
265
+
266
+ # Create the Gradio interface
267
+ with gr.Blocks(title="Unsloth fine-tuning") as demo:
268
+ if (running_on_hf):
269
+ gr.LoginButton()
270
+ # logged_user = gr.Markdown(f"**User:** {hf_user}")
271
+ #demo.load(username, inputs=None, outputs=logged_user)
272
+ with gr.Row():
273
+ with gr.Column(scale=0.5):
274
+ gr.Image("unsloth.png", width="300px", interactive=False, show_download_button=False, show_label=False, show_share_button=False)
275
+ with gr.Column(min_width="550px", scale=1):
276
+ gr.Markdown(system_info)
277
+ with gr.Column(min_width="250px", scale=0.3):
278
+ gr.Markdown(f"**Links:**\n\n* [Unsloth Hub](https://huggingface.co/unsloth)\n\n* [Unsloth Docs](http://docs.unsloth.com/)\n\n* [Unsloth GitHub](https://github.com/unslothai/unsloth)")
279
+ with gr.Tab("Base Model Parameters"):
280
+
281
+ with gr.Row():
282
+ initial_model_name = gr.Dropdown(choices=model_options, label="Select Base Model", allow_custom_value=True)
283
+ load_in_4bit = gr.Checkbox(label="Load 4bit model", value=True)
284
+
285
+ gr.Markdown("### Target Model Parameters")
286
+ with gr.Row():
287
+ max_sequence_length = gr.Slider(minimum=128, value=512, step=64, maximum=128*1024, interactive=True, label="Max Sequence Length")
288
+ load_btn = gr.Button("Load")
289
+ output = gr.Textbox(label="Model Load Status", value="Model not loaded", interactive=False)
290
+ gr.Markdown("---")
291
+
292
+ with gr.Tab("Data Preparation"):
293
+ with gr.Row():
294
+ dataset_name = gr.Textbox(label="Dataset Name", value="yahma/alpaca-cleaned")
295
+ data_template_style = gr.Dropdown(label="Template", choices=["alpaca","custom"], value="alpaca", allow_custom_value=True)
296
+ with gr.Row():
297
+ data_template = gr.TextArea(label="Data Template", value="""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
298
+
299
+ ### Instruction:
300
+ {}
301
+
302
+ ### Input:
303
+ {}
304
+
305
+ ### Response:
306
+ {}""")
307
+ gr.Markdown("---")
308
+ output_load_data = gr.Textbox(label="Data Load Status", value="Data not loaded", interactive=False)
309
+ load_data_btn = gr.Button("Load Dataset", interactive=True)
310
+ load_data_btn.click(load_data, inputs=[dataset_name, data_template_style, data_template], outputs=[output_load_data, load_data_btn])
311
+
312
+ with gr.Tab("Fine-Tuning"):
313
+ gr.Markdown("""### Fine-Tuned Model Parameters""")
314
+ with gr.Row():
315
+ model_name = gr.Textbox(label="Model Name", value=initial_model_name.value, interactive=True)
316
+
317
+ gr.Markdown("""### Lora Parameters""")
318
+
319
+ with gr.Row():
320
+ lora_r = gr.Number(label="R", value=16, interactive=True)
321
+ lora_alpha = gr.Number(label="Lora Alpha", value=16, interactive=True)
322
+ lora_dropout = gr.Number(label="Lora Dropout", value=0.1, interactive=True)
323
+
324
+ gr.Markdown("---")
325
+ gr.Markdown("""### Training Parameters""")
326
+ with gr.Row():
327
+ with gr.Column():
328
+ with gr.Row():
329
+ per_device_train_batch_size = gr.Number(label="Per Device Train Batch Size", value=2, interactive=True)
330
+ warmup_steps = gr.Number(label="Warmup Steps", value=5, interactive=True)
331
+ max_steps = gr.Number(label="Max Steps", value=60, interactive=True)
332
+ gradient_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=4, interactive=True)
333
+ with gr.Row():
334
+ logging_steps = gr.Number(label="Logging Steps", value=1, interactive=True)
335
+ log_to_tensorboard = gr.Checkbox(label="Log to Tensorboard", value=True, interactive=True)
336
+
337
+ with gr.Row():
338
+ # optim = gr.Dropdown(choices=["adamw_8bit", "adamw", "sgd"], label="Optimizer", value="adamw_8bit")
339
+ learning_rate = gr.Number(label="Learning Rate", value=2e-4, interactive=True)
340
+
341
+ # with gr.Row():
342
+ weight_decay = gr.Number(label="Weight Decay", value=0.01, interactive=True)
343
+ # lr_scheduler_type = gr.Dropdown(choices=["linear", "cosine", "constant"], label="LR Scheduler Type", value="linear")
344
+ gr.Markdown("---")
345
+
346
+ with gr.Row():
347
+ seed = gr.Number(label="Seed", value=3407, interactive=True)
348
+ output_dir = gr.Textbox(label="Output Directory", value="outputs", interactive=True)
349
+ gr.Markdown("---")
350
+
351
+ train_output = gr.Textbox(label="Training Status", value="Model not trained", interactive=False)
352
+ train_btn = gr.Button("Train", visible=True)
353
+
354
+ def train_model(model_name: str, lora_r: int, lora_alpha: int, lora_dropout: float, per_device_train_batch_size: int, warmup_steps: int, max_steps: int,
355
+ gradient_accumulation_steps: int, logging_steps: int, log_to_tensorboard: bool, learning_rate, weight_decay, seed: int, output_dir, progress= gr.Progress()):
356
+ global model, tokenizer
357
+ print(f"$$$ Training model {model_name} with {lora_r} R, {lora_alpha} alpha, {lora_dropout} dropout, {per_device_train_batch_size} per device train batch size, {warmup_steps} warmup steps, {max_steps} max steps, {gradient_accumulation_steps} gradient accumulation steps, {logging_steps} logging steps, {log_to_tensorboard} log to tensorboard, {learning_rate} learning rate, {weight_decay} weight decay, {seed} seed, {output_dir} output dir")
358
+ iseed = seed
359
+ model = FastLanguageModel.get_peft_model(
360
+ model,
361
+ r = lora_r,
362
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
363
+ "gate_proj", "up_proj", "down_proj",],
364
+ lora_alpha = lora_alpha,
365
+ lora_dropout = lora_dropout,
366
+ bias = "none",
367
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
368
+ random_state=iseed,
369
+ use_rslora = False, # We support rank stabilized LoRA
370
+ loftq_config = None, # And LoftQ
371
+ )
372
+ progress(0.0, desc="Loading Trainer")
373
+ time.sleep(1)
374
+ trainer = SFTTrainer(
375
+ model = model,
376
+ tokenizer = tokenizer,
377
+ train_dataset = dataset,
378
+ dataset_text_field="text",
379
+ max_seq_length=max_seq_length,
380
+ data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
381
+ dataset_num_proc = 2,
382
+ packing = False, # Can make training 5x faster for short sequences.
383
+ callbacks = [PrinterCallback(progress)],
384
+ args = TrainingArguments(
385
+ per_device_train_batch_size = per_device_train_batch_size,
386
+ gradient_accumulation_steps = gradient_accumulation_steps,
387
+ warmup_steps = warmup_steps,
388
+ max_steps = 60, # Set num_train_epochs = 1 for full training runs
389
+ learning_rate = learning_rate,
390
+ fp16 = not is_bfloat16_supported(),
391
+ bf16 = is_bfloat16_supported(),
392
+ logging_steps = logging_steps,
393
+ optim = "adamw_8bit",
394
+ weight_decay = weight_decay,
395
+ lr_scheduler_type = "linear",
396
+ seed = iseed,
397
+ report_to="tensorboard" if log_to_tensorboard else None,
398
+ output_dir = output_dir
399
+ ),
400
+ )
401
+ trainer = train_on_responses_only(
402
+ trainer,
403
+ instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
404
+ response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
405
+ )
406
+ trainer.train()
407
+ progress(1, desc="Training completed")
408
+ time.sleep(1)
409
+ return "Model trained 100%",gr.update(visible=True, interactive=False), gr.update(visible=True, interactive=True), gr.update(interactive=True)
410
+
411
+
412
+ train_btn.click(train_model, inputs=[model_name, lora_r, lora_alpha, lora_dropout, per_device_train_batch_size, warmup_steps, max_steps, gradient_accumulation_steps, logging_steps, log_to_tensorboard, learning_rate, weight_decay, seed, output_dir], outputs=[train_output, train_btn])
413
+
414
+ with gr.Tab("Save & Push Options"):
415
+
416
+ with gr.Row():
417
+ gr.Markdown("### Merging Options")
418
+ with gr.Column():
419
+ merge_16bit = gr.Checkbox(label="Merge to 16bit", value=False, interactive=True)
420
+ merge_4bit = gr.Checkbox(label="Merge to 4bit", value=False, interactive=True)
421
+ just_lora = gr.Checkbox(label="Just LoRA Adapter", value=False, interactive=True)
422
+ gr.Markdown("---")
423
+
424
+ with gr.Row():
425
+ gr.Markdown("### GGUF Options")
426
+ with gr.Column():
427
+ gguf_16bit = gr.Checkbox(label="Quantize to f16", value=False, interactive=True)
428
+ gguf_8bit = gr.Checkbox(label="Quantize to 8bit (Q8_0)", value=False, interactive=True)
429
+ gguf_4bit = gr.Checkbox(label="Quantize to 4bit (q4_k_m)", value=False, interactive=True)
430
+ with gr.Column():
431
+ gguf_custom = gr.Checkbox(label="Custom", value=False, interactive=True)
432
+ gguf_custom_value = gr.Textbox(label="", value="Q5_K", interactive=True)
433
+ gr.Markdown("---")
434
+
435
+ with gr.Row():
436
+ gr.Markdown("### Hugging Face Hub Options")
437
+ push_to_hub = gr.Checkbox(label="Push to Hub", value=False, interactive=True)
438
+ with gr.Column():
439
+ hub_model_name = gr.Textbox(label="Hub Model Name", value=f"username/model_name", interactive=True)
440
+ hub_token = gr.Textbox(label="Hub Token", interactive=True, type="password")
441
+ gr.Markdown("---")
442
+
443
+ # with gr.Row():
444
+ # gr.Markdown("### Ollama options")
445
+ # with gr.Column():
446
+ # ollama_create_local = gr.Checkbox(label="Create in Ollama (local)", value=False, interactive=True)
447
+ # ollama_push_to_hub = gr.Checkbox(label="Push to Ollama", value=False, interactive=True)
448
+ # with gr.Column():
449
+ # ollama_model_name = gr.Textbox(label="Ollama Model Name", value="user/model_name")
450
+ # ollama_pub_key = gr.Button("Ollama Pub Key")
451
+ save_output = gr.Markdown("---")
452
+ save_button = gr.Button("Save Model", visible=True, interactive=True)
453
+ save_button.click(save_model, inputs=[model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, gguf_4bit, gguf_custom, gguf_custom_value, merge_16bit, merge_4bit, just_lora, push_to_hub], outputs=[save_output, save_button])
454
+
455
+ with gr.Tab("Inference"):
456
+ with gr.Row():
457
+ input_text = gr.Textbox(label="Input Text", lines=4, value="""\
458
+ Continue the fibonnaci sequence.
459
+ # instruction
460
+ 1, 1, 2, 3, 5, 8
461
+ # input
462
+ """, interactive=True)
463
+ output_text = gr.Textbox(label="Output Text", lines=4, value="", interactive=False)
464
+
465
+ inference_button = gr.Button("Inference", visible=True, interactive=True)
466
+ inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
467
+ load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
468
+
469
  demo.launch()