Borcherding commited on
Commit
f6c2bf0
·
verified ·
1 Parent(s): c6b7433

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -21
app.py CHANGED
@@ -1,15 +1,13 @@
1
  import gradio as gr
2
  from huggingface_hub import HfApi
3
- from unsloth import FastLanguageModel
 
 
4
  from trl import SFTTrainer
5
- from transformers import TrainingArguments, TrainerCallback
6
- from unsloth import is_bfloat16_supported
7
  import torch
8
  from datasets import load_dataset
9
- import logging
10
- from io import StringIO
11
  import time
12
- import asyncio
13
  import psutil
14
  import platform
15
  import os
@@ -41,6 +39,11 @@ memory = psutil.virtual_memory()
41
 
42
  # Dropdown options
43
  model_options = [
 
 
 
 
 
44
  "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
45
  "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
46
  "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
@@ -98,21 +101,40 @@ class PrinterCallback(TrainerCallback):
98
  self.progress(self.step/60, desc=f"Training {self.step}/60")
99
  #print("**Step ", state.global_step)
100
 
 
101
 
102
 
103
  def formatting_prompts_func(examples, prompt):
104
- EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
105
  instructions = examples["instruction"]
106
  inputs = examples["input"]
107
  outputs = examples["output"]
 
108
  texts = []
109
  for instruction, input, output in zip(instructions, inputs, outputs):
110
- # Must add EOS_TOKEN, otherwise your generation will go on forever!
111
- text = prompt.format(instruction, input, output) + EOS_TOKEN
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  texts.append(text)
113
- return { "text" : texts, }
 
114
 
115
- def load_model(initial_model_name, load_in_4bit, max_sequence_length):
116
  global model, tokenizer, max_seq_length
117
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
118
  max_seq_length = max_sequence_length
@@ -121,7 +143,11 @@ def load_model(initial_model_name, load_in_4bit, max_sequence_length):
121
  max_seq_length = max_sequence_length,
122
  dtype = dtype,
123
  load_in_4bit = load_in_4bit,
124
- # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
 
 
 
 
125
  )
126
  return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
127
 
@@ -129,6 +155,7 @@ def load_data(dataset_name, data_template_style, data_template):
129
  global dataset
130
  dataset = load_dataset(dataset_name, split = "train")
131
  dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
 
132
  return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
133
 
134
  def inference(prompt, input_text):
@@ -150,6 +177,7 @@ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, ggu
150
  global model, tokenizer
151
 
152
  quants = []
 
153
 
154
  if gguf_custom:
155
  gguf_custom_value = gguf_custom_value
@@ -165,15 +193,35 @@ def save_model(model_name, hub_model_name, hub_token, gguf_16bit, gguf_8bit, ggu
165
  quants.append("q4_k_m")
166
 
167
  if merge_16bit:
168
- merge = "16bit"
 
 
 
 
 
 
 
 
169
  elif merge_4bit:
170
- merge = "4bit"
 
 
 
 
 
 
 
 
171
  elif just_lora:
172
- merge = "lora"
173
- else:
174
- merge = None
 
 
 
 
 
175
 
176
- #model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")
177
  if push_to_hub:
178
  current_quant = 0
179
  for q in quants:
@@ -298,8 +346,9 @@ with gr.Blocks(title="Unsloth fine-tuning") as demo:
298
  model = model,
299
  tokenizer = tokenizer,
300
  train_dataset = dataset,
301
- dataset_text_field = "text",
302
- max_seq_length = max_seq_length,
 
303
  dataset_num_proc = 2,
304
  packing = False, # Can make training 5x faster for short sequences.
305
  callbacks = [PrinterCallback(progress)],
@@ -320,6 +369,11 @@ with gr.Blocks(title="Unsloth fine-tuning") as demo:
320
  output_dir = output_dir
321
  ),
322
  )
 
 
 
 
 
323
  trainer.train()
324
  progress(1, desc="Training completed")
325
  time.sleep(1)
@@ -381,6 +435,6 @@ Continue the fibonnaci sequence.
381
 
382
  inference_button = gr.Button("Inference", visible=True, interactive=True)
383
  inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
384
- load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
385
 
386
  demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import HfApi
3
+ from unsloth import FastLanguageModel, is_bfloat16_supported
4
+ from unsloth.chat_templates import get_chat_template, train_on_responses_only
5
+
6
  from trl import SFTTrainer
7
+ from transformers import TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
 
8
  import torch
9
  from datasets import load_dataset
 
 
10
  import time
 
11
  import psutil
12
  import platform
13
  import os
 
39
 
40
  # Dropdown options
41
  model_options = [
42
+ "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
43
+ "unsloth/Llama-3.2-1B-bnb-4bit",
44
+ "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
45
+ "unsloth/Llama-3.2-3B-bnb-4bit",
46
+ "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
47
  "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
48
  "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
49
  "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
 
101
  self.progress(self.step/60, desc=f"Training {self.step}/60")
102
  #print("**Step ", state.global_step)
103
 
104
+
105
 
106
 
107
  def formatting_prompts_func(examples, prompt):
108
+ global tokenizer
109
  instructions = examples["instruction"]
110
  inputs = examples["input"]
111
  outputs = examples["output"]
112
+
113
  texts = []
114
  for instruction, input, output in zip(instructions, inputs, outputs):
115
+ conversation = [
116
+ {
117
+ "role": "system",
118
+ "content": instruction + tokenizer.eos_token
119
+ },
120
+ {
121
+ "role": "user",
122
+ "content": input + tokenizer.eos_token
123
+ },
124
+ {
125
+ "role": "assistant",
126
+ "content": output + tokenizer.eos_token
127
+ }
128
+ ]
129
+ text = tokenizer.apply_chat_template(
130
+ conversation, tokenize=False, add_generation_prompt=False
131
+ )
132
+
133
  texts.append(text)
134
+
135
+ return { "text" : texts }
136
 
137
+ def load_model(initial_model_name, load_in_4bit, max_sequence_length, hub_token):
138
  global model, tokenizer, max_seq_length
139
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
140
  max_seq_length = max_sequence_length
 
143
  max_seq_length = max_sequence_length,
144
  dtype = dtype,
145
  load_in_4bit = load_in_4bit,
146
+ token = f"{hub_token}", # use one if using gated models like meta-llama/Llama-2-7b-hf
147
+ )
148
+ tokenizer = get_chat_template(
149
+ tokenizer,
150
+ chat_template="llama-3.1",
151
  )
152
  return f"Model {initial_model_name} loaded, using {max_sequence_length} as max sequence length.", gr.update(visible=True, interactive=True), gr.update(interactive=True),gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
153
 
 
155
  global dataset
156
  dataset = load_dataset(dataset_name, split = "train")
157
  dataset = dataset.map(lambda examples: formatting_prompts_func(examples, data_template), batched=True)
158
+
159
  return f"Data loaded {len(dataset)} records loaded.", gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True)
160
 
161
  def inference(prompt, input_text):
 
177
  global model, tokenizer
178
 
179
  quants = []
180
+ current_quant = 0
181
 
182
  if gguf_custom:
183
  gguf_custom_value = gguf_custom_value
 
193
  quants.append("q4_k_m")
194
 
195
  if merge_16bit:
196
+ progress(current_quant/len(quants), desc=f"Pushing model merged 16bit {model_name} to HuggingFace Hub")
197
+ model.save_pretrained_merged(
198
+ "model",
199
+ tokenizer,
200
+ save_method="merged_16bit",
201
+ )
202
+ if push_to_hub:
203
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_16bit", token=hub_token)
204
+
205
  elif merge_4bit:
206
+ progress(current_quant/len(quants), desc=f"Pushing model merged 4bit {model_name} to HuggingFace Hub")
207
+ model.save_pretrained_merged(
208
+ "model",
209
+ tokenizer,
210
+ save_method="merged_4bit",
211
+ )
212
+ if push_to_hub:
213
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="merged_4bit", token=hub_token)
214
+
215
  elif just_lora:
216
+ progress(current_quant/len(quants), desc=f"Pushing model merged lora {model_name} to HuggingFace Hub")
217
+ model.save_pretrained_merged(
218
+ "model",
219
+ tokenizer,
220
+ save_method="lora",
221
+ )
222
+ if push_to_hub:
223
+ model.push_to_hub_merged(hub_model_name, tokenizer, save_method="lora", token=hub_token)
224
 
 
225
  if push_to_hub:
226
  current_quant = 0
227
  for q in quants:
 
346
  model = model,
347
  tokenizer = tokenizer,
348
  train_dataset = dataset,
349
+ dataset_text_field="text",
350
+ max_seq_length=max_seq_length,
351
+ data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
352
  dataset_num_proc = 2,
353
  packing = False, # Can make training 5x faster for short sequences.
354
  callbacks = [PrinterCallback(progress)],
 
369
  output_dir = output_dir
370
  ),
371
  )
372
+ trainer = train_on_responses_only(
373
+ trainer,
374
+ instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
375
+ response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
376
+ )
377
  trainer.train()
378
  progress(1, desc="Training completed")
379
  time.sleep(1)
 
435
 
436
  inference_button = gr.Button("Inference", visible=True, interactive=True)
437
  inference_button.click(inference, inputs=[data_template, input_text], outputs=[output_text, inference_button])
438
+ load_btn.click(load_model, inputs=[initial_model_name, load_in_4bit, max_sequence_length, hub_token], outputs=[output, load_btn, train_btn, initial_model_name, load_in_4bit, max_sequence_length])
439
 
440
  demo.launch()