wuhp commited on
Commit
b5aeb95
·
verified ·
1 Parent(s): 24eb33c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -30
app.py CHANGED
@@ -21,41 +21,39 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
21
  # ZeroGPU + QLoRA Example
22
  ##############################################################################
23
 
24
- TEXT_PIPELINE = None
25
- COMPARISON_PIPELINE = None # pipeline for the comparison model, if desired
26
- NUM_EXAMPLES = 50 # We'll train on 50 lines (or rows) for demonstration
27
 
28
- @spaces.GPU(duration=300) # up to 10 min
 
 
29
  def finetune_small_subset():
30
  """
31
  1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
32
  2) Adds LoRA adapters (trainable),
33
- 3) Trains on a small subset of Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B,
34
  4) Saves LoRA adapter to 'finetuned_myr1',
35
  5) Reloads LoRA adapters for inference in a pipeline.
36
  """
37
 
38
- # --- 1) Load Magpie dataset ---
39
- # You can load 'train' or 'validation' split depending on your preference
40
  ds = load_dataset(
41
  "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
42
  split="train"
43
  )
44
 
45
- # EXAMPLE: Filter for a single conversation_id
46
- # (Alternatively, just do ds.select(range(...)) for a small random subset.)
47
- # We'll demonstrate filtering for the first conversation_id:
48
  unique_ids = list(set(ds["conversation_id"]))
49
  single_id = unique_ids[0]
50
  ds = ds.filter(lambda x: x["conversation_id"] == single_id)
51
 
52
- # After filtering, still pick just up to NUM_EXAMPLES
53
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
54
 
55
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
56
  bnb_config = BitsAndBytesConfig(
57
  load_in_4bit=True,
58
- bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16 if you prefer
59
  bnb_4bit_use_double_quant=True,
60
  bnb_4bit_quant_type="nf4",
61
  )
@@ -75,12 +73,12 @@ def finetune_small_subset():
75
  "wuhp/myr1",
76
  subfolder="myr1",
77
  config=config,
78
- quantization_config=bnb_config, # <--- QLoRA 4-bit
79
  device_map="auto",
80
  trust_remote_code=True
81
  )
82
 
83
- # Prepare the model for k-bit training (QLoRA)
84
  base_model = prepare_model_for_kbit_training(base_model)
85
 
86
  # --- 3) Create LoRA config & wrap the base model in LoRA ---
@@ -97,10 +95,9 @@ def finetune_small_subset():
97
  # --- 4) Tokenize dataset ---
98
  def tokenize_fn(ex):
99
  """
100
- Example: combine instruction + response
101
- into a single text. Adjust to your liking.
102
  """
103
- # For demonstration, let's do a short prompt style:
104
  text = (
105
  f"Instruction: {ex['instruction']}\n\n"
106
  f"Response: {ex['response']}"
@@ -119,9 +116,9 @@ def finetune_small_subset():
119
  per_device_train_batch_size=1,
120
  gradient_accumulation_steps=2,
121
  logging_steps=5,
122
- save_steps=999999,
123
  save_total_limit=1,
124
- fp16=False, # rely on bfloat16 from quantization
125
  )
126
 
127
  # Trainer
@@ -158,7 +155,8 @@ def finetune_small_subset():
158
  global TEXT_PIPELINE
159
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
160
 
161
- return "Finetuning complete (QLoRA + LoRA on Magpie dataset). Model loaded for inference."
 
162
 
163
  def ensure_pipeline():
164
  """
@@ -186,10 +184,34 @@ def ensure_pipeline():
186
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
187
  return TEXT_PIPELINE
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  @spaces.GPU(duration=120) # up to 2 min for text generation
190
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
191
  """
192
- Generates text from the finetuned (LoRA) model if present, else the base model.
193
  """
194
  pipe = ensure_pipeline()
195
  out = pipe(
@@ -202,19 +224,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
202
  )
203
  return out[0]["generated_text"]
204
 
205
- # (Optional) If you want to compare with another model, define it here:
206
- # def ensure_comparison_pipeline():
207
- # ...
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  with gr.Blocks() as demo:
210
- gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1 (Magpie dataset subset)")
211
- gr.Markdown("Finetune or skip to use the base model. Then generate text below.")
212
 
213
- finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on small subset of Magpie dataset (up to 10 min)")
214
  status_box = gr.Textbox(label="Finetune Status")
215
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
216
 
217
- gr.Markdown("### Generate with myr1 (fine-tuned if done above, else base)")
218
 
219
  prompt_in = gr.Textbox(lines=3, label="Prompt")
220
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
@@ -222,8 +274,8 @@ with gr.Blocks() as demo:
222
  min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
223
  max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
224
 
225
- output_box = gr.Textbox(label="Generated Text", lines=12)
226
- gen_btn = gr.Button("Generate")
227
 
228
  gen_btn.click(
229
  fn=predict,
@@ -231,4 +283,16 @@ with gr.Blocks() as demo:
231
  outputs=output_box
232
  )
233
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  demo.launch()
 
21
  # ZeroGPU + QLoRA Example
22
  ##############################################################################
23
 
24
+ TEXT_PIPELINE = None # Pipeline for wuhp/myr1 (fine-tuned or base)
25
+ COMPARISON_PIPELINE = None # Pipeline for the DeepSeek model
 
26
 
27
+ NUM_EXAMPLES = 50 # We'll train on 50 rows for demonstration
28
+
29
+ @spaces.GPU(duration=300) # up to 5 min
30
  def finetune_small_subset():
31
  """
32
  1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
33
  2) Adds LoRA adapters (trainable),
34
+ 3) Trains on a small subset of the Magpie dataset,
35
  4) Saves LoRA adapter to 'finetuned_myr1',
36
  5) Reloads LoRA adapters for inference in a pipeline.
37
  """
38
 
39
+ # --- 1) Load a small subset of the Magpie dataset ---
 
40
  ds = load_dataset(
41
  "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
42
  split="train"
43
  )
44
 
45
+ # For demonstration, pick a single conversation_id
 
 
46
  unique_ids = list(set(ds["conversation_id"]))
47
  single_id = unique_ids[0]
48
  ds = ds.filter(lambda x: x["conversation_id"] == single_id)
49
 
50
+ # Then select only NUM_EXAMPLES from that subset
51
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
52
 
53
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
54
  bnb_config = BitsAndBytesConfig(
55
  load_in_4bit=True,
56
+ bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
57
  bnb_4bit_use_double_quant=True,
58
  bnb_4bit_quant_type="nf4",
59
  )
 
73
  "wuhp/myr1",
74
  subfolder="myr1",
75
  config=config,
76
+ quantization_config=bnb_config, # <--- QLoRA 4-bit
77
  device_map="auto",
78
  trust_remote_code=True
79
  )
80
 
81
+ # Prepare the model for k-bit training
82
  base_model = prepare_model_for_kbit_training(base_model)
83
 
84
  # --- 3) Create LoRA config & wrap the base model in LoRA ---
 
95
  # --- 4) Tokenize dataset ---
96
  def tokenize_fn(ex):
97
  """
98
+ Combine instruction + response into a single text.
99
+ You can adjust this to include more fields or different formatting.
100
  """
 
101
  text = (
102
  f"Instruction: {ex['instruction']}\n\n"
103
  f"Response: {ex['response']}"
 
116
  per_device_train_batch_size=1,
117
  gradient_accumulation_steps=2,
118
  logging_steps=5,
119
+ save_steps=999999, # effectively don't save mid-epoch
120
  save_total_limit=1,
121
+ fp16=False, # rely on bfloat16 from quantization
122
  )
123
 
124
  # Trainer
 
155
  global TEXT_PIPELINE
156
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
157
 
158
+ return "Finetuning complete. Model loaded for inference."
159
+
160
 
161
  def ensure_pipeline():
162
  """
 
184
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
185
  return TEXT_PIPELINE
186
 
187
+
188
+ def ensure_comparison_pipeline():
189
+ """
190
+ Load the DeepSeek model pipeline if not already loaded.
191
+ """
192
+ global COMPARISON_PIPELINE
193
+ if COMPARISON_PIPELINE is None:
194
+ # If you prefer 4-bit, you can define BitsAndBytesConfig here,
195
+ # but let's keep it simpler for demonstration (fp16 or bf16).
196
+ config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
197
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
198
+ model = AutoModelForCausalLM.from_pretrained(
199
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
200
+ config=config,
201
+ device_map="auto"
202
+ )
203
+ COMPARISON_PIPELINE = pipeline(
204
+ "text-generation",
205
+ model=model,
206
+ tokenizer=tokenizer
207
+ )
208
+ return COMPARISON_PIPELINE
209
+
210
+
211
  @spaces.GPU(duration=120) # up to 2 min for text generation
212
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
213
  """
214
+ Generates text from the fine-tuned (LoRA) model if present, else the base model.
215
  """
216
  pipe = ensure_pipeline()
217
  out = pipe(
 
224
  )
225
  return out[0]["generated_text"]
226
 
 
 
 
227
 
228
+ @spaces.GPU(duration=120) # up to 2 min for text generation
229
+ def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
230
+ """
231
+ Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
232
+ AND from the DeepSeek model. Returns two strings.
233
+ """
234
+ local_pipe = ensure_pipeline()
235
+ comp_pipe = ensure_comparison_pipeline()
236
+
237
+ local_out = local_pipe(
238
+ prompt,
239
+ temperature=float(temperature),
240
+ top_p=float(top_p),
241
+ min_new_tokens=int(min_new_tokens),
242
+ max_new_tokens=int(max_new_tokens),
243
+ do_sample=True
244
+ )
245
+ local_text = local_out[0]["generated_text"]
246
+
247
+ comp_out = comp_pipe(
248
+ prompt,
249
+ temperature=float(temperature),
250
+ top_p=float(top_p),
251
+ min_new_tokens=int(min_new_tokens),
252
+ max_new_tokens=int(max_new_tokens),
253
+ do_sample=True
254
+ )
255
+ comp_text = comp_out[0]["generated_text"]
256
+
257
+ return local_text, comp_text
258
+
259
+
260
+ # Build Gradio UI
261
  with gr.Blocks() as demo:
262
+ gr.Markdown("# QLoRA Fine-tuning & Comparison Demo")
263
+ gr.Markdown("**Fine-tune wuhp/myr1** on a small subset of the Magpie dataset, then generate or compare output with the DeepSeek model.")
264
 
265
+ finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on Magpie subset (up to 5 min)")
266
  status_box = gr.Textbox(label="Finetune Status")
267
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
268
 
269
+ gr.Markdown("### Generate with myr1 (fine-tuned if done, else base)")
270
 
271
  prompt_in = gr.Textbox(lines=3, label="Prompt")
272
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
 
274
  min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
275
  max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
276
 
277
+ output_box = gr.Textbox(label="myr1 Output", lines=8)
278
+ gen_btn = gr.Button("Generate with myr1")
279
 
280
  gen_btn.click(
281
  fn=predict,
 
283
  outputs=output_box
284
  )
285
 
286
+ gr.Markdown("### Compare myr1 vs DeepSeek side-by-side")
287
+
288
+ compare_btn = gr.Button("Compare")
289
+ out_local = gr.Textbox(label="myr1 Output", lines=8)
290
+ out_deepseek = gr.Textbox(label="DeepSeek Output", lines=8)
291
+
292
+ compare_btn.click(
293
+ fn=compare_models,
294
+ inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
295
+ outputs=[out_local, out_deepseek]
296
+ )
297
+
298
  demo.launch()