Spaces:
wuhp
/
Running on Zero

wuhp commited on
Commit
4fa9540
·
verified ·
1 Parent(s): d93eea9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -93
app.py CHANGED
@@ -11,7 +11,7 @@ from transformers import (
11
  Trainer,
12
  TrainingArguments,
13
  pipeline,
14
- BitsAndBytesConfig, # for 4-bit config
15
  )
16
 
17
  # PEFT (LoRA / QLoRA)
@@ -22,21 +22,34 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
22
  ##############################################################################
23
 
24
  TEXT_PIPELINE = None
25
- COMPARISON_PIPELINE = None # We'll keep a separate pipeline for the DeepSeek model
26
- NUM_EXAMPLES = 50 # We'll train on 50 lines of WikiText-2 for demonstration
27
 
28
  @spaces.GPU(duration=600) # up to 10 min
29
  def finetune_small_subset():
30
  """
31
  1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
32
  2) Adds LoRA adapters (trainable),
33
- 3) Trains on 50 lines of WikiText-2,
34
  4) Saves LoRA adapter to 'finetuned_myr1',
35
  5) Reloads LoRA adapters for inference in a pipeline.
36
  """
37
 
38
- # --- 1) Load WikiText-2 subset ---
39
- ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
41
 
42
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
@@ -83,9 +96,18 @@ def finetune_small_subset():
83
 
84
  # --- 4) Tokenize dataset ---
85
  def tokenize_fn(ex):
86
- return tokenizer(ex["text"], truncation=True, max_length=512)
 
 
 
 
 
 
 
 
 
87
 
88
- ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
89
  ds.set_format("torch")
90
 
91
  collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
@@ -128,7 +150,6 @@ def finetune_small_subset():
128
  )
129
  base_model_2 = prepare_model_for_kbit_training(base_model_2)
130
 
131
- # Instead of load_adapter(...), we use PeftModel.from_pretrained
132
  lora_model_2 = PeftModel.from_pretrained(
133
  base_model_2,
134
  "finetuned_myr1",
@@ -137,7 +158,7 @@ def finetune_small_subset():
137
  global TEXT_PIPELINE
138
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
139
 
140
- return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
141
 
142
  def ensure_pipeline():
143
  """
@@ -165,37 +186,6 @@ def ensure_pipeline():
165
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
166
  return TEXT_PIPELINE
167
 
168
- def ensure_comparison_pipeline():
169
- """
170
- Load the DeepSeek model pipeline if not already loaded.
171
- Adjust config if you'd like to load in 4-bit, or just do standard fp16/bfloat16.
172
- """
173
- global COMPARISON_PIPELINE
174
- if COMPARISON_PIPELINE is None:
175
- # Example: standard load (no QLoRA).
176
- # If you want 4-bit, you can set up BitsAndBytesConfig here similarly.
177
- config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
178
- tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
179
-
180
- # If you want to use device_map="auto" for GPU usage:
181
- # In many cases you might want to do:
182
- # device_map="auto" or device_map=0 for single-GPU.
183
- # For demonstration, let's keep it simple.
184
- # If your environment supports accelerate, you can do device_map="auto".
185
- model = AutoModelForCausalLM.from_pretrained(
186
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
187
- config=config,
188
- device_map="auto"
189
- )
190
-
191
- COMPARISON_PIPELINE = pipeline(
192
- "text-generation",
193
- model=model,
194
- tokenizer=tokenizer
195
- )
196
-
197
- return COMPARISON_PIPELINE
198
-
199
  @spaces.GPU(duration=120) # up to 2 min for text generation
200
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
201
  """
@@ -212,45 +202,15 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
212
  )
213
  return out[0]["generated_text"]
214
 
215
- @spaces.GPU(duration=120)
216
- def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
217
- """
218
- Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
219
- AND from the DeepSeek model.
220
- Returns two strings.
221
- """
222
- # Ensure both pipelines are loaded:
223
- local_pipe = ensure_pipeline()
224
- comp_pipe = ensure_comparison_pipeline()
225
 
226
- local_out = local_pipe(
227
- prompt,
228
- temperature=float(temperature),
229
- top_p=float(top_p),
230
- min_new_tokens=int(min_new_tokens),
231
- max_new_tokens=int(max_new_tokens),
232
- do_sample=True
233
- )
234
- local_text = local_out[0]["generated_text"]
235
-
236
- comp_out = comp_pipe(
237
- prompt,
238
- temperature=float(temperature),
239
- top_p=float(top_p),
240
- min_new_tokens=int(min_new_tokens),
241
- max_new_tokens=int(max_new_tokens),
242
- do_sample=True
243
- )
244
- comp_text = comp_out[0]["generated_text"]
245
-
246
- return local_text, comp_text
247
-
248
- # Build Gradio UI
249
  with gr.Blocks() as demo:
250
- gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
251
- gr.Markdown("Finetune or skip to use the base model. Then compare results with the DeepSeek model.")
252
 
253
- finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
254
  status_box = gr.Textbox(label="Finetune Status")
255
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
256
 
@@ -259,11 +219,11 @@ with gr.Blocks() as demo:
259
  prompt_in = gr.Textbox(lines=3, label="Prompt")
260
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
261
  top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
262
- min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
263
- max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
264
 
265
- output_box = gr.Textbox(label="myr1 Model Output", lines=12)
266
- gen_btn = gr.Button("Generate with myr1")
267
 
268
  gen_btn.click(
269
  fn=predict,
@@ -271,16 +231,4 @@ with gr.Blocks() as demo:
271
  outputs=output_box
272
  )
273
 
274
- gr.Markdown("### Compare myr1 vs DeepSeek-R1-Distill-Llama-8B side-by-side")
275
-
276
- compare_btn = gr.Button("Compare (Side-by-side)")
277
- out_local = gr.Textbox(label="myr1 Output", lines=10)
278
- out_deepseek = gr.Textbox(label="DeepSeek Output", lines=10)
279
-
280
- compare_btn.click(
281
- fn=compare_models,
282
- inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
283
- outputs=[out_local, out_deepseek]
284
- )
285
-
286
  demo.launch()
 
11
  Trainer,
12
  TrainingArguments,
13
  pipeline,
14
+ BitsAndBytesConfig,
15
  )
16
 
17
  # PEFT (LoRA / QLoRA)
 
22
  ##############################################################################
23
 
24
  TEXT_PIPELINE = None
25
+ COMPARISON_PIPELINE = None # pipeline for the comparison model, if desired
26
+ NUM_EXAMPLES = 50 # We'll train on 50 lines (or rows) for demonstration
27
 
28
  @spaces.GPU(duration=600) # up to 10 min
29
  def finetune_small_subset():
30
  """
31
  1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
32
  2) Adds LoRA adapters (trainable),
33
+ 3) Trains on a small subset of Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B,
34
  4) Saves LoRA adapter to 'finetuned_myr1',
35
  5) Reloads LoRA adapters for inference in a pipeline.
36
  """
37
 
38
+ # --- 1) Load Magpie dataset ---
39
+ # You can load 'train' or 'validation' split depending on your preference
40
+ ds = load_dataset(
41
+ "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
42
+ split="train"
43
+ )
44
+
45
+ # EXAMPLE: Filter for a single conversation_id
46
+ # (Alternatively, just do ds.select(range(...)) for a small random subset.)
47
+ # We'll demonstrate filtering for the first conversation_id:
48
+ unique_ids = list(set(ds["conversation_id"]))
49
+ single_id = unique_ids[0]
50
+ ds = ds.filter(lambda x: x["conversation_id"] == single_id)
51
+
52
+ # After filtering, still pick just up to NUM_EXAMPLES
53
  ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
54
 
55
  # --- 2) Setup 4-bit quantization with BitsAndBytes ---
 
96
 
97
  # --- 4) Tokenize dataset ---
98
  def tokenize_fn(ex):
99
+ """
100
+ Example: combine instruction + response
101
+ into a single text. Adjust to your liking.
102
+ """
103
+ # For demonstration, let's do a short prompt style:
104
+ text = (
105
+ f"Instruction: {ex['instruction']}\n\n"
106
+ f"Response: {ex['response']}"
107
+ )
108
+ return tokenizer(text, truncation=True, max_length=512)
109
 
110
+ ds = ds.map(tokenize_fn, batched=False, remove_columns=ds.column_names)
111
  ds.set_format("torch")
112
 
113
  collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
150
  )
151
  base_model_2 = prepare_model_for_kbit_training(base_model_2)
152
 
 
153
  lora_model_2 = PeftModel.from_pretrained(
154
  base_model_2,
155
  "finetuned_myr1",
 
158
  global TEXT_PIPELINE
159
  TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
160
 
161
+ return "Finetuning complete (QLoRA + LoRA on Magpie dataset). Model loaded for inference."
162
 
163
  def ensure_pipeline():
164
  """
 
186
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
187
  return TEXT_PIPELINE
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  @spaces.GPU(duration=120) # up to 2 min for text generation
190
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
191
  """
 
202
  )
203
  return out[0]["generated_text"]
204
 
205
+ # (Optional) If you want to compare with another model, define it here:
206
+ # def ensure_comparison_pipeline():
207
+ # ...
 
 
 
 
 
 
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  with gr.Blocks() as demo:
210
+ gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1 (Magpie dataset subset)")
211
+ gr.Markdown("Finetune or skip to use the base model. Then generate text below.")
212
 
213
+ finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on small subset of Magpie dataset (up to 10 min)")
214
  status_box = gr.Textbox(label="Finetune Status")
215
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
216
 
 
219
  prompt_in = gr.Textbox(lines=3, label="Prompt")
220
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
221
  top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
222
+ min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
223
+ max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
224
 
225
+ output_box = gr.Textbox(label="Generated Text", lines=12)
226
+ gen_btn = gr.Button("Generate")
227
 
228
  gen_btn.click(
229
  fn=predict,
 
231
  outputs=output_box
232
  )
233
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  demo.launch()