Spaces:
wuhp
/
Running on Zero

wuhp commited on
Commit
d93eea9
·
verified ·
1 Parent(s): 1ce8e5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -5
app.py CHANGED
@@ -22,6 +22,7 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
22
  ##############################################################################
23
 
24
  TEXT_PIPELINE = None
 
25
  NUM_EXAMPLES = 50 # We'll train on 50 lines of WikiText-2 for demonstration
26
 
27
  @spaces.GPU(duration=600) # up to 10 min
@@ -70,7 +71,6 @@ def finetune_small_subset():
70
  base_model = prepare_model_for_kbit_training(base_model)
71
 
72
  # --- 3) Create LoRA config & wrap the base model in LoRA ---
73
- # Adjust target_modules if your model uses different param names than "q_proj"/"v_proj".
74
  lora_config = LoraConfig(
75
  r=16,
76
  lora_alpha=32,
@@ -139,7 +139,6 @@ def finetune_small_subset():
139
 
140
  return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
141
 
142
-
143
  def ensure_pipeline():
144
  """
145
  If we haven't finetuned yet (TEXT_PIPELINE is None),
@@ -166,6 +165,37 @@ def ensure_pipeline():
166
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
167
  return TEXT_PIPELINE
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  @spaces.GPU(duration=120) # up to 2 min for text generation
170
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
171
  """
@@ -182,15 +212,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
182
  )
183
  return out[0]["generated_text"]
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # Build Gradio UI
186
  with gr.Blocks() as demo:
187
  gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
 
188
 
189
  finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
190
  status_box = gr.Textbox(label="Finetune Status")
191
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
192
 
193
- gr.Markdown("Then generate text below (or skip finetuning to see base model).")
194
 
195
  prompt_in = gr.Textbox(lines=3, label="Prompt")
196
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
@@ -198,8 +262,8 @@ with gr.Blocks() as demo:
198
  min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
199
  max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
200
 
201
- output_box = gr.Textbox(label="Generated Text", lines=12)
202
- gen_btn = gr.Button("Generate")
203
 
204
  gen_btn.click(
205
  fn=predict,
@@ -207,4 +271,16 @@ with gr.Blocks() as demo:
207
  outputs=output_box
208
  )
209
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  demo.launch()
 
22
  ##############################################################################
23
 
24
  TEXT_PIPELINE = None
25
+ COMPARISON_PIPELINE = None # We'll keep a separate pipeline for the DeepSeek model
26
  NUM_EXAMPLES = 50 # We'll train on 50 lines of WikiText-2 for demonstration
27
 
28
  @spaces.GPU(duration=600) # up to 10 min
 
71
  base_model = prepare_model_for_kbit_training(base_model)
72
 
73
  # --- 3) Create LoRA config & wrap the base model in LoRA ---
 
74
  lora_config = LoraConfig(
75
  r=16,
76
  lora_alpha=32,
 
139
 
140
  return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
141
 
 
142
  def ensure_pipeline():
143
  """
144
  If we haven't finetuned yet (TEXT_PIPELINE is None),
 
165
  TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
166
  return TEXT_PIPELINE
167
 
168
+ def ensure_comparison_pipeline():
169
+ """
170
+ Load the DeepSeek model pipeline if not already loaded.
171
+ Adjust config if you'd like to load in 4-bit, or just do standard fp16/bfloat16.
172
+ """
173
+ global COMPARISON_PIPELINE
174
+ if COMPARISON_PIPELINE is None:
175
+ # Example: standard load (no QLoRA).
176
+ # If you want 4-bit, you can set up BitsAndBytesConfig here similarly.
177
+ config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
178
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
179
+
180
+ # If you want to use device_map="auto" for GPU usage:
181
+ # In many cases you might want to do:
182
+ # device_map="auto" or device_map=0 for single-GPU.
183
+ # For demonstration, let's keep it simple.
184
+ # If your environment supports accelerate, you can do device_map="auto".
185
+ model = AutoModelForCausalLM.from_pretrained(
186
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
187
+ config=config,
188
+ device_map="auto"
189
+ )
190
+
191
+ COMPARISON_PIPELINE = pipeline(
192
+ "text-generation",
193
+ model=model,
194
+ tokenizer=tokenizer
195
+ )
196
+
197
+ return COMPARISON_PIPELINE
198
+
199
  @spaces.GPU(duration=120) # up to 2 min for text generation
200
  def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
201
  """
 
212
  )
213
  return out[0]["generated_text"]
214
 
215
+ @spaces.GPU(duration=120)
216
+ def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
217
+ """
218
+ Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
219
+ AND from the DeepSeek model.
220
+ Returns two strings.
221
+ """
222
+ # Ensure both pipelines are loaded:
223
+ local_pipe = ensure_pipeline()
224
+ comp_pipe = ensure_comparison_pipeline()
225
+
226
+ local_out = local_pipe(
227
+ prompt,
228
+ temperature=float(temperature),
229
+ top_p=float(top_p),
230
+ min_new_tokens=int(min_new_tokens),
231
+ max_new_tokens=int(max_new_tokens),
232
+ do_sample=True
233
+ )
234
+ local_text = local_out[0]["generated_text"]
235
+
236
+ comp_out = comp_pipe(
237
+ prompt,
238
+ temperature=float(temperature),
239
+ top_p=float(top_p),
240
+ min_new_tokens=int(min_new_tokens),
241
+ max_new_tokens=int(max_new_tokens),
242
+ do_sample=True
243
+ )
244
+ comp_text = comp_out[0]["generated_text"]
245
+
246
+ return local_text, comp_text
247
+
248
  # Build Gradio UI
249
  with gr.Blocks() as demo:
250
  gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
251
+ gr.Markdown("Finetune or skip to use the base model. Then compare results with the DeepSeek model.")
252
 
253
  finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
254
  status_box = gr.Textbox(label="Finetune Status")
255
  finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
256
 
257
+ gr.Markdown("### Generate with myr1 (fine-tuned if done above, else base)")
258
 
259
  prompt_in = gr.Textbox(lines=3, label="Prompt")
260
  temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
 
262
  min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
263
  max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
264
 
265
+ output_box = gr.Textbox(label="myr1 Model Output", lines=12)
266
+ gen_btn = gr.Button("Generate with myr1")
267
 
268
  gen_btn.click(
269
  fn=predict,
 
271
  outputs=output_box
272
  )
273
 
274
+ gr.Markdown("### Compare myr1 vs DeepSeek-R1-Distill-Llama-8B side-by-side")
275
+
276
+ compare_btn = gr.Button("Compare (Side-by-side)")
277
+ out_local = gr.Textbox(label="myr1 Output", lines=10)
278
+ out_deepseek = gr.Textbox(label="DeepSeek Output", lines=10)
279
+
280
+ compare_btn.click(
281
+ fn=compare_models,
282
+ inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
283
+ outputs=[out_local, out_deepseek]
284
+ )
285
+
286
  demo.launch()