myr1-2

Running on Zero

App Files Files Community

wuhp commited on 3 days ago

Commit

d93eea9

verified ·

1 Parent(s): 1ce8e5a

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -5

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
 ##############################################################################
 TEXT_PIPELINE = None
 NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
 @spaces.GPU(duration=600)  # up to 10 min
@@ -70,7 +71,6 @@ def finetune_small_subset():
     base_model = prepare_model_for_kbit_training(base_model)
     # --- 3) Create LoRA config & wrap the base model in LoRA ---
-    # Adjust target_modules if your model uses different param names than "q_proj"/"v_proj".
     lora_config = LoraConfig(
         r=16,
         lora_alpha=32,
@@ -139,7 +139,6 @@ def finetune_small_subset():
     return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
 def ensure_pipeline():
     """
     If we haven't finetuned yet (TEXT_PIPELINE is None),
@@ -166,6 +165,37 @@ def ensure_pipeline():
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
@@ -182,15 +212,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
 # Build Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
     finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
-    gr.Markdown("Then generate text below (or skip finetuning to see base model).")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
@@ -198,8 +262,8 @@ with gr.Blocks() as demo:
     min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
-    output_box = gr.Textbox(label="Generated Text", lines=12)
-    gen_btn = gr.Button("Generate")
     gen_btn.click(
         fn=predict,
@@ -207,4 +271,16 @@ with gr.Blocks() as demo:
         outputs=output_box
     )
 demo.launch()

 ##############################################################################
 TEXT_PIPELINE = None
+COMPARISON_PIPELINE = None  # We'll keep a separate pipeline for the DeepSeek model
 NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
 @spaces.GPU(duration=600)  # up to 10 min
     base_model = prepare_model_for_kbit_training(base_model)
     # --- 3) Create LoRA config & wrap the base model in LoRA ---
     lora_config = LoraConfig(
         r=16,
         lora_alpha=32,
     return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
 def ensure_pipeline():
     """
     If we haven't finetuned yet (TEXT_PIPELINE is None),
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
+def ensure_comparison_pipeline():
+    """
+    Load the DeepSeek model pipeline if not already loaded.
+    Adjust config if you'd like to load in 4-bit, or just do standard fp16/bfloat16.
+    """
+    global COMPARISON_PIPELINE
+    if COMPARISON_PIPELINE is None:
+        # Example: standard load (no QLoRA).
+        # If you want 4-bit, you can set up BitsAndBytesConfig here similarly.
+        config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+        tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+        # If you want to use device_map="auto" for GPU usage:
+        # In many cases you might want to do:
+        # device_map="auto" or device_map=0 for single-GPU.
+        # For demonstration, let's keep it simple.
+        # If your environment supports accelerate, you can do device_map="auto".
+        model = AutoModelForCausalLM.from_pretrained(
+            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            config=config,
+            device_map="auto"
+        )
+        COMPARISON_PIPELINE = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer
+        )
+    return COMPARISON_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
     )
     return out[0]["generated_text"]
+@spaces.GPU(duration=120)
+def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
+    """
+    Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
+    AND from the DeepSeek model.
+    Returns two strings.
+    """
+    # Ensure both pipelines are loaded:
+    local_pipe = ensure_pipeline()
+    comp_pipe = ensure_comparison_pipeline()
+    local_out = local_pipe(
+        prompt,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        min_new_tokens=int(min_new_tokens),
+        max_new_tokens=int(max_new_tokens),
+        do_sample=True
+    )
+    local_text = local_out[0]["generated_text"]
+    comp_out = comp_pipe(
+        prompt,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        min_new_tokens=int(min_new_tokens),
+        max_new_tokens=int(max_new_tokens),
+        do_sample=True
+    )
+    comp_text = comp_out[0]["generated_text"]
+    return local_text, comp_text
 # Build Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
+    gr.Markdown("Finetune or skip to use the base model. Then compare results with the DeepSeek model.")
     finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
+    gr.Markdown("### Generate with myr1 (fine-tuned if done above, else base)")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
     min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
+    output_box = gr.Textbox(label="myr1 Model Output", lines=12)
+    gen_btn = gr.Button("Generate with myr1")
     gen_btn.click(
         fn=predict,
         outputs=output_box
     )
+    gr.Markdown("### Compare myr1 vs DeepSeek-R1-Distill-Llama-8B side-by-side")
+    compare_btn = gr.Button("Compare (Side-by-side)")
+    out_local = gr.Textbox(label="myr1 Output", lines=10)
+    out_deepseek = gr.Textbox(label="DeepSeek Output", lines=10)
+    compare_btn.click(
+        fn=compare_models,
+        inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
+        outputs=[out_local, out_deepseek]
+    )
 demo.launch()