myr1-2

Running on Zero

App Files Files Community

wuhp commited on 3 days ago

Commit

4fa9540

verified ·

1 Parent(s): d93eea9

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -93

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from transformers import (
     Trainer,
     TrainingArguments,
     pipeline,
-    BitsAndBytesConfig,  # for 4-bit config
 )
 # PEFT (LoRA / QLoRA)
@@ -22,21 +22,34 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
 ##############################################################################
 TEXT_PIPELINE = None
-COMPARISON_PIPELINE = None  # We'll keep a separate pipeline for the DeepSeek model
-NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
 @spaces.GPU(duration=600)  # up to 10 min
 def finetune_small_subset():
     """
     1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
     2) Adds LoRA adapters (trainable),
-    3) Trains on 50 lines of WikiText-2,
     4) Saves LoRA adapter to 'finetuned_myr1',
     5) Reloads LoRA adapters for inference in a pipeline.
     """
-    # --- 1) Load WikiText-2 subset ---
-    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
@@ -83,9 +96,18 @@ def finetune_small_subset():
     # --- 4) Tokenize dataset ---
     def tokenize_fn(ex):
-        return tokenizer(ex["text"], truncation=True, max_length=512)
-    ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
     ds.set_format("torch")
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
@@ -128,7 +150,6 @@ def finetune_small_subset():
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
-    # Instead of load_adapter(...), we use PeftModel.from_pretrained
     lora_model_2 = PeftModel.from_pretrained(
         base_model_2,
         "finetuned_myr1",
@@ -137,7 +158,7 @@ def finetune_small_subset():
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
-    return "Finetuning complete (QLoRA + LoRA). Model loaded for inference."
 def ensure_pipeline():
     """
@@ -165,37 +186,6 @@ def ensure_pipeline():
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
-def ensure_comparison_pipeline():
-    """
-    Load the DeepSeek model pipeline if not already loaded.
-    Adjust config if you'd like to load in 4-bit, or just do standard fp16/bfloat16.
-    """
-    global COMPARISON_PIPELINE
-    if COMPARISON_PIPELINE is None:
-        # Example: standard load (no QLoRA).
-        # If you want 4-bit, you can set up BitsAndBytesConfig here similarly.
-        config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
-        tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
-        # If you want to use device_map="auto" for GPU usage:
-        # In many cases you might want to do:
-        # device_map="auto" or device_map=0 for single-GPU.
-        # For demonstration, let's keep it simple.
-        # If your environment supports accelerate, you can do device_map="auto".
-        model = AutoModelForCausalLM.from_pretrained(
-            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-            config=config,
-            device_map="auto"
-        )
-        COMPARISON_PIPELINE = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer
-        )
-    return COMPARISON_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
@@ -212,45 +202,15 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
-@spaces.GPU(duration=120)
-def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
-    """
-    Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
-    AND from the DeepSeek model.
-    Returns two strings.
-    """
-    # Ensure both pipelines are loaded:
-    local_pipe = ensure_pipeline()
-    comp_pipe = ensure_comparison_pipeline()
-    local_out = local_pipe(
-        prompt,
-        temperature=float(temperature),
-        top_p=float(top_p),
-        min_new_tokens=int(min_new_tokens),
-        max_new_tokens=int(max_new_tokens),
-        do_sample=True
-    )
-    local_text = local_out[0]["generated_text"]
-    comp_out = comp_pipe(
-        prompt,
-        temperature=float(temperature),
-        top_p=float(top_p),
-        min_new_tokens=int(min_new_tokens),
-        max_new_tokens=int(max_new_tokens),
-        do_sample=True
-    )
-    comp_text = comp_out[0]["generated_text"]
-    return local_text, comp_text
-# Build Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")
-    gr.Markdown("Finetune or skip to use the base model. Then compare results with the DeepSeek model.")
-    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
@@ -259,11 +219,11 @@ with gr.Blocks() as demo:
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
     top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
-    min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
-    max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")
-    output_box = gr.Textbox(label="myr1 Model Output", lines=12)
-    gen_btn = gr.Button("Generate with myr1")
     gen_btn.click(
         fn=predict,
@@ -271,16 +231,4 @@ with gr.Blocks() as demo:
         outputs=output_box
     )
-    gr.Markdown("### Compare myr1 vs DeepSeek-R1-Distill-Llama-8B side-by-side")
-    compare_btn = gr.Button("Compare (Side-by-side)")
-    out_local = gr.Textbox(label="myr1 Output", lines=10)
-    out_deepseek = gr.Textbox(label="DeepSeek Output", lines=10)
-    compare_btn.click(
-        fn=compare_models,
-        inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
-        outputs=[out_local, out_deepseek]
-    )
 demo.launch()

     Trainer,
     TrainingArguments,
     pipeline,
+    BitsAndBytesConfig,
 )
 # PEFT (LoRA / QLoRA)
 ##############################################################################
 TEXT_PIPELINE = None
+COMPARISON_PIPELINE = None  # pipeline for the comparison model, if desired
+NUM_EXAMPLES = 50  # We'll train on 50 lines (or rows) for demonstration
 @spaces.GPU(duration=600)  # up to 10 min
 def finetune_small_subset():
     """
     1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
     2) Adds LoRA adapters (trainable),
+    3) Trains on a small subset of Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B,
     4) Saves LoRA adapter to 'finetuned_myr1',
     5) Reloads LoRA adapters for inference in a pipeline.
     """
+    # --- 1) Load Magpie dataset ---
+    # You can load 'train' or 'validation' split depending on your preference
+    ds = load_dataset(
+        "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
+        split="train"
+    )
+    # EXAMPLE: Filter for a single conversation_id
+    # (Alternatively, just do ds.select(range(...)) for a small random subset.)
+    # We'll demonstrate filtering for the first conversation_id:
+    unique_ids = list(set(ds["conversation_id"]))
+    single_id = unique_ids[0]
+    ds = ds.filter(lambda x: x["conversation_id"] == single_id)
+    # After filtering, still pick just up to NUM_EXAMPLES
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
     # --- 4) Tokenize dataset ---
     def tokenize_fn(ex):
+        """
+        Example: combine instruction + response
+        into a single text. Adjust to your liking.
+        """
+        # For demonstration, let's do a short prompt style:
+        text = (
+            f"Instruction: {ex['instruction']}\n\n"
+            f"Response: {ex['response']}"
+        )
+        return tokenizer(text, truncation=True, max_length=512)
+    ds = ds.map(tokenize_fn, batched=False, remove_columns=ds.column_names)
     ds.set_format("torch")
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
     lora_model_2 = PeftModel.from_pretrained(
         base_model_2,
         "finetuned_myr1",
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
+    return "Finetuning complete (QLoRA + LoRA on Magpie dataset). Model loaded for inference."
 def ensure_pipeline():
     """
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
     )
     return out[0]["generated_text"]
+# (Optional) If you want to compare with another model, define it here:
+# def ensure_comparison_pipeline():
+#     ...
 with gr.Blocks() as demo:
+    gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1 (Magpie dataset subset)")
+    gr.Markdown("Finetune or skip to use the base model. Then generate text below.")
+    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on small subset of Magpie dataset (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
     top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
+    min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
+    max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
+    output_box = gr.Textbox(label="Generated Text", lines=12)
+    gen_btn = gr.Button("Generate")
     gen_btn.click(
         fn=predict,
         outputs=output_box
     )
 demo.launch()