myr1-2

Running on Zero

App Files Files Community

wuhp commited on 3 days ago

Commit

1ce8e5a

verified ·

1 Parent(s): 451ee23

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -33

app.py CHANGED Viewed

@@ -15,12 +15,12 @@ from transformers import (
 )
 # PEFT (LoRA / QLoRA)
-from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
 ##############################################################################
 # ZeroGPU + QLoRA Example
 ##############################################################################
 TEXT_PIPELINE = None
 NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
@@ -38,16 +38,12 @@ def finetune_small_subset():
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
-    # We'll define tokenize_fn after we have the tokenizer
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
-    # This is QLoRA approach: we load the base model in 4-bit
-    # and attach LoRA adapters for training.
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16 if preferred
         bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",  # "nf4" is standard for QLoRA
     )
     config = AutoConfig.from_pretrained(
@@ -61,7 +57,6 @@ def finetune_small_subset():
         trust_remote_code=True
     )
-    # Load model in 4-bit
     base_model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
@@ -72,18 +67,16 @@ def finetune_small_subset():
     )
     # Prepare the model for k-bit training (QLoRA)
-    # This step disables dropout on some layers, sets up gradients for LN, etc.
     base_model = prepare_model_for_kbit_training(base_model)
-    # --- 3) Create LoRA config & wrap the base model in LoRA adapter ---
-    # For LLaMA-like models, "q_proj" and "v_proj" are typical. If your model is different,
-    # adjust target_modules accordingly (maybe "c_attn", "W_pack", "query_key_value", etc.)
     lora_config = LoraConfig(
         r=16,
         lora_alpha=32,
         lora_dropout=0.05,
         bias="none",
-        target_modules=["q_proj", "v_proj"],  # Adjust if your model uses different layer names
         task_type=TaskType.CAUSAL_LM,
     )
     lora_model = get_peft_model(base_model, lora_config)
@@ -95,7 +88,6 @@ def finetune_small_subset():
     ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
     ds.set_format("torch")
-    # Data collator
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     # Training args
@@ -107,7 +99,7 @@ def finetune_small_subset():
         logging_steps=5,
         save_steps=999999,
         save_total_limit=1,
-        fp16=False,  # We'll rely on bnb_4bit/bfloat16 for the base model
     )
     # Trainer
@@ -121,13 +113,11 @@ def finetune_small_subset():
     # --- 5) Train ---
     trainer.train()
-    # Save LoRA adapter + tokenizer
-    # The 'save_model' would save only the LoRA adapter if using PEFT
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
-    # --- 6) Reload the base model in 4-bit, then merge or apply the LoRA adapter for inference
-    # We'll do the same approach, then load adapter from 'finetuned_myr1'
     base_model_2 = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
@@ -138,17 +128,12 @@ def finetune_small_subset():
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
-    # Re-inject LoRA
-    # If your LoRA was saved in the same folder, you can do:
-    # from peft import PeftModel
-    # lora_model_2 = PeftModel.from_pretrained(base_model_2, "finetuned_myr1")
-    # or you can do get_peft_model and pass the weights, etc.
-    # But we can reuse 'get_peft_model' + load the LoRA weights
-    lora_model_2 = get_peft_model(base_model_2, lora_config)
-    lora_model_2.load_adapter("finetuned_myr1")
-    # Create pipeline
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
@@ -162,7 +147,6 @@ def ensure_pipeline():
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
-        # Just load base model in 4-bit
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.bfloat16,
@@ -182,7 +166,6 @@ def ensure_pipeline():
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
@@ -199,7 +182,6 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
 # Build Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")

 )
 # PEFT (LoRA / QLoRA)
+from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, PeftModel
 ##############################################################################
 # ZeroGPU + QLoRA Example
 ##############################################################################
 TEXT_PIPELINE = None
 NUM_EXAMPLES = 50  # We'll train on 50 lines of WikiText-2 for demonstration
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16 if you prefer
         bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
     )
     config = AutoConfig.from_pretrained(
         trust_remote_code=True
     )
     base_model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
     )
     # Prepare the model for k-bit training (QLoRA)
     base_model = prepare_model_for_kbit_training(base_model)
+    # --- 3) Create LoRA config & wrap the base model in LoRA ---
+    # Adjust target_modules if your model uses different param names than "q_proj"/"v_proj".
     lora_config = LoraConfig(
         r=16,
         lora_alpha=32,
         lora_dropout=0.05,
         bias="none",
+        target_modules=["q_proj", "v_proj"],
         task_type=TaskType.CAUSAL_LM,
     )
     lora_model = get_peft_model(base_model, lora_config)
     ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
     ds.set_format("torch")
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     # Training args
         logging_steps=5,
         save_steps=999999,
         save_total_limit=1,
+        fp16=False,  # rely on bfloat16 from quantization
     )
     # Trainer
     # --- 5) Train ---
     trainer.train()
+    # --- 6) Save LoRA adapter + tokenizer ---
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
+    # --- 7) Reload the base model + LoRA adapter for inference
     base_model_2 = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
+    # Instead of load_adapter(...), we use PeftModel.from_pretrained
+    lora_model_2 = PeftModel.from_pretrained(
+        base_model_2,
+        "finetuned_myr1",
+    )
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_compute_dtype=torch.bfloat16,
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
     )
     return out[0]["generated_text"]
 # Build Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1")