myr1-2

Running on Zero

App Files Files Community

wuhp commited on 3 days ago

Commit

4cf237b

verified ·

1 Parent(s): 4e66e3d

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -56

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import spaces
-import torch
 from datasets import load_dataset
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -13,28 +13,28 @@ from transformers import (
 )
 ##############################################################################
-# ZeroGPU constraints:
-#  1) No GPU calls in top-level code
-#  2) Decorate GPU-using functions with @spaces.GPU(...)
 ##############################################################################
-TEXT_PIPELINE = None  # We'll store an inference pipeline after training (if any).
-# We'll train on a subset of WikiText-2 to keep it short for ZeroGPU demonstration.
-NUM_EXAMPLES = 1000
-@spaces.GPU(duration=300)  # 5 minutes to do a quick demo train
 def finetune_small_subset():
     """
-    Demonstration:
-      - Loads 'wuhp/myr1' (DeepSeek)
-      - Finetunes on a small subset of WikiText-2
-      - Disables fp16 to avoid "Attempting to unscale FP16 gradients" error
-      - Saves model to 'finetuned_myr1'
-      - Reloads as pipeline for inference
     """
     # 1) Load dataset
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # 2) Load config, tokenizer, model
@@ -48,27 +48,17 @@ def finetune_small_subset():
         subfolder="myr1",
         trust_remote_code=True
     )
-    # If your GPU supports BF16 (e.g. A100), you can try:
-    #   bf16 = True, and fp16 = False
-    # Otherwise, just keep fp16=False
-    # We'll do bf16=False so we definitely skip half-precision
-    # (which avoids the "Attempting to unscale FP16 gradients" error).
-    bf16 = False
-    fp16 = False
     model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
-        # Only auto-detect if we do normal float32 or bfloat16.
-        # (We do not want normal fp16 in training.)
-        torch_dtype=torch.bfloat16 if bf16 else torch.float32,
-        device_map="auto",
         trust_remote_code=True
     )
-    # 3) Tokenize data
     def tokenize_fn(ex):
         return tokenizer(ex["text"], truncation=True, max_length=512)
@@ -77,72 +67,73 @@ def finetune_small_subset():
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-    # 4) TrainingArguments
     training_args = TrainingArguments(
         output_dir="finetuned_myr1",
         num_train_epochs=1,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=10,
-        save_steps=999999,       # won't save mid-training
         save_total_limit=1,
-        # Turn off half precision explicitly
-        fp16=fp16,
-        bf16=bf16,
-        # If the above doesn't fix it, remove advanced features that auto uses
-        # gradient scaling, or do more manual approach.
     )
-    # 5) Build Trainer
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=ds,
-        data_collator=collator
     )
     # 6) Train
     trainer.train()
-    # 7) Save final
     trainer.save_model("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
-    # 8) Reload the newly trained model as a pipeline
     finetuned_model = AutoModelForCausalLM.from_pretrained(
         "finetuned_myr1",
-        torch_dtype=torch.bfloat16 if bf16 else torch.float32,
         device_map="auto",
         trust_remote_code=True
     )
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)
     return "Finetuning complete! Model reloaded for inference."
 def ensure_pipeline():
     """
-    If we haven't finetuned yet, or if TEXT_PIPELINE is None,
-    load the original model from 'wuhp/myr1' for inference.
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
-        tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
-        # We'll do float32 for inference if no BF16 or fp16.
-        model = AutoModelForCausalLM.from_pretrained(
             "wuhp/myr1",
             subfolder="myr1",
-            torch_dtype=torch.float32,
-            device_map="auto",
             trust_remote_code=True
         )
         TEXT_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return TEXT_PIPELINE
-@spaces.GPU(duration=120)  # up to 2 minutes for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
-    Generates text from the (finetuned) pipeline or the original model.
-    Allows user to adjust temperature, top_p, and token range [260..5000].
     """
     pipe = ensure_pipeline()
     out = pipe(
@@ -155,20 +146,20 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
 # Build Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("## ZeroGPU Mini-Finetuning (No FP16) + Long Text Generation")
-    # 1) Button to run finetune_small_subset()
-    finetune_btn = gr.Button("Finetune WikiText-2 (Subset)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
-    gr.Markdown("Use 'Generate' to produce text from either the newly finetuned or original model.")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
-    temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
-    top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
     min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")

 import gradio as gr
 import spaces
 from datasets import load_dataset
+import torch
 from transformers import (
     AutoConfig,
     AutoTokenizer,
 )
 ##############################################################################
+# GLOBALS / ZERO-GPU APPROACH
 ##############################################################################
+# We store a global pipeline after finetuning (if any).
+TEXT_PIPELINE = None
+# We'll train on only 50 examples from WikiText-2 to keep it short.
+NUM_EXAMPLES = 50
+@spaces.GPU(duration=600)  # up to 600 seconds (10 minutes) for mini-finetraining
 def finetune_small_subset():
     """
+    1) Loads 'wuhp/myr1' in 8-bit,
+    2) Takes 50 examples from WikiText-2,
+    3) Finetunes for 1 epoch,
+    4) Saves to 'finetuned_myr1/',
+    5) Reloads the new model into a pipeline for inference.
     """
     # 1) Load dataset
     ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+    # Keep only 50 to fit ephemeral GPU time
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # 2) Load config, tokenizer, model
         subfolder="myr1",
         trust_remote_code=True
     )
+    # 8-bit loading via bitsandbytes
     model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
+        load_in_8bit=True,         # <--- 8-bit
+        device_map="auto",         # let HF manage device placement
         trust_remote_code=True
     )
+    # 3) Tokenize
     def tokenize_fn(ex):
         return tokenizer(ex["text"], truncation=True, max_length=512)
     collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    # 4) TrainingArguments: no fp16 to avoid half-precision gradient issues
     training_args = TrainingArguments(
         output_dir="finetuned_myr1",
         num_train_epochs=1,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=10,
+        save_steps=999999,       # skip mid-training saves
         save_total_limit=1,
+        fp16=False,              # <--- disable FP16
     )
+    # 5) Trainer
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=ds,
+        data_collator=collator,
     )
     # 6) Train
     trainer.train()
+    # 7) Save final model
     trainer.save_model("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
+    # 8) Reload the newly finetuned model as a pipeline (for inference)
     finetuned_model = AutoModelForCausalLM.from_pretrained(
         "finetuned_myr1",
         device_map="auto",
         trust_remote_code=True
     )
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)
     return "Finetuning complete! Model reloaded for inference."
 def ensure_pipeline():
     """
+    If no pipeline yet, load the original model from wuhp/myr1 for inference.
+    (In 8-bit or normal float? We can do normal float here for a simpler approach.)
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
+        tokenizer = AutoTokenizer.from_pretrained(
             "wuhp/myr1",
             subfolder="myr1",
             trust_remote_code=True
         )
+        model = AutoModelForCausalLM.from_pretrained(
+            "wuhp/myr1",
+            subfolder="myr1",
+            trust_remote_code=True,
+            load_in_8bit=True,   # load in 8-bit also for inference
+            device_map="auto"
+        )
         TEXT_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return TEXT_PIPELINE
+@spaces.GPU(duration=120)  # up to 120s for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
+    Generates text from either the finetuned pipeline (if it exists) or the base model.
+    Allows user to adjust temperature, top_p, min/max tokens.
     """
     pipe = ensure_pipeline()
     out = pipe(
     )
     return out[0]["generated_text"]
 # Build Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## ZeroGPU: Mini-Finetune with 8-bit + Extended Generation")
+    finetune_btn = gr.Button("Finetune on 50 lines of WikiText-2 (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
+    gr.Markdown("After finetuning, or even without it, generate text below:")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
+    temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
+    top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
     min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")