myr1-2

Running on Zero

App Files Files Community

wuhp commited on 3 days ago

Commit

b26485f

verified ·

1 Parent(s): b446d41

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -57

app.py CHANGED Viewed

@@ -1,73 +1,64 @@
 import gradio as gr
 import spaces
-import torch
-from transformers import Trainer, TrainingArguments
-from datasets import load_dataset
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    DataCollatorForLanguageModeling,
-)
-@spaces.GPU(duration=600)  # 10 minutes
-def run_finetuning():
-    # Load dataset
-    ds = load_dataset("Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B")
-    # maybe select a small subset (like 1000 rows) or you'll likely time out
-    ds_small = ds["train"].select(range(1000))
-    # Format example:
-    def format_row(ex):
-        return {"text": f"User: {ex['instruction']}\nAssistant: {ex['response']}"}
-    ds_small = ds_small.map(format_row)
-    # Load config/tokenizer/model with trust_remote_code
-    config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
-        torch_dtype=torch.float16,
-        device_map="auto",
         trust_remote_code=True
     )
-    # Tokenize
-    def tokenize(ex):
-        return tokenizer(ex["text"], truncation=True, max_length=512)
-    ds_small = ds_small.map(tokenize, batched=True)
-    ds_small.set_format("torch")
-    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
-    # Trainer
-    args = TrainingArguments(
-        output_dir="finetuned_model",
-        num_train_epochs=1,
-        per_device_train_batch_size=1,
-        logging_steps=5,
-        fp16=True,
-        save_strategy="no",
-    )
-    trainer = Trainer(
-        model=model,
-        args=args,
-        train_dataset=ds_small,
-        data_collator=collator,
-    )
-    trainer.train()
-    # Save
-    trainer.save_model("finetuned_model")
-    tokenizer.save_pretrained("finetuned_model")
-    return "Finetuning done!"
-# Then define a Gradio UI that calls run_finetuning
 with gr.Blocks() as demo:
-    btn = gr.Button("Run Finetuning (10 min max!)")
-    status = gr.Textbox(label="Status")
-    btn.click(fn=run_finetuning, inputs=None, outputs=status)
 demo.launch()

 import gradio as gr
 import spaces
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
+text_pipeline = None  # global var to hold our pipeline once loaded
+@spaces.GPU(duration=120)  # request up to 120s GPU time to load the model
+def load_model():
+    """
+    This function will run in a *child* process that has GPU allocated.
+    We can safely do device_map="auto" or .to("cuda") here.
+    """
+    config = AutoConfig.from_pretrained(
+        "wuhp/myr1",
+        subfolder="myr1",
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        "wuhp/myr1",
+        subfolder="myr1",
+        trust_remote_code=True
+    )
     model = AutoModelForCausalLM.from_pretrained(
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
+        torch_dtype="auto",  # triggers GPU usage
+        device_map="auto",   # triggers GPU usage
         trust_remote_code=True
     )
+    text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    return text_pipe
+def ensure_pipeline():
+    """
+    If we've never loaded the pipeline, call load_model() now.
+    If ZeroGPU has deallocated it, we might need to reload again.
+    """
+    global text_pipeline
+    if text_pipeline is None:
+        text_pipeline = load_model()  # <-- calls the GPU-wrapped function
+    return text_pipeline
+@spaces.GPU(duration=60)  # up to 60s for each generate call
+def predict(prompt, max_new_tokens=64):
+    """
+    Called when the user clicks 'Generate'; ensures the model is loaded,
+    then runs inference on GPU.
+    """
+    pipe = ensure_pipeline()
+    outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
+    return outputs[0]["generated_text"]
+# Build the Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# ZeroGPU Inference Demo")
+    prompt = gr.Textbox(label="Prompt")
+    max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
+    output = gr.Textbox(label="Generated Text")
+    generate_btn = gr.Button("Generate")
+    generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)
 demo.launch()