import gradio as gr import spaces import torch from datasets import load_dataset from transformers import ( AutoConfig, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline ) ############################################################################## # ZeroGPU constraints: # 1) No GPU calls in top-level code # 2) Decorate GPU-using functions with @spaces.GPU(...) ############################################################################## TEXT_PIPELINE = None # We'll store an inference pipeline after training (if any). # We'll train on a subset of WikiText-2 to keep it short for ZeroGPU demonstration. NUM_EXAMPLES = 1000 @spaces.GPU(duration=300) # 5 minutes to do a quick demo train def finetune_small_subset(): """ Demonstration: - Loads 'wuhp/myr1' (DeepSeek) - Finetunes on a small subset of WikiText-2 - Disables fp16 to avoid "Attempting to unscale FP16 gradients" error - Saves model to 'finetuned_myr1' - Reloads as pipeline for inference """ # 1) Load dataset ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") ds = ds.select(range(min(NUM_EXAMPLES, len(ds)))) # 2) Load config, tokenizer, model config = AutoConfig.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) # If your GPU supports BF16 (e.g. A100), you can try: # bf16 = True, and fp16 = False # Otherwise, just keep fp16=False # We'll do bf16=False so we definitely skip half-precision # (which avoids the "Attempting to unscale FP16 gradients" error). bf16 = False fp16 = False model = AutoModelForCausalLM.from_pretrained( "wuhp/myr1", subfolder="myr1", config=config, # Only auto-detect if we do normal float32 or bfloat16. # (We do not want normal fp16 in training.) torch_dtype=torch.bfloat16 if bf16 else torch.float32, device_map="auto", trust_remote_code=True ) # 3) Tokenize data def tokenize_fn(ex): return tokenizer(ex["text"], truncation=True, max_length=512) ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"]) ds.set_format("torch") collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # 4) TrainingArguments training_args = TrainingArguments( output_dir="finetuned_myr1", num_train_epochs=1, per_device_train_batch_size=1, gradient_accumulation_steps=2, logging_steps=10, save_steps=999999, # won't save mid-training save_total_limit=1, # Turn off half precision explicitly fp16=fp16, bf16=bf16, # If the above doesn't fix it, remove advanced features that auto uses # gradient scaling, or do more manual approach. ) # 5) Build Trainer trainer = Trainer( model=model, args=training_args, train_dataset=ds, data_collator=collator ) # 6) Train trainer.train() # 7) Save final trainer.save_model("finetuned_myr1") tokenizer.save_pretrained("finetuned_myr1") # 8) Reload the newly trained model as a pipeline finetuned_model = AutoModelForCausalLM.from_pretrained( "finetuned_myr1", torch_dtype=torch.bfloat16 if bf16 else torch.float32, device_map="auto", trust_remote_code=True ) global TEXT_PIPELINE TEXT_PIPELINE = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer) return "Finetuning complete! Model reloaded for inference." def ensure_pipeline(): """ If we haven't finetuned yet, or if TEXT_PIPELINE is None, load the original model from 'wuhp/myr1' for inference. """ global TEXT_PIPELINE if TEXT_PIPELINE is None: tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True) # We'll do float32 for inference if no BF16 or fp16. model = AutoModelForCausalLM.from_pretrained( "wuhp/myr1", subfolder="myr1", torch_dtype=torch.float32, device_map="auto", trust_remote_code=True ) TEXT_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer) return TEXT_PIPELINE @spaces.GPU(duration=120) # up to 2 minutes for text generation def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens): """ Generates text from the (finetuned) pipeline or the original model. Allows user to adjust temperature, top_p, and token range [260..5000]. """ pipe = ensure_pipeline() out = pipe( prompt, temperature=float(temperature), top_p=float(top_p), min_new_tokens=int(min_new_tokens), max_new_tokens=int(max_new_tokens), do_sample=True ) return out[0]["generated_text"] # Build Gradio UI with gr.Blocks() as demo: gr.Markdown("## ZeroGPU Mini-Finetuning (No FP16) + Long Text Generation") # 1) Button to run finetune_small_subset() finetune_btn = gr.Button("Finetune WikiText-2 (Subset)") status_box = gr.Textbox(label="Finetune Status") finetune_btn.click(fn=finetune_small_subset, outputs=status_box) gr.Markdown("Use 'Generate' to produce text from either the newly finetuned or original model.") prompt_in = gr.Textbox(lines=3, label="Prompt") temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p") min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens") max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens") output_box = gr.Textbox(label="Generated Text", lines=12) gen_btn = gr.Button("Generate") gen_btn.click( fn=predict, inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens], outputs=output_box ) demo.launch()