import gradio as gr import spaces import torch from datasets import load_dataset from transformers import ( AutoConfig, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline ) ############################################################# # ZeroGPU REQUIREMENT: # - No CUDA references at global scope. # - All GPU usage within @spaces.GPU(...) functions. ############################################################# # We'll do a small subset of WikiText-2 for demonstration. # Real finetuning on the entire dataset likely exceeds typical ZeroGPU time. NUM_EXAMPLES = 1000 # or fewer to keep it quick # We'll store the "inference pipeline" after training TEXT_PIPELINE = None @spaces.GPU(duration=300) # up to 5 minutes for a mini-finetraining def finetune_small_subset(): """ 1) Loads the model & tokenizer from 'wuhp/myr1'. 2) Loads a small subset of WikiText-2 for language modeling. 3) Runs a quick 1-epoch finetune. 4) Saves model + tokenizer to 'finetuned_myr1'. 5) Loads the newly trained model back into a text-generation pipeline. Returns a success message. """ # ------------------------------- # A) Load a small dataset # ------------------------------- ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") # Keep only a subset so we don't exceed time. ds = ds.select(range(min(NUM_EXAMPLES, len(ds)))) def format_and_tokenize(ex): # For standard LM, we just treat each line as text return tokenizer(ex["text"], truncation=True, max_length=512) # We'll define them once we have the tokenizer below. # ------------------------------- # B) Load config, tokenizer, model from HF # (trust_remote_code = True for custom modeling_deepseek) # ------------------------------- config = AutoConfig.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( "wuhp/myr1", subfolder="myr1", config=config, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", trust_remote_code=True ) # ------------------------------- # C) Process dataset # ------------------------------- ds = ds.map(format_and_tokenize, batched=True, remove_columns=["text"]) ds.set_format("torch") # ------------------------------- # D) Data Collator # ------------------------------- collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # ------------------------------- # E) Training Arguments + Trainer # ------------------------------- training_args = TrainingArguments( output_dir="finetuned_myr1", num_train_epochs=1, # 1 epoch for demonstration per_device_train_batch_size=1, gradient_accumulation_steps=2, logging_steps=10, save_steps=999999, # effectively "don't save mid-training" save_total_limit=1, fp16=torch.cuda.is_available(), # ZeroGPU ephemeral environment => no real advantage to push_to_hub ) trainer = Trainer( model=model, args=training_args, train_dataset=ds, data_collator=collator, ) # ------------------------------- # F) Train # ------------------------------- trainer.train() # ------------------------------- # G) Save local checkpoint # ------------------------------- trainer.save_model("finetuned_myr1") tokenizer.save_pretrained("finetuned_myr1") # ------------------------------- # H) Reload the newly finetuned model as a pipeline # ------------------------------- # (We do this so we can do inference in the same GPU session) # However, if the pipeline is used *after* this function returns, # we might need to re-load in a separate function call. finetuned_model = AutoModelForCausalLM.from_pretrained( "finetuned_myr1", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", trust_remote_code=True ) global TEXT_PIPELINE TEXT_PIPELINE = pipeline( "text-generation", model=finetuned_model, tokenizer=tokenizer ) return "Finetuning complete. Model reloaded for inference!" def ensure_pipeline(): """ If TEXT_PIPELINE is None (e.g., we didn't finetune yet), let's just load the *original* model from wuhp/myr1 so that 'predict' can still run. """ global TEXT_PIPELINE if TEXT_PIPELINE is None: # Load the original model for inference TEXT_PIPELINE = pipeline( "text-generation", model="wuhp/myr1/myr1", # subfolder syntax trust_remote_code=True ) return TEXT_PIPELINE @spaces.GPU(duration=120) # up to 2 minutes to generate text def predict(prompt, min_new_tokens=260, max_new_tokens=2600): """ Generate text from the (possibly finetuned) model. We default max_new_tokens to 2,600, but allow up to 5,000 in the UI slider. We'll also ensure a minimum of 260 tokens. """ pipe = ensure_pipeline() # load model if not already # Use pipeline generate params. # The pipeline will handle do_sample by default. # We set a large max_new_tokens, but be careful about timeouts. outputs = pipe( prompt, min_new_tokens=int(min_new_tokens), max_new_tokens=int(max_new_tokens), temperature=0.7, top_p=0.9 ) return outputs[0]["generated_text"] ############################################################# # Build a Gradio UI ############################################################# with gr.Blocks() as demo: gr.Markdown("## ZeroGPU Finetuning & Long-Text Generation Demo") finetune_btn = gr.Button("Finetune on a small WikiText-2 subset (5 min limit)") finetune_status = gr.Textbox(label="Status") # When user clicks, we run 'finetune_small_subset' finetune_btn.click(fn=finetune_small_subset, outputs=finetune_status) gr.Markdown( "Once finetuning completes, or if you skip it, you can still do inference " "with either the new or original model." ) prompt_in = gr.Textbox(label="Prompt", lines=3) min_tok_slider = gr.Slider( minimum=260, maximum=5000, value=260, step=10, label="Minimum New Tokens" ) max_tok_slider = gr.Slider( minimum=260, maximum=5000, value=2600, step=50, label="Maximum New Tokens" ) gen_btn = gr.Button("Generate") output_box = gr.Textbox(label="Generated Text", lines=12) gen_btn.click( fn=predict, inputs=[prompt_in, min_tok_slider, max_tok_slider], outputs=output_box ) demo.launch()