import gradio as gr import spaces from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline text_pipeline = None # global var to hold our pipeline once loaded @spaces.GPU(duration=120) # request up to 120s GPU time to load the model def load_model(): """ This function will run in a *child* process that has GPU allocated. We can safely do device_map="auto" or .to("cuda") here. """ config = AutoConfig.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "wuhp/myr1", subfolder="myr1", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( "wuhp/myr1", subfolder="myr1", config=config, torch_dtype="auto", # triggers GPU usage device_map="auto", # triggers GPU usage trust_remote_code=True ) text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) return text_pipe def ensure_pipeline(): """ If we've never loaded the pipeline, call load_model() now. If ZeroGPU has deallocated it, we might need to reload again. """ global text_pipeline if text_pipeline is None: text_pipeline = load_model() # <-- calls the GPU-wrapped function return text_pipeline @spaces.GPU(duration=60) # up to 60s for each generate call def predict(prompt, max_new_tokens=64): """ Called when the user clicks 'Generate'; ensures the model is loaded, then runs inference on GPU. """ pipe = ensure_pipeline() outputs = pipe(prompt, max_new_tokens=int(max_new_tokens)) return outputs[0]["generated_text"] # Build the Gradio UI with gr.Blocks() as demo: gr.Markdown("# ZeroGPU Inference Demo") prompt = gr.Textbox(label="Prompt") max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens") output = gr.Textbox(label="Generated Text") generate_btn = gr.Button("Generate") generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output) demo.launch()