myr1-2

Sleeping

File size: 2,109 Bytes

5755412
 
b26485f
eccd8f6
b26485f
b446d41
b26485f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eccd8f6
5755412
 
eccd8f6
b26485f
 
eccd8f6
 
b26485f
 
eccd8f6
b26485f
 
 
 
 
 
 
 
 
b446d41
b26485f
 
 
 
 
 
 
 
 
eccd8f6
b26485f
b446d41
b26485f
 
 
 
 
 
 
eccd8f6
5755412

import gradio as gr
import spaces
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

text_pipeline = None  # global var to hold our pipeline once loaded

@spaces.GPU(duration=120)  # request up to 120s GPU time to load the model
def load_model():
    """
    This function will run in a *child* process that has GPU allocated.
    We can safely do device_map="auto" or .to("cuda") here.
    """
    config = AutoConfig.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        config=config,
        torch_dtype="auto",  # triggers GPU usage
        device_map="auto",   # triggers GPU usage
        trust_remote_code=True
    )
    text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return text_pipe

def ensure_pipeline():
    """
    If we've never loaded the pipeline, call load_model() now.
    If ZeroGPU has deallocated it, we might need to reload again.
    """
    global text_pipeline
    if text_pipeline is None:
        text_pipeline = load_model()  # <-- calls the GPU-wrapped function
    return text_pipeline

@spaces.GPU(duration=60)  # up to 60s for each generate call
def predict(prompt, max_new_tokens=64):
    """
    Called when the user clicks 'Generate'; ensures the model is loaded, 
    then runs inference on GPU.
    """
    pipe = ensure_pipeline()
    outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
    return outputs[0]["generated_text"]

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# ZeroGPU Inference Demo")
    prompt = gr.Textbox(label="Prompt")
    max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
    output = gr.Textbox(label="Generated Text")

    generate_btn = gr.Button("Generate")
    generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)

demo.launch()