import gradio as gr
import spaces
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

text_pipeline = None  # global var to hold our pipeline once loaded

@spaces.GPU(duration=120)  # request up to 120s GPU time to load the model
def load_model():
    """
    This function will run in a *child* process that has GPU allocated.
    We can safely do device_map="auto" or .to("cuda") here.
    """
    config = AutoConfig.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        config=config,
        torch_dtype="auto",  # triggers GPU usage
        device_map="auto",   # triggers GPU usage
        trust_remote_code=True
    )
    text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return text_pipe

def ensure_pipeline():
    """
    If we've never loaded the pipeline, call load_model() now.
    If ZeroGPU has deallocated it, we might need to reload again.
    """
    global text_pipeline
    if text_pipeline is None:
        text_pipeline = load_model()  # <-- calls the GPU-wrapped function
    return text_pipeline

@spaces.GPU(duration=60)  # up to 60s for each generate call
def predict(prompt, max_new_tokens=64):
    """
    Called when the user clicks 'Generate'; ensures the model is loaded, 
    then runs inference on GPU.
    """
    pipe = ensure_pipeline()
    outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
    return outputs[0]["generated_text"]

# Build the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# ZeroGPU Inference Demo")
    prompt = gr.Textbox(label="Prompt")
    max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
    output = gr.Textbox(label="Generated Text")

    generate_btn = gr.Button("Generate")
    generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)

demo.launch()