Spaces:
wuhp
/
Running on Zero

myr1-2 / app.py
wuhp's picture
Update app.py
b26485f verified
raw
history blame
2.11 kB
import gradio as gr
import spaces
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
text_pipeline = None # global var to hold our pipeline once loaded
@spaces.GPU(duration=120) # request up to 120s GPU time to load the model
def load_model():
"""
This function will run in a *child* process that has GPU allocated.
We can safely do device_map="auto" or .to("cuda") here.
"""
config = AutoConfig.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
config=config,
torch_dtype="auto", # triggers GPU usage
device_map="auto", # triggers GPU usage
trust_remote_code=True
)
text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
return text_pipe
def ensure_pipeline():
"""
If we've never loaded the pipeline, call load_model() now.
If ZeroGPU has deallocated it, we might need to reload again.
"""
global text_pipeline
if text_pipeline is None:
text_pipeline = load_model() # <-- calls the GPU-wrapped function
return text_pipeline
@spaces.GPU(duration=60) # up to 60s for each generate call
def predict(prompt, max_new_tokens=64):
"""
Called when the user clicks 'Generate'; ensures the model is loaded,
then runs inference on GPU.
"""
pipe = ensure_pipeline()
outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
return outputs[0]["generated_text"]
# Build the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# ZeroGPU Inference Demo")
prompt = gr.Textbox(label="Prompt")
max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
output = gr.Textbox(label="Generated Text")
generate_btn = gr.Button("Generate")
generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)
demo.launch()