File size: 2,109 Bytes
5755412 b26485f eccd8f6 b26485f b446d41 b26485f eccd8f6 5755412 eccd8f6 b26485f eccd8f6 b26485f eccd8f6 b26485f b446d41 b26485f eccd8f6 b26485f b446d41 b26485f eccd8f6 5755412 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
import spaces
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
text_pipeline = None # global var to hold our pipeline once loaded
@spaces.GPU(duration=120) # request up to 120s GPU time to load the model
def load_model():
"""
This function will run in a *child* process that has GPU allocated.
We can safely do device_map="auto" or .to("cuda") here.
"""
config = AutoConfig.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
"wuhp/myr1",
subfolder="myr1",
config=config,
torch_dtype="auto", # triggers GPU usage
device_map="auto", # triggers GPU usage
trust_remote_code=True
)
text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
return text_pipe
def ensure_pipeline():
"""
If we've never loaded the pipeline, call load_model() now.
If ZeroGPU has deallocated it, we might need to reload again.
"""
global text_pipeline
if text_pipeline is None:
text_pipeline = load_model() # <-- calls the GPU-wrapped function
return text_pipeline
@spaces.GPU(duration=60) # up to 60s for each generate call
def predict(prompt, max_new_tokens=64):
"""
Called when the user clicks 'Generate'; ensures the model is loaded,
then runs inference on GPU.
"""
pipe = ensure_pipeline()
outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
return outputs[0]["generated_text"]
# Build the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# ZeroGPU Inference Demo")
prompt = gr.Textbox(label="Prompt")
max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
output = gr.Textbox(label="Generated Text")
generate_btn = gr.Button("Generate")
generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)
demo.launch()
|