|
import gradio as gr |
|
import spaces |
|
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline |
|
|
|
text_pipeline = None |
|
|
|
@spaces.GPU(duration=120) |
|
def load_model(): |
|
""" |
|
This function will run in a *child* process that has GPU allocated. |
|
We can safely do device_map="auto" or .to("cuda") here. |
|
""" |
|
config = AutoConfig.from_pretrained( |
|
"wuhp/myr1", |
|
subfolder="myr1", |
|
trust_remote_code=True |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"wuhp/myr1", |
|
subfolder="myr1", |
|
trust_remote_code=True |
|
) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
"wuhp/myr1", |
|
subfolder="myr1", |
|
config=config, |
|
torch_dtype="auto", |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) |
|
return text_pipe |
|
|
|
def ensure_pipeline(): |
|
""" |
|
If we've never loaded the pipeline, call load_model() now. |
|
If ZeroGPU has deallocated it, we might need to reload again. |
|
""" |
|
global text_pipeline |
|
if text_pipeline is None: |
|
text_pipeline = load_model() |
|
return text_pipeline |
|
|
|
@spaces.GPU(duration=60) |
|
def predict(prompt, max_new_tokens=64): |
|
""" |
|
Called when the user clicks 'Generate'; ensures the model is loaded, |
|
then runs inference on GPU. |
|
""" |
|
pipe = ensure_pipeline() |
|
outputs = pipe(prompt, max_new_tokens=int(max_new_tokens)) |
|
return outputs[0]["generated_text"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# ZeroGPU Inference Demo") |
|
prompt = gr.Textbox(label="Prompt") |
|
max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens") |
|
output = gr.Textbox(label="Generated Text") |
|
|
|
generate_btn = gr.Button("Generate") |
|
generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output) |
|
|
|
demo.launch() |
|
|