myr1-2

Running on Zero

App Files Files Community

myr1-2 / app.py

wuhp

Update app.py

b26485f verified 3 days ago

raw

history blame

2.11 kB

	import gradio as gr
	import spaces
	from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

	text_pipeline = None # global var to hold our pipeline once loaded

	@spaces.GPU(duration=120) # request up to 120s GPU time to load the model
	def load_model():
	"""
	This function will run in a child process that has GPU allocated.
	We can safely do device_map="auto" or .to("cuda") here.
	"""
	config = AutoConfig.from_pretrained(
	"wuhp/myr1",
	subfolder="myr1",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(
	"wuhp/myr1",
	subfolder="myr1",
	trust_remote_code=True
	)
	model = AutoModelForCausalLM.from_pretrained(
	"wuhp/myr1",
	subfolder="myr1",
	config=config,
	torch_dtype="auto", # triggers GPU usage
	device_map="auto", # triggers GPU usage
	trust_remote_code=True
	)
	text_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
	return text_pipe

	def ensure_pipeline():
	"""
	If we've never loaded the pipeline, call load_model() now.
	If ZeroGPU has deallocated it, we might need to reload again.
	"""
	global text_pipeline
	if text_pipeline is None:
	text_pipeline = load_model() # <-- calls the GPU-wrapped function
	return text_pipeline

	@spaces.GPU(duration=60) # up to 60s for each generate call
	def predict(prompt, max_new_tokens=64):
	"""
	Called when the user clicks 'Generate'; ensures the model is loaded,
	then runs inference on GPU.
	"""
	pipe = ensure_pipeline()
	outputs = pipe(prompt, max_new_tokens=int(max_new_tokens))
	return outputs[0]["generated_text"]

	# Build the Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# ZeroGPU Inference Demo")
	prompt = gr.Textbox(label="Prompt")
	max_tok = gr.Slider(1, 256, value=64, step=1, label="Max New Tokens")
	output = gr.Textbox(label="Generated Text")

	generate_btn = gr.Button("Generate")
	generate_btn.click(fn=predict, inputs=[prompt, max_tok], outputs=output)

	demo.launch()