Spaces:

yusufs
/

vllm-inference

Paused

vllm-inference / main.py

feat(first-commit): follow examples and tutorials

ae7cfbb 3 months ago

1.67 kB

	from typing import Optional
	from fastapi import FastAPI
	from pydantic import BaseModel
	from vllm import LLM, SamplingParams, RequestOutput


	# Don't forget to set HF_TOKEN in the env during running

	app = FastAPI()

	# Initialize the LLM engine
	# Replace 'your-model-path' with the actual path or name of your model

	engine = LLM(
	model='meta-llama/Llama-3.2-3B-Instruct',
	revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
	max_num_batched_tokens=512, # Reduced for T4
	max_num_seqs=16, # Reduced for T4
	gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
	max_model_len=131072, # Llama-3.2-3B-Instruct context length
	enforce_eager=True, # Disable CUDA graph
	dtype='half', # Use half precision
	)


	@app.get("/")
	def greet_json():
	return {"Hello": "World!"}


	class GenerationRequest(BaseModel):
	prompt: str
	max_tokens: int = 100
	temperature: float = 0.7
	logit_bias: Optional[dict[int, float]] = None


	class GenerationResponse(BaseModel):
	text: Optional[str]
	error: Optional[str]


	@app.post("/generate-llama3-2")
	def generate_text(request: GenerationRequest) -> list[RequestOutput] \| dict[str, str]:
	try:
	sampling_params: SamplingParams = SamplingParams(
	temperature=request.temperature,
	max_tokens=request.max_tokens,
	logit_bias=request.logit_bias,
	)

	# Generate text
	return engine.generate(
	prompts=request.prompt,
	sampling_params=sampling_params
	)

	except Exception as e:
	return {
	"error": str(e)
	}