Spaces:

yusufs
/

vllm-inference

Paused

App Files Files

yusufs commited on Nov 27, 2024

Commit

4998ce7

1 Parent(s): 6d19ece

feat(t4-gpu): add t4 gpu capability

Browse files

Files changed (4) hide show

Dockerfile +1 -1
main.py +26 -4
poetry.lock +0 -0
pyproject.toml +1 -0

Dockerfile CHANGED Viewed

@@ -7,7 +7,7 @@ ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app

 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
 COPY --chown=user . /app

main.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from typing import Optional
 from fastapi import FastAPI
 from pydantic import BaseModel
@@ -11,7 +13,7 @@ app = FastAPI()
 # Initialize the LLM engine
 # Replace 'your-model-path' with the actual path or name of your model
-engine = LLM(
     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
     max_num_batched_tokens=512,    # Reduced for T4
@@ -19,13 +21,33 @@ engine = LLM(
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     max_model_len=131072,          # Llama-3.2-3B-Instruct context length
     enforce_eager=True,            # Disable CUDA graph
-    dtype='half',                  # Use half precision
 )
 @app.get("/")
 def greet_json():
-    return {"Hello": "World!"}
 class GenerationRequest(BaseModel):
@@ -50,7 +72,7 @@ def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str,
         )
         # Generate text
-        return engine.generate(
             prompts=request.prompt,
             sampling_params=sampling_params
         )

+import torch
+from typing import Any
 from typing import Optional
 from fastapi import FastAPI
 from pydantic import BaseModel
 # Initialize the LLM engine
 # Replace 'your-model-path' with the actual path or name of your model
+engine_llama_3_2: LLM = LLM(
     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
     max_num_batched_tokens=512,    # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     max_model_len=131072,          # Llama-3.2-3B-Instruct context length
     enforce_eager=True,            # Disable CUDA graph
+    dtype='half',                  # Use 'half' if you want half precision
 )
 @app.get("/")
 def greet_json():
+    cuda_info: dict[str, Any] = {}
+    if torch.cuda.is_available():
+        cuda_current_device: int = torch.cuda.current_device()
+        cuda_info = {
+            "device_count": torch.cuda.device_count(),
+            "cuda_device": torch.cuda.get_device_name(cuda_current_device),
+            "cuda_capability": torch.cuda.get_device_capability(cuda_current_device),
+            "allocated":  f"{round(torch.cuda.memory_allocated(cuda_current_device) / 1024 ** 3, 1)} GB",
+            "cached": f"{round(torch.cuda.memory_reserved(cuda_current_device) / 1024 ** 3, 1)} GB",
+        }
+    return {
+        "message": f"CUDA availability is {torch.cuda.is_available()}",
+        "cuda_info": cuda_info,
+        "model": [
+            {
+                "name": "meta-llama/Llama-3.2-3B-Instruct",
+                "revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
+            }
+        ]
+    }
 class GenerationRequest(BaseModel):
         )
         # Generate text
+        return engine_llama_3_2.generate(
             prompts=request.prompt,
             sampling_params=sampling_params
         )

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -12,6 +12,7 @@ vllm = "^0.6.4.post1"
 fastapi = "^0.115.5"
 pydantic = "^2.10.2"
 uvicorn = "^0.32.1"
 [build-system]

 fastapi = "^0.115.5"
 pydantic = "^2.10.2"
 uvicorn = "^0.32.1"
+torch = "^2.5.1"
 [build-system]