Spaces:
Paused
Paused
feat(t4-gpu): add t4 gpu capability
Browse files- Dockerfile +1 -1
- main.py +26 -4
- poetry.lock +0 -0
- pyproject.toml +1 -0
Dockerfile
CHANGED
@@ -7,7 +7,7 @@ ENV PATH="/home/user/.local/bin:$PATH"
|
|
7 |
WORKDIR /app
|
8 |
|
9 |
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
-
RUN pip install --no-cache-dir
|
11 |
|
12 |
COPY --chown=user . /app
|
13 |
|
|
|
7 |
WORKDIR /app
|
8 |
|
9 |
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
|
11 |
|
12 |
COPY --chown=user . /app
|
13 |
|
main.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from typing import Optional
|
2 |
from fastapi import FastAPI
|
3 |
from pydantic import BaseModel
|
@@ -11,7 +13,7 @@ app = FastAPI()
|
|
11 |
# Initialize the LLM engine
|
12 |
# Replace 'your-model-path' with the actual path or name of your model
|
13 |
|
14 |
-
|
15 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
16 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
17 |
max_num_batched_tokens=512, # Reduced for T4
|
@@ -19,13 +21,33 @@ engine = LLM(
|
|
19 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
20 |
max_model_len=131072, # Llama-3.2-3B-Instruct context length
|
21 |
enforce_eager=True, # Disable CUDA graph
|
22 |
-
dtype='half', # Use half precision
|
23 |
)
|
24 |
|
25 |
|
26 |
@app.get("/")
|
27 |
def greet_json():
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
|
31 |
class GenerationRequest(BaseModel):
|
@@ -50,7 +72,7 @@ def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str,
|
|
50 |
)
|
51 |
|
52 |
# Generate text
|
53 |
-
return
|
54 |
prompts=request.prompt,
|
55 |
sampling_params=sampling_params
|
56 |
)
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any
|
3 |
from typing import Optional
|
4 |
from fastapi import FastAPI
|
5 |
from pydantic import BaseModel
|
|
|
13 |
# Initialize the LLM engine
|
14 |
# Replace 'your-model-path' with the actual path or name of your model
|
15 |
|
16 |
+
engine_llama_3_2: LLM = LLM(
|
17 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
18 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
19 |
max_num_batched_tokens=512, # Reduced for T4
|
|
|
21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
22 |
max_model_len=131072, # Llama-3.2-3B-Instruct context length
|
23 |
enforce_eager=True, # Disable CUDA graph
|
24 |
+
dtype='half', # Use 'half' if you want half precision
|
25 |
)
|
26 |
|
27 |
|
28 |
@app.get("/")
|
29 |
def greet_json():
|
30 |
+
cuda_info: dict[str, Any] = {}
|
31 |
+
if torch.cuda.is_available():
|
32 |
+
cuda_current_device: int = torch.cuda.current_device()
|
33 |
+
cuda_info = {
|
34 |
+
"device_count": torch.cuda.device_count(),
|
35 |
+
"cuda_device": torch.cuda.get_device_name(cuda_current_device),
|
36 |
+
"cuda_capability": torch.cuda.get_device_capability(cuda_current_device),
|
37 |
+
"allocated": f"{round(torch.cuda.memory_allocated(cuda_current_device) / 1024 ** 3, 1)} GB",
|
38 |
+
"cached": f"{round(torch.cuda.memory_reserved(cuda_current_device) / 1024 ** 3, 1)} GB",
|
39 |
+
}
|
40 |
+
|
41 |
+
return {
|
42 |
+
"message": f"CUDA availability is {torch.cuda.is_available()}",
|
43 |
+
"cuda_info": cuda_info,
|
44 |
+
"model": [
|
45 |
+
{
|
46 |
+
"name": "meta-llama/Llama-3.2-3B-Instruct",
|
47 |
+
"revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
48 |
+
}
|
49 |
+
]
|
50 |
+
}
|
51 |
|
52 |
|
53 |
class GenerationRequest(BaseModel):
|
|
|
72 |
)
|
73 |
|
74 |
# Generate text
|
75 |
+
return engine_llama_3_2.generate(
|
76 |
prompts=request.prompt,
|
77 |
sampling_params=sampling_params
|
78 |
)
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -12,6 +12,7 @@ vllm = "^0.6.4.post1"
|
|
12 |
fastapi = "^0.115.5"
|
13 |
pydantic = "^2.10.2"
|
14 |
uvicorn = "^0.32.1"
|
|
|
15 |
|
16 |
|
17 |
[build-system]
|
|
|
12 |
fastapi = "^0.115.5"
|
13 |
pydantic = "^2.10.2"
|
14 |
uvicorn = "^0.32.1"
|
15 |
+
torch = "^2.5.1"
|
16 |
|
17 |
|
18 |
[build-system]
|