yusufs commited on
Commit
4998ce7
·
1 Parent(s): 6d19ece

feat(t4-gpu): add t4 gpu capability

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -1
  2. main.py +26 -4
  3. poetry.lock +0 -0
  4. pyproject.toml +1 -0
Dockerfile CHANGED
@@ -7,7 +7,7 @@ ENV PATH="/home/user/.local/bin:$PATH"
7
  WORKDIR /app
8
 
9
  COPY --chown=user ./requirements.txt requirements.txt
10
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
  COPY --chown=user . /app
13
 
 
7
  WORKDIR /app
8
 
9
  COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
11
 
12
  COPY --chown=user . /app
13
 
main.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from typing import Optional
2
  from fastapi import FastAPI
3
  from pydantic import BaseModel
@@ -11,7 +13,7 @@ app = FastAPI()
11
  # Initialize the LLM engine
12
  # Replace 'your-model-path' with the actual path or name of your model
13
 
14
- engine = LLM(
15
  model='meta-llama/Llama-3.2-3B-Instruct',
16
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
17
  max_num_batched_tokens=512, # Reduced for T4
@@ -19,13 +21,33 @@ engine = LLM(
19
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
20
  max_model_len=131072, # Llama-3.2-3B-Instruct context length
21
  enforce_eager=True, # Disable CUDA graph
22
- dtype='half', # Use half precision
23
  )
24
 
25
 
26
  @app.get("/")
27
  def greet_json():
28
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
  class GenerationRequest(BaseModel):
@@ -50,7 +72,7 @@ def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str,
50
  )
51
 
52
  # Generate text
53
- return engine.generate(
54
  prompts=request.prompt,
55
  sampling_params=sampling_params
56
  )
 
1
+ import torch
2
+ from typing import Any
3
  from typing import Optional
4
  from fastapi import FastAPI
5
  from pydantic import BaseModel
 
13
  # Initialize the LLM engine
14
  # Replace 'your-model-path' with the actual path or name of your model
15
 
16
+ engine_llama_3_2: LLM = LLM(
17
  model='meta-llama/Llama-3.2-3B-Instruct',
18
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
19
  max_num_batched_tokens=512, # Reduced for T4
 
21
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
22
  max_model_len=131072, # Llama-3.2-3B-Instruct context length
23
  enforce_eager=True, # Disable CUDA graph
24
+ dtype='half', # Use 'half' if you want half precision
25
  )
26
 
27
 
28
  @app.get("/")
29
  def greet_json():
30
+ cuda_info: dict[str, Any] = {}
31
+ if torch.cuda.is_available():
32
+ cuda_current_device: int = torch.cuda.current_device()
33
+ cuda_info = {
34
+ "device_count": torch.cuda.device_count(),
35
+ "cuda_device": torch.cuda.get_device_name(cuda_current_device),
36
+ "cuda_capability": torch.cuda.get_device_capability(cuda_current_device),
37
+ "allocated": f"{round(torch.cuda.memory_allocated(cuda_current_device) / 1024 ** 3, 1)} GB",
38
+ "cached": f"{round(torch.cuda.memory_reserved(cuda_current_device) / 1024 ** 3, 1)} GB",
39
+ }
40
+
41
+ return {
42
+ "message": f"CUDA availability is {torch.cuda.is_available()}",
43
+ "cuda_info": cuda_info,
44
+ "model": [
45
+ {
46
+ "name": "meta-llama/Llama-3.2-3B-Instruct",
47
+ "revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
48
+ }
49
+ ]
50
+ }
51
 
52
 
53
  class GenerationRequest(BaseModel):
 
72
  )
73
 
74
  # Generate text
75
+ return engine_llama_3_2.generate(
76
  prompts=request.prompt,
77
  sampling_params=sampling_params
78
  )
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -12,6 +12,7 @@ vllm = "^0.6.4.post1"
12
  fastapi = "^0.115.5"
13
  pydantic = "^2.10.2"
14
  uvicorn = "^0.32.1"
 
15
 
16
 
17
  [build-system]
 
12
  fastapi = "^0.115.5"
13
  pydantic = "^2.10.2"
14
  uvicorn = "^0.32.1"
15
+ torch = "^2.5.1"
16
 
17
 
18
  [build-system]