yusufs commited on
Commit
38d356a
·
1 Parent(s): 4a9e328

feat(llama3.2): run llama3.2 using bfloat16 with cache dtype fp8 with same model len

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. run-llama.sh +2 -2
  3. run-sailor.sh +7 -1
Dockerfile CHANGED
@@ -32,4 +32,4 @@ EXPOSE 7860
32
  RUN chmod +x /app/run-llama.sh
33
  RUN chmod +x /app/run-sailor.sh
34
 
35
- CMD ["/app/run-sailor.sh"]
 
32
  RUN chmod +x /app/run-llama.sh
33
  RUN chmod +x /app/run-sailor.sh
34
 
35
+ CMD ["/app/run-llama.sh"]
run-llama.sh CHANGED
@@ -25,7 +25,7 @@ python -u /app/openai_compatible_api_server.py \
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
- --dtype half \
 
29
  --enforce-eager \
30
  --gpu-memory-utilization 0.85
31
-
 
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
+ --dtype bfloat16 \
29
+ --kv-cache-dtype fp8 \
30
  --enforce-eager \
31
  --gpu-memory-utilization 0.85
 
run-sailor.sh CHANGED
@@ -11,8 +11,13 @@ printf "Running sail/Sailor-4B-Chat using vLLM OpenAI compatible API Server at p
11
  # INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
12
  # ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
13
 
 
 
14
  # 7536tokens÷1.2=6280words.
15
  # 6280words÷500words/page=12.56pages. (For single-spaced)
 
 
 
16
  python -u /app/openai_compatible_api_server.py \
17
  --model sail/Sailor-4B-Chat \
18
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
@@ -20,6 +25,7 @@ python -u /app/openai_compatible_api_server.py \
20
  --port 7860 \
21
  --max-num-batched-tokens 32768 \
22
  --max-model-len 32768 \
23
- --dtype half \
 
24
  --enforce-eager \
25
  --gpu-memory-utilization 0.9
 
11
  # INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
12
  # ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
13
 
14
+ # After increasing gpu utilization to 0.9, the maximum token for this model is: 9456
15
+
16
  # 7536tokens÷1.2=6280words.
17
  # 6280words÷500words/page=12.56pages. (For single-spaced)
18
+ #
19
+ # 9456tokens÷1.2=7880words.
20
+ # 7880words÷500words/page=15.76pages. (For single-spaced)
21
  python -u /app/openai_compatible_api_server.py \
22
  --model sail/Sailor-4B-Chat \
23
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
 
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
+ --dtype bfloat16 \
29
+ --kv-cache-dtype fp8 \
30
  --enforce-eager \
31
  --gpu-memory-utilization 0.9