Spaces:

yusufs
/

vllm-inference

Paused

yusufs commited on Nov 29, 2024

Commit

38d356a

1 Parent(s): 4a9e328

feat(llama3.2): run llama3.2 using bfloat16 with cache dtype fp8 with same model len

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -32,4 +32,4 @@ EXPOSE 7860
 RUN chmod +x /app/run-llama.sh
 RUN chmod +x /app/run-sailor.sh
-CMD ["/app/run-sailor.sh"]

 RUN chmod +x /app/run-llama.sh
 RUN chmod +x /app/run-sailor.sh
+CMD ["/app/run-llama.sh"]

run-llama.sh CHANGED Viewed

@@ -25,7 +25,7 @@ python -u /app/openai_compatible_api_server.py \
     --port 7860 \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
-    --dtype half \
     --enforce-eager \
     --gpu-memory-utilization 0.85

     --port 7860 \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
+    --dtype bfloat16 \
+    --kv-cache-dtype fp8 \
     --enforce-eager \
     --gpu-memory-utilization 0.85

run-sailor.sh CHANGED Viewed

@@ -11,8 +11,13 @@ printf "Running sail/Sailor-4B-Chat using vLLM OpenAI compatible API Server at p
 # INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
 # ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
 # 7536tokens÷1.2=6280words.
 # 6280words÷500words/page=12.56pages. (For single-spaced)
 python -u /app/openai_compatible_api_server.py \
     --model sail/Sailor-4B-Chat \
     --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
@@ -20,6 +25,7 @@ python -u /app/openai_compatible_api_server.py \
     --port 7860 \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
-    --dtype half \
     --enforce-eager \
     --gpu-memory-utilization 0.9

 # INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
 # ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
+# After increasing gpu utilization to 0.9, the maximum token for this model is: 9456
 # 7536tokens÷1.2=6280words.
 # 6280words÷500words/page=12.56pages. (For single-spaced)
+#
+# 9456tokens÷1.2=7880words.
+# 7880words÷500words/page=15.76pages. (For single-spaced)
 python -u /app/openai_compatible_api_server.py \
     --model sail/Sailor-4B-Chat \
     --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
     --port 7860 \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
+    --dtype bfloat16 \
+    --kv-cache-dtype fp8 \
     --enforce-eager \
     --gpu-memory-utilization 0.9