Spaces:
Paused
Paused
feat(reduce-max-length): reduce maximum length
Browse files
main.py
CHANGED
@@ -19,9 +19,13 @@ engine_llama_3_2: LLM = LLM(
|
|
19 |
max_num_batched_tokens=512, # Reduced for T4
|
20 |
max_num_seqs=16, # Reduced for T4
|
21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
22 |
-
|
|
|
|
|
|
|
|
|
23 |
enforce_eager=True, # Disable CUDA graph
|
24 |
-
dtype='
|
25 |
)
|
26 |
|
27 |
|
|
|
19 |
max_num_batched_tokens=512, # Reduced for T4
|
20 |
max_num_seqs=16, # Reduced for T4
|
21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
22 |
+
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
23 |
+
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
24 |
+
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
25 |
+
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
26 |
+
max_model_len=32768,
|
27 |
enforce_eager=True, # Disable CUDA graph
|
28 |
+
dtype='auto', # Use 'half' if you want half precision
|
29 |
)
|
30 |
|
31 |
|