yusufs commited on
Commit
2425953
·
1 Parent(s): 4998ce7

feat(reduce-max-length): reduce maximum length

Browse files
Files changed (1) hide show
  1. main.py +6 -2
main.py CHANGED
@@ -19,9 +19,13 @@ engine_llama_3_2: LLM = LLM(
19
  max_num_batched_tokens=512, # Reduced for T4
20
  max_num_seqs=16, # Reduced for T4
21
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
22
- max_model_len=131072, # Llama-3.2-3B-Instruct context length
 
 
 
 
23
  enforce_eager=True, # Disable CUDA graph
24
- dtype='half', # Use 'half' if you want half precision
25
  )
26
 
27
 
 
19
  max_num_batched_tokens=512, # Reduced for T4
20
  max_num_seqs=16, # Reduced for T4
21
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
22
+ # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
23
+ # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
24
+ # so that's basically 24k / .5k = 24 x 2 =~48 pages.
25
+ # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
26
+ max_model_len=32768,
27
  enforce_eager=True, # Disable CUDA graph
28
+ dtype='auto', # Use 'half' if you want half precision
29
  )
30
 
31