Spaces:
Running
Running
import multiprocessing | |
import os | |
from typing import Optional, Dict, List, Union | |
import dotenv | |
from loguru import logger | |
from pydantic import BaseModel, Field | |
from api.utils.compat import model_json, disable_warnings | |
dotenv.load_dotenv() | |
disable_warnings(BaseModel) | |
def get_bool_env(key, default="false"): | |
return os.environ.get(key, default).lower() == "true" | |
def get_env(key, default): | |
val = os.environ.get(key, "") | |
return val or default | |
class Settings(BaseModel): | |
""" Settings class. """ | |
host: Optional[str] = Field( | |
default=get_env("HOST", "0.0.0.0"), | |
description="Listen address.", | |
) | |
port: Optional[int] = Field( | |
default=int(get_env("PORT", 8000)), | |
description="Listen port.", | |
) | |
api_prefix: Optional[str] = Field( | |
default=get_env("API_PREFIX", "/v1"), | |
description="API prefix.", | |
) | |
engine: Optional[str] = Field( | |
default=get_env("ENGINE", "default"), | |
description="Choices are ['default', 'vllm', 'llama.cpp', 'tgi'].", | |
) | |
# model related | |
model_name: Optional[str] = Field( | |
default=get_env("MODEL_NAME", None), | |
description="The name of the model to use for generating completions." | |
) | |
model_path: Optional[str] = Field( | |
default=get_env("MODEL_PATH", None), | |
description="The path to the model to use for generating completions." | |
) | |
adapter_model_path: Optional[str] = Field( | |
default=get_env("ADAPTER_MODEL_PATH", None), | |
description="Path to a LoRA file to apply to the model." | |
) | |
resize_embeddings: Optional[bool] = Field( | |
default=get_bool_env("RESIZE_EMBEDDINGS"), | |
description="Whether to resize embeddings." | |
) | |
dtype: Optional[str] = Field( | |
default=get_env("DTYPE", "half"), | |
description="Precision dtype." | |
) | |
# device related | |
device: Optional[str] = Field( | |
default=get_env("DEVICE", "cuda"), | |
description="Device to load the model." | |
) | |
device_map: Optional[Union[str, Dict]] = Field( | |
default=get_env("DEVICE_MAP", None), | |
description="Device map to load the model." | |
) | |
gpus: Optional[str] = Field( | |
default=get_env("GPUS", None), | |
description="Specify which gpus to load the model." | |
) | |
num_gpus: Optional[int] = Field( | |
default=int(get_env("NUM_GPUs", 1)), | |
ge=0, | |
description="How many gpus to load the model." | |
) | |
# embedding related | |
only_embedding: Optional[bool] = Field( | |
default=get_bool_env("ONLY_EMBEDDING"), | |
description="Whether to launch embedding server only." | |
) | |
embedding_name: Optional[str] = Field( | |
default=get_env("EMBEDDING_NAME", None), | |
description="The path to the model to use for generating embeddings." | |
) | |
embedding_size: Optional[int] = Field( | |
default=int(get_env("EMBEDDING_SIZE", -1)), | |
description="The embedding size to use for generating embeddings." | |
) | |
embedding_device: Optional[str] = Field( | |
default=get_env("EMBEDDING_DEVICE", "cuda"), | |
description="Device to load the model." | |
) | |
# quantize related | |
quantize: Optional[int] = Field( | |
default=int(get_env("QUANTIZE", 16)), | |
description="Quantize level for model." | |
) | |
load_in_8bit: Optional[bool] = Field( | |
default=get_bool_env("LOAD_IN_8BIT"), | |
description="Whether to load the model in 8 bit." | |
) | |
load_in_4bit: Optional[bool] = Field( | |
default=get_bool_env("LOAD_IN_4BIT"), | |
description="Whether to load the model in 4 bit." | |
) | |
using_ptuning_v2: Optional[bool] = Field( | |
default=get_bool_env("USING_PTUNING_V2"), | |
description="Whether to load the model using ptuning_v2." | |
) | |
pre_seq_len: Optional[int] = Field( | |
default=int(get_env("PRE_SEQ_LEN", 128)), | |
ge=0, | |
description="PRE_SEQ_LEN for ptuning_v2." | |
) | |
# context related | |
context_length: Optional[int] = Field( | |
default=int(get_env("CONTEXT_LEN", -1)), | |
ge=-1, | |
description="Context length for generating completions." | |
) | |
chat_template: Optional[str] = Field( | |
default=get_env("PROMPT_NAME", None), | |
description="Chat template for generating completions." | |
) | |
patch_type: Optional[str] = Field( | |
default=get_env("PATCH_TYPE", None), | |
description="Patch type for generating completions." | |
) | |
alpha: Optional[Union[str, float]] = Field( | |
default=get_env("ALPHA", "auto"), | |
description="Alpha for generating completions." | |
) | |
# vllm related | |
trust_remote_code: Optional[bool] = Field( | |
default=get_bool_env("TRUST_REMOTE_CODE"), | |
description="Whether to use remote code." | |
) | |
tokenize_mode: Optional[str] = Field( | |
default=get_env("TOKENIZE_MODE", "auto"), | |
description="Tokenize mode for vllm server." | |
) | |
tensor_parallel_size: Optional[int] = Field( | |
default=int(get_env("TENSOR_PARALLEL_SIZE", 1)), | |
ge=1, | |
description="Tensor parallel size for vllm server." | |
) | |
gpu_memory_utilization: Optional[float] = Field( | |
default=float(get_env("GPU_MEMORY_UTILIZATION", 0.9)), | |
description="GPU memory utilization for vllm server." | |
) | |
max_num_batched_tokens: Optional[int] = Field( | |
default=int(get_env("MAX_NUM_BATCHED_TOKENS", -1)), | |
ge=-1, | |
description="Max num batched tokens for vllm server." | |
) | |
max_num_seqs: Optional[int] = Field( | |
default=int(get_env("MAX_NUM_SEQS", 256)), | |
ge=1, | |
description="Max num seqs for vllm server." | |
) | |
quantization_method: Optional[str] = Field( | |
default=get_env("QUANTIZATION_METHOD", None), | |
description="Quantization method for vllm server." | |
) | |
# support for transformers.TextIteratorStreamer | |
use_streamer_v2: Optional[bool] = Field( | |
default=get_bool_env("USE_STREAMER_V2"), | |
description="Support for transformers.TextIteratorStreamer." | |
) | |
# support for api key check | |
api_keys: Optional[List[str]] = Field( | |
default=get_env("API_KEYS", "").split(",") if get_env("API_KEYS", "") else None, | |
description="Support for api key check." | |
) | |
activate_inference: Optional[bool] = Field( | |
default=get_bool_env("ACTIVATE_INFERENCE", "true"), | |
description="Whether to activate inference." | |
) | |
interrupt_requests: Optional[bool] = Field( | |
default=get_bool_env("INTERRUPT_REQUESTS", "true"), | |
description="Whether to interrupt requests when a new request is received.", | |
) | |
# support for llama.cpp | |
n_gpu_layers: Optional[int] = Field( | |
default=int(get_env("N_GPU_LAYERS", 0)), | |
ge=-1, | |
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", | |
) | |
main_gpu: Optional[int] = Field( | |
default=int(get_env("MAIN_GPU", 0)), | |
ge=0, | |
description="Main GPU to use.", | |
) | |
tensor_split: Optional[List[float]] = Field( | |
default=float(get_env("TENSOR_SPLIT", None)) if get_env("TENSOR_SPLIT", None) else None, | |
description="Split layers across multiple GPUs in proportion.", | |
) | |
n_batch: Optional[int] = Field( | |
default=int(get_env("N_BATCH", 512)), | |
ge=1, | |
description="The batch size to use per eval." | |
) | |
n_threads: Optional[int] = Field( | |
default=int(get_env("N_THREADS", max(multiprocessing.cpu_count() // 2, 1))), | |
ge=1, | |
description="The number of threads to use.", | |
) | |
n_threads_batch: Optional[int] = Field( | |
default=int(get_env("N_THREADS_BATCH", max(multiprocessing.cpu_count() // 2, 1))), | |
ge=0, | |
description="The number of threads to use when batch processing.", | |
) | |
rope_scaling_type: Optional[int] = Field( | |
default=int(get_env("ROPE_SCALING_TYPE", -1)) | |
) | |
rope_freq_base: Optional[float] = Field( | |
default=float(get_env("ROPE_FREQ_BASE", 0.0)), | |
description="RoPE base frequency" | |
) | |
rope_freq_scale: Optional[float] = Field( | |
default=float(get_env("ROPE_FREQ_SCALE", 0.0)), | |
description="RoPE frequency scaling factor", | |
) | |
# support for tgi: https://github.com/huggingface/text-generation-inference | |
tgi_endpoint: Optional[str] = Field( | |
default=get_env("TGI_ENDPOINT", None), | |
description="Text Generation Inference Endpoint.", | |
) | |
# support for tei: https://github.com/huggingface/text-embeddings-inference | |
tei_endpoint: Optional[str] = Field( | |
default=get_env("TEI_ENDPOINT", None), | |
description="Text Embeddings Inference Endpoint.", | |
) | |
max_concurrent_requests: Optional[int] = Field( | |
default=int(get_env("MAX_CONCURRENT_REQUESTS", 256)), | |
description="The maximum amount of concurrent requests for this particular deployment." | |
) | |
max_client_batch_size: Optional[int] = Field( | |
default=int(get_env("MAX_CLIENT_BATCH_SIZE", 32)), | |
description="Control the maximum number of inputs that a client can send in a single request." | |
) | |
SETTINGS = Settings() | |
logger.debug(f"SETTINGS: {model_json(SETTINGS, indent=4)}") | |
if SETTINGS.gpus: | |
if len(SETTINGS.gpus.split(",")) < SETTINGS.num_gpus: | |
raise ValueError( | |
f"Larger --num_gpus ({SETTINGS.num_gpus}) than --gpus {SETTINGS.gpus}!" | |
) | |
os.environ["CUDA_VISIBLE_DEVICES"] = SETTINGS.gpus | |