slices: | |
- sources: | |
- model: Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf | |
layer_range: [1, 1] | |
parameters: | |
weight: 0.3 | |
density: 0.2 | |
gamma: 0.005 | |
normalize: true | |
int8_mask: true | |
random_seed: 42 | |
temperature: 0.5 | |
top_p: 0.65 | |
inference: true | |
max_tokens: 300 | |
stream: true | |
quantization: | |
- method: int8 | |
value: 60 | |
- method: int4 | |
value: 40 | |
merge_method: passthrough | |
base_model: huihui-ai/Llama-3.2-1B-Instruct-abliterated | |
dtype: float16 | |
compression: | |
pruning: | |
enabled: true | |
sparsity: 0.95 | |
distillation: | |
enabled: true | |
temperature: 0.7 | |
model_type: "distilled" | |
quantization: | |
enabled: true | |
methods: | |
- int8 | |
- int4 | |
inference_optimizations: | |
caching: | |
enabled: true | |
cache_size: 1000 | |
batching: | |
enabled: true | |
batch_size: 8 | |
parallelism: | |
enabled: true | |
workers: 4 | |
asynchronous: | |
enabled: true | |
max_concurrent_tasks: 5 | |
tensor_cores: | |
enabled: true | |
gpu: | |
enabled: true | |
device: cuda | |
model_sharding: | |
enabled: true | |
shards: 2 | |
memory_optimization: | |
enabled: true | |
strategy: "offload" | |
tensor_compression: | |
enabled: true | |
method: "tensor_factorization" | |
mixture_of_experts: | |
enabled: true | |
num_experts: 4 | |
gating_strategy: top_k | |
top_k: 2 | |
load_balancing: | |
enabled: true | |
balance_factor: 0.5 | |
expert_capacity: | |
max_tokens_per_expert: 512 | |
dynamic_routing: | |
enabled: true | |
routing_threshold: 0.1 | |
routing_optimizations: | |
enabled: true | |
cache_routing: true | |
model_sparsity: | |
enabled: true | |
sparsity_pattern: "block" | |
mask_method: "random" | |
pruning_factor: 0.98 | |
auto_tuning: | |
enabled: true | |
batch_size_adaptation: | |
enabled: true | |
factor: 0.8 | |
max_batch_size: 32 | |
temperature_scheduling: | |
enabled: true | |
start_temp: 1.0 | |
end_temp: 0.5 | |
schedule: "linear" | |