{ "parameters": { "max_total_tokens": 4096, // Increase from 2048 "max_input_length": 2048, // Increase from 1024 "max_batch_total_tokens": 16384, // Increase from 8192 "max_concurrent_requests": 2, // Increase from 1 "max_batch_size": 2, // Increase from 1 "waiting_served_ratio": 0.8 // Decrease from 1.2 }, "hardware": { "task_type": "text-generation", "accelerator": "gpu", "num_gpus": 1, "gpu_memory_gb": 24, "distributed_setup": false }, "framework_type": "pytorch", "torch_compile": true, "trust_remote_code": true, "disable_custom_kernels": false }