{ "producer": { "name": "modelopt", "version": "0.11.2" }, "architecture": "MedusaForCausalLM", "dtype": "float16", "num_hidden_layers": 32, "num_attention_heads": 32, "num_key_value_heads": 8, "hidden_size": 4096, "norm_epsilon": 1e-05, "vocab_size": 32000, "max_position_embeddings": 32768, "hidden_act": "silu", "use_parallel_embedding": true, "embedding_sharding_dim": 0, "quantization": { "quant_algo": "FP8", "kv_cache_quant_algo": "FP8", "exclude_modules": [ "lm_head", "*router", "*vocab_embedding", "*position_embedding", "*block_embedding", "*medusa_heads*" ] }, "mapping": { "world_size": 1, "tp_size": 1, "pp_size": 1 }, "head_size": 128, "intermediate_size": 14336, "position_embedding_type": "rope_gpt_neox", "share_embedding_table": false, "residual_mlp": false, "bias": false, "rotary_pct": 1.0, "rank": 0, "decoder": "llama", "rmsnorm": true, "lm_head_bias": false, "rotary_base": 1000000.0, "max_draft_len": 63, "num_medusa_heads": 3, "num_medusa_layers": 1 }