base_model: d:/DeepSeek-R1-Distill-Qwen-1.5B-uncensored gate_mode: random architecture: qwen dtype: float32 experts_per_token: 4 experts: - source_model: D:/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1 - source_model: D:/Qwen2.5-DeepScaleR-1.5B-Preview - source_model: D:/QwQ-R1-Distill-1.5B-CoT - source_model: D:/DeepSeek-R1-ReDistill-Qwen-1.5B-v1.0 - source_model: D:/DeepSeek-R1-Distill-Qwen-1.5B-uncensored - source_model: D:/Qwen2.5-1.5B-Instruct shared_experts: - source_model: D:/DeepSeek-R1-Distill-Qwen-1.5B-uncensored residual_scale: 0.01 # downweight output from shared expert to prevent overcooking the model