SmallDoge
/

Doge-20M-Instruct-SFT

@@ -25,6 +25,9 @@
   "num_cdmmoe_experts": 2048,
   "num_cdmmoe_experts_per_head": 8,
   "num_cdmmoe_heads": 4,
   "num_channels": 3,
   "num_hidden_layers": 8,
   "num_key_value_heads": 1,

   "num_cdmmoe_experts": 2048,
   "num_cdmmoe_experts_per_head": 8,
   "num_cdmmoe_heads": 4,
+  "num_cdmoe_experts": 16348,
+  "num_cdmoe_experts_per_head": 8,
+  "num_cdmoe_heads": 4,
   "num_channels": 3,
   "num_hidden_layers": 8,
   "num_key_value_heads": 1,

configuration_doge.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimension of the CDMoE representations.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
             The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
-        num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
-            Number of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
-        expert_retrieval_size (`int`, *optional*, defaults to 256):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
         attention_dropout=0.0,
         dynamic_mask_ratio=0.0,
         is_moe=False,
-        num_cdmmoe_experts=2048,
-        num_cdmmoe_heads=4,
-        num_cdmmoe_experts_per_head=8,
-        expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
-        self.num_cdmmoe_experts = num_cdmmoe_experts
-        self.num_cdmmoe_heads = num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters

         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
+        num_cdmoe_experts (`int`, *optional*, defaults to 16348):
+            Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
+        num_cdmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
+        num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
+        expert_retrieval_size (`int`, *optional*, defaults to 64):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
         attention_dropout=0.0,
         dynamic_mask_ratio=0.0,
         is_moe=False,
+        num_cdmoe_experts=16348,
+        num_cdmoe_heads=4,
+        num_cdmoe_experts_per_head=8,
+        expert_retrieval_size=64,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.attention_dropout = attention_dropout
         self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
+        self.num_cdmoe_experts = num_cdmoe_experts
+        self.num_cdmoe_heads = num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters

modeling_doge.py CHANGED Viewed

@@ -274,7 +274,7 @@ class DogeDynamicMaskAttention(nn.Module):
         attn_mask = self.prepare_dynamic_mask(
             hidden_states=hidden_states,
             dynamic_mask=dynamic_mask,
-            dynamic_mask_ratio=0.1,
             attention_mask=attention_mask,
         )
         attn_weights = attn_weights + attn_mask
@@ -480,18 +480,18 @@ class DogeCDMoE(DogeMLP):
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
-        self.num_cdmmoe_experts = config.num_cdmmoe_experts
-        self.num_cdmmoe_heads = config.num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
-        self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         # queries and keys for retrieval experts
-        self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
-        self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
-        self.down_embed  = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
-        self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
     def forward(
         self,
@@ -502,11 +502,11 @@ class DogeCDMoE(DogeMLP):
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
-        queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
-        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
@@ -515,7 +515,7 @@ class DogeCDMoE(DogeMLP):
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
-        scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)

         attn_mask = self.prepare_dynamic_mask(
             hidden_states=hidden_states,
             dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
             attention_mask=attention_mask,
         )
         attn_weights = attn_weights + attn_mask
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
+        self.num_cdmoe_experts = config.num_cdmoe_experts
+        self.num_cdmoe_heads = config.num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
+        self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
         # queries and keys for retrieval experts
+        self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
+        self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
+        self.down_embed  = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
+        self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
     def forward(
         self,
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
+        queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
+        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
+        scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)