JingzeShi commited on
Commit
ac26339
verified
1 Parent(s): 133d917

Upload DogeForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +3 -0
  2. configuration_doge.py +13 -13
  3. modeling_doge.py +12 -12
config.json CHANGED
@@ -25,6 +25,9 @@
25
  "num_cdmmoe_experts": 2048,
26
  "num_cdmmoe_experts_per_head": 8,
27
  "num_cdmmoe_heads": 4,
 
 
 
28
  "num_channels": 3,
29
  "num_hidden_layers": 8,
30
  "num_key_value_heads": 1,
 
25
  "num_cdmmoe_experts": 2048,
26
  "num_cdmmoe_experts_per_head": 8,
27
  "num_cdmmoe_heads": 4,
28
+ "num_cdmoe_experts": 16348,
29
+ "num_cdmoe_experts_per_head": 8,
30
+ "num_cdmoe_heads": 4,
31
  "num_channels": 3,
32
  "num_hidden_layers": 8,
33
  "num_key_value_heads": 1,
configuration_doge.py CHANGED
@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
40
  hidden_size (`int`, *optional*, defaults to 1024):
41
  Dimension of the hidden representations.
42
  intermediate_size (`int`, *optional*, defaults to 2048):
43
- Dimension of the CDMoE representations.
44
  num_hidden_layers (`int`, *optional*, defaults to 32):
45
  Number of hidden layers in the Transformer decoder.
46
  hidden_bias (`bool`, *optional*, defaults to `False`):
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
115
  The ratio to control the proportion of the dynamic mask filled with the minimum value.
116
  is_moe (`bool`, *optional*, defaults to `False`):
117
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
118
- num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
119
- Number of Private Experts for the Cross Domain Mixture of Experts.
120
- num_cdmmoe_heads (`int`, *optional*, defaults to 4):
121
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
122
- num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
  Number of Private Experts per head for the Cross Domain Mixture of Experts.
124
- expert_retrieval_size (`int`, *optional*, defaults to 256):
125
  Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
126
  """
127
 
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
158
  attention_dropout=0.0,
159
  dynamic_mask_ratio=0.0,
160
  is_moe=False,
161
- num_cdmmoe_experts=2048,
162
- num_cdmmoe_heads=4,
163
- num_cdmmoe_experts_per_head=8,
164
- expert_retrieval_size=256,
165
  **kwargs,
166
  ):
167
  self.vocab_size = vocab_size
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
188
  self.attention_dropout = attention_dropout
189
  self.dynamic_mask_ratio = dynamic_mask_ratio
190
  self.is_moe = is_moe
191
- self.num_cdmmoe_experts = num_cdmmoe_experts
192
- self.num_cdmmoe_heads = num_cdmmoe_heads
193
- self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
194
  self.expert_retrieval_size = expert_retrieval_size
195
 
196
  # Validate the correctness of rotary position embeddings parameters
 
40
  hidden_size (`int`, *optional*, defaults to 1024):
41
  Dimension of the hidden representations.
42
  intermediate_size (`int`, *optional*, defaults to 2048):
43
+ Dimension of the MLP representations.
44
  num_hidden_layers (`int`, *optional*, defaults to 32):
45
  Number of hidden layers in the Transformer decoder.
46
  hidden_bias (`bool`, *optional*, defaults to `False`):
 
115
  The ratio to control the proportion of the dynamic mask filled with the minimum value.
116
  is_moe (`bool`, *optional*, defaults to `False`):
117
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
118
+ num_cdmoe_experts (`int`, *optional*, defaults to 16348):
119
+ Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
120
+ num_cdmoe_heads (`int`, *optional*, defaults to 4):
121
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
122
+ num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
  Number of Private Experts per head for the Cross Domain Mixture of Experts.
124
+ expert_retrieval_size (`int`, *optional*, defaults to 64):
125
  Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
126
  """
127
 
 
158
  attention_dropout=0.0,
159
  dynamic_mask_ratio=0.0,
160
  is_moe=False,
161
+ num_cdmoe_experts=16348,
162
+ num_cdmoe_heads=4,
163
+ num_cdmoe_experts_per_head=8,
164
+ expert_retrieval_size=64,
165
  **kwargs,
166
  ):
167
  self.vocab_size = vocab_size
 
188
  self.attention_dropout = attention_dropout
189
  self.dynamic_mask_ratio = dynamic_mask_ratio
190
  self.is_moe = is_moe
191
+ self.num_cdmoe_experts = num_cdmoe_experts
192
+ self.num_cdmoe_heads = num_cdmoe_heads
193
+ self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
194
  self.expert_retrieval_size = expert_retrieval_size
195
 
196
  # Validate the correctness of rotary position embeddings parameters
modeling_doge.py CHANGED
@@ -274,7 +274,7 @@ class DogeDynamicMaskAttention(nn.Module):
274
  attn_mask = self.prepare_dynamic_mask(
275
  hidden_states=hidden_states,
276
  dynamic_mask=dynamic_mask,
277
- dynamic_mask_ratio=0.1,
278
  attention_mask=attention_mask,
279
  )
280
  attn_weights = attn_weights + attn_mask
@@ -480,18 +480,18 @@ class DogeCDMoE(DogeMLP):
480
  self.act_fn = ACT2FN[config.hidden_act]
481
 
482
  self.expert_retrieval_dim = config.expert_retrieval_size
483
- self.num_cdmmoe_experts = config.num_cdmmoe_experts
484
- self.num_cdmmoe_heads = config.num_cdmmoe_heads
485
- self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
486
- self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
487
 
488
  # queries and keys for retrieval experts
489
- self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
490
- self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
491
 
492
  # experts
493
- self.down_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
494
- self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
495
 
496
  def forward(
497
  self,
@@ -502,11 +502,11 @@ class DogeCDMoE(DogeMLP):
502
 
503
  # get similarity with queries and keys
504
  queries = self.queries(hidden_states)
505
- queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
506
  sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
507
 
508
  # get experts with the highest similarity
509
- (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
510
  if einx_add is not None:
511
  all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
512
  all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
@@ -515,7 +515,7 @@ class DogeCDMoE(DogeMLP):
515
  all_scores = all_scores.view(*scores_x.shape[:-1], -1)
516
  all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
517
  all_indices = all_indices.view(*indices_x.shape[:-1], -1)
518
- scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
519
  indices = all_indices.gather(-1, pk_indices)
520
  down_embed = self.down_embed(indices)
521
  up_embed = self.up_embed(indices)
 
274
  attn_mask = self.prepare_dynamic_mask(
275
  hidden_states=hidden_states,
276
  dynamic_mask=dynamic_mask,
277
+ dynamic_mask_ratio=self.dynamic_mask_ratio,
278
  attention_mask=attention_mask,
279
  )
280
  attn_weights = attn_weights + attn_mask
 
480
  self.act_fn = ACT2FN[config.hidden_act]
481
 
482
  self.expert_retrieval_dim = config.expert_retrieval_size
483
+ self.num_cdmoe_experts = config.num_cdmoe_experts
484
+ self.num_cdmoe_heads = config.num_cdmoe_heads
485
+ self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
486
+ self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
487
 
488
  # queries and keys for retrieval experts
489
+ self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
490
+ self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
491
 
492
  # experts
493
+ self.down_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
494
+ self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
495
 
496
  def forward(
497
  self,
 
502
 
503
  # get similarity with queries and keys
504
  queries = self.queries(hidden_states)
505
+ queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
506
  sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
507
 
508
  # get experts with the highest similarity
509
+ (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
510
  if einx_add is not None:
511
  all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
512
  all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
 
515
  all_scores = all_scores.view(*scores_x.shape[:-1], -1)
516
  all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
517
  all_indices = all_indices.view(*indices_x.shape[:-1], -1)
518
+ scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
519
  indices = all_indices.gather(-1, pk_indices)
520
  down_embed = self.down_embed(indices)
521
  up_embed = self.up_embed(indices)