Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 13, 2024

Commit

e6d7d0e

verified ·

1 Parent(s): 8825292

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +12 -26

modeling_gemmoe.py CHANGED Viewed

@@ -670,44 +670,30 @@ class GemmoeSparseMoeBlock(nn.Module):
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        hidden_states = hidden_states.to(self.gate.weight.device)
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
-        top_routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-        top_routing_weights /= top_routing_weights.sum(dim=-1, keepdim=True)
         # we cast back to the input dtype
-        top_routing_weights = top_routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
-        )
-        # Loop over all available experts in the model and perform the computation on each expert
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.experts[expert_idx]
-            token_indices = (selected_experts == expert_idx).any(dim=-1).nonzero(as_tuple=True)[0]
-            if token_indices.numel() == 0:
-                continue
-            current_state = hidden_states[token_indices]
-            current_hidden_states = expert_layer(current_state)
-            # Multiply the output hidden states by `top_routing_weights` on the corresponding tokens
-            expert_indices = (selected_experts[token_indices] == expert_idx).nonzero(as_tuple=True)[1]
-            current_hidden_states *= top_routing_weights[token_indices, expert_indices, None]
-            # Cast current_hidden_states to the same data type as final_hidden_states
-            current_hidden_states = current_hidden_states.to(final_hidden_states.dtype)
-            final_hidden_states.index_add_(0, token_indices, current_hidden_states)
-        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits

         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
+        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
         # we cast back to the input dtype
+        topk_weight = topk_weight.to(hidden_states.dtype)
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)
+        flat_topk_idx = topk_idx.view(-1)
+        for i in range(self.num_experts):
+            expert = self.experts[i]
+            y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits