Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

877429a

verified ·

1 Parent(s): 9e9951f

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +19 -17

modeling_gemmoe.py CHANGED Viewed

@@ -616,11 +616,11 @@ class GemmoeSdpaAttention(GemmoeAttention):
         causal_mask = attention_mask
         if attention_mask is not None and cache_position is not None:
             causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
-        # Ensure query, key, and value states have the same dtype
-        common_dtype = query_states.dtype
-        key_states = key_states.to(dtype=common_dtype)
-        value_states = value_states.to(dtype=common_dtype)
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -629,9 +629,6 @@ class GemmoeSdpaAttention(GemmoeAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
-        # Cast causal_mask to the same dtype as query_states
-        if causal_mask is not None:
-            causal_mask = causal_mask.to(dtype=query_states.dtype)
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
@@ -1215,11 +1212,15 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
         )
         hidden_states = outputs[0]
-        # Ensure hidden_states and lm_head have compatible dtypes
-        hidden_states = hidden_states.to(dtype=self.lm_head.weight.dtype)
         logits = self.lm_head(hidden_states)
         loss = None
         if labels is not None:
@@ -1298,8 +1299,8 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
                 past_length = 0
             else:
                 past_length = cache_position[-1] + 1
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            position_ids = position_ids[:, -1].unsqueeze(-1)
         cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
@@ -1328,7 +1329,6 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
 @add_start_docstrings(
     """
     The Gemmoe Model transformer with a sequence classification head on top (linear layer).
@@ -1418,8 +1418,10 @@ class GemmoeForSequenceClassification(GemmoePreTrainedModel):
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
-                sequence_lengths = sequence_lengths.clamp(min=0).to(logits.device)
             else:
                 sequence_lengths = -1

         causal_mask = attention_mask
         if attention_mask is not None and cache_position is not None:
             causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
+            # Cast query, key, and value states to the same dtype (bf16)
+            query_states = query_states.to(dtype=torch.bfloat16)
+            key_states = key_states.to(dtype=torch.bfloat16)
+            value_states = value_states.to(dtype=torch.bfloat16)
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        # Handle unused parameters
+        if self.training:
+            for expert in self.model.layers[-1].block_sparse_moe.experts:
+                for param in expert.parameters():
+                    if param.requires_grad and param.grad is None:
+                        param.grad = torch.zeros_like(param)
         loss = None
         if labels is not None:
                 past_length = 0
             else:
                 past_length = cache_position[-1] + 1
+            input_ids = input_ids[:, past_length:]
+            position_ids = position_ids[:, past_length:]
         cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
 @add_start_docstrings(
     """
     The Gemmoe Model transformer with a sequence classification head on top (linear layer).
             sequence_lengths = -1
         else:
             if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
             else:
                 sequence_lengths = -1