Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

5450314

verified ·

1 Parent(s): f84e893

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +105 -96

modeling_gemmoe.py CHANGED Viewed

@@ -18,12 +18,13 @@
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
@@ -306,6 +307,7 @@ class GemmoeAttention(nn.Module):
                 - The attention weights (if `output_attentions=True`).
                 - The past key-value cache (if `use_cache=True`).
         """
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
@@ -331,12 +333,14 @@ class GemmoeAttention(nn.Module):
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attention_mask is not None:  # no matter the length, we just slice it
-            if cache_position is not None:
-                causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
-            else:
-                causal_mask = attention_mask
-            attn_weights = attn_weights + causal_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -371,84 +375,72 @@ class GemmoeFlashAttention2(GemmoeAttention):
         # TODO: Remove this attribute once Flash Attention for RoCm is bumped to 2.1.
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
-        past_key_value = getattr(self, "past_key_value", past_key_value)
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; position_ids needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        dropout_rate = self.attention_dropout if self.training else 0.0
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (GemmoeRMSNorm handles it correctly)
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         )
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
     def _flash_attention_forward(
         self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
@@ -684,6 +676,7 @@ class GemmoeSparseMoeBlock(nn.Module):
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -724,6 +717,7 @@ class GemmoeDecoderLayer(nn.Module):
         self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -973,7 +967,7 @@ class GemmoeModel(GemmoePreTrainedModel):
 		hidden_states = inputs_embeds
 		# Normalize
-		scale_factor = torch.tensor(math_sqrt(self.config.hidden_size), dtype=hidden_states.dtype)
 		hidden_states = hidden_states * scale_factor
 		# Decoder layers
 		all_hidden_states = () if output_hidden_states else None
@@ -986,8 +980,8 @@ class GemmoeModel(GemmoePreTrainedModel):
 				all_hidden_states += (hidden_states,)
 			if self.gradient_checkpointing and self.training:
-				layer_outputs = self._gradient_checkpointing_func(
-					decoder_layer.__call__,
 					hidden_states,
 					causal_mask,
 					position_ids,
@@ -1200,19 +1194,34 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_router_logits=output_router_logits,
-            return_dict=return_dict,
-            cache_position=cache_position,
-        )
         hidden_states = outputs[0]

 import math
 import warnings
 from typing import List, Optional, Tuple, Union
+import contextlib
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import flash_attn_cuda_utils
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
                 - The attention weights (if `output_attentions=True`).
                 - The past key-value cache (if `use_cache=True`).
         """
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        with torch.no_grad() if not self.training else contextlib.nullcontext():
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+            if attention_mask is not None:
+                if cache_position is not None:
+                    causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
+                else:
+                    causal_mask = attention_mask
+                attn_weights = attn_weights + causal_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         # TODO: Remove this attribute once Flash Attention for RoCm is bumped to 2.1.
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    output_attentions = False
+    bsz, q_len, _ = hidden_states.size()
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
+    past_key_value = getattr(self, "past_key_value", past_key_value)
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+    dropout_rate = self.attention_dropout if self.training else 0.0
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+        logger.warning_once(
+            f"The input hidden states seems to be silently casted in float32, this might be related to"
+            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            f" {target_dtype}."
         )
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    attn_output = flash_attn_cuda_utils.pyt_flash_scaled_dot_attention(
+        query_states, key_states, value_states, attn_mask=attention_mask, dropout_prob=dropout_rate
+    )
+    attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+    attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
     def _flash_attention_forward(
         self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+    @torch.jit.script
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    @torch.jit.script
     def forward(
         self,
         hidden_states: torch.Tensor,
 		hidden_states = inputs_embeds
 		# Normalize
+		scale_factor = torch.tensor(math.sqrt(self.config.hidden_size), dtype=hidden_states.dtype)
 		hidden_states = hidden_states * scale_factor
 		# Decoder layers
 		all_hidden_states = () if output_hidden_states else None
 				all_hidden_states += (hidden_states,)
 			if self.gradient_checkpointing and self.training:
+				layer_outputs = torch.utils.checkpoint.checkpoint(
+					decoder_layer,
 					hidden_states,
 					causal_mask,
 					position_ids,
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.training:
+            outputs = torch.utils.checkpoint.checkpoint(
+                self.model,
+                input_ids,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                inputs_embeds,
+                use_cache,
+                output_attentions,
+                output_hidden_states,
+                return_dict,
+                cache_position,
+            )
+        else:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_router_logits=output_router_logits,
+                return_dict=return_dict,
+                cache_position=cache_position,
+            )
         hidden_states = outputs[0]