wuhp
/

myr1

Safetensors

Model card Files Files and versions Community

wuhp commited on 8 days ago

Commit

625ec9b

verified ·

1 Parent(s): 1520350

Update myr1/modeling_deepseek.py

Browse files

Files changed (1) hide show

myr1/modeling_deepseek.py +76 -19

myr1/modeling_deepseek.py CHANGED Viewed

@@ -2,9 +2,19 @@
 modeling_deepseek.py
 An improved version of the DeepSeekV3 model code with added docstrings, in-line commentary,
-some mild refactoring, and suggestions for potential future enhancements. This version is
-intended for demonstration and testing. Actual performance gains may vary based on your
-environment and training data.
 """
 import math
@@ -45,7 +55,7 @@ from transformers.utils import (
 from transformers.utils.import_utils import is_torch_fx_available
 # Import your configuration
-from .configuration_deepseek import DeepseekV3Config
 import torch.distributed as dist
 import numpy as np
@@ -330,7 +340,7 @@ class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):
         self.register_buffer("sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False)
-# ==============================================================================
 # General Rotary helper functions
 # ==============================================================================
@@ -438,6 +448,8 @@ class MoEGate(nn.Module):
         logits = F.linear(hidden_states.float(), self.weight.float(), None)
         if self.scoring_func == "sigmoid":
             scores = logits.sigmoid()
         else:
             raise NotImplementedError(
                 f"Unsupported gating scoring function: {self.scoring_func}"
@@ -462,6 +474,9 @@ class MoEGate(nn.Module):
             tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
             _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
             topk_weight = scores_for_choice.gather(1, topk_idx)
         else:
             raise NotImplementedError(
                 f"Unsupported topk_method: {self.topk_method}"
@@ -656,6 +671,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class DeepseekV3Attention(nn.Module):
     """
     Standard multi-headed attention for Deepseek.
     """
     def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
         super().__init__()
@@ -665,6 +684,13 @@ class DeepseekV3Attention(nn.Module):
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
@@ -691,16 +717,16 @@ class DeepseekV3Attention(nn.Module):
                 config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
             )
-        # K,V-proj (MQA style)
         self.kv_a_proj_with_mqa = nn.Linear(
             self.hidden_size,
-            config.kv_lora_rank + config.qk_rope_head_dim,
             bias=config.attention_bias,
         )
         self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
-            self.num_heads * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
             bias=False,
         )
@@ -731,8 +757,8 @@ class DeepseekV3Attention(nn.Module):
                 base=self.rope_theta,
             )
         else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
             if scaling_type == "linear":
                 self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
@@ -782,6 +808,12 @@ class DeepseekV3Attention(nn.Module):
     ):
         """
         Standard forward pass for multi-headed self-attention.
         """
         if "padding_mask" in kwargs:
             warnings.warn(
@@ -798,7 +830,7 @@ class DeepseekV3Attention(nn.Module):
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-        # MQA: K,V from single projection
         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
         compressed_kv, k_pe = torch.split(
             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
@@ -806,7 +838,7 @@ class DeepseekV3Attention(nn.Module):
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
         kv = (
             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
             .transpose(1, 2)
         )
         k_nope, value_states = torch.split(
@@ -829,10 +861,17 @@ class DeepseekV3Attention(nn.Module):
         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # for RoPE
             key_states, value_states = past_key_value.update(
@@ -866,7 +905,7 @@ class DeepseekV3Attention(nn.Module):
 class DeepseekV3FlashAttention2(DeepseekV3Attention):
     """
     DeepseekV3 flash attention module. Inherits the same Q/K/V projections from DeepseekV3Attention.
-    Only the forward pass changes to use flash_attn APIs.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -906,7 +945,7 @@ class DeepseekV3FlashAttention2(DeepseekV3Attention):
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
         kv = (
             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
             .transpose(1, 2)
         )
         k_nope, value_states = torch.split(
@@ -923,10 +962,17 @@ class DeepseekV3FlashAttention2(DeepseekV3Attention):
         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
         if self.q_head_dim != self.v_head_dim:
             # Pad if needed
             value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
@@ -1091,6 +1137,7 @@ class DeepseekV3FlashAttention2(DeepseekV3Attention):
 ATTENTION_CLASSES = {
     "eager": DeepseekV3Attention,
     "flash_attention_2": DeepseekV3FlashAttention2,
 }
@@ -1106,7 +1153,7 @@ class DeepseekV3DecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
             config=config, layer_idx=layer_idx
         )
@@ -1138,7 +1185,7 @@ class DeepseekV3DecoderLayer(nn.Module):
         **kwargs
     ):
         """
-        Forward pass for one Deepseek decoder layer.
         """
         residual = hidden_states
@@ -1443,6 +1490,10 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
         Args:
             labels (torch.LongTensor of shape (batch_size, sequence_length), optional):
                 For computing the language modeling loss. Indices in [0, config.vocab_size] or -100.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (output_hidden_states if output_hidden_states is not None
@@ -1500,6 +1551,12 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
     ):
         """
         Prepare inputs during generation loops.
         """
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
@@ -1672,4 +1729,4 @@ class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

 modeling_deepseek.py
 An improved version of the DeepSeekV3 model code with added docstrings, in-line commentary,
+some mild refactoring, and suggestions for potential future enhancements for **reasoning** and **efficiency**.
+This version incorporates architectural considerations for enhanced reasoning,
+efficiency improvements like GQA (configurable), and placeholders for more advanced features.
+Actual performance gains may vary based on your environment and training data.
+**Important Notes:**
+* **Configuration is Key:** Many reasoning and efficiency improvements are driven by configuration changes in `configuration_deepseek.py`.  You will need to update your config file to fully utilize these features.  See comments marked with `[CONFIG]` for configuration-related suggestions.
+* **Placeholders:** This code includes placeholders (comments and `TODO`s) for features like Sparse Attention, more advanced MoE gating, and Chain-of-Thought (CoT) prompting.  Implementing these fully requires more code modifications and potentially changes to your training/inference pipelines.
+* **Data is Crucial for Reasoning:**  Reasoning improvements heavily depend on training data. Consider fine-tuning or pre-training on reasoning-focused datasets and using techniques like Chain-of-Thought data augmentation.
+* **Grouped-Query Attention (GQA):** This version includes comments and configuration hints for GQA.  Full GQA implementation would require modifying the attention logic in `DeepseekV3Attention.forward()` to handle grouped K/V heads.  Currently, it's MQA-style.
+* **Sparse Attention:** Placeholder for integrating Sparse Attention (e.g., Longformer, BigBird). You would need to implement a `SparseDeepseekV3Attention` class and integrate it.
 """
 import math
 from transformers.utils.import_utils import is_torch_fx_available
 # Import your configuration
+from .configuration_deepseek import DeepseekV3Config  # [CONFIG] Make sure DeepseekV3Config has new parameters
 import torch.distributed as dist
 import numpy as np
         self.register_buffer("sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False)
+# ==============================================================================
 # General Rotary helper functions
 # ==============================================================================
         logits = F.linear(hidden_states.float(), self.weight.float(), None)
         if self.scoring_func == "sigmoid":
             scores = logits.sigmoid()
+        elif self.scoring_func == "softmax": # [CONFIG] Option for softmax gating
+            scores = logits.softmax(dim=-1)
         else:
             raise NotImplementedError(
                 f"Unsupported gating scoring function: {self.scoring_func}"
             tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
             _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
             topk_weight = scores_for_choice.gather(1, topk_idx)
+        elif self.topk_method == "topk_gating": # [CONFIG] Option for simpler top-k gating
+            _, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight = torch.gather(scores, dim=-1, index=topk_idx)
         else:
             raise NotImplementedError(
                 f"Unsupported topk_method: {self.topk_method}"
 class DeepseekV3Attention(nn.Module):
     """
     Standard multi-headed attention for Deepseek.
+    **Reasoning & Efficiency Improvements Considered:**
+    * **Grouped-Query Attention (GQA):** Configurable via `config.num_key_value_heads` and `config.num_attention_heads`.  If `num_key_value_heads < num_attention_heads`, GQA is implicitly enabled.  See comments in `forward()` for GQA implementation hints. [CONFIG]
+    * **Sparse Attention:** Placeholder for integration. To use sparse attention, you would need to create a `SparseDeepseekV3Attention` class (e.g., based on LongformerAttention or BigBirdAttention) and replace `DeepseekV3Attention` in `ATTENTION_CLASSES` based on a config flag. [CONFIG]
     """
     def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads # [CONFIG] For GQA
+        if self.num_heads % self.num_key_value_heads != 0: # GQA check
+            raise ValueError(
+                "num_attention_heads must be divisible by num_key_value_heads (for GQA)"
+            )
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads # For GQA
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
                 config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
             )
+        # K,V-proj (MQA/GQA style)
         self.kv_a_proj_with_mqa = nn.Linear(
             self.hidden_size,
+            config.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
         self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
+            self.num_key_value_heads * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim), # [GQA] num_key_value_heads here
             bias=False,
         )
                 base=self.rope_theta,
             )
         else:
+            scaling_type = self.config.rope_scaling["type"] # [CONFIG] Rope scaling type
+            scaling_factor = self.config.rope_scaling["factor"] # [CONFIG] Rope scaling factor
             if scaling_type == "linear":
                 self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
     ):
         """
         Standard forward pass for multi-headed self-attention.
+        **Grouped-Query Attention (GQA) Implementation Notes:**
+        If `num_key_value_heads < num_attention_heads` (GQA is configured):
+        1. `kv` projection will produce `num_key_value_heads` * (head_dim * 2) channels.
+        2. We need to *repeat* the `key_states` and `value_states` `num_key_value_groups` times along the head dimension to match the `num_attention_heads` for query.
+        3. `repeat_kv` utility function is used for this repetition.
         """
         if "padding_mask" in kwargs:
             warnings.warn(
         q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
         q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # MQA/GQA: K,V from single projection
         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
         compressed_kv, k_pe = torch.split(
             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
         kv = (
             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_key_value_heads, self.qk_nope_head_dim + self.v_head_dim) # [GQA] num_key_value_heads here
             .transpose(1, 2)
         )
         k_nope, value_states = torch.split(
         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+        key_states = k_pe.new_empty(bsz, self.num_key_value_heads, q_len, self.q_head_dim) # [GQA] num_key_value_heads here
         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        value_states = value_states # [GQA] num_key_value_heads is already in value_states
+        # GQA: Repeat K/V states if num_key_value_heads < num_attention_heads
+        if self.num_key_value_groups != 1:
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # for RoPE
             key_states, value_states = past_key_value.update(
 class DeepseekV3FlashAttention2(DeepseekV3Attention):
     """
     DeepseekV3 flash attention module. Inherits the same Q/K/V projections from DeepseekV3Attention.
+    Only the forward pass changes to use flash_attn APIs.  Supports GQA implicitly through `DeepseekV3Attention`.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
         kv = (
             self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_key_value_heads, self.qk_nope_head_dim + self.v_head_dim) # [GQA] num_key_value_heads here
             .transpose(1, 2)
         )
         k_nope, value_states = torch.split(
         query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
         query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+        key_states = k_pe.new_empty(bsz, self.num_key_value_heads, q_len, self.q_head_dim) # [GQA] num_key_value_heads here
         key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
         key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        value_states = value_states # [GQA] value_states already has num_key_value_heads
+        # GQA: Repeat K/V states if num_key_value_heads < num_attention_heads
+        if self.num_key_value_groups != 1:
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
         if self.q_head_dim != self.v_head_dim:
             # Pad if needed
             value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
 ATTENTION_CLASSES = {
     "eager": DeepseekV3Attention,
     "flash_attention_2": DeepseekV3FlashAttention2,
+    # "sparse_attention": SparseDeepseekV3Attention, # [TODO] Placeholder for Sparse Attention class - implement SparseDeepseekV3Attention
 }
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = ATTENTION_CLASSES[config._attn_implementation]( # [CONFIG] _attn_implementation to select attention type
             config=config, layer_idx=layer_idx
         )
         **kwargs
     ):
         """
+        Forward pass for one Deepseek decoder layer.
         """
         residual = hidden_states
         Args:
             labels (torch.LongTensor of shape (batch_size, sequence_length), optional):
                 For computing the language modeling loss. Indices in [0, config.vocab_size] or -100.
+        **Reasoning Enhancement Considerations:**
+        * **Chain-of-Thought (CoT) Data:** To effectively improve reasoning, fine-tune this model on datasets that include Chain-of-Thought examples.  The model architecture is capable of leveraging CoT if trained appropriately.
+        * **Prompt Engineering for CoT Inference:** During inference, use prompts that encourage the model to generate reasoning steps (e.g., "Let's think step by step...") to elicit Chain-of-Thought behavior.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (output_hidden_states if output_hidden_states is not None
     ):
         """
         Prepare inputs during generation loops.
+        **Chain-of-Thought (CoT) Inference Hint:**
+        When using Chain-of-Thought prompting during generation, ensure your prompts are correctly formatted to encourage reasoning.
+        Consider using techniques like:
+        * "Let's think step by step:" prefix in your prompt.
+        * Sampling strategies that encourage diverse outputs for self-consistency decoding.
         """
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )