Implement MLA inference optimizations to DeepseekV2Attention

This patched DeepseekV2Model contains the following modifications to DeepseekV2Attention for reducing VRAM consumption and improve efficiency:

1. Instead of caching the decompressed key/value states, we cache only the low-rank key-value joint compression as well as
the decoupled RoPE part of the keys. For the sake of reusing the cache utility of transformers library, we treat
k_pe as key_states and compressed_kv as value_states.
2. We implement the absorption technique described in the DeepseekV2 paper, by changing the multiplication order when
computing query and output vectors. This not only saves memory consumption of intermediate tensors but also reduces
the number of floating-point operations.
3. We compute the RoPE part and non-RoPE part of the attention score separately and then sum them up. The original
implementation concatenates the two parts of the query/key vectors, which has proven to be quite inefficient when
caching compressed key/value states due to unnecessary data broadcast and memory round-trips.

By applying the above changes, the MLA module can achieve up to 20.4x speedup for single request and 3.63x for 32
batched requests on an NVIDIA A100-PCIE-40GB GPU during the decoding phase, as well as 26.2x and 3.52x speedup on
NVIDIA GeForce RTX 4080 for single and batched requests, respectively.

More detailed description of the modification can be found in https://zhuanlan.zhihu.com/p/700214123?utm_psn=1779287628619632640 and https://github.com/madsys-dev/deepseekv2-profile/blob/924174cb5dc11fad24bdaad3fd820ebf87506368/workspace/blog/optimizing-mla.md (in Chinese).

Files changed (1) hide show

modeling_deepseek.py +18 -28

modeling_deepseek.py CHANGED Viewed

@@ -822,17 +822,10 @@ class DeepseekV2Attention(nn.Module):
         compressed_kv, k_pe = torch.split(
             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
         )
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-        kv = (
-            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            .transpose(1, 2)
-        )
-        k_nope, value_states = torch.split(
-            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
-        )
-        kv_seq_len = value_states.shape[-2]
         if past_key_value is not None:
             if self.layer_idx is None:
                 raise ValueError(
@@ -841,27 +834,22 @@ class DeepseekV2Attention(nn.Module):
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-        attn_weights = (
-            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
-        )
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
@@ -878,11 +866,13 @@ class DeepseekV2Attention(nn.Module):
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(
             attn_weights, dim=-1, dtype=torch.float32
-        ).to(query_states.dtype)
         attn_weights = nn.functional.dropout(
             attn_weights, p=self.attention_dropout, training=self.training
         )
-        attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
             raise ValueError(
@@ -1902,4 +1892,4 @@ class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

         compressed_kv, k_pe = torch.split(
             compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
         )
+        compressed_kv = self.kv_a_layernorm(compressed_kv)
         k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv_seq_len = k_pe.shape[-2]
         if past_key_value is not None:
             if self.layer_idx is None:
                 raise ValueError(
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(q_pe, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            compressed_kv = compressed_kv.unsqueeze(1)
+            k_pe, compressed_kv = past_key_value.update(k_pe, compressed_kv, self.layer_idx, cache_kwargs)
+            compressed_kv = compressed_kv.squeeze(1)
+        kv_b_proj = self.kv_b_proj.weight.view(self.num_heads, -1, self.kv_lora_rank)
+        q_absorb = kv_b_proj[:, :self.qk_nope_head_dim,:]
+        out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :]
+        q_nope = torch.matmul(q_nope, q_absorb)
+        attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
                 f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(
             attn_weights, dim=-1, dtype=torch.float32
+        ).to(q_pe.dtype)
         attn_weights = nn.functional.dropout(
             attn_weights, p=self.attention_dropout, training=self.training
         )
+        attn_output = torch.einsum('bhql,blc->bhqc', attn_weights, compressed_kv)
+        attn_output = torch.matmul(attn_output, out_absorb.mT)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
             raise ValueError(
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )