ccdv
/

lsg-distilbert-base-uncased-4096

@@ -45,6 +45,7 @@ You can change various parameters like :
 * local block size (block_size=128)
 * sparse block size (sparse_block_size=128)
 * sparsity factor (sparsity_factor=2)
 * see config.json file
 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.

 * local block size (block_size=128)
 * sparse block size (sparse_block_size=128)
 * sparsity factor (sparsity_factor=2)
+* mask_first_token (mask first token since it is redundant with the first global token)
 * see config.json file
 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.

modeling_lsg_distilbert.py CHANGED Viewed

@@ -227,7 +227,11 @@ class CausalAttentionProduct(nn.Module):
             # Add causal mask
             causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
-            causal_mask = torch.tril(torch.ones(*causal_shape, device=attention_mask.device), diagonal=-1).T * (-10000)
             attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
             del attention_mask
@@ -345,7 +349,7 @@ class LSGAttentionProduct(nn.Module):
         # Pad before block reshaping
         if is_attn_mask:
-            pad_value = -10000
             hidden_states = hidden_states.transpose(-1, -2)
         else:
             pad_value = 0
@@ -378,7 +382,7 @@ class LSGAttentionProduct(nn.Module):
         # Pad before block reshaping
         if is_attn_mask:
-            pad_value = -10000
             hidden_states = hidden_states.transpose(-1, -2)
         else:
             pad_value = 0
@@ -511,7 +515,7 @@ class LSGSelfAttention(BaseSelfAttention):
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
-        mask = - (1. - mask.clamp(0, 1)) * 1e4
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
@@ -576,7 +580,7 @@ class LSGSelfAttention(BaseSelfAttention):
         keys /= mask + 1e-8
         values /= mask + 1e-8
-        mask = -10000 * (1. - mask.clamp(0, 1))
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
@@ -871,7 +875,7 @@ class LSGTransformerBlock(nn.Module):
         # Self-Attention
         sa_output = self.attention(
             hidden_states=x,
-            attention_mask=-10000*(1 - attn_mask).unsqueeze(1).unsqueeze(1),
             head_mask=head_mask,
             output_attentions=output_attentions,
         )
@@ -948,7 +952,7 @@ class LSGDistilBertModel(LSGDistilBertPreTrainedModel, DistilBertModel):
         n, t = inputs_.size()[:2]
         if attention_mask is None:
-            attention_mask = torch.ones(n, t, device=inputs_.device)
         if self.mask_first_token:
             attention_mask[:,0] = 0

             # Add causal mask
             causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
+            causal_mask = torch.tril(
+                torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
+                diagonal=-1
+                )
+            causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
             attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
             del attention_mask
         # Pad before block reshaping
         if is_attn_mask:
+            pad_value = torch.finfo(hidden_states.dtype).min
             hidden_states = hidden_states.transpose(-1, -2)
         else:
             pad_value = 0
         # Pad before block reshaping
         if is_attn_mask:
+            pad_value = torch.finfo(hidden_states.dtype).min
             hidden_states = hidden_states.transpose(-1, -2)
         else:
             pad_value = 0
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
+        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         keys /= mask + 1e-8
         values /= mask + 1e-8
+        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
         # Self-Attention
         sa_output = self.attention(
             hidden_states=x,
+            attention_mask=torch.finfo(x.dtype).min*(1 - attn_mask).unsqueeze(1).unsqueeze(1),
             head_mask=head_mask,
             output_attentions=output_attentions,
         )
         n, t = inputs_.size()[:2]
         if attention_mask is None:
+            attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
         if self.mask_first_token:
             attention_mask[:,0] = 0