ccdv
/

lsg-bart-large-4096

@@ -71,26 +71,31 @@ model = AutoModel.from_pretrained("ccdv/lsg-bart-large-4096",
 ## Sparse selection type
-There are 5 different sparse selection patterns. The best type is task dependent. \
 Note that for sequences with length < 2*block_size, the type has no effect.
-* sparsity_type="norm", select highest norm tokens
     * Works best for a small sparsity_factor (2 to 4)
     * Additional parameters:
         * None
-* sparsity_type="pooling", use average pooling to merge tokens
     * Works best for a small sparsity_factor (2 to 4)
     * Additional parameters:
         * None
-* sparsity_type="lsh", use the LSH algorithm to cluster similar tokens
     * Works best for a large sparsity_factor (4+)
     * LSH relies on random projections, thus inference may differ slightly with different seeds
     * Additional parameters:
         * lsg_num_pre_rounds=1, pre merge tokens n times before computing centroids
-* sparsity_type="stride", use a striding mecanism per head
     * Each head will use different tokens strided by sparsify_factor
     * Not recommended if sparsify_factor > num_heads
-* sparsity_type="block_stride", use a striding mecanism per head
     * Each head will use block of tokens strided by sparsify_factor
     * Not recommended if sparsify_factor > num_heads

 ## Sparse selection type
+There are 6 different sparse selection patterns. The best type is task dependent. \
+If `sparse_block_size=0` or `sparsity_type="none"`, only local attention is considered. \
 Note that for sequences with length < 2*block_size, the type has no effect.
+* `sparsity_type="bos_pooling"` (new)
+    * weighted average pooling using the BOS token
+    * Works best in general, especially with a rather large sparsity_factor (8, 16, 32)
+    * Additional parameters:
+        * None
+* `sparsity_type="norm"`, select highest norm tokens
     * Works best for a small sparsity_factor (2 to 4)
     * Additional parameters:
         * None
+* `sparsity_type="pooling"`, use average pooling to merge tokens
     * Works best for a small sparsity_factor (2 to 4)
     * Additional parameters:
         * None
+* `sparsity_type="lsh"`, use the LSH algorithm to cluster similar tokens
     * Works best for a large sparsity_factor (4+)
     * LSH relies on random projections, thus inference may differ slightly with different seeds
     * Additional parameters:
         * lsg_num_pre_rounds=1, pre merge tokens n times before computing centroids
+* `sparsity_type="stride"`, use a striding mecanism per head
     * Each head will use different tokens strided by sparsify_factor
     * Not recommended if sparsify_factor > num_heads
+* `sparsity_type="block_stride"`, use a striding mecanism per head
     * Each head will use block of tokens strided by sparsify_factor
     * Not recommended if sparsify_factor > num_heads

modeling_lsg_bart.py CHANGED Viewed

@@ -55,9 +55,9 @@ class LSGBartConfig(BartConfig):
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
-        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
                     setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
@@ -345,7 +345,7 @@ class LSGAttentionProduct(nn.Module):
         return x.reshape(*x.size()[:-2], n_blocks, -1, d)
-class LSGBartEncoderAttention(BaseSelfAttention):
     '''
     Compute local attention with overlapping blocs
     Use global attention for tokens with highest norm
@@ -380,15 +380,16 @@ class LSGBartEncoderAttention(BaseSelfAttention):
             "lsh": self.get_sparse_tokens_with_lsh,
             "stride": self.get_sparse_tokens_with_stride,
             "block_stride": self.get_sparse_tokens_with_block_stride,
             }
         self.sparsity_type = config.sparsity_type
-        self.get_sparse_elements = sparse_functions.get(self.sparsity_type, lambda x, y, z: (None, None, None))
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
-    def get_sparse_tokens_with_norm(self, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
@@ -416,7 +417,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys, values, mask
-    def get_sparse_tokens_with_pooling(self, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
@@ -439,7 +440,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
-    def get_sparse_tokens_with_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
@@ -455,7 +456,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys, values, mask
-    def get_sparse_tokens_with_block_stride(self, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
@@ -475,11 +476,14 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys, values, mask
-    def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
         values = self.chunk(values, block_size)
@@ -526,6 +530,29 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         return keys[..., :output_size, :], values[..., :output_size, :], mask[..., :output_size, :]
     def forward(
         self,
         hidden_states,
@@ -595,7 +622,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         sparse_key, sparse_value, sparse_mask = (None, None, None)
         if self.sparse_block_size and self.sparsity_factor > 0:
-            sparse_key, sparse_value, sparse_mask = self.get_sparse_elements(key_layer, value_layer, attention_mask)
         # Expand masks on heads
         attention_mask = attention_mask.expand(-1, h, -1, -1)
@@ -632,7 +659,7 @@ class LSGBartEncoderLayer(BartEncoderLayer):
     def __init__(self, config):
         super().__init__(config)
-        self.self_attn = LSGBartEncoderAttention(
             config=config,
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,

         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
+        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride", "bos_pooling"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride', 'bos_pooling'], \
                     setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         return x.reshape(*x.size()[:-2], n_blocks, -1, d)
+class LSGBartEncoderSelfAttention(BaseSelfAttention):
     '''
     Compute local attention with overlapping blocs
     Use global attention for tokens with highest norm
             "lsh": self.get_sparse_tokens_with_lsh,
             "stride": self.get_sparse_tokens_with_stride,
             "block_stride": self.get_sparse_tokens_with_block_stride,
+            "bos_pooling": self.get_sparse_tokens_with_bos_pooling
             }
         self.sparsity_type = config.sparsity_type
+        self.get_sparse_elements = sparse_functions.get(self.sparsity_type, lambda w, x, y, z: (None, None, None))
         if config.sparsity_type == "lsh":
             self.lsh_num_pre_rounds = config.lsh_num_pre_rounds
+    def get_sparse_tokens_with_norm(self, queries, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         return keys, values, mask
+    def get_sparse_tokens_with_pooling(self, queries, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
+    def get_sparse_tokens_with_stride(self, queries, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         return keys, values, mask
+    def get_sparse_tokens_with_block_stride(self, queries, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
         return keys, values, mask
+    def get_sparse_tokens_with_lsh(self, queries, keys, values, mask):
         if self.sparsity_factor == 1:
             return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
+        if self.sparsity_factor == self.sparse_block_size:
+            return self.get_sparse_tokens_with_bos_pooling(queries, keys, values, mask)
         block_size = min(self.block_size, self.sparse_block_size)
         keys = self.chunk(keys, block_size)
         values = self.chunk(values, block_size)
         return keys[..., :output_size, :], values[..., :output_size, :], mask[..., :output_size, :]
+    def get_sparse_tokens_with_bos_pooling(self, queries, keys, values, mask):
+        if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
+        queries = queries.unsqueeze(-3)
+        mask = self.chunk(mask.transpose(-1, -2), self.sparsity_factor).transpose(-1, -2)
+        keys = self.chunk(keys, self.sparsity_factor)
+        values = self.chunk(values, self.sparsity_factor)
+        n, h, b, t, d = keys.size()
+        scores = (queries[..., :1, :] @ keys.transpose(-1, -2)) / math.sqrt(d)
+        if mask is not None:
+            scores = scores + mask
+        scores = torch.softmax(scores, dim=-1)
+        keys = scores @ keys
+        values = scores @ values
+        mask = mask.mean(dim=-1)
+        mask[mask != torch.finfo(mask.dtype).min] = 0
+        return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def forward(
         self,
         hidden_states,
         sparse_key, sparse_value, sparse_mask = (None, None, None)
         if self.sparse_block_size and self.sparsity_factor > 0:
+            sparse_key, sparse_value, sparse_mask = self.get_sparse_elements(query_layer, key_layer, value_layer, attention_mask)
         # Expand masks on heads
         attention_mask = attention_mask.expand(-1, h, -1, -1)
     def __init__(self, config):
         super().__init__(config)
+        self.self_attn = LSGBartEncoderSelfAttention(
             config=config,
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,