ccdv
/

lsg-barthez-4096

@@ -15,7 +15,7 @@ AUTO_MAP = {
 class LSGMBartConfig(MBartConfig):
     """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
@@ -57,7 +57,8 @@ class LSGMBartConfig(MBartConfig):
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
@@ -73,7 +74,7 @@ class LSGMBartConfig(MBartConfig):
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
-                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
@@ -81,6 +82,16 @@ class LSGMBartConfig(MBartConfig):
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 class BaseSelfAttention(nn.Module):
@@ -557,8 +568,6 @@ class LSGMBartEncoderAttention(BaseSelfAttention):
                 attention_mask=attention_mask
                 )
-            if head_mask is not None:
-                context_layer = context_layer * head_mask[:, :, :1, :1]
             return self.reshape_output(context_layer)
         # Split input into global tokens and other tokens
@@ -606,8 +615,6 @@ class LSGMBartEncoderAttention(BaseSelfAttention):
         # Merge global and local-sparse tokens
         context_layer = torch.cat([bos, context_layer], dim=-2)
-        if head_mask is not None:
-            context_layer = context_layer * head_mask[:, :, :1, :1]
         context_layer = self.reshape_output(context_layer)
         return context_layer
@@ -631,27 +638,6 @@ class LSGMBartEncoderLayer(MBartEncoderLayer):
         )
-class LSGMBartDecoderLayer(MBartDecoderLayer):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGMBartClassificationHead(MBartClassificationHead):
-    """Head for sentence-level classification tasks."""
-    def __init__(
-        self,
-        input_dim,
-        inner_dim,
-        num_classes,
-        pooler_dropout,
-        ):
-        super().__init__(input_dim, inner_dim, num_classes, pooler_dropout)
 class LSGMBartPretrainedModel(MBartPreTrainedModel):
     config_class = LSGMBartConfig
@@ -659,7 +645,7 @@ class LSGMBartPretrainedModel(MBartPreTrainedModel):
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (MBartDecoder, MBartEncoder, LSGMBartDecoder, LSGMBartEncoder)):
             module.gradient_checkpointing = value
@@ -674,7 +660,7 @@ class LSGMBartEncoder(LSGMBartPretrainedModel, MBartEncoder):
     def __init__(self, config, embed_tokens=None):
-        super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
@@ -811,7 +797,7 @@ class LSGMBartEncoder(LSGMBartPretrainedModel, MBartEncoder):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        embed_pos = self.embed_positions(input_shape)
         hidden_states = inputs_embeds + embed_pos
         # Add global tokens
@@ -884,44 +870,6 @@ class LSGMBartEncoder(LSGMBartPretrainedModel, MBartEncoder):
         )
-class LSGMBartDecoder(LSGMBartPretrainedModel, MBartDecoder):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`LSGBartDecoderLayer`
-    Args:
-        config: BartConfig
-        embed_tokens (nn.Embedding): output embedding
-    """
-    def __init__(self, config, embed_tokens=None):
-        LSGMBartPretrainedModel.__init__(self, config)
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.adaptive = config.adaptive
-        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
-        self.embed_positions = MBartLearnedPositionalEmbedding(
-            config.max_position_embeddings,
-            config.d_model,
-        )
-        self.layers = nn.ModuleList([LSGMBartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = nn.LayerNorm(config.d_model)
-        self.layer_norm = nn.LayerNorm(config.d_model)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
 class LSGMBartModel(LSGMBartPretrainedModel, MBartModel):
     def __init__(self, config):
@@ -935,7 +883,7 @@ class LSGMBartModel(LSGMBartPretrainedModel, MBartModel):
         self.num_global_tokens = config.num_global_tokens
         self.encoder = LSGMBartEncoder(config, self.shared)
-        self.decoder = LSGMBartDecoder(config, self.shared)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1051,7 +999,7 @@ class LSGMBartForSequenceClassification(LSGMBartPretrainedModel, MBartForSequenc
         LSGMBartPretrainedModel.__init__(self, config, **kwargs)
         self.model = LSGMBartModel(config)
-        self.classification_head = LSGMBartClassificationHead(
             config.d_model,
             config.d_model,
             config.num_labels,
@@ -1075,35 +1023,13 @@ class LSGMBartForQuestionAnswering(LSGMBartPretrainedModel, MBartForQuestionAnsw
         self.model._init_weights(self.qa_outputs)
-class LSGMBartDecoderWrapper(LSGMBartPretrainedModel):
-    """
-    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
-    used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
-    """
-    def __init__(self, config):
-        super().__init__(config)
-        self.decoder = LSGMBartDecoder(config)
-    def forward(self, *args, **kwargs):
-        return self.decoder(*args, **kwargs)
 class LSGMBartForCausalLM(LSGMBartPretrainedModel, MBartForCausalLM):
     def __init__(self, config):
-        config = copy.deepcopy(config)
-        config.is_decoder = True
-        config.is_encoder_decoder = False
         LSGMBartPretrainedModel.__init__(self, config)
-        self.model = LSGMBartDecoderWrapper(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
 def str_to_class(classname):

 class LSGMBartConfig(MBartConfig):
     """
+    This class overrides :class:`~transformers.MBartConfig`. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
+                    setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
+        if self.mask_first_token and not pool_with_global:
+            logger.warning(
+                "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
+            self.pool_with_global = True
+        if hasattr(self, "position_embedding_type"):
+            if self.position_embedding_type != "absolute":
+                logger.warning(
+                "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
 class BaseSelfAttention(nn.Module):
                 attention_mask=attention_mask
                 )
             return self.reshape_output(context_layer)
         # Split input into global tokens and other tokens
         # Merge global and local-sparse tokens
         context_layer = torch.cat([bos, context_layer], dim=-2)
         context_layer = self.reshape_output(context_layer)
         return context_layer
         )
 class LSGMBartPretrainedModel(MBartPreTrainedModel):
     config_class = LSGMBartConfig
     supports_gradient_checkpointing = True
     def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (MBartDecoder, MBartEncoder, LSGMBartEncoder)):
             module.gradient_checkpointing = value
     def __init__(self, config, embed_tokens=None):
+        LSGMBartPretrainedModel.__init__(self, config)
         self.dropout = config.dropout
         self.layerdrop = config.encoder_layerdrop
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        embed_pos = self.embed_positions(inputs_embeds)
         hidden_states = inputs_embeds + embed_pos
         # Add global tokens
         )
 class LSGMBartModel(LSGMBartPretrainedModel, MBartModel):
     def __init__(self, config):
         self.num_global_tokens = config.num_global_tokens
         self.encoder = LSGMBartEncoder(config, self.shared)
+        self.decoder = MBartDecoder(config, self.shared)
         # Initialize weights and apply final processing
         self.post_init()
         LSGMBartPretrainedModel.__init__(self, config, **kwargs)
         self.model = LSGMBartModel(config)
+        self.classification_head = MBartClassificationHead(
             config.d_model,
             config.d_model,
             config.num_labels,
         self.model._init_weights(self.qa_outputs)
 class LSGMBartForCausalLM(LSGMBartPretrainedModel, MBartForCausalLM):
     def __init__(self, config):
         LSGMBartPretrainedModel.__init__(self, config)
+        MBartForCausalLM.__init__(self, config)
 def str_to_class(classname):