THUDM
/

chatglm-6b

@@ -662,6 +662,12 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         """Initialize the weights."""
         return
     def get_masks(self, input_ids, device):
         batch_size, seq_length = input_ids.shape
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
@@ -669,6 +675,10 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         attention_mask.tril_()
         for i, context_length in enumerate(context_lengths):
             attention_mask[i, :, :context_length] = 1
         attention_mask.unsqueeze_(1)
         attention_mask = (attention_mask < 0.5).bool()
@@ -676,16 +686,22 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
     def get_position_ids(self, input_ids, mask_positions, device, gmask=False):
         batch_size, seq_length = input_ids.shape
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
         if self.position_encoding_2d:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            for i, context_length in enumerate(context_lengths):
-                position_ids[i, context_length:] = mask_positions[i]
             block_position_ids = [torch.cat((
                 torch.zeros(context_length, dtype=torch.long, device=device),
                 torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
             )) for context_length in context_lengths]
             block_position_ids = torch.stack(block_position_ids, dim=0)
             position_ids = torch.stack((position_ids, block_position_ids), dim=1)
         else:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
@@ -1094,15 +1110,20 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             if attention_mask is not None and attention_mask.dtype == torch.bool:
                 attention_mask = attention_mask[:, :, -1:]
             else:
-                attention_mask = None
             if position_ids is not None:
                 position_ids = position_ids[..., -1:]
             else:
                 context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
                 if self.position_encoding_2d:
                     position_ids = torch.tensor(
-                        [[mask_position, seq_length - context_length] for mask_position, context_length in
-                         zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
                 else:
                     position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
                                                 device=input_ids.device).unsqueeze(-1)

         """Initialize the weights."""
         return
+    def get_pad_length(self, seq):
+        l = 0
+        while l < len(seq) and seq[l] == self.config.pad_token_id:
+            l += 1
+        return l
     def get_masks(self, input_ids, device):
         batch_size, seq_length = input_ids.shape
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
         attention_mask.tril_()
         for i, context_length in enumerate(context_lengths):
             attention_mask[i, :, :context_length] = 1
+        pad_lengths = [self.get_pad_length(seq.tolist()) for seq in input_ids]
+        for i, pad_length in enumerate(pad_lengths):
+            attention_mask[i, :, :pad_length] = 0
+            attention_mask[i, :pad_length, :] = 0
         attention_mask.unsqueeze_(1)
         attention_mask = (attention_mask < 0.5).bool()
     def get_position_ids(self, input_ids, mask_positions, device, gmask=False):
         batch_size, seq_length = input_ids.shape
+        pad_lengths = [self.get_pad_length(seq.tolist()) for seq in input_ids]
         context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
         if self.position_encoding_2d:
+            position_ids = [torch.arange(seq_length-pad_length, dtype=torch.long, device=device) for pad_length in pad_lengths]
+            for i, (context_length, pad_length) in enumerate(zip(context_lengths, pad_lengths)):
+                position_ids[i][context_length-pad_length:] = mask_positions[i] - pad_length
             block_position_ids = [torch.cat((
                 torch.zeros(context_length, dtype=torch.long, device=device),
                 torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
             )) for context_length in context_lengths]
             block_position_ids = torch.stack(block_position_ids, dim=0)
+            position_ids = [torch.cat((
+                torch.zeros(pad_length, dtype=torch.long, device=device),
+                range_pos
+            )) for pad_length, range_pos in zip(pad_lengths, position_ids)]
+            position_ids = torch.stack(position_ids, dim=0)
             position_ids = torch.stack((position_ids, block_position_ids), dim=1)
         else:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
             if attention_mask is not None and attention_mask.dtype == torch.bool:
                 attention_mask = attention_mask[:, :, -1:]
             else:
+                attention_mask = self.get_masks(
+                    input_ids,
+                    device=input_ids.device
+                )
+                attention_mask[:, :, -1:]
             if position_ids is not None:
                 position_ids = position_ids[..., -1:]
             else:
+                pad_lengths = [self.get_pad_length(seq.tolist()) for seq in input_ids]
                 context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
                 if self.position_encoding_2d:
                     position_ids = torch.tensor(
+                        [[mask_position - pad_length, seq_length - context_length] for pad_length, mask_position, context_length in
+                         zip(pad_lengths, mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
                 else:
                     position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
                                                 device=input_ids.device).unsqueeze(-1)