nathanrchn
/

phi

@@ -355,8 +355,10 @@ class SelfAttention(nn.Module):
         key_padding_mask: Optional[torch.BoolTensor] = None,
         **kwargs,
     ) -> torch.FloatTensor:
         batch_size, seqlen = qkv.shape[0], qkv.shape[1]
         q, k, v = qkv.unbind(dim=2)
         q = q.to(torch.float32)
         k = k.to(torch.float32)
@@ -367,6 +369,7 @@ class SelfAttention(nn.Module):
         # Autocast is manually disabled to avoid `torch.einsum` performing the operation
         # using float16, which might lead to overflow
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
@@ -376,13 +379,15 @@ class SelfAttention(nn.Module):
         if causal:
             causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
             scores = scores + causal_mask.to(dtype=scores.dtype)
         attention = torch.softmax(scores, dim=-1).to(v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
         return output

         key_padding_mask: Optional[torch.BoolTensor] = None,
         **kwargs,
     ) -> torch.FloatTensor:
+        print(qkv.shape)
         batch_size, seqlen = qkv.shape[0], qkv.shape[1]
         q, k, v = qkv.unbind(dim=2)
+        print(q.shape, k.shape, v.shape)
         q = q.to(torch.float32)
         k = k.to(torch.float32)
         # Autocast is manually disabled to avoid `torch.einsum` performing the operation
         # using float16, which might lead to overflow
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        print(scores.shape)
         if key_padding_mask is not None:
             padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
         if causal:
             causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+            print(causal_mask.shape)
             scores = scores + causal_mask.to(dtype=scores.dtype)
         attention = torch.softmax(scores, dim=-1).to(v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
+        print(output.shape)
         return output