3D-GRAND / llava /model /bbox_head.py
jedyang97's picture
initial demo
947767a
import torch
import torch.nn as nn
from llava.model.multimodal_encoder.three_detr_model.models.transformer import (
TransformerEncoder,
TransformerEncoderLayer,
TransformerDecoder,
TransformerDecoderLayer,
)
from torch.nn.utils.rnn import pad_sequence
from llava.model.multimodal_encoder.mask3d_model.position_embedding import (
PositionEmbeddingCoordsSine,
)
from torch.nn.init import xavier_uniform_
class SimpleBBoxHead(nn.Module):
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 1024,
):
super().__init__()
self.activation = nn.ReLU()
# # round up to the nearest multiple of 4
# new_vision_feat_dim_in = (vision_feat_dim_in + 3) // 4 * 4
# self.vision_projection_mlp = nn.Sequential(
# nn.Linear(vision_feat_dim_in, new_vision_feat_dim_in),
# self.activation,
# nn.Linear(new_vision_feat_dim_in, new_vision_feat_dim_in),
# self.activation,
# nn.Linear(new_vision_feat_dim_in, new_vision_feat_dim_in),
# )
# encoder_layer = TransformerEncoderLayer(
# d_model=new_vision_feat_dim_in,
# nhead=4,
# dim_feedforward=dim_feedforward,
# dropout=0.0,
# activation="relu",
# normalize_before=False,
# )
# self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=1)
self.box_mlp = nn.Sequential(
nn.Linear(vision_feat_dim_in * num_vision_feat + lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, 6),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, d_latents]
Returns:
_type_: _description_
"""
# pre_encoder_vision_feat = self.vision_projection_mlp(
# vision_features_before_mm_projection
# ) # (B, num_latents, new_vision_feat_dim_in)
# # get padding mask by checking where zero vectors are
# src_key_padding_mask = vision_features_before_mm_projection.eq(0).all(
# dim=-1
# ) # (B, num_latents)
# # nn.MultiHeadAttention in encoder expects npoints x batch x channel features
# # note that vision_features_before_mm_projection already contains positional embeddings
# _, encoder_output, _ = self.encoder(
# src=pre_encoder_vision_feat.permute(1, 0, 2),
# src_key_padding_mask=src_key_padding_mask,
# ) # [num_latents, B, d_latents]
# encoder_output = encoder_output.permute(1, 0, 2) # [B, num_latents, d_latents]
bbox_preds = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
# vision_feat = encoder_output[batch_idx].flatten() # (1024 * 96,)
vision_feat = vision_features_before_mm_projection[batch_idx].flatten() # (1024 * 96,)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (D,)
concat_feat = torch.cat((vision_feat, langauge_feat), dim=-1)
bbox_pred = self.box_mlp(concat_feat)
bbox_preds.append(bbox_pred)
bbox_preds = torch.stack(bbox_preds, dim=0) # (N, 6)
return bbox_preds
class BBoxHead(nn.Module):
"""A simple MLP head for bounding box regression"""
def __init__(self, lm_feat_dim_in: int, vision_feat_dim_in: int, dim_feedforward: int = 128):
super().__init__()
encoder_layer = TransformerEncoderLayer(
d_model=vision_feat_dim_in,
nhead=4,
dim_feedforward=dim_feedforward,
dropout=0.0,
activation="relu",
# normalize_before=False,
)
self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=4)
decoder_layer = TransformerDecoderLayer(
d_model=vision_feat_dim_in,
nhead=4,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=False,
)
self.decoder = TransformerDecoder(
decoder_layer=decoder_layer, num_layers=4, return_intermediate=False
)
self.language_projection = nn.Sequential(
nn.Linear(lm_feat_dim_in, vision_feat_dim_in),
# nn.ReLU(),
# nn.Linear(256, 256),
# nn.ReLU(),
# nn.Linear(256, vision_feat_dim_in),
)
self.activation = nn.GELU()
self.box_mlp = nn.Sequential(
nn.Linear(vision_feat_dim_in, 256),
self.activation,
nn.Linear(256, 256),
self.activation,
nn.Linear(256, 6),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, d_latents]
Returns:
_type_: _description_
"""
# nn.MultiHeadAttention in encoder expects npoints x batch x channel features
# note that vision_features_before_mm_projection already contains positional embeddings
_, encoder_output, _ = self.encoder(
src=vision_features_before_mm_projection.permute(1, 0, 2)
) # [num_latents, B, d_latents]
# we need to mask out the attention between different ground tokens
# because each ground token is independent of each other
# Pad the list of hidden states to the longest sample
grd_token_hidden_states_padded = pad_sequence(
grd_token_hidden_states_list, batch_first=True, padding_value=0
) # (B, N', D), where N' is the number of ground tokens in the sample with the most ground tokens in the batch
# Create a mask for the padding tokens, True means there will be no attention
tgt_key_padding_mask = grd_token_hidden_states_padded.eq(0).all(dim=-1) # (B, N')
tgt_mask = self.create_diag_mask(grd_token_hidden_states_padded.shape[1]).to(
grd_token_hidden_states_padded.device
) # (N', N')
# decoder expects: npoints x batch x channel
language_projected = self.language_projection(
grd_token_hidden_states_padded
) # (B, N', d_latents)
decoder_output, decoder_attns = self.decoder(
tgt=language_projected.permute(1, 0, 2), # [N', B, d_latents]
memory=encoder_output,
tgt_mask=tgt_mask,
tgt_key_padding_mask=tgt_key_padding_mask,
) # output, attns, output shape: [N', B, d_latents]
# predict the bounding boxes
bbox_preds = self.box_mlp(decoder_output) # (N', B, 6)
# flatten the first two dimensions, remove padded locations
bbox_preds = bbox_preds.permute(1, 0, 2) # (B, N', 6)
# discard the padded locations
bbox_preds = bbox_preds[~tgt_key_padding_mask] # (num_boxes_in_batch, 6)
return bbox_preds
@staticmethod
def create_diag_mask(size):
# for transformer, a binary ``True`` value indicates that the corresponding position is NOT
# allowed to attend, while a ``False`` value indicates that the position is allowed to attend.
mask = torch.ones(size, size, dtype=torch.bool)
mask.fill_diagonal_(0)
return mask
class BBoxHeadForGroundTruthBboxRegressionV2(nn.Module):
"""A simple MLP head for bounding box regression"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 1024,
):
super().__init__()
# round up to the nearest multiple of 4
new_vision_feat_dim_in = (vision_feat_dim_in + 3) // 4 * 4
self.vision_projection_mlp = nn.Sequential(
nn.Linear(vision_feat_dim_in, new_vision_feat_dim_in),
)
self.activation = nn.ReLU()
self.language_projection_mlp = nn.Sequential(
nn.Linear(lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, new_vision_feat_dim_in),
)
encoder_layer = TransformerEncoderLayer(
d_model=new_vision_feat_dim_in,
nhead=4,
dim_feedforward=dim_feedforward,
dropout=0.0,
activation="relu",
normalize_before=True,
)
self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=4)
self.activation = nn.ReLU()
self.box_mlp = nn.Sequential(
nn.Linear(new_vision_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, 6),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, d_latents]
Returns:
_type_: _description_
"""
bbox_preds = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
# vision_feat = encoder_output[batch_idx].flatten() # (1024 * 96,)
vision_feat = vision_features_before_mm_projection[batch_idx].unsqueeze(
0
) # (1, num_vision_feat, vision_feat_dim_in)
vision_feat = self.vision_projection_mlp(
vision_feat
) # (1, num_vision_feat, new_vision_feat_dim_in)
for i in range(len(grd_token_hidden_states)):
language_feat = grd_token_hidden_states[i] # (D,)
language_feat = self.language_projection_mlp(
language_feat
) # (new_vision_feat_dim_in,)
language_feat = language_feat[None, None, :] # (1, 1, new_vision_feat_dim_in)
language_concat_vision_feat = torch.cat(
(language_feat, vision_feat), dim=1
) # (1, 1 + new_vision_feat_dim_in, new_vision_feat_dim_in)
# # nn.MultiHeadAttention in encoder expects seqlen x batch x channel features
_, encoder_output, _ = self.encoder(
src=language_concat_vision_feat.permute(1, 0, 2)
) # [1 + new_vision_feat_dim_in, 1, new_vision_feat_dim_in]
fused_feat = encoder_output[0][0] # (new_vision_feat_dim_in,)
bbox_pred = self.box_mlp(fused_feat) # (6,)
bbox_preds.append(bbox_pred)
bbox_preds = torch.stack(bbox_preds, dim=0) # (N, 6)
return bbox_preds
class BBoxHeadForGroundTruthBboxRegressionV1(nn.Module):
"""A simple MLP head for bounding box regression"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 1024,
):
super().__init__()
self.bbox_pos_embedding = PositionEmbeddingCoordsSine(
d_pos=10,
pos_type="fourier",
)
self.obj_class_embedding = nn.Embedding(
265, 64
) # 265 classes in ScanNet, learnable embedding size 64
self.activation = nn.ReLU()
encoder_layer = TransformerEncoderLayer(
d_model=10 * 2 + 64,
nhead=4,
dim_feedforward=dim_feedforward,
dropout=0.0,
activation="relu",
normalize_before=False,
)
self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
self.box_mlp = nn.Sequential(
nn.Linear((10 * 2 + 64) * num_vision_feat + lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, 6),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, 6 + 1]
Returns:
_type_: _description_
"""
# get bbox position embeddings
# xyz is batch x npoints x 3
min_xyz_pos_embeddings = self.bbox_pos_embedding(
xyz=vision_features_before_mm_projection[:, :, 0:3]
) # (B, 96, num_latents)
min_xyz_pos_embeddings = min_xyz_pos_embeddings.permute(0, 2, 1) # (B, num_latents, 96)
max_xyz_pos_embeddings = self.bbox_pos_embedding(
xyz=vision_features_before_mm_projection[:, :, 3:6]
) # (B, 96, num_latents)
max_xyz_pos_embeddings = max_xyz_pos_embeddings.permute(0, 2, 1) # (B, num_latents, 96)
# get the object class embeddings
obj_classes = vision_features_before_mm_projection[:, :, -1].long()
obj_class_embeddings = self.obj_class_embedding(obj_classes) # (B, num_latents, 64)
vision_feat = torch.concat(
(min_xyz_pos_embeddings, max_xyz_pos_embeddings, obj_class_embeddings), dim=-1
) # (B, num_vision_feat, 96*2+64)
# get padding mask by checking where zero vectors are
src_key_padding_mask = vision_features_before_mm_projection.eq(0).all(
dim=-1
) # (B, num_latents)
# nn.MultiHeadAttention in encoder expects npoints x batch x channel features
# note that vision_features_before_mm_projection already contains positional embeddings
_, encoder_output, _ = self.encoder(
src=vision_feat.permute(1, 0, 2),
src_key_padding_mask=src_key_padding_mask,
) # [num_latents, B, d_latents]
encoder_output = encoder_output.permute(1, 0, 2) # [B, num_latents, d_latents]
bbox_preds = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
vision_feat = encoder_output[batch_idx].flatten() # (1024 * 96,)
# vision_feat = vision_features_before_mm_projection[batch_idx].flatten() # (1024 * 96,)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (D,)
concat_feat = torch.cat((vision_feat, langauge_feat), dim=-1)
bbox_pred = self.box_mlp(concat_feat)
bbox_preds.append(bbox_pred)
bbox_preds = torch.stack(bbox_preds, dim=0) # (N, 6)
return bbox_preds
class BBoxHeadForGroundTruthBboxSelectionTransformerLateFusion(nn.Module):
"""A simple MLP head for bounding box selection, for training on CE loss"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 2048,
):
super().__init__()
class_emb_dim = 256
pos_emb_dim = 16
self.bbox_pos_embedding = PositionEmbeddingCoordsSine(
d_pos=pos_emb_dim,
pos_type="fourier",
)
self.obj_class_embedding = nn.Embedding(
265, class_emb_dim
) # 265 classes in ScanNet, learnable embedding size 64
self.activation = nn.GELU()
self.language_vision_fusion_mlp = nn.Sequential(
nn.Linear(class_emb_dim + pos_emb_dim + lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
)
# encoder_layer = TransformerEncoderLayer(
# d_model=dim_feedforward,
# nhead=8,
# dim_feedforward=dim_feedforward,
# dropout=0.0,
# activation="relu",
# normalize_before=True,
# )
# self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
encoder_layer = nn.TransformerEncoderLayer(
d_model=class_emb_dim + pos_emb_dim,
nhead=8,
dim_feedforward=dim_feedforward,
norm_first=True,
)
self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
self.scoring_mlp = nn.Sequential(
nn.Linear(dim_feedforward, 1),
)
self._reset_parameters()
def _reset_parameters(self):
r"""Initiate parameters in the transformer model."""
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, 6 + 1]
Returns:
_type_: _description_
"""
# get bbox position embeddings
# xyz is batch x npoints x 3
# get the center of the bbox
bbox_center = (
vision_features_before_mm_projection[:, :, 0:3]
+ vision_features_before_mm_projection[:, :, 3:6]
) / 2.0
bbox_pos_embeddings = self.bbox_pos_embedding(
xyz=bbox_center
) # (B, pos_emb_dim, num_latents)
bbox_pos_embeddings = bbox_pos_embeddings.permute(0, 2, 1) # (B, num_latents, pos_emb_dim)
# get the object class embeddings
obj_classes = vision_features_before_mm_projection[:, :, -1].long()
obj_class_embeddings = self.obj_class_embedding(
obj_classes
) # (B, num_latents, class_emb_dim)
vision_feat = torch.concat(
(obj_class_embeddings, bbox_pos_embeddings), dim=-1
) # (B, class_emb_dim + pos_emb_dim, class_emb_dim)
# get padding mask by checking where zero vectors are
src_key_padding_mask = vision_features_before_mm_projection.eq(0).all(
dim=-1
) # (B, num_latents)
bbox_scores = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
# vision_feat = vision_features_before_mm_projection[
# batch_idx
# ] # (num_latents, d_latents)
cur_vision_feat = vision_feat[batch_idx] # (num_latents, class_emb_dim)
cur_vision_feat = cur_vision_feat.unsqueeze(0) # (1, num_latents, class_emb_dim)
cur_vision_feat = cur_vision_feat.permute(1, 0, 2) # (num_latents, 1, class_emb_dim)
# nn.MultiHeadAttention in encoder expects seqlen x batch x channel features
cur_encoder_output = self.encoder(
cur_vision_feat,
src_key_padding_mask=src_key_padding_mask[batch_idx].unsqueeze(0),
) # [num_latents, 1, class_emb_dim]
cur_encoder_output = cur_encoder_output.squeeze(1) # (num_latents, class_emb_dim)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (lm_feat_dim_in,)
# concat the language feat with each vision feat
langauge_feat_repeat = langauge_feat.repeat(
cur_encoder_output.shape[0], 1
) # (num_latents, lm_feat_dim_in)
concat_feat = torch.cat(
(cur_encoder_output, langauge_feat_repeat), dim=-1
) # (num_latents, class_emb_dim + lm_feat_dim_in)
fused_feat = self.language_vision_fusion_mlp(
concat_feat
) # (num_latents, dim_feedforward)
bbox_score = self.scoring_mlp(fused_feat).squeeze(-1) # (num_latents,)
bbox_scores.append(bbox_score) # (num_latents)
bbox_scores = torch.stack(bbox_scores, dim=0) # (N, num_latents)
return bbox_scores
class BBoxHeadForGroundTruthBboxSelectionTransformerEarlyFusion(nn.Module):
"""A simple MLP head for bounding box selection, for training on CE loss"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 2048,
):
super().__init__()
class_emb_dim = 256
pos_emb_dim = 16
self.bbox_pos_embedding = PositionEmbeddingCoordsSine(
d_pos=pos_emb_dim,
pos_type="fourier",
)
self.obj_class_embedding = nn.Embedding(
265, class_emb_dim
) # 265 classes in ScanNet, learnable embedding size class_emb_dim
self.activation = nn.GELU()
self.language_projection_mlp = nn.Sequential(
nn.Linear(lm_feat_dim_in, class_emb_dim),
)
# encoder_layer = TransformerEncoderLayer(
# d_model=dim_feedforward,
# nhead=8,
# dim_feedforward=dim_feedforward,
# dropout=0.0,
# activation="relu",
# normalize_before=True,
# )
# self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
encoder_layer = nn.TransformerEncoderLayer(
d_model=class_emb_dim + pos_emb_dim + class_emb_dim,
nhead=8,
dim_feedforward=dim_feedforward,
norm_first=False,
)
self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
self.scoring_mlp = nn.Sequential(
nn.Linear(class_emb_dim + pos_emb_dim + class_emb_dim, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, 1),
)
self._reset_parameters()
def _reset_parameters(self):
r"""Initiate parameters in the transformer model."""
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, 6 + 1]
Returns:
_type_: _description_
"""
# get bbox position embeddings
# xyz is batch x npoints x 3
# get the center of the bbox
bbox_center = (
vision_features_before_mm_projection[:, :, 0:3]
+ vision_features_before_mm_projection[:, :, 3:6]
) / 2.0
bbox_pos_embeddings = self.bbox_pos_embedding(
xyz=bbox_center
) # (B, pos_emb_dim, num_latents)
bbox_pos_embeddings = bbox_pos_embeddings.permute(0, 2, 1) # (B, num_latents, pos_emb_dim)
# get the object class embeddings
obj_classes = vision_features_before_mm_projection[:, :, -1].long()
obj_class_embeddings = self.obj_class_embedding(
obj_classes
) # (B, num_latents, class_emb_dim)
vision_feat = torch.concat(
(obj_class_embeddings, bbox_pos_embeddings), dim=-1
) # (B, num_latents, class_emb_dim)
# get padding mask by checking where zero vectors are
src_key_padding_mask = vision_features_before_mm_projection.eq(0).all(
dim=-1
) # (B, num_latents)
bbox_scores = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
# vision_feat = vision_features_before_mm_projection[
# batch_idx
# ] # (num_latents, d_latents)
cur_vision_feat = vision_feat[batch_idx] # (num_latents, class_emb_dim + pos_emb_dim)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (lm_feat_dim_in,)
langauge_feat = self.language_projection_mlp(langauge_feat) # (lm_feat_dim_in,)
langauge_feat_repeat = langauge_feat.repeat(
cur_vision_feat.shape[0], 1
) # (num_latents, lm_feat_dim_in)
concat_feat = torch.cat(
(cur_vision_feat, langauge_feat_repeat), dim=-1
) # (num_latents, class_emb_dim + pos_emb_dim + lm_feat_dim_in)
concat_feat = concat_feat.unsqueeze(
0
) # (1, num_latents, class_emb_dim + pos_emb_dim + lm_feat_dim_in)
concat_feat = concat_feat.permute(
1, 0, 2
) # (num_latents, 1, class_emb_dim + pos_emb_dim + lm_feat_dim_in)
# nn.MultiHeadAttention in encoder expects seqlen x batch x channel features
cur_encoder_output = self.encoder(
concat_feat,
src_key_padding_mask=src_key_padding_mask[batch_idx].unsqueeze(0),
) # [num_latents, 1, class_emb_dim + pos_emb_dim + lm_feat_dim_in]
cur_encoder_output = cur_encoder_output.squeeze(1) # (num_latents, class_emb_dim)
bbox_score = self.scoring_mlp(cur_encoder_output).squeeze(-1) # (num_latents,)
bbox_scores.append(bbox_score) # (num_latents)
bbox_scores = torch.stack(bbox_scores, dim=0) # (N, num_latents)
return bbox_scores
class BBoxHeadForGroundTruthBboxSelectionMLPPosEmbAndFusionOneHot(nn.Module):
"""A simple MLP head for bounding box selection, for training on CE loss"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 4096,
):
super().__init__()
self.class_emb_dim = class_emb_dim = 265 # 265 classes in ScanRefer
pos_emb_dim = 16
self.bbox_pos_embedding = PositionEmbeddingCoordsSine(
d_pos=pos_emb_dim,
pos_type="fourier",
)
self.activation = nn.ReLU()
self.language_vision_fusion_mlp = nn.Sequential(
nn.Linear(class_emb_dim + pos_emb_dim + lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
)
self.scoring_mlp = nn.Sequential(
nn.Linear(dim_feedforward, 1),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, 6 + 1]
Returns:
_type_: _description_
"""
# get bbox position embeddings
# xyz is batch x npoints x 3
# get the center of the bbox
bbox_center = (
vision_features_before_mm_projection[:, :, 0:3]
+ vision_features_before_mm_projection[:, :, 3:6]
) / 2.0
bbox_pos_embeddings = self.bbox_pos_embedding(
xyz=bbox_center
) # (B, pos_emb_dim, num_latents)
bbox_pos_embeddings = bbox_pos_embeddings.permute(0, 2, 1) # (B, num_latents, pos_emb_dim)
# get the object class embeddings, one-hot encoding of self.class_emb_dim classes
obj_classes = vision_features_before_mm_projection[:, :, -1].long()
obj_class_embeddings = torch.eye(
self.class_emb_dim,
device=vision_features_before_mm_projection.device,
dtype=vision_features_before_mm_projection.dtype,
)[
obj_classes
] # (B, num_latents, class_emb_dim)
vision_feat = torch.concat(
(obj_class_embeddings, bbox_pos_embeddings), dim=-1
) # (B, num_latents, class_emb_dim + pos_emb_dim)
# get padding mask by checking where zero vectors are
src_key_padding_mask = vision_features_before_mm_projection.eq(0).all(
dim=-1
) # (B, num_latents)
# for the padded locations, we set the vision_feat to be zero
vision_feat[src_key_padding_mask] = 0
bbox_scores = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
cur_vision_feat = vision_feat[batch_idx] # (num_latents, d_latents)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (lm_feat_dim_in),)
# concat the language feat with each vision feat
langauge_feat_repeat = langauge_feat.repeat(
cur_vision_feat.shape[0], 1
) # (num_latents, lm_feat_dim_in)
concat_feat = torch.cat(
(cur_vision_feat, langauge_feat_repeat), dim=-1
) # (num_latents, d_latents + lm_feat_dim_in)
fused_feat = self.language_vision_fusion_mlp(concat_feat)
bbox_score = self.scoring_mlp(fused_feat).squeeze(-1) # (num_latents,)
bbox_scores.append(bbox_score) # (num_latents)
bbox_scores = torch.stack(bbox_scores, dim=0) # (N, num_latents)
return bbox_scores
class BBoxHeadForGroundTruthBboxSelectionMLPFusionBoxCoordsAndClassID(nn.Module):
"""A simple MLP head for bounding box selection, for training on CE loss"""
def __init__(
self,
lm_feat_dim_in: int,
vision_feat_dim_in: int,
num_vision_feat: int,
dim_feedforward: int = 1024,
):
super().__init__()
self.activation = nn.ReLU()
self.language_vision_fusion_mlp = nn.Sequential(
nn.Linear(vision_feat_dim_in + lm_feat_dim_in, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
self.activation,
nn.Linear(dim_feedforward, dim_feedforward),
)
self.scoring_mlp = nn.Sequential(
nn.Linear(dim_feedforward, 1),
)
def forward(
self,
grd_token_hidden_states_list: list[torch.Tensor],
vision_features_before_mm_projection: torch.Tensor,
):
"""_summary_
Args:
grd_token_hidden_states_list (list[torch.Tensor]): each element in this list
contains the hidden states of the ground tokens in one sample, list[[varying N, D]]
vision_features_before_mm_projection (torch.Tensor): [B, num_latents, 6 + 1]
Returns:
_type_: _description_
"""
bbox_scores = []
for batch_idx, grd_token_hidden_states in enumerate(grd_token_hidden_states_list):
vision_feat = vision_features_before_mm_projection[
batch_idx
] # (num_latents, d_latents)
for i in range(len(grd_token_hidden_states)):
langauge_feat = grd_token_hidden_states[i] # (lm_feat_dim_in),)
# concat the language feat with each vision feat
langauge_feat_repeat = langauge_feat.repeat(
vision_feat.shape[0], 1
) # (num_latents, lm_feat_dim_in)
concat_feat = torch.cat(
(vision_feat, langauge_feat_repeat), dim=-1
) # (num_latents, d_latents + lm_feat_dim_in)
fused_feat = self.language_vision_fusion_mlp(concat_feat)
bbox_score = self.scoring_mlp(fused_feat).squeeze(-1) # (num_latents,)
bbox_scores.append(bbox_score) # (num_latents)
bbox_scores = torch.stack(bbox_scores, dim=0) # (N, num_latents)
return bbox_scores