import torch import torch.nn as nn class Adapter(nn.Module): def __init__(self, img_dim, txt_dim, embed_dim=1024, num_heads=8): super().__init__() self.adapter = nn.Sequential( nn.Linear(img_dim, embed_dim), nn.ReLU(), nn.Linear(embed_dim, embed_dim), nn.ReLU(), nn.Linear(embed_dim, txt_dim) ) self.self_attention = nn.MultiheadAttention(embed_dim=txt_dim, num_heads=num_heads) def forward(self, img_emb): img_emb = self.adapter(img_emb).unsqueeze(0) attn_output, _= self.self_attention(img_emb, img_emb, img_emb) return attn_output.squeeze(0) def get_adapter_model(in_shape, out_shape): model = Adapter(in_shape, out_shape) return model def load_adapter_model(): model = get_adapter_model(512, 384) model.load_state_dict(torch.load("./weights/adapter_model_with_attention.pt", map_location=torch.device('cpu'))) return model