Dataset stats: \ lat_mean = 39.951564548022596 \ lat_std = 0.0006361722351128644 \ lon_mean = -75.19150880602636 \ lon_std = 0.000611411894337979 The model can be loaded using: ``` from huggingface_hub import hf_hub_download import torch # Specify the repository and the filename of the model you want to load repo_id = "FinalProj5190/vit_base_72" # Replace with your repo name filename = "resnet_gps_regressor_complete.pth" model_path = hf_hub_download(repo_id=repo_id, filename=filename) model_test = MultiModalModel() model_test.load_state_dict(torch.load(model_path)) model_test.eval() ``` The model implementation is here: ``` from transformers import AutoModel class MultiModalModel(nn.Module): def __init__(self, image_model_name='google/vit-base-patch16-224-in21k', output_dim=2): super(MultiModalModel, self).__init__() # Load Vision Transformer for feature extraction self.image_model = AutoModel.from_pretrained(image_model_name, output_hidden_states=True) # Combine image and GPS features for regression self.regressor = nn.Sequential( nn.Linear(self.image_model.config.hidden_size, 128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, output_dim), ) def forward(self, image): # Extract image features from the last hidden state image_outputs = self.image_model(image) image_features = image_outputs.last_hidden_state[:, 0, :] # CLS token features # Final regression return self.regressor(image_features) ```