Image2GPS Model Overview

Datasets

Training Dataset: image2gpsLLH/image_data
Evaluation Metrics:
- Accuracy

Model Statistics

Latitude Mean: 39.95150678400655
Latitude Standard Deviation: 0.0007344790486223371
Longitude Mean: -75.19146715269915
Longitude Standard Deviation: 0.0007342464795497821

Model Description

Model Type: Vision Transformer (ViT)

Training Data

Dataset Size: 1325 Images
Location: Penn Engineering walkways
Data Collection Method:
- Images captured from different directions at various points:
  - North, Northeast, East, Southeast, South, Southwest, West, Northwest

Testing Data

Dataset Size: 441 Images
Location: Penn Engineering walkways

Factors Affecting Model Performance

Environmental Conditions: Lighting, weather, time of day
Image Variability: Different camera angles and perspectives

Training Result

Caption: Example of an image used during training/testing.

Example Execution

https://colab.research.google.com/drive/12mQAu1m65EV5kJlVkigkEOxH8NaLULTS?usp=sharing

!pip install datasets

# Imports
from huggingface_hub import login
from huggingface_hub import hf_hub_download
from torchvision import models
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import numpy as np
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error
import timm
from torch import nn

class ViTGeoLocator(nn.Module):
    def __init__(self, freeze_backbone=True):
        super(ViTGeoLocator, self).__init__()
        # Load pretrained ViT
        self.backbone = timm.create_model('vit_base_patch16_224', pretrained=True)

        if freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False

        # Get the dimension of the ViT's output
        embed_dim = self.backbone.num_features

        # Remove the original classification head
        self.backbone.head = nn.Identity()

        # New regression head
        self.regressor = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 2)  # Output: [latitude, longitude]
        )

    def forward(self, x):
        x = self.backbone(x)
        return self.regressor(x)

# Log in to Hugging Face
login("replace with huggingface token")

# Specify the repository and model file
repo_id = "image2gpsLLH/vit"
filename = "vit.pth"

# Download the model from Hugging Face
model_path = hf_hub_download(repo_id=repo_id, filename=filename)

# Initialize the model
model_test = ViTGeoLocator(freeze_backbone=True)

# Load the checkpoint
checkpoint = torch.load(model_path)

# Load state dict
model_test.load_state_dict(checkpoint['model_state_dict'])

# Set the model to evaluation mode
model_test.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_test = model_test.to(device)

# Create the DataLoader and run inference
with torch.no_grad():
    for images, gps_coords in sample_dataloader:
        images, gps_coords = images.to(device), gps_coords.to(device)
        outputs = model_test(images)


class GPSImageDataset(Dataset):
    def __init__(self, hf_dataset, transform=None, lat_mean=None, lat_std=None, lon_mean=None, lon_std=None):
        self.hf_dataset = hf_dataset
        self.transform = transform

        # Compute mean and std from the dataframe if not provided
        self.latitude_mean = lat_mean if lat_mean is not None else np.mean(np.array(self.hf_dataset['Latitude']))
        self.latitude_std = lat_std if lat_std is not None else np.std(np.array(self.hf_dataset['Latitude']))
        self.longitude_mean = lon_mean if lon_mean is not None else np.mean(np.array(self.hf_dataset['Longitude']))
        self.longitude_std = lon_std if lon_std is not None else np.std(np.array(self.hf_dataset['Longitude']))

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        # Extract data
        example = self.hf_dataset[idx]

        # Load and process the image
        image = example['image']
        latitude = example['Latitude']
        longitude = example['Longitude']
        if self.transform:
            image = self.transform(image)

        # Normalize GPS coordinates
        latitude = (latitude - self.latitude_mean) / self.latitude_std
        longitude = (longitude - self.longitude_mean) / self.longitude_std
        gps_coords = torch.tensor([latitude, longitude], dtype=torch.float32)

        return image, gps_coords

# Load sample data (replace with path to sample data)
data_sample = load_dataset("gydou/released_img", split="train")

# Specify mean and std for latitude and longitude (replace with the stated mean and std above)
lat_mean: 39.95150678400655
lat_std: 0.0007344790486223371
lon_mean: -75.19146715269915
lon_std: 0.0007342464795497821

# Specify transform
inference_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Create dataloader
sample_dataset = GPSImageDataset(
    hf_dataset=data_sample,
    transform=inference_transform,
    lat_mean=lat_mean,
    lat_std=lat_std,
    lon_mean=lon_mean,
    lon_std=lon_std
)
sample_dataloader = DataLoader(sample_dataset, batch_size=32, shuffle=False)

# Run model
all_preds = []
all_actuals = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    for images, gps_coords in sample_dataloader:
        images, gps_coords = images.to(device), gps_coords.to(device)

        outputs = model_test(images)

        # Denormalize predictions and actual values
        preds = outputs.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean])
        actuals = gps_coords.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean])

        all_preds.append(preds)
        all_actuals.append(actuals)

# Concatenate all batches
all_preds = torch.cat(all_preds).numpy()
all_actuals = torch.cat(all_actuals).numpy()

# Compute error metrics
mae = mean_absolute_error(all_actuals, all_preds)
rmse = mean_squared_error(all_actuals, all_preds, squared=False)

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

image2gpsLLH
/

vit

You need to agree to share your contact information to access this model

Image2GPS Model Overview

Datasets

Model Statistics

Model Description

Training Data

Testing Data

Factors Affecting Model Performance

Training Result

Example Execution