|
# **Image2GPS Model Overview** |
|
|
|
## **Datasets** |
|
- **Training Dataset:** `image2gpsLLH/image_data` |
|
- **Evaluation Metrics:** |
|
- Accuracy |
|
|
|
--- |
|
|
|
## **Model Statistics** |
|
- **Latitude Mean:** 39.95150678400655 |
|
- **Latitude Standard Deviation:** 0.0007344790486223371 |
|
- **Longitude Mean:** -75.19146715269915 |
|
- **Longitude Standard Deviation:** 0.0007342464795497821 |
|
|
|
--- |
|
|
|
## **Model Description** |
|
- **Model Type:** Vision Transformer (ViT) |
|
|
|
--- |
|
|
|
## **Training Data** |
|
- **Dataset Size:** 1325 Images |
|
- **Location:** Penn Engineering walkways |
|
- **Data Collection Method:** |
|
- Images captured from different directions at various points: |
|
- North, Northeast, East, Southeast, South, Southwest, West, Northwest |
|
|
|
--- |
|
|
|
## **Testing Data** |
|
- **Dataset Size:** 441 Images |
|
- **Location:** Penn Engineering walkways |
|
|
|
--- |
|
|
|
## **Factors Affecting Model Performance** |
|
- **Environmental Conditions:** Lighting, weather, time of day |
|
- **Image Variability:** Different camera angles and perspectives |
|
|
|
--- |
|
|
|
## **Training Result** |
|
![Image Example](https://cdn-uploads.huggingface.co/production/uploads/675310562ebb9d1a13935961/X5vAl8CJzsvJRl_IKY20u.png) |
|
*Caption: Example of an image used during training/testing.* |
|
|
|
--- |
|
|
|
## **Example Execution** |
|
https://colab.research.google.com/drive/12mQAu1m65EV5kJlVkigkEOxH8NaLULTS?usp=sharing |
|
|
|
```python |
|
!pip install datasets |
|
|
|
# Imports |
|
from huggingface_hub import login |
|
from huggingface_hub import hf_hub_download |
|
from torchvision import models |
|
import torch |
|
import torchvision.transforms as transforms |
|
from torch.utils.data import DataLoader, Dataset |
|
from datasets import load_dataset |
|
import numpy as np |
|
import os |
|
from sklearn.metrics import mean_absolute_error, mean_squared_error |
|
import timm |
|
from torch import nn |
|
|
|
class ViTGeoLocator(nn.Module): |
|
def __init__(self, freeze_backbone=True): |
|
super(ViTGeoLocator, self).__init__() |
|
# Load pretrained ViT |
|
self.backbone = timm.create_model('vit_base_patch16_224', pretrained=True) |
|
|
|
if freeze_backbone: |
|
for param in self.backbone.parameters(): |
|
param.requires_grad = False |
|
|
|
# Get the dimension of the ViT's output |
|
embed_dim = self.backbone.num_features |
|
|
|
# Remove the original classification head |
|
self.backbone.head = nn.Identity() |
|
|
|
# New regression head |
|
self.regressor = nn.Sequential( |
|
nn.Linear(embed_dim, 512), |
|
nn.GELU(), |
|
nn.Dropout(0.3), |
|
nn.Linear(512, 128), |
|
nn.GELU(), |
|
nn.Dropout(0.2), |
|
nn.Linear(128, 2) # Output: [latitude, longitude] |
|
) |
|
|
|
def forward(self, x): |
|
x = self.backbone(x) |
|
return self.regressor(x) |
|
|
|
# Log in to Hugging Face |
|
login("replace with huggingface token") |
|
|
|
# Specify the repository and model file |
|
repo_id = "image2gpsLLH/vit" |
|
filename = "vit.pth" |
|
|
|
# Download the model from Hugging Face |
|
model_path = hf_hub_download(repo_id=repo_id, filename=filename) |
|
|
|
# Initialize the model |
|
model_test = ViTGeoLocator(freeze_backbone=True) |
|
|
|
# Load the checkpoint |
|
checkpoint = torch.load(model_path) |
|
|
|
# Load state dict |
|
model_test.load_state_dict(checkpoint['model_state_dict']) |
|
|
|
# Set the model to evaluation mode |
|
model_test.eval() |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model_test = model_test.to(device) |
|
|
|
# Create the DataLoader and run inference |
|
with torch.no_grad(): |
|
for images, gps_coords in sample_dataloader: |
|
images, gps_coords = images.to(device), gps_coords.to(device) |
|
outputs = model_test(images) |
|
|
|
|
|
class GPSImageDataset(Dataset): |
|
def __init__(self, hf_dataset, transform=None, lat_mean=None, lat_std=None, lon_mean=None, lon_std=None): |
|
self.hf_dataset = hf_dataset |
|
self.transform = transform |
|
|
|
# Compute mean and std from the dataframe if not provided |
|
self.latitude_mean = lat_mean if lat_mean is not None else np.mean(np.array(self.hf_dataset['Latitude'])) |
|
self.latitude_std = lat_std if lat_std is not None else np.std(np.array(self.hf_dataset['Latitude'])) |
|
self.longitude_mean = lon_mean if lon_mean is not None else np.mean(np.array(self.hf_dataset['Longitude'])) |
|
self.longitude_std = lon_std if lon_std is not None else np.std(np.array(self.hf_dataset['Longitude'])) |
|
|
|
def __len__(self): |
|
return len(self.hf_dataset) |
|
|
|
def __getitem__(self, idx): |
|
# Extract data |
|
example = self.hf_dataset[idx] |
|
|
|
# Load and process the image |
|
image = example['image'] |
|
latitude = example['Latitude'] |
|
longitude = example['Longitude'] |
|
if self.transform: |
|
image = self.transform(image) |
|
|
|
# Normalize GPS coordinates |
|
latitude = (latitude - self.latitude_mean) / self.latitude_std |
|
longitude = (longitude - self.longitude_mean) / self.longitude_std |
|
gps_coords = torch.tensor([latitude, longitude], dtype=torch.float32) |
|
|
|
return image, gps_coords |
|
|
|
# Load sample data (replace with path to sample data) |
|
data_sample = load_dataset("gydou/released_img", split="train") |
|
|
|
# Specify mean and std for latitude and longitude (replace with the stated mean and std above) |
|
lat_mean: 39.95150678400655 |
|
lat_std: 0.0007344790486223371 |
|
lon_mean: -75.19146715269915 |
|
lon_std: 0.0007342464795497821 |
|
|
|
# Specify transform |
|
inference_transform = transforms.Compose([ |
|
transforms.Resize((224, 224)), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], |
|
std=[0.229, 0.224, 0.225]) |
|
]) |
|
|
|
# Create dataloader |
|
sample_dataset = GPSImageDataset( |
|
hf_dataset=data_sample, |
|
transform=inference_transform, |
|
lat_mean=lat_mean, |
|
lat_std=lat_std, |
|
lon_mean=lon_mean, |
|
lon_std=lon_std |
|
) |
|
sample_dataloader = DataLoader(sample_dataset, batch_size=32, shuffle=False) |
|
|
|
# Run model |
|
all_preds = [] |
|
all_actuals = [] |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
with torch.no_grad(): |
|
for images, gps_coords in sample_dataloader: |
|
images, gps_coords = images.to(device), gps_coords.to(device) |
|
|
|
outputs = model_test(images) |
|
|
|
# Denormalize predictions and actual values |
|
preds = outputs.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) |
|
actuals = gps_coords.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) |
|
|
|
all_preds.append(preds) |
|
all_actuals.append(actuals) |
|
|
|
# Concatenate all batches |
|
all_preds = torch.cat(all_preds).numpy() |
|
all_actuals = torch.cat(all_actuals).numpy() |
|
|
|
# Compute error metrics |
|
mae = mean_absolute_error(all_actuals, all_preds) |
|
rmse = mean_squared_error(all_actuals, all_preds, squared=False) |
|
|
|
print(f'Mean Absolute Error: {mae}') |
|
print(f'Root Mean Squared Error: {rmse}') |
|
``` |
|
|