|
# Image to GPS Project - ConvNext, MobileNet and EfficientNet Ensemble |
|
```bash |
|
## Training Data Statistics |
|
lat_mean = 39.951537011424264 |
|
lat_std = 0.0006940325318781937 |
|
lon_mean = -75.19152009539549 |
|
lon_std = 0.0007607716964655242 |
|
``` |
|
|
|
## How to Load the Model and Perform Inference |
|
```bash |
|
# install dependencies |
|
pip install geopy datasets torch torchvision huggingface_hub |
|
# import packages |
|
import numpy as np |
|
from geopy.distance import geodesic |
|
import torch |
|
from torch.utils.data import DataLoader, Dataset |
|
from torchvision import transforms |
|
import torch.nn as nn |
|
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights, convnext_tiny, ConvNeXt_Tiny_Weights, efficientnet_b0, EfficientNet_B0_Weights |
|
from datasets import load_dataset |
|
from huggingface_hub import hf_hub_download |
|
# load the model |
|
repo_id = "cis519projectA/Ensemble_ConvNeXt_MobileNet_EfficientNet" |
|
filename = "ensemble_triple.pth" |
|
model_path = hf_hub_download(repo_id=repo_id, filename=filename) |
|
# define models |
|
class CustomEfficientNetModel(nn.Module): |
|
def __init__(self, weights=EfficientNet_B0_Weights.DEFAULT, num_classes=2): |
|
super().__init__() |
|
self.efficientnet = efficientnet_b0(weights=weights) |
|
in_features = self.efficientnet.classifier[1].in_features |
|
self.efficientnet.classifier = nn.Sequential( |
|
nn.Linear(in_features, 512), |
|
nn.ReLU(), |
|
nn.Dropout(p=0.3), |
|
nn.Linear(512, num_classes) |
|
) |
|
for param in self.efficientnet.features[:3].parameters(): |
|
param.requires_grad = False |
|
|
|
def forward(self, x): |
|
return self.efficientnet(x) |
|
|
|
class CustomConvNeXtModel(nn.Module): |
|
def __init__(self, weights=ConvNeXt_Tiny_Weights.DEFAULT, num_classes=2): |
|
super().__init__() |
|
self.convnext = convnext_tiny(weights=weights) |
|
in_features = self.convnext.classifier[2].in_features |
|
self.convnext.classifier = nn.Sequential( |
|
nn.AdaptiveAvgPool2d(1), |
|
nn.Flatten(), |
|
nn.Linear(in_features, 512), |
|
nn.BatchNorm1d(512), |
|
nn.ReLU(), |
|
nn.Dropout(p=0.3), |
|
nn.Linear(512, num_classes) |
|
) |
|
for param in self.convnext.features[:4].parameters(): |
|
param.requires_grad = False |
|
def forward(self, x): |
|
return self.convnext(x) |
|
|
|
class CustomMobileNetModel(nn.Module): |
|
def __init__(self, weights=MobileNet_V2_Weights.DEFAULT, num_classes=2): |
|
super().__init__() |
|
self.mobilenet = mobilenet_v2(weights=weights) |
|
in_features = self.mobilenet.classifier[1].in_features |
|
self.mobilenet.classifier = nn.Sequential( |
|
nn.Linear(in_features, 1024), |
|
nn.ReLU(), |
|
nn.Dropout(p=0.5), |
|
nn.Linear(1024, 512), |
|
nn.ReLU(), |
|
nn.Dropout(p=0.5), |
|
nn.Linear(512, num_classes) |
|
) |
|
for param in self.mobilenet.features[:5].parameters(): |
|
param.requires_grad = False |
|
|
|
def forward(self, x): |
|
return self.mobilenet(x) |
|
|
|
class EnsembleModel(nn.Module): |
|
def __init__(self, convnext_model, mobilenet_model, efficientnet_model, num_classes=2): |
|
super().__init__() |
|
self.convnext = convnext_model |
|
self.mobilenet = mobilenet_model |
|
self.efficientnet = efficientnet_model |
|
self.weight_convnext = nn.Parameter(torch.tensor(1.0)) |
|
self.weight_mobilenet = nn.Parameter(torch.tensor(1.0)) |
|
self.weight_efficientnet = nn.Parameter(torch.tensor(1.0)) |
|
self.fc = nn.Sequential( |
|
nn.Linear(num_classes * 3, 512), |
|
nn.ReLU(), |
|
nn.Dropout(p=0.3), |
|
nn.Linear(512, num_classes) |
|
) |
|
def forward(self, x): |
|
convnext_out = self.convnext(x) |
|
mobilenet_out = self.mobilenet(x) |
|
efficientnet_out = self.efficientnet(x) |
|
weights = torch.softmax(torch.stack([self.weight_convnext, self.weight_mobilenet, self.weight_efficientnet]), dim=0) |
|
combined = (weights[0] * convnext_out + |
|
weights[1] * mobilenet_out + |
|
weights[2] * efficientnet_out) |
|
output = self.fc(torch.cat((convnext_out, mobilenet_out, efficientnet_out), dim=1)) |
|
return output |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
convnext_model = CustomConvNeXtModel(weights=ConvNeXt_Tiny_Weights.DEFAULT, num_classes=2) |
|
mobilenet_model = CustomMobileNetModel(weights=MobileNet_V2_Weights.DEFAULT, num_classes=2) |
|
efficientnet_model = CustomEfficientNetModel(weights=EfficientNet_B0_Weights.DEFAULT, num_classes=2) |
|
ensemble_model = EnsembleModel(convnext_model, mobilenet_model, efficientnet_model, num_classes=2).to(device) |
|
# load the model weights |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
state_dict = torch.load(model_path, map_location=device) |
|
ensemble_model.load_state_dict(state_dict) |
|
ensemble_model.to(device) |
|
ensemble_model.eval() |
|
# load the dataset |
|
dataset_test = load_dataset("gydou/released_img", split="train") |
|
# define transformers |
|
inference_transform = transforms.Compose([ |
|
transforms.Resize((224, 224)), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) |
|
]) |
|
# Parameters for denormalization |
|
lat_mean = 39.951537011424264 |
|
lat_std = 0.0006940325318781937 |
|
lon_mean = -75.19152009539549 |
|
lon_std = 0.0007607716964655242 |
|
class GPSImageDataset(Dataset): |
|
def __init__(self, hf_dataset, transform=None, lat_mean=None, lat_std=None, lon_mean=None, lon_std=None): |
|
self.hf_dataset = hf_dataset |
|
self.transform = transform |
|
self.latitude_mean = lat_mean |
|
self.latitude_std = lat_std |
|
self.longitude_mean = lon_mean |
|
self.longitude_std = lon_std |
|
def __len__(self): |
|
return len(self.hf_dataset) |
|
def __getitem__(self, idx): |
|
example = self.hf_dataset[idx] |
|
image = example['image'] |
|
latitude = example['Latitude'] |
|
longitude = example['Longitude'] |
|
if self.transform: |
|
image = self.transform(image) |
|
latitude = (latitude - self.latitude_mean) / self.latitude_std |
|
longitude = (longitude - self.longitude_mean) / self.longitude_std |
|
gps_coords = torch.tensor([latitude, longitude], dtype=torch.float32) |
|
return image, gps_coords |
|
# transform test data |
|
test_dataset = GPSImageDataset( |
|
hf_dataset=dataset_test, |
|
transform=inference_transform, |
|
lat_mean=lat_mean, |
|
lat_std=lat_std, |
|
lon_mean=lon_mean, |
|
lon_std=lon_std |
|
) |
|
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4) |
|
# evaluate |
|
def evaluate_model_single_batch(model, dataloader, lat_mean, lat_std, lon_mean, lon_std): |
|
all_distances = [] |
|
model.eval() |
|
with torch.no_grad(): |
|
for batch_idx, (images, gps_coords) in enumerate(dataloader): |
|
images, gps_coords = images.to(device), gps_coords.to(device) |
|
outputs = model(images) |
|
preds_denorm = outputs.cpu().numpy() * np.array([lat_std, lon_std]) + np.array([lat_mean, lon_mean]) |
|
actuals_denorm = gps_coords.cpu().numpy() * np.array([lat_std, lon_std]) + np.array([lat_mean, lon_mean]) |
|
for pred, actual in zip(preds_denorm, actuals_denorm): |
|
distance = geodesic((actual[0], actual[1]), (pred[0], pred[1])).meters |
|
all_distances.append(distance) |
|
break |
|
mean_error = np.mean(all_distances) |
|
rmse_error = np.sqrt(np.mean(np.square(all_distances))) |
|
return mean_error, rmse_error |
|
# Evaluate using only one batch |
|
mean_error, rmse_error = evaluate_model_single_batch( |
|
ensemble_model, test_dataloader, lat_mean, lat_std, lon_mean, lon_std |
|
) |
|
print(f"Mean Error (meters): {mean_error:.2f}, RMSE (meters): {rmse_error:.2f}") |
|
``` |