|
### Relevant imports & set up |
|
```python |
|
!pip install geopy > delete.txt |
|
!pip install datasets > delete.txt |
|
!pip install torch torchvision datasets > delete.txt |
|
!pip install huggingface_hub > delete.txt |
|
!rm delete.txt |
|
``` |
|
|
|
```python |
|
!pip install transformers |
|
import transformers |
|
``` |
|
|
|
```python |
|
!huggingface-cli login --token [your_token] |
|
``` |
|
|
|
```python |
|
lat_mean = 39.95156937654321 |
|
lat_std = 0.0005992518588323268 |
|
lon_mean = -75.19136795987654 |
|
lon_std = 0.0007030395253318959 |
|
``` |
|
|
|
### Instructions |
|
Our current best performing model is an ensemble of multiple models. To run it on hidden test data, first run the model definitions. |
|
|
|
#### Load and define models |
|
|
|
```python |
|
from transformers import AutoModelForImageClassification, PretrainedConfig, PreTrainedModel |
|
import torch |
|
import torch.nn as nn |
|
import os |
|
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download |
|
|
|
class CustomConvNeXtConfig(PretrainedConfig): |
|
model_type = "custom-convnext" |
|
|
|
def __init__(self, num_labels=2, **kwargs): |
|
super().__init__(**kwargs) |
|
self.num_labels = num_labels # Register number of labels (output dimensions) |
|
|
|
class CustomConvNeXtModel(PreTrainedModel): |
|
config_class = CustomConvNeXtConfig |
|
|
|
def __init__(self, config, model_name="facebook/convnext-tiny-224", |
|
num_classes=2, train_final_layer_only=False): |
|
super().__init__(config) |
|
|
|
# Load pre-trained ConvNeXt model from Hugging Face |
|
self.convnext = AutoModelForImageClassification.from_pretrained(model_name) |
|
|
|
# Access the input features of the existing classifier |
|
in_features = self.convnext.classifier.in_features |
|
|
|
# Modify the classifier layer to match the number of output classes |
|
self.convnext.classifier = nn.Linear(in_features, num_classes) |
|
|
|
# Freeze previous weights if only training the final layer |
|
if train_final_layer_only: |
|
for name, param in self.convnext.named_parameters(): |
|
if "classifier" not in name: |
|
param.requires_grad = False |
|
else: |
|
print(f"Unfrozen layer: {name}") |
|
|
|
def forward(self, x): |
|
return self.convnext(x) |
|
|
|
@classmethod |
|
def from_pretrained(cls, repo_id, model_name="facebook/convnext-tiny-224", **kwargs): |
|
"""Load model weights and configuration from Hugging Face Hub.""" |
|
# Download model.safetensors from Hugging Face Hub |
|
model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") |
|
|
|
# Download config.json from Hugging Face Hub |
|
config_path = hf_hub_download(repo_id=repo_id, filename="config.json") |
|
|
|
# Load configuration |
|
config = CustomConvNeXtConfig.from_pretrained(config_path) |
|
|
|
# Create the model |
|
model = cls(config=config, model_name=model_name, num_classes=config.num_labels) |
|
|
|
# Load state_dict from safetensors file |
|
from safetensors.torch import load_file # Safetensors library |
|
state_dict = load_file(model_path) |
|
model.load_state_dict(state_dict) |
|
|
|
return model |
|
|
|
|
|
class CustomResNetConfig(PretrainedConfig): |
|
model_type = "custom-resnet" |
|
|
|
def __init__(self, num_labels=2, **kwargs): |
|
super().__init__(**kwargs) |
|
self.num_labels = num_labels # Register number of labels (output dimensions) |
|
|
|
class CustomResNetModel(nn.Module, PyTorchModelHubMixin): |
|
config_class = CustomResNetConfig |
|
|
|
def __init__(self, model_name="microsoft/resnet-18", |
|
num_classes=2, |
|
train_final_layer_only=False): |
|
super().__init__() |
|
|
|
# Load pre-trained ResNet model from Hugging Face |
|
self.resnet = AutoModelForImageClassification.from_pretrained(model_name) |
|
|
|
# Access the Linear layer within the Sequential classifier |
|
in_features = self.resnet.classifier[1].in_features # Accessing the Linear layer within the Sequential |
|
|
|
# Modify the classifier layer to have the desired number of output classes |
|
self.resnet.classifier = nn.Sequential( |
|
nn.Flatten(), |
|
nn.Linear(in_features, num_classes) |
|
) |
|
|
|
self.config = CustomResNetConfig(num_labels=num_classes) |
|
|
|
# Freeze previous weights |
|
if train_final_layer_only: |
|
for name, param in self.resnet.named_parameters(): |
|
if "classifier" not in name: |
|
param.requires_grad = False |
|
else: |
|
print(f"Unfrozen layer: {name}") |
|
|
|
def forward(self, x): |
|
return self.resnet(x) |
|
|
|
def save_pretrained(self, save_directory, **kwargs): |
|
"""Save model weights and custom configuration in Hugging Face format.""" |
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
# Save model weights |
|
torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin")) |
|
|
|
# Save configuration |
|
self.config.save_pretrained(save_directory) |
|
|
|
@classmethod |
|
def from_pretrained(cls, repo_id, model_name="microsoft/resnet-18", **kwargs): |
|
"""Load model weights and configuration from Hugging Face Hub or local directory.""" |
|
# Download pytorch_model.bin from Hugging Face Hub |
|
model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin") |
|
|
|
# Download config.json from Hugging Face Hub |
|
config_path = hf_hub_download(repo_id=repo_id, filename="config.json") |
|
|
|
# Load configuration |
|
config = CustomResNetConfig.from_pretrained(config_path) |
|
|
|
# Create the model |
|
model = cls(model_name=model_name, num_classes=config.num_labels) |
|
|
|
# Load state_dict |
|
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))) |
|
|
|
return model |
|
|
|
|
|
class CustomEfficientNetConfig(PretrainedConfig): |
|
model_type = "custom-efficientnet" |
|
|
|
def __init__(self, num_labels=2, **kwargs): |
|
super().__init__(**kwargs) |
|
self.num_labels = num_labels # Register number of labels (output dimensions) |
|
|
|
class CustomEfficientNetModel(PreTrainedModel): |
|
config_class = CustomEfficientNetConfig |
|
|
|
def __init__(self, config, model_name="google/efficientnet-b0", |
|
num_classes=2, train_final_layer_only=False): |
|
super().__init__(config) |
|
|
|
# Load pre-trained EfficientNet model from Hugging Face |
|
self.efficientnet = AutoModelForImageClassification.from_pretrained(model_name) |
|
|
|
# Access the input features of the existing classifier |
|
in_features = self.efficientnet.classifier.in_features |
|
|
|
# Modify the classifier layer to match the number of output classes |
|
self.efficientnet.classifier = nn.Sequential( |
|
nn.Linear(in_features, num_classes) |
|
) |
|
|
|
# Freeze previous weights if only training the final layer |
|
if train_final_layer_only: |
|
for name, param in self.efficientnet.named_parameters(): |
|
if "classifier" not in name: |
|
param.requires_grad = False |
|
else: |
|
print(f"Unfrozen layer: {name}") |
|
|
|
def forward(self, x): |
|
return self.efficientnet(x) |
|
|
|
@classmethod |
|
def from_pretrained(cls, repo_id, model_name="google/efficientnet-b0", **kwargs): |
|
"""Load model weights and configuration from Hugging Face Hub.""" |
|
# Attempt to download the safetensors model file |
|
try: |
|
model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") |
|
state_dict = load_file(model_path) |
|
except Exception as e: |
|
raise ValueError( |
|
f"Failed to download or load 'model.safetensors' from {repo_id}. Ensure the file exists." |
|
) from e |
|
|
|
# Download config.json from Hugging Face Hub |
|
config_path = hf_hub_download(repo_id=repo_id, filename="config.json") |
|
|
|
# Load configuration |
|
config = CustomEfficientNetConfig.from_pretrained(config_path) |
|
|
|
# Create the model |
|
model = cls(config=config, model_name=model_name, num_classes=config.num_labels) |
|
|
|
# Load the state_dict into the model |
|
model.load_state_dict(state_dict) |
|
|
|
return model |
|
|
|
|
|
class CustomViTConfig(PretrainedConfig): |
|
model_type = "custom-vit" |
|
|
|
def __init__(self, num_labels=2, **kwargs): |
|
super().__init__(**kwargs) |
|
self.num_labels = num_labels # Register number of labels (output dimensions) |
|
|
|
class CustomViTModel(PreTrainedModel): |
|
config_class = CustomViTConfig |
|
|
|
def __init__(self, config, model_name="google/vit-base-patch16-224", |
|
num_classes=2, train_final_layer_only=False): |
|
super().__init__(config) |
|
|
|
# Load pre-trained ViT model from Hugging Face |
|
self.vit = AutoModelForImageClassification.from_pretrained(model_name) |
|
|
|
# Access the input features of the existing classifier |
|
in_features = self.vit.classifier.in_features |
|
|
|
# Modify the classifier layer to match the number of output classes |
|
self.vit.classifier = nn.Linear(in_features, num_classes) |
|
|
|
# Freeze previous weights if only training the final layer |
|
if train_final_layer_only: |
|
for name, param in self.vit.named_parameters(): |
|
if "classifier" not in name: |
|
param.requires_grad = False |
|
else: |
|
print(f"Unfrozen layer: {name}") |
|
|
|
def forward(self, x): |
|
return self.vit(x) |
|
|
|
@classmethod |
|
def from_pretrained(cls, repo_id, model_name="google/vit-base-patch16-224", **kwargs): |
|
# Attempt to download the safetensors model file |
|
try: |
|
model_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") |
|
state_dict = load_file(model_path) |
|
except Exception as e: |
|
raise ValueError( |
|
f"Failed to download or load 'model.safetensors' from {repo_id}. Ensure the file exists." |
|
) from e |
|
|
|
# Download config.json from Hugging Face Hub |
|
config_path = hf_hub_download(repo_id=repo_id, filename="config.json") |
|
|
|
# Load configuration |
|
config = CustomViTConfig.from_pretrained(config_path) |
|
|
|
# Create the model |
|
model = cls(config=config, model_name=model_name, num_classes=config.num_labels) |
|
|
|
# Load the state_dict into the model |
|
model.load_state_dict(state_dict) |
|
|
|
return model |
|
|
|
|
|
# Define the WeightedEnsembleModel class |
|
class WeightedEnsembleModel(nn.Module): |
|
def __init__(self, models, weights): |
|
""" |
|
Initialize the ensemble model with individual models and their weights. |
|
""" |
|
super(WeightedEnsembleModel, self).__init__() |
|
self.models = nn.ModuleList(models) # Wrap models in ModuleList |
|
self.weights = weights |
|
|
|
def forward(self, images): |
|
""" |
|
Forward pass for the ensemble model. |
|
Performs weighted averaging of logits from individual models. |
|
""" |
|
ensemble_logits = torch.zeros((images.size(0), 2)).to(images.device) # Initialize logits |
|
for model, weight in zip(self.models, self.weights): |
|
outputs = model(images) |
|
logits = outputs.logits if hasattr(outputs, "logits") else outputs # Extract logits |
|
ensemble_logits += weight * logits # Weighted sum of logits |
|
return ensemble_logits |
|
|
|
|
|
|
|
``` |
|
|
|
|
|
Now, load the model weights from huggingface. |
|
```python |
|
from transformers import AutoModelForImageClassification |
|
import torch |
|
from sklearn.metrics import mean_absolute_error, mean_squared_error |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
``` |
|
|
|
```python |
|
#resnet |
|
resnet = CustomResNetModel.from_pretrained( |
|
"final-project-5190/model-resnet-50-base", |
|
model_name="microsoft/resnet-50" |
|
) |
|
|
|
#convnext |
|
convnext=CustomConvNeXtModel.from_pretrained( |
|
"final-project-5190/model-convnext-tiny-reducePlateau", |
|
model_name="facebook/convnext-tiny-224") |
|
|
|
#vit |
|
vit = CustomViTModel.from_pretrained( |
|
"final-project-5190/model-ViT-base", |
|
model_name="google/vit-base-patch16-224" |
|
) |
|
|
|
#efficientnet |
|
efficientnet = CustomEfficientNetModel.from_pretrained( |
|
"final-project-5190/model-efficientnet-b0-base", |
|
model_name="google/efficientnet-b0" |
|
) |
|
|
|
models = [convnext, resnet, vit, efficientnet] |
|
weights = [0.28, 0.26, 0.20, 0.27] |
|
``` |
|
|
|
|
|
|
|
#### For data loading |
|
```python |
|
# Download |
|
from datasets import load_dataset, Image |
|
``` |
|
|
|
```python |
|
import torch |
|
import torch.nn as nn |
|
import torchvision.models as models |
|
import torchvision.transforms as transforms |
|
from torch.utils.data import DataLoader, Dataset |
|
from transformers import AutoImageProcessor, AutoModelForImageClassification, AutoConfig |
|
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download |
|
from PIL import Image |
|
import os |
|
import numpy as np |
|
|
|
class GPSImageDataset(Dataset): |
|
def __init__(self, hf_dataset, transform=None, lat_mean=None, lat_std=None, lon_mean=None, lon_std=None): |
|
self.hf_dataset = hf_dataset |
|
self.transform = transform |
|
|
|
# Compute mean and std from the dataframe if not provided |
|
self.latitude_mean = lat_mean if lat_mean is not None else np.mean(np.array(self.hf_dataset['Latitude'])) |
|
self.latitude_std = lat_std if lat_std is not None else np.std(np.array(self.hf_dataset['Latitude'])) |
|
self.longitude_mean = lon_mean if lon_mean is not None else np.mean(np.array(self.hf_dataset['Longitude'])) |
|
self.longitude_std = lon_std if lon_std is not None else np.std(np.array(self.hf_dataset['Longitude'])) |
|
|
|
def __len__(self): |
|
return len(self.hf_dataset) |
|
|
|
def __getitem__(self, idx): |
|
# Extract data |
|
example = self.hf_dataset[idx] |
|
|
|
# Load and process the image |
|
image = example['image'] |
|
latitude = example['Latitude'] |
|
longitude = example['Longitude'] |
|
# image = image.rotate(-90, expand=True) |
|
if self.transform: |
|
image = self.transform(image) |
|
|
|
# Normalize GPS coordinates |
|
latitude = (latitude - self.latitude_mean) / self.latitude_std |
|
longitude = (longitude - self.longitude_mean) / self.longitude_std |
|
gps_coords = torch.tensor([latitude, longitude], dtype=torch.float32) |
|
|
|
return image, gps_coords |
|
``` |
|
|
|
```python |
|
# Dataloader + Visualize |
|
transform = transforms.Compose([ |
|
transforms.RandomResizedCrop(224), # Random crop and resize to 224x224 |
|
transforms.RandomHorizontalFlip(), # Random horizontal flip |
|
# transforms.RandomRotation(degrees=15), # Random rotation between -15 and 15 degrees |
|
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # Random color jitter |
|
# transforms.GaussianBlur(kernel_size=(3, 5), sigma=(0.1, 2.0)), |
|
# transforms.RandomPerspective(distortion_scale=0.5, p=0.5), |
|
transforms.ToTensor(), |
|
|
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], |
|
std=[0.229, 0.224, 0.225]) |
|
]) |
|
|
|
# Optionally, you can create a separate transform for inference without augmentations |
|
inference_transform = transforms.Compose([ |
|
transforms.Resize((224, 224)), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], |
|
std=[0.229, 0.224, 0.225]) |
|
]) |
|
``` |
|
|
|
Here's an exmaple of us testing the ensemble on the release test set. You can just change the load release_data line below and run the rest of the code to obtain rMSE. |
|
|
|
```python |
|
# Load test data |
|
release_data = load_dataset("gydou/released_img", split="train") |
|
``` |
|
|
|
```python |
|
# Create dataset and dataloader using training mean and std |
|
rel_dataset = GPSImageDataset( |
|
hf_dataset=release_data, |
|
transform=inference_transform, |
|
lat_mean=lat_mean, |
|
lat_std=lat_std, |
|
lon_mean=lon_mean, |
|
lon_std=lon_std |
|
) |
|
rel_dataloader = DataLoader(rel_dataset, batch_size=32, shuffle=False) |
|
``` |
|
|
|
|
|
```python |
|
# ensemble |
|
ensemble_model = WeightedEnsembleModel(models=models, weights=weights).to(device) |
|
|
|
# Validation |
|
all_preds = [] |
|
all_actuals = [] |
|
|
|
ensemble_model.eval() |
|
with torch.no_grad(): |
|
for images, gps_coords in rel_dataloader: |
|
images, gps_coords = images.to(device), gps_coords.to(device) |
|
|
|
# Weighted ensemble prediction using the new model |
|
ensemble_logits = ensemble_model(images) |
|
|
|
# Denormalize predictions and actual values |
|
preds = ensemble_logits.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) |
|
actuals = gps_coords.cpu() * torch.tensor([lat_std, lon_std]) + torch.tensor([lat_mean, lon_mean]) |
|
|
|
all_preds.append(preds) |
|
all_actuals.append(actuals) |
|
|
|
# Concatenate all batches |
|
all_preds = torch.cat(all_preds).numpy() |
|
all_actuals = torch.cat(all_actuals).numpy() |
|
|
|
# Compute error metrics |
|
mae = mean_absolute_error(all_actuals, all_preds) |
|
rmse = mean_squared_error(all_actuals, all_preds, squared=False) |
|
|
|
print(f'Mean Absolute Error: {mae}') |
|
print(f'Root Mean Squared Error: {rmse}') |
|
|
|
# Convert predictions and actuals to meters |
|
latitude_mean_radians = np.radians(lat_mean) # Convert to radians for cosine |
|
meters_per_degree_latitude = 111000 # Constant |
|
meters_per_degree_longitude = 111000 * np.cos(latitude_mean_radians) # Adjusted for latitude mean |
|
|
|
all_preds_meters = all_preds.copy() |
|
all_preds_meters[:, 0] *= meters_per_degree_latitude # Latitude to meters |
|
all_preds_meters[:, 1] *= meters_per_degree_longitude # Longitude to meters |
|
|
|
all_actuals_meters = all_actuals.copy() |
|
all_actuals_meters[:, 0] *= meters_per_degree_latitude # Latitude to meters |
|
all_actuals_meters[:, 1] *= meters_per_degree_longitude # Longitude to meters |
|
|
|
# Compute error metrics in meters |
|
mae_meters = mean_absolute_error(all_actuals_meters, all_preds_meters) |
|
rmse_meters = mean_squared_error(all_actuals_meters, all_preds_meters, squared=False) |
|
|
|
print(f"Mean Absolute Error (meters): {mae_meters:.2f}") |
|
print(f"Root Mean Squared Error (meters): {rmse_meters:.2f}") |
|
|
|
``` |
|
|
|
After running inference on the release test set, our results are the following. |
|
- Release Dataset Mean Absolute Error: 0.0004267849560326909 |
|
- Release Dataset Root Mean Squared Error: 0.0005247778631268114 |
|
- Mean Absolute Error (meters): 41.90 |
|
- Root Mean Squared Error (meters): 51.29 |