Ubuntu
commited on
Commit
·
41b8141
1
Parent(s):
d695662
Added checkpoint and early stopping
Browse files- checkpoint.py +21 -0
- resnet_execute.py +24 -2
checkpoint.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def save_checkpoint(model, optimizer, epoch, loss, checkpoint_path="checkpoint.pth"):
|
4 |
+
checkpoint = {
|
5 |
+
'epoch': epoch,
|
6 |
+
'model_state_dict': model.state_dict(),
|
7 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
8 |
+
'loss': loss
|
9 |
+
}
|
10 |
+
torch.save(checkpoint, checkpoint_path)
|
11 |
+
print(f"Checkpoint saved at epoch {epoch}")
|
12 |
+
|
13 |
+
def load_checkpoint(model, optimizer, checkpoint_path="checkpoint.pth"):
|
14 |
+
checkpoint = torch.load(checkpoint_path, weights_only=True)
|
15 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
16 |
+
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
17 |
+
epoch = checkpoint['epoch']
|
18 |
+
loss = checkpoint['loss']
|
19 |
+
print(f"Checkpoint loaded, resuming from epoch {epoch}")
|
20 |
+
return model, optimizer, loss
|
21 |
+
|
resnet_execute.py
CHANGED
@@ -7,6 +7,7 @@ import torch.optim as optim
|
|
7 |
from resnet_model import ResNet50
|
8 |
from tqdm import tqdm
|
9 |
from torchvision import datasets
|
|
|
10 |
|
11 |
# Define transformations
|
12 |
transform = transforms.Compose([
|
@@ -89,11 +90,32 @@ def test(model, device, test_loader, criterion):
|
|
89 |
|
90 |
test_accuracy = 100.*correct/total
|
91 |
print(f'Test Loss: {test_loss/len(test_loader):.4f}, Accuracy: {test_accuracy:.2f}%')
|
92 |
-
return test_accuracy
|
93 |
|
94 |
# Main execution
|
95 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
for epoch in range(1, 6): # 20 epochs
|
97 |
train_accuracy = train(model, device, trainloader, optimizer, criterion, epoch)
|
98 |
-
test_accuracy = test(model, device, testloader, criterion)
|
99 |
print(f'Epoch {epoch} | Train Accuracy: {train_accuracy:.2f}% | Test Accuracy: {test_accuracy:.2f}%')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from resnet_model import ResNet50
|
8 |
from tqdm import tqdm
|
9 |
from torchvision import datasets
|
10 |
+
from checkpoint import save_checkpoint, load_checkpoint
|
11 |
|
12 |
# Define transformations
|
13 |
transform = transforms.Compose([
|
|
|
90 |
|
91 |
test_accuracy = 100.*correct/total
|
92 |
print(f'Test Loss: {test_loss/len(test_loader):.4f}, Accuracy: {test_accuracy:.2f}%')
|
93 |
+
return test_accuracy, test_loss/len(test_loader)
|
94 |
|
95 |
# Main execution
|
96 |
if __name__ == '__main__':
|
97 |
+
# Early stopping parameters and checkpoint path
|
98 |
+
checkpoint_path = "checkpoint.pth"
|
99 |
+
best_loss = float('inf')
|
100 |
+
patience = 5
|
101 |
+
patience_counter = 0
|
102 |
+
# Load checkpoint if it exists to resume training
|
103 |
+
try:
|
104 |
+
model, optimizer, best_test_accuracy = load_checkpoint(model, optimizer, checkpoint_path)
|
105 |
+
except FileNotFoundError:
|
106 |
+
print("No checkpoint found, starting from scratch.")
|
107 |
+
|
108 |
for epoch in range(1, 6): # 20 epochs
|
109 |
train_accuracy = train(model, device, trainloader, optimizer, criterion, epoch)
|
110 |
+
test_accuracy, test_loss = test(model, device, testloader, criterion)
|
111 |
print(f'Epoch {epoch} | Train Accuracy: {train_accuracy:.2f}% | Test Accuracy: {test_accuracy:.2f}%')
|
112 |
+
if test_loss < best_loss:
|
113 |
+
best_loss = test_loss
|
114 |
+
patience_counter = 0
|
115 |
+
save_checkpoint(model, optimizer, epoch, test_loss, checkpoint_path)
|
116 |
+
else:
|
117 |
+
patience_counter += 1
|
118 |
+
|
119 |
+
if patience_counter >= patience:
|
120 |
+
print("Early stopping triggered. Training terminated.")
|
121 |
+
break
|