atiwari751
/

ResNet50_replicate

Model card Files Files and versions Community

Ubuntu commited on Jan 3

Commit

41b8141

1 Parent(s): d695662

Added checkpoint and early stopping

Browse files

Files changed (2) hide show

checkpoint.py +21 -0
resnet_execute.py +24 -2

checkpoint.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+def save_checkpoint(model, optimizer, epoch, loss, checkpoint_path="checkpoint.pth"):
+    checkpoint = {
+        'epoch': epoch,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'loss': loss
+    }
+    torch.save(checkpoint, checkpoint_path)
+    print(f"Checkpoint saved at epoch {epoch}")
+def load_checkpoint(model, optimizer, checkpoint_path="checkpoint.pth"):
+    checkpoint = torch.load(checkpoint_path,  weights_only=True)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    epoch = checkpoint['epoch']
+    loss = checkpoint['loss']
+    print(f"Checkpoint loaded, resuming from epoch {epoch}")
+    return model, optimizer, loss

resnet_execute.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch.optim as optim
 from resnet_model import ResNet50
 from tqdm import tqdm
 from torchvision import datasets
 # Define transformations
 transform = transforms.Compose([
@@ -89,11 +90,32 @@ def test(model, device, test_loader, criterion):
     test_accuracy = 100.*correct/total
     print(f'Test Loss: {test_loss/len(test_loader):.4f}, Accuracy: {test_accuracy:.2f}%')
-    return test_accuracy
 # Main execution
 if __name__ == '__main__':
     for epoch in range(1, 6):  # 20 epochs
         train_accuracy = train(model, device, trainloader, optimizer, criterion, epoch)
-        test_accuracy = test(model, device, testloader, criterion)
         print(f'Epoch {epoch} | Train Accuracy: {train_accuracy:.2f}% | Test Accuracy: {test_accuracy:.2f}%')

 from resnet_model import ResNet50
 from tqdm import tqdm
 from torchvision import datasets
+from checkpoint import save_checkpoint, load_checkpoint
 # Define transformations
 transform = transforms.Compose([
     test_accuracy = 100.*correct/total
     print(f'Test Loss: {test_loss/len(test_loader):.4f}, Accuracy: {test_accuracy:.2f}%')
+    return test_accuracy, test_loss/len(test_loader)
 # Main execution
 if __name__ == '__main__':
+    # Early stopping parameters and checkpoint path
+    checkpoint_path = "checkpoint.pth"
+    best_loss = float('inf')
+    patience = 5
+    patience_counter = 0
+    # Load checkpoint if it exists to resume training
+    try:
+        model, optimizer, best_test_accuracy = load_checkpoint(model, optimizer, checkpoint_path)
+    except FileNotFoundError:
+        print("No checkpoint found, starting from scratch.")
     for epoch in range(1, 6):  # 20 epochs
         train_accuracy = train(model, device, trainloader, optimizer, criterion, epoch)
+        test_accuracy, test_loss = test(model, device, testloader, criterion)
         print(f'Epoch {epoch} | Train Accuracy: {train_accuracy:.2f}% | Test Accuracy: {test_accuracy:.2f}%')
+        if test_loss < best_loss:
+            best_loss = test_loss
+            patience_counter = 0
+            save_checkpoint(model, optimizer, epoch, test_loss, checkpoint_path)
+        else:
+            patience_counter += 1
+        if patience_counter >= patience:
+            print("Early stopping triggered. Training terminated.")
+            break