Lecture 7: Convolutional Neural Networks

Handling image data

Joaquin Vanschoren, Eindhoven University of Technology


  • Image convolution

  • Convolutional neural networks

  • Data augmentation

  • Model interpretation

  • Using pre-trained networks (transfer learning)

  • Operation that transforms an image by sliding a smaller image (called a filter or kernel ) over the image and multiplying the pixel values

from __future__ import print_function
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, Dropdown
from skimage import color

# Visualize convolution. See https://tonysyu.github.io/
def iter_pixels(image):
    """ Yield pixel position (row, column) and pixel intensity. """
    height, width = image.shape[:2]
    for i in range(height):
        for j in range(width):
            yield (i, j), image[i, j]
# Visualize result
def imshow_pair(image_pair, titles=('', ''), figsize=(8, 4), **kwargs):
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    for ax, img, label in zip(axes.ravel(), image_pair, titles):
        ax.imshow(img, **kwargs)
        ax.set_title(label, fontdict={'fontsize':32*fig_scale})
# Visualize result
def imshow_triple(axes, image_pair, titles=('', '', ''), figsize=(8, 4), **kwargs):
    for ax, img, label in zip(axes, image_pair, titles):
        ax.imshow(img, **kwargs)
        ax.set_title(label, fontdict={'fontsize':10*fig_scale})
# Zero-padding
def padding_for_kernel(kernel):
    """ Return the amount of padding needed for each side of an image.

    For example, if the returned result is [1, 2], then this means an
    image should be padded with 1 extra row on top and bottom, and 2
    extra columns on the left and right.
    # Slice to ignore RGB channels if they exist.
    image_shape = kernel.shape[:2]
    # We only handle kernels with odd dimensions so make sure that's true.
    # (The "center" pixel of an even number of pixels is arbitrary.)
    assert all((size % 2) == 1 for size in image_shape)
    return [(size - 1) // 2 for size in image_shape]
def add_padding(image, kernel):
    h_pad, w_pad = padding_for_kernel(kernel)
    return np.pad(image, ((h_pad, h_pad), (w_pad, w_pad)),
                  mode='constant', constant_values=0)
def remove_padding(image, kernel):
    inner_region = []  # A 2D slice for grabbing the inner image region
    for pad in padding_for_kernel(kernel):
        slice_i = np.s_[:] if pad == 0 else np.s_[pad: -pad]
    return image # [inner_region] # Broken in numpy 1.24, doesn't seem necessary

# Slice windows
def window_slice(center, kernel):
    r, c = center
    r_pad, c_pad = padding_for_kernel(kernel)
    # Slicing is (inclusive, exclusive) so add 1 to the stop value
    return np.s_[r-r_pad:r+r_pad+1, c-c_pad:c+c_pad+1]

# Apply convolution kernel to image patch
def apply_kernel(center, kernel, original_image):
    image_patch = original_image[window_slice(center, kernel)]
    # An element-wise multiplication followed by the sum
    return np.sum(kernel * image_patch)

# Move kernel over the image
def iter_kernel_labels(image, kernel):
    original_image = image
    image = add_padding(original_image, kernel)
    i_pad, j_pad = padding_for_kernel(kernel)

    for (i, j), pixel in iter_pixels(original_image):
        # Shift the center of the kernel to ignore padded border.
        i += i_pad
        j += j_pad
        mask = np.zeros(image.shape, dtype=int)  # Background = 0
        mask[window_slice((i, j), kernel)] = kernel   # Kernel = 1
        #mask[i, j] = 2                           # Kernel-center = 2
        yield (i, j), mask

# Visualize kernel as it moves over the image
def visualize_kernel(kernel_labels, image):
    return kernel_labels + image #color.label2rgb(kernel_labels, image, bg_label=0)

def convolution_demo(image, kernels, **kwargs):
    # Dropdown for selecting kernels
    kernel_names = list(kernels.keys())
    kernel_selector = Dropdown(options=kernel_names, description='Kernel:')
    def update_convolution(kernel_name):
        kernel = kernels[kernel_name]  # Get the selected kernel
        gen_kernel_labels = iter_kernel_labels(image, kernel)
        image_cache = []
        image_padded = add_padding(image, kernel)
        def convolution_step(i_step=0):
            while i_step >= len(image_cache):
                filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
                filtered = filtered_prev.copy()
                center, kernel_labels = next(gen_kernel_labels)
                filtered[center] = apply_kernel(center, kernel, image_padded)
                kernel_overlay = visualize_kernel(kernel_labels, image_padded)
                image_cache.append((kernel_overlay, filtered))
            image_pair = [remove_padding(each, kernel) for each in image_cache[i_step]]
            imshow_pair(image_pair, **kwargs)
        interact(convolution_step, i_step=(0, image.size - 1, 1))
    interact(update_convolution, kernel_name=kernel_selector);

# Full process
def convolution_full(ax, image, kernel, **kwargs):
    # Initialize generator since we're only ever going to iterate over
    # a pixel once. The cached result is used, if we step back.
    gen_kernel_labels = iter_kernel_labels(image, kernel)

    image_cache = []
    image_padded = add_padding(image, kernel)
    # Plot original image and kernel-overlay next to filtered image.

    for i_step in range(image.size-1):

        # For the first step (`i_step == 0`), the original image is the
        # filtered image; after that we look in the cache, which stores
        # (`kernel_overlay`, `filtered`).
        filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
        # We don't want to overwrite the previously filtered image:
        filtered = filtered_prev.copy()

        # Get the labels used to visualize the kernel
        center, kernel_labels = next(gen_kernel_labels)
        # Modify the pixel value at the kernel center
        filtered[center] = apply_kernel(center, kernel, image_padded)
        # Take the original image and overlay our kernel visualization
        kernel_overlay = visualize_kernel(kernel_labels, image_padded)
        # Save images for reuse.
        image_cache.append((kernel_overlay, filtered))

    # Remove padding we added to deal with boundary conditions
    # (Loop since each step has 2 images)
    image_triple = [remove_padding(each, kernel)
                  for each in image_cache[i_step]]
    imshow_triple(ax, image_triple, **kwargs)
  • Different kernels can detect different types of patterns in the image

horizontal_edge_kernel = np.array([[ 1,  2,  1],
                                   [ 0,  0,  0],
                                   [-1, -2, -1]])
diagonal_edge_kernel = np.array([[1, 0, 0],
                                 [0, 1, 0],
                                 [0, 0, 1]])
edge_detect_kernel = np.array([[-1, -1, -1],
                               [-1,  8, -1],
                               [-1, -1, -1]])
all_kernels = {"horizontal": horizontal_edge_kernel,
               "diagonal": diagonal_edge_kernel,
mnist_data = oml.datasets.get_dataset(554) # Download MNIST data
# Get the predictors X and the labels y
X_mnist, y_mnist, c, a = mnist_data.get_data(dataset_format='array', target=mnist_data.default_target_attribute); 
image = X_mnist[1].reshape((28, 28))
image = (image - np.min(image))/np.ptp(image) # Normalize

titles = ('Image and kernel', 'Filtered image')
convolution_demo(image, all_kernels, vmin=-4, vmax=4, titles=titles, cmap='gray_r');
if not interactive:
    fig, axs = plt.subplots(3,  3, figsize=(5*fig_scale, 5*fig_scale))
    titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
    convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
    convolution_full(axs[1,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
    convolution_full(axs[2,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')

Demonstration on Fashion-MNIST

fmnist_data = oml.datasets.get_dataset(40996) # Download FMNIST data
# Get the predictors X and the labels y
X_fm, y_fm, _, _ = fmnist_data.get_data(dataset_format='array', target=fmnist_data.default_target_attribute)
fm_classes = {0:"T-shirt/top", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat", 5: "Sandal", 
              6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle boot"}
def normalize_image(X):
    image = X.reshape((28, 28))
    return (image - np.min(image))/np.ptp(image) # Normalize

image = normalize_image(X_fm[3])
demo2 = convolution_demo(image, all_kernels,
                 vmin=-4, vmax=4, cmap='gray_r');
if not interactive:
    fig, axs = plt.subplots(3, 3, figsize=(5*fig_scale, 5*fig_scale))
    titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
    convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
    convolution_full(axs[1,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
    convolution_full(axs[2,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')

Image convolution in practice

  • How do we know which filters are best for a given image?

  • Families of kernels (or filter banks ) can be run on every image

    • Gabor, Sobel, Haar Wavelets,…

  • Gabor filters: Wave patterns generated by changing:

    • Frequency: narrow or wide ondulations

    • Theta: angle (direction) of the wave

    • Sigma: resolution (size of the filter)


from scipy import ndimage as ndi
from skimage import data
from skimage.util import img_as_float
from skimage.filters import gabor_kernel

# Gabor Filters.
def demoGabor(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
    plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest', extent=[-1, 1, -1, 1])
    plt.title(f'freq: {round(frequency,2)}, theta: {round(theta,2)}, sigma: {round(sigma,2)}', fontdict={'fontsize':14*fig_scale})
if not interactive:
    plt.subplot(1, 3, 1)
    demoGabor(frequency=0.16, theta=1.2, sigma=4.0)
    plt.subplot(1, 3, 2)
    demoGabor(frequency=0.31, theta=0, sigma=3.6)
    plt.subplot(1, 3, 3)
    demoGabor(frequency=0.36, theta=1.6, sigma=1.3)

Demonstration on the Fashion-MNIST data

# Calculate the magnitude of the Gabor filter response given a kernel and an imput image
def magnitude(image, kernel):
    image = (image - image.mean()) / image.std() # Normalize images
    return np.sqrt(ndi.convolve(image, np.real(kernel), mode='wrap')**2 +
                   ndi.convolve(image, np.imag(kernel), mode='wrap')**2)
def demoGabor2(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
    plt.title('Original', fontdict={'fontsize':24*fig_scale})
    plt.title('Gabor kernel', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
    plt.title('Response magnitude', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(magnitude(image, gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma))), interpolation='nearest')
if not interactive:
    demoGabor2(frequency=0.16, theta=1.4, sigma=1.2)

Filter banks#

  • Different filters detect different edges, shapes,…

  • Not all seem useful

Convolutional neural nets#

  • Finding relationships between individual pixels and the correct class is hard

  • Simplify the problem by decomposing it into smaller problems

  • First, discover ‘local’ patterns (edges, lines, endpoints)

  • Representing such local patterns as features makes it easier to learn from them

    • Deeper layers will do that for us

  • We could use convolutions, but how to choose the filters?


Convolutional Neural Networks (ConvNets)#

  • Instead of manually designing the filters, we can also learn them based on data

    • Choose filter sizes (manually), initialize with small random weights

  • Forward pass: Convolutional layer slides the filter over the input, generates the output

  • Backward pass: Update the filter weights according to the loss gradients

  • Illustration for 1 filter:


Convolutional layers: Feature maps#

  • One filter is not sufficient to detect all relevant patterns in an image

  • A convolutional layer applies and learns \(d\) filters in parallel

  • Slide \(d\) filters across the input image (in parallel) -> a (1x1xd) output per patch

  • Reassemble into a feature map with \(d\) ‘channels’, a (width x height x d) tensor.


Border effects (zero padding)#

  • Consider a 5x5 image and a 3x3 filter: there are only 9 possible locations, hence the output is a 3x3 feature map

  • If we want to maintain the image size, we use zero-padding, adding 0’s all around the input tensor.

ml ml

Undersampling (striding)#

  • Sometimes, we want to downsample a high-resolution image

    • Faster processing, less noisy (hence less overfitting)

    • Forces the model to summarize information in (smaller) feature maps

  • One approach is to skip values during the convolution

    • Distance between 2 windows: stride length

  • Example with stride length 2 (without padding):



  • Another approach to shrink the input tensors is max-pooling :

    • Run a filter with a fixed stride length over the image

      • Usually 2x2 filters and stride lenght 2

    • The filter simply returns the max (or avg ) of all values

  • Agressively reduces the number of weights (less overfitting)


Receptive field#

  • Receptive field: how much each output neuron ‘sees’ of the input image

  • Translation invariance: shifting the input does not affect the output

    • Large receptive field -> neurons can ‘see’ patterns anywhere in the input

  • \(nxn\) convolutions only increase the receptive field by \(n+2\) each layer

  • Maxpooling doubles the receptive field without deepening the network

Dilated convolutions#

Convolutional nets in practice#

Example with PyTorch#

    nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=0),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0),
  • Observe how the input image on 1x28x28 is transformed to a 64x3x3 feature map

    • In pytorch, shapes are (batch_size, channels, height, width)

  • Conv2d parameters = (kernel size^2 × input channels + 1) × output channels

  • No zero-padding: every output is 2 pixels less in every dimension

  • After every MaxPooling, resolution halved in every dimension

from torchinfo import summary
summary(model, input_size=(1, 1, 28, 28))
Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [1, 64, 3, 3]             --
├─Conv2d: 1-1                            [1, 32, 26, 26]           320
├─ReLU: 1-2                              [1, 32, 26, 26]           --
├─MaxPool2d: 1-3                         [1, 32, 13, 13]           --
├─Conv2d: 1-4                            [1, 64, 11, 11]           18,496
├─ReLU: 1-5                              [1, 64, 11, 11]           --
├─MaxPool2d: 1-6                         [1, 64, 5, 5]             --
├─Conv2d: 1-7                            [1, 64, 3, 3]             36,928
├─ReLU: 1-8                              [1, 64, 3, 3]             --
Total params: 55,744
Trainable params: 55,744
Non-trainable params: 0
Total mult-adds (M): 2.79
Input size (MB): 0.00
Forward/backward pass size (MB): 0.24
Params size (MB): 0.22
Estimated Total Size (MB): 0.47
  • To classify the images, we still need a linear and output layer.

  • We flatten the 3x3x64 feature map to a vector of size 576

model = nn.Sequential(
    nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0),
    nn.Linear(64 * 3 * 3, 64),
    nn.Linear(64, 10)
model = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=0),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=0),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0),
    nn.Linear(64 * 3 * 3, 64),
    nn.Linear(64, 10)

Complete model. Flattening adds a lot of weights!

Global Average Pooling (GAP)#

  • Instead of flattening, we do GAP: returns average of each activation map

  • We can drop the hidden dense layer: number of outputs > number of classes

model = nn.Sequential(...
    nn.AdaptiveAvgPool2d(1), # Global Average Pooling
    nn.Flatten(),            # Convert (batch, 64, 1, 1) -> (batch, 64)
    nn.Linear(64, 10))       # Output layer for 10 classes
  • With GlobalAveragePooling: much fewer weights to learn

  • Use with caution: this destroys the location information learned by the CNN

  • Not ideal for tasks such as object localization

model = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=0),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=0),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0),
    nn.AdaptiveAvgPool2d(1),  # Global Average Pooling (GAP)
    nn.Flatten(),  # Convert (batch, 64, 1, 1) -> (batch, 64)
    nn.Linear(64, 10)  # Output layer for 10 classes
Run the model on MNIST dataset

  • Train and test as usual: 99% accuracy

    • Compared to 97,8% accuracy with the dense architecture

    • Flatten and GlobalAveragePooling yield similar performance

Cats vs Dogs#

  • A more realistic dataset: Cats vs Dogs

    • Colored JPEG images, different sizes

    • Not nicely centered, translation invariance is important

  • Preprocessing

    • Decode JPEG images to floating-point tensors

    • Rescale pixel values to [0,1]

    • Resize images to 150x150 pixels

seed_everything(42)  # Set global seed

class CatDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, batch_size=20, img_size=(150, 150)):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.img_size = img_size

        # Define image transformations
        self.transform = transforms.Compose([
            transforms.Resize(self.img_size),  # Resize to 150x150
            transforms.ToTensor(),  # Convert to tensor (also scales 0-1)

    def setup(self, stage=None):
        """Load datasets"""
        train_dir = os.path.join(self.data_dir, "train")
        val_dir = os.path.join(self.data_dir, "validation")

        self.train_dataset = datasets.ImageFolder(root=train_dir, transform=self.transform)
        self.val_dataset = datasets.ImageFolder(root=val_dir, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)

# ----------------------------
# Load dataset and visualize a batch
# ----------------------------
data_module = CatDataModule(data_dir=data_dir)
train_loader = data_module.train_dataloader()
Seed set to 42
# Get a batch of data
data_batch, labels_batch = next(iter(train_loader))

# Visualize images
plt.figure(figsize=(10, 5))
for i in range(7):
    plt.subplot(1, 7, i + 1)
    plt.imshow(data_batch[i].permute(1, 2, 0))  # Convert from (C, H, W) to (H, W, C)
    plt.title("Cat" if labels_batch[i] == 0 else "Dog", fontsize=16)

Data loader#

  • We create a Pytorch Lightning DataModule to do preprocessing and data loading

class ImageDataModule(pl.LightningDataModule):
  def __init__(self, data_dir, batch_size=20, img_size=(150, 150)):
    self.transform = transforms.Compose([
      transforms.Resize(self.img_size),  # Resize to 150x150
      transforms.ToTensor()])  # Convert to tensor (also scales 0-1)
  def setup(self, stage=None):
    self.train_dataset = datasets.ImageFolder(root=train_dir, transform=self.transform)
    self.val_dataset = datasets.ImageFolder(root=val_dir, transform=self.transform)
  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)
from torchmetrics.classification import Accuracy

# Model in PyTorch Lightning
class CatImageClassifier(pl.LightningModule):
    def __init__(self, learning_rate=0.001):

        # Define convolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(128, 128, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.AdaptiveAvgPool2d(1)  # GAP replaces Flatten()

        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(128, 512),  # GAP outputs (batch, 128, 1, 1) → Flatten to (batch, 128)
            nn.Linear(512, 1)  # Binary classification (1 output neuron)

        self.loss_fn = nn.BCEWithLogitsLoss()
        self.accuracy = Accuracy(task="binary")

    def forward(self, x):
        x = self.conv_layers(x)  # Convolutions + GAP
        x = x.view(x.size(0), -1)  # Flatten from (batch, 128, 1, 1) → (batch, 128)
        x = self.fc_layers(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)  # Remove extra dimension
        loss = self.loss_fn(logits, y.float())  # BCE loss requires float labels

        preds = torch.sigmoid(logits)  # Convert logits to probabilities
        acc = self.accuracy(preds, y)

        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        self.log("train_acc", acc, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)
        loss = self.loss_fn(logits, y.float())

        preds = torch.sigmoid(logits)
        acc = self.accuracy(preds, y)

        self.log("val_loss", loss, prog_bar=True, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.hparams.learning_rate)


Since the images are more complex, we add another convolutional layer and increase the number of filters to 128.

  • We use a Trainer module (from PyTorch Lightning) to simplify training

trainer = pl.Trainer(
    max_epochs=20,        # Train for 20 epochs
    accelerator="gpu",    # Move data and model to GPU
    devices="auto",       # Number of GPUs
    deterministic=True,   # Set random seeds, for reproducibility
    callbacks=[metric_tracker,      # Callback for logging loss and acc
               checkpoint_callback] # Callback for logging weights
trainer.fit(model, datamodule=data_module)
  • Tip: to store the best model weights, you can add a ModelCheckpoint callback

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",   # Save model with lowest val. loss
    mode="min",           # "min" for loss, "max" for accuracy
    save_top_k=1,         # Keep only the best model
    dirpath="weights/",   # Directory to save checkpoints
    filename="cat_model", # File name pattern

The model learns well for the first 20 epochs, but then starts overfitting a lot!

Solving overfitting in CNNs#

  • There are various ways to further improve the model:

    • Generating more training data (data augmentation)

    • Regularization (e.g. Dropout, L1/L2, Batch Normalization,…)

    • Use pretrained rather than randomly initialized filters

      • These are trained on a lot more data

Data augmentation#

  • Generate new images via image transformations (only on training data!)

    • Images will be randomly transformed every epoch

  • Update the transform in the data module

self.train_transform = transforms.Compose([
    transforms.Resize(self.img_size), # Resize to 150x150
    transforms.RandomRotation(40),    # Rotations up to 40 degrees
                                 scale=(0.8, 1.2)), # Scale + crop, up to 20%
    transforms.RandomHorizontalFlip(),              # Horizontal flip
    transforms.RandomAffine(degrees=0, shear=20),   # Shear, up to 20%
    transforms.ColorJitter(brightness=0.2, contrast=0.2, 
                           saturation=0.2),         # Color jitter
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
Augmentation example

We also add Dropout before the Dense layer, and L2 regularization ('weight decay') in Adam

class CatImageClassifier(pl.LightningModule):
    def __init__(self, learning_rate=0.001):

        # Define convolutional layers (CNN)
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(128, 128, kernel_size=3, padding=0),
            nn.MaxPool2d(2, 2),

            nn.AdaptiveAvgPool2d(1)  # GAP instead of Flatten

        # Fully connected layers (FC) with Dropout
        self.fc_layers = nn.Sequential(
            nn.Linear(128, 512),  # GAP outputs (batch, 128, 1, 1) → Flatten to (batch, 128)
            nn.Dropout(0.5),  # Dropout (same as Keras Dropout(0.5))
            nn.Linear(512, 1)  # Binary classification (1 output neuron)

        self.loss_fn = nn.BCEWithLogitsLoss()
        self.accuracy = Accuracy(task="binary")

    def forward(self, x):
        x = self.conv_layers(x)  # Convolutions + GAP
        x = x.view(x.size(0), -1)  # Flatten from (batch, 128, 1, 1) → (batch, 128)
        x = self.fc_layers(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)  # Remove extra dimension
        loss = self.loss_fn(logits, y.float())  # BCE loss requires float labels

        preds = torch.sigmoid(logits)  # Convert logits to probabilities
        acc = self.accuracy(preds, y)

        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        self.log("train_acc", acc, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x).squeeze(1)
        loss = self.loss_fn(logits, y.float())

        preds = torch.sigmoid(logits)
        acc = self.accuracy(preds, y)

        self.log("val_loss", loss, prog_bar=True, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.hparams.learning_rate, weight_decay=1e-4)
model = CatImageClassifier()
summary(model, input_size=(1, 3, 150, 150))
Layer (type:depth-idx)                   Output Shape              Param #
CatImageClassifier                       [1, 1]                    --
├─Sequential: 1-1                        [1, 128, 1, 1]            --
│    └─Conv2d: 2-1                       [1, 32, 148, 148]         896
│    └─ReLU: 2-2                         [1, 32, 148, 148]         --
│    └─MaxPool2d: 2-3                    [1, 32, 74, 74]           --
│    └─Conv2d: 2-4                       [1, 64, 72, 72]           18,496
│    └─ReLU: 2-5                         [1, 64, 72, 72]           --
│    └─MaxPool2d: 2-6                    [1, 64, 36, 36]           --
│    └─Conv2d: 2-7                       [1, 128, 34, 34]          73,856
│    └─ReLU: 2-8                         [1, 128, 34, 34]          --
│    └─MaxPool2d: 2-9                    [1, 128, 17, 17]          --
│    └─Conv2d: 2-10                      [1, 128, 15, 15]          147,584
│    └─ReLU: 2-11                        [1, 128, 15, 15]          --
│    └─MaxPool2d: 2-12                   [1, 128, 7, 7]            --
│    └─AdaptiveAvgPool2d: 2-13           [1, 128, 1, 1]            --
├─Sequential: 1-2                        [1, 1]                    --
│    └─Linear: 2-14                      [1, 512]                  66,048
│    └─ReLU: 2-15                        [1, 512]                  --
│    └─Dropout: 2-16                     [1, 512]                  --
│    └─Linear: 2-17                      [1, 1]                    513
Total params: 307,393
Trainable params: 307,393
Non-trainable params: 0
Total mult-adds (M): 234.16
Input size (MB): 0.27
Forward/backward pass size (MB): 9.68
Params size (MB): 1.23
Estimated Total Size (MB): 11.18

No more overfitting!

Real-world CNNs


  • Deeper architecture (16 layers): allows it to learn more complex high-level features

    • Textures, patterns, shapes,…

  • Small filters (3x3) work better: capture spatial information while reducing number of parameters

  • Max-pooling (2x2): reduces spatial dimension, improves translation invariance

    • Lower resolution forces model to learn robust features (less sensitive to small input changes)

    • Only after every 2 layers, otherwise dimensions reduce too fast

  • Downside: too many parameters, expensive to train



  • Inception modules: parallel branches learn features of different sizes and scales (3x3, 5x5, 7x7,…)

    • Add reduction blocks that reduce dimensionality via convolutions with stride 2

  • Factorized convolutions: a 3x3 conv. can be replaced by combining 1x3 and 3x1, and is 33% cheaper

    • A 5x5 can be replaced by combining 3x3 and 3x3, which can in turn be factorized as above

  • 1x1 convolutions, or Network-In-Network (NIN) layers help reduce the number of channels: cheaper

  • An auxiliary classifier adds an additional gradient signal deeper in the network


Factorized convolutions#

  • A 3x3 conv. can be replaced by combining 1x3 and 3x1, and is 33% cheaper



  • Residual (skip) connections: add earlier feature map to a later one (dimensions must match)

    • Information can bypass layers, reduces vanishing gradients, allows much deeper nets

  • Residual blocks: skip small number or layers and repeat many times

    • Match dimensions though padding and 1x1 convolutions

    • When resolution drops, add 1x1 convolutions with stride 2

  • Can be combined with Inception blocks


Interpreting the model#

  • Let’s see what the convnet is learning exactly by observing the intermediate feature maps

  • We can do this easily by attaching a ‘hook’ to a layer so we can read it’s output (activation)

# Create a hook to extract intermediate output (activation)
def hook_fn(module, input, output): 
    nonlocal activation
    activation = output.detach()
# Add a hook to a specific layer
hook = model.features[layer_id].register_forward_hook(hook_fn)
# Do a forward pass without gradient computation
with torch.no_grad(): 
return activation

Result for a specific filter (Layer 0, Filter 0)

Let’s plot all the activations to see what happens in the network

  • 3rd convolutional layer: increasingly abstract: ears, nose, eyes

  • Last convolutional layer: more abstract patterns

  • Same layer, with dog image input

Spatial hierarchies#

Visualizing the learned filters#

  • Learned filters of second convolutional layer

The next layer starts showing some structures / textures

Last convolutional layer

We need to go deeper. Bigger networks, more data!#

VGG Layer 5

filters=[12, 16, 86, 110]
FV = FilterVisualizer(size=56, upscaling_steps=12, upscaling_factor=1.2)
FV.visualize_filters(5, filters=filters, blur=5)
VGG Layer 10

VGG Layer 17

Visualizing class activation#

  • We can also visualize which part of the input image had the greatest influence on the final classification. Helps to interpret what the model is paying attention to.

  • Class activation maps : produces a heatmap over the input image

    • Choose a convolution layer, do Global Average Pooling (GAP) to get one output per filter

    • Get the weights between those outputs and the class of interest

    • Compute the weighted sum of all filter activations: combines what each filter is responding to and how much this affects the class prediction


Implementing gradCAM#

    target_layer = model.layer4[-1] # Last conv layer
    # Hooks to capture activations and gradients
    def forward_hook(module, input, output):
        activations = output

    def backward_hook(module, grad_input, grad_output):
        gradients = grad_output[0]
    # Register hooks

    # Forward pass
    output = model(img_tensor)
    pred_class = output.argmax(dim=1).item()
    # Backward pass to compute gradients
    output[:, pred_class].backward()

    # Compute Grad-CAM heatmap
    weights = torch.mean(gradients, dim=[2, 3], keepdim=True)  # GAP layer
    heatmap = torch.sum(weights * activations, dim=1).squeeze()

Example on ResNet50 with a specific input image (class Elephant)

Transfer learning#

  • We can re-use pretrained networks instead of training from scratch

  • Learned features can be a generic model of the visual world

  • Use convolutional base to contruct features, then train any classifier on new data

  • Also called transfer learning , which is a kind of meta-learning

  • Let’s instantiate the VGG16 model (without the dense layers)

  • The final feature map has shape (7, 7, 512)

vgg16_model = vgg16(pretrained=True)
conv_base = vgg16_model.features
  • Add your custom network on top of an already trained base network.

  • Freeze the base network, but unfreeze the last block of conv layers.

class FineTunedClassifier(nn.Module):
    def __init__(self, conv_base, unfreeze_from=10): 
        super(FFEClassifier, self).__init__()
        self.conv_base = conv_base  # Pretrained feature extractor

        # Freeze all layers first
        for param in self.conv_base.parameters():
            param.requires_grad = False  
        # Unfreeze deeper layers
        for layer in list(self.conv_base.children())[unfreeze_from:]:
            for param in layer.parameters():
                param.requires_grad = True

        self.flatten = nn.Flatten()  # Flatten feature maps
        self.fc = nn.Sequential(
            nn.Linear(512 * 4 * 4, 256),  # Adjust input size based on feature maps
            nn.Linear(256, 1))


ml ml
  • Convnets are ideal for addressing image-related problems.

  • They learn a hierarchy of modular patterns and concepts to represent the visual world.

  • Representations are easy to inspect

  • Data augmentation helps fight overfitting

  • You can use a pretrained convnet to build better models via transfer learning