Lecture 9: Convolutional Neural Networks#

Handling image data

Joaquin Vanschoren, Eindhoven University of Technology

Overview#

  • Image convolution

  • Convolutional neural networks

  • Data augmentation

  • Model interpretation

  • Using pre-trained networks (transfer learning)

Hide code cell source
# Auto-setup when running on Google Colab
import os
import tensorflow as tf
if 'google.colab' in str(get_ipython()) and not os.path.exists('/content/master'):
    !git clone -q https://github.com/ML-course/master.git /content/master
    !pip --quiet install -r /content/master/requirements_colab.txt
    %cd master/notebooks

# Global imports and settings
%matplotlib inline
from preamble import *
interactive = True # Set to True for interactive plots 
if interactive:
    fig_scale = 0.5
    plt.rcParams.update(print_config)
else: # For printing
    fig_scale = 0.4
    plt.rcParams.update(print_config)
Hide code cell source
data_dir = '../data/cats-vs-dogs_small'
model_dir = '../data/models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
Hide code cell source
# This notebook includes models that are too slow to rebuild often, 
# so we are downloading pretrained versions

# !pip install gdown # uncomment to install the downloader
import gdown
import zipfile

if not os.path.exists(os.path.join(model_dir,"cats_and_dogs_small_1.h5")):
    url = 'https://drive.google.com/uc?id=1p10QM5jvTSJsw3060JkKSBrxDy9HwBko'
    output = os.path.join(model_dir,"lecture9_models.zip")
    gdown.download(url, output, quiet=False)
    with zipfile.ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(model_dir)
    os.remove(output)

download_cats = False    
if download_cats:
    url = 'https://drive.google.com/uc?id=1xojwqGMrwiWLBBcVNXurBZ0OyFqsS_V0'
    output = os.path.join(data_dir,"lecture9_data.zip")
    gdown.download(url, output, quiet=False)
    with zipfile.ZipFile(output, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(output)

Convolutions#

  • Operation that transforms an image by sliding a smaller image (called a filter or kernel ) over the image and multiplying the pixel values

    • Slide an \(n\) x \(n\) filter over \(n\) x \(n\) patches of the original image

    • Every pixel is replaced by the sum of the element-wise products of the values of the image patch around that pixel and the kernel

# kernel and image_patch are n x n matrices
pixel_out = np.sum(kernel * image_patch)
ml
Hide code cell source
from __future__ import print_function
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from skimage import color


# Visualize convolution. See https://tonysyu.github.io/
def iter_pixels(image):
    """ Yield pixel position (row, column) and pixel intensity. """
    height, width = image.shape[:2]
    for i in range(height):
        for j in range(width):
            yield (i, j), image[i, j]
            
# Visualize result
def imshow_pair(image_pair, titles=('', ''), figsize=(10, 5), **kwargs):
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    for ax, img, label in zip(axes.ravel(), image_pair, titles):
        ax.imshow(img, **kwargs)
        ax.set_title(label, fontdict={'fontsize':32*fig_scale})
        ax.set_xticks([])
        ax.set_yticks([])
        
# Visualize result
def imshow_triple(axes, image_pair, titles=('', '', ''), figsize=(10, 5), **kwargs):
    for ax, img, label in zip(axes, image_pair, titles):
        ax.imshow(img, **kwargs)
        ax.set_title(label, fontdict={'fontsize':10*fig_scale})
        ax.set_xticks([])
        ax.set_yticks([])
        
# Zero-padding
def padding_for_kernel(kernel):
    """ Return the amount of padding needed for each side of an image.

    For example, if the returned result is [1, 2], then this means an
    image should be padded with 1 extra row on top and bottom, and 2
    extra columns on the left and right.
    """
    # Slice to ignore RGB channels if they exist.
    image_shape = kernel.shape[:2]
    # We only handle kernels with odd dimensions so make sure that's true.
    # (The "center" pixel of an even number of pixels is arbitrary.)
    assert all((size % 2) == 1 for size in image_shape)
    return [(size - 1) // 2 for size in image_shape]
def add_padding(image, kernel):
    h_pad, w_pad = padding_for_kernel(kernel)
    return np.pad(image, ((h_pad, h_pad), (w_pad, w_pad)),
                  mode='constant', constant_values=0)
def remove_padding(image, kernel):
    inner_region = []  # A 2D slice for grabbing the inner image region
    for pad in padding_for_kernel(kernel):
        slice_i = np.s_[:] if pad == 0 else np.s_[pad: -pad]
        inner_region.append(slice_i)
    return image # [inner_region] # Broken in numpy 1.24, doesn't seem necessary

# Slice windows
def window_slice(center, kernel):
    r, c = center
    r_pad, c_pad = padding_for_kernel(kernel)
    # Slicing is (inclusive, exclusive) so add 1 to the stop value
    return np.s_[r-r_pad:r+r_pad+1, c-c_pad:c+c_pad+1]
        

# Apply convolution kernel to image patch
def apply_kernel(center, kernel, original_image):
    image_patch = original_image[window_slice(center, kernel)]
    # An element-wise multiplication followed by the sum
    return np.sum(kernel * image_patch)

# Move kernel over the image
def iter_kernel_labels(image, kernel):
    original_image = image
    image = add_padding(original_image, kernel)
    i_pad, j_pad = padding_for_kernel(kernel)

    for (i, j), pixel in iter_pixels(original_image):
        # Shift the center of the kernel to ignore padded border.
        i += i_pad
        j += j_pad
        mask = np.zeros(image.shape, dtype=int)  # Background = 0
        mask[window_slice((i, j), kernel)] = kernel   # Kernel = 1
        #mask[i, j] = 2                           # Kernel-center = 2
        yield (i, j), mask

# Visualize kernel as it moves over the image
def visualize_kernel(kernel_labels, image):
    return kernel_labels + image #color.label2rgb(kernel_labels, image, bg_label=0)

# Do a single step
def convolution_demo(image, kernel, **kwargs):
    # Initialize generator since we're only ever going to iterate over
    # a pixel once. The cached result is used, if we step back.
    gen_kernel_labels = iter_kernel_labels(image, kernel)

    image_cache = []
    image_padded = add_padding(image, kernel)
    # Plot original image and kernel-overlay next to filtered image.
    @interact(i_step=(0, image.size-1,1))
    def convolution_step(i_step=0):
        # Create all images up to the current step, unless they're already
        # cached:
        while i_step >= len(image_cache):

            # For the first step (`i_step == 0`), the original image is the
            # filtered image; after that we look in the cache, which stores
            # (`kernel_overlay`, `filtered`).
            filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
            # We don't want to overwrite the previously filtered image:
            filtered = filtered_prev.copy()

            # Get the labels used to visualize the kernel
            center, kernel_labels = next(gen_kernel_labels)
            # Modify the pixel value at the kernel center
            filtered[center] = apply_kernel(center, kernel, image_padded)
            # Take the original image and overlay our kernel visualization
            kernel_overlay = visualize_kernel(kernel_labels, image_padded)
            # Save images for reuse.
            image_cache.append((kernel_overlay, filtered))

        # Remove padding we added to deal with boundary conditions
        # (Loop since each step has 2 images)
        image_pair = [remove_padding(each, kernel)
                      for each in image_cache[i_step]]
        imshow_pair(image_pair, **kwargs)
        plt.show()

    return convolution_step

# Full process
def convolution_full(ax, image, kernel, **kwargs):
    # Initialize generator since we're only ever going to iterate over
    # a pixel once. The cached result is used, if we step back.
    gen_kernel_labels = iter_kernel_labels(image, kernel)
    print(gen_kernel_labels)

    image_cache = []
    image_padded = add_padding(image, kernel)
    # Plot original image and kernel-overlay next to filtered image.

    for i_step in range(image.size-1):

        # For the first step (`i_step == 0`), the original image is the
        # filtered image; after that we look in the cache, which stores
        # (`kernel_overlay`, `filtered`).
        filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
        # We don't want to overwrite the previously filtered image:
        filtered = filtered_prev.copy()

        # Get the labels used to visualize the kernel
        center, kernel_labels = next(gen_kernel_labels)
        # Modify the pixel value at the kernel center
        filtered[center] = apply_kernel(center, kernel, image_padded)
        # Take the original image and overlay our kernel visualization
        kernel_overlay = visualize_kernel(kernel_labels, image_padded)
        # Save images for reuse.
        image_cache.append((kernel_overlay, filtered))

    # Remove padding we added to deal with boundary conditions
    # (Loop since each step has 2 images)
    image_triple = [remove_padding(each, kernel)
                  for each in image_cache[i_step]]
    image_triple.insert(1,kernel)
    imshow_triple(ax, image_triple, **kwargs)
  • Different kernels can detect different types of patterns in the image

Hide code cell source
horizontal_edge_kernel = np.array([[ 1,  2,  1],
                                   [ 0,  0,  0],
                                   [-1, -2, -1]])
diagonal_edge_kernel = np.array([[1, 0, 0],
                                 [0, 1, 0],
                                 [0, 0, 1]])
edge_detect_kernel = np.array([[-1, -1, -1],
                               [-1,  8, -1],
                               [-1, -1, -1]])
Hide code cell source
mnist_data = oml.datasets.get_dataset(554) # Download MNIST data
# Get the predictors X and the labels y
X_mnist, y_mnist, c, a = mnist_data.get_data(dataset_format='array', target=mnist_data.default_target_attribute); 
image = X_mnist[1].reshape((28, 28))
image = (image - np.min(image))/np.ptp(image) # Normalize

titles = ('Image and kernel', 'Filtered image')
convolution_demo(image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r');
Hide code cell source
if not interactive:
    fig, axs = plt.subplots(3,  3, figsize=(5*fig_scale, 5*fig_scale))
    titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
    convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
    convolution_full(axs[1,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
    convolution_full(axs[2,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    plt.tight_layout()

Demonstration on Google streetview data#

House numbers photographed from Google streetview imagery, cropped and centered around digits, but with neighboring numbers or other edge artifacts.

Hide code cell source
SVHN = oml.datasets.get_dataset(41081)
X, y, cats, attrs = SVHN.get_data(dataset_format='array',
    target=SVHN.default_target_attribute)
Hide code cell source
def plot_images(X, y, grayscale=False):
    fig, axes = plt.subplots(1, len(X),  figsize=(10, 5))
    for n in range(len(X)):
        if grayscale:
            axes[n].imshow(X[n].reshape(32, 32)/255, cmap='gray')
        else:
            axes[n].imshow(X[n].reshape(32, 32, 3)/255)
        axes[n].set_xlabel((y[n]+1), fontsize=32*fig_scale) # Label is index+1
        axes[n].set_xticks(()), axes[n].set_yticks(())
    plt.show();

images = range(5)
X_sub_color = [X[i] for i in images]
y_sub = [y[i] for i in images]
plot_images(X_sub_color, y_sub)
../_images/e675bd4c41263e0ae297fffa9a39205f02bc46c21903e392f6dd9856a8fd303f.png

For recognizing digits, color is not important, so we grayscale the images

Hide code cell source
def rgb2gray(X, dim=32):
    return np.expand_dims(np.dot(X.reshape(len(X), dim*dim, 3), [0.2990, 0.5870, 0.1140]), axis=2)
Xsm = rgb2gray(X[:100])

X_sub = [Xsm[i] for i in images]
plot_images(X_sub, y_sub, grayscale=True)
../_images/2a4f2ffaf0c064d961cdc904e92bef1c5d38a4e5640cbf1cd6b2564d0b7aa8bc.png

Demonstration

Hide code cell source
def normalize_image(X):
    image = X.reshape((32, 32))
    return (image - np.min(image))/np.ptp(image) # Normalize

image = normalize_image(X_sub[3])
demo2 = convolution_demo(image, horizontal_edge_kernel,
                 vmin=-4, vmax=4, cmap='gray_r');
Hide code cell source
if not interactive:
    fig, axs = plt.subplots(3, 3, figsize=(5*fig_scale, 5*fig_scale))
    titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
    convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
    convolution_full(axs[1,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
    convolution_full(axs[2,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
    plt.tight_layout()

Image convolution in practice#

  • How do we know which filters are best for a given image?

  • Families of kernels (or filter banks ) can be run on every image

    • Gabor, Sobel, Haar Wavelets,…

  • Gabor filters: Wave patterns generated by changing:

    • Frequency: narrow or wide ondulations

    • Theta: angle (direction) of the wave

    • Sigma: resolution (size of the filter)

Demonstration

Hide code cell source
from scipy import ndimage as ndi
from skimage import data
from skimage.util import img_as_float
from skimage.filters import gabor_kernel

# Gabor Filters.
@interact
def demoGabor(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
    plt.gray()
    plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
    plt.title(f'freq: {frequency}, theta: {theta}, sigma: {sigma}', fontdict={'fontsize':16*fig_scale})
    plt.xticks([])
    plt.yticks([])
Hide code cell source
if not interactive:
    plt.subplot(1, 3, 1)
    demoGabor(frequency=0.16, theta=1.2, sigma=4.0)
    plt.subplot(1, 3, 2)
    demoGabor(frequency=0.31, theta=0, sigma=3.6)
    plt.subplot(1, 3, 3)
    demoGabor(frequency=0.36, theta=1.6, sigma=1.3)
    plt.tight_layout()

Demonstration on the streetview data

Hide code cell source
# Calculate the magnitude of the Gabor filter response given a kernel and an imput image
def magnitude(image, kernel):
    image = (image - image.mean()) / image.std() # Normalize images
    return np.sqrt(ndi.convolve(image, np.real(kernel), mode='wrap')**2 +
                   ndi.convolve(image, np.imag(kernel), mode='wrap')**2)
Hide code cell source
@interact
def demoGabor2(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
    plt.subplot(131)
    plt.title('Original', fontdict={'fontsize':24*fig_scale})
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
    plt.subplot(132)
    plt.title('Gabor kernel', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
    plt.xticks([])
    plt.yticks([])
    plt.subplot(133)
    plt.title('Response magnitude', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(magnitude(image, gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma))), interpolation='nearest')
    plt.tight_layout()
    plt.xticks([])
    plt.yticks([])
../_images/5a609c10208b1da26ff482bddf6acaa86b5f73ccbba92252d0bfddf4e824f4f8.png
Hide code cell source
if not interactive:
    demoGabor2(frequency=0.16, theta=1.4, sigma=1.2)

Filter banks#

  • Different filters detect different edges, shapes,…

  • Not all seem useful

Hide code cell source
# More images
image3 = normalize_image(Xsm[3])
image5 = normalize_image(Xsm[5])
image13 = normalize_image(Xsm[13])

image_names = ('3', '5', '8') # labels
images = (image3, image5, image13)

def plot_filter_bank(images):
    # Create a set of kernels, apply them to each image, store the results
    results = []
    kernel_params = []
    for theta in (0, 1):
        theta = theta / 4. * np.pi
        for frequency in (0.1, 0.2):
            for sigma in (1, 3):
                kernel = gabor_kernel(frequency, theta=theta,sigma_x=sigma,sigma_y=sigma)
                params = 'theta=%.2f,\nfrequency=%.2f\nsigma=%.2f' % (theta, frequency, sigma)
                kernel_params.append(params)
                results.append((kernel, [magnitude(img, kernel) for img in images]))

    # Plotting
    fig, axes = plt.subplots(nrows=4, ncols=9, figsize=(14*fig_scale, 8*fig_scale))
    plt.gray()
    #fig.suptitle('Image responses for Gabor filter kernels', fontsize=12)
    axes[0][0].axis('off')

    for label, img, ax in zip(image_names, images, axes[1:]):
        axs = ax[0]
        axs.imshow(img)
        axs.set_ylabel(label, fontsize=12*fig_scale)
        axs.set_xticks([]) # Remove axis ticks 
        axs.set_yticks([])
        
    # Plot Gabor kernel
    col = 1
    for label, (kernel, magnitudes), ax_col in zip(kernel_params, results, axes[0][1:]):
        ax_col.imshow(np.real(kernel), interpolation='nearest') # Plot kernel
        ax_col.set_title(label, fontsize=10*fig_scale)
        ax_col.axis('off')
        
        # Plot Gabor responses with the contrast normalized for each filter
        vmin = np.min(magnitudes)
        vmax = np.max(magnitudes)
        for patch, ax in zip(magnitudes, axes.T[col][1:]):
            ax.imshow(patch, vmin=vmin, vmax=vmax) # Plot convolutions
            ax.axis('off')
        col += 1
    
    plt.show()

plot_filter_bank(images)
../_images/836a5c52d2c77ad3f6543f0fa6440bf5ed5c0a31146046426b282e13a5057cdd.png

Another example: Fashion MNIST

Hide code cell source
fmnist_data = oml.datasets.get_dataset(40996) # Download FMNIST data
# Get the predictors X and the labels y
X_fm, y_fm, c, a = fmnist_data.get_data(dataset_format='array', target=fmnist_data.default_target_attribute); 
Hide code cell source
# build a list of figures for plotting
def buildFigureList(fig, subfiglist, titles, length):
    for i in range(0,length):
        pixels = np.array(subfiglist[i], dtype='float')
        pixels = pixels.reshape((28, 28))
        a=fig.add_subplot(1,length,i+1)
        imgplot =plt.imshow(pixels, cmap='gray_r')
        a.set_title(titles[i], fontsize=6)
        a.axes.get_xaxis().set_visible(False)
        a.axes.get_yaxis().set_visible(False)
    return

subfiglist = []
titles=[]

for i in range(0,10):
    subfiglist.append(X_fm[i])
    titles.append(i)

buildFigureList(plt.figure(1),subfiglist, titles, 10)
plt.show()
../_images/793a0cc411095ba2c6d4cd58182ce62d6d504d927bf784cdceabc7cd9577039f.png

Demonstration

Hide code cell source
boot = X_fm[0].reshape((28, 28))
image2=boot
@interact
def demoGabor3(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
    plt.subplot(131)
    plt.title('Original', fontdict={'fontsize':24*fig_scale})
    plt.imshow(image2)
    plt.xticks([]) # Remove axis ticks 
    plt.yticks([])
    plt.subplot(132)
    plt.title('Gabor kernel', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
    plt.xticks([]) # Remove axis ticks 
    plt.yticks([])
    plt.subplot(133)
    plt.xticks([]) # Remove axis ticks 
    plt.yticks([])
    plt.title('Response magnitude', fontdict={'fontsize':24*fig_scale})
    plt.imshow(np.real(magnitude(image2, gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma))), interpolation='nearest')
Hide code cell source
if not interactive:
    demoGabor3(frequency=0.81, theta=2.7, sigma=0.9)

Fashion MNIST with multiple filters (filter bank)

Hide code cell source
# Fetch some Fashion-MNIST images
boot = X_fm[0].reshape(28, 28)
shirt = X_fm[1].reshape(28, 28)
dress = X_fm[2].reshape(28, 28)
image_names = ('boot', 'shirt', 'dress')
images = (boot, shirt, dress)

plot_filter_bank(images)
../_images/99e79eaeaf8d2789f4f5b7fcf26f7720f82ace32a4383e94417970b2886f3274.png

Convolutional neural nets#

  • Finding relationships between individual pixels and the correct class is hard

  • We want to discover ‘local’ patterns (edges, lines, endpoints)

  • Representing such local patterns as features makes it easier to learn from them

  • We could use convolutions, but how to choose the filters?

ml

Convolutional Neural Networks (ConvNets)#

  • Instead of manually designing the filters, we can also learn them based on data

    • Choose filter sizes (manually), initialize with small random weights

  • Forward pass: Convolutional layer slides the filter over the input, generates the output

  • Backward pass: Update the filter weights according to the loss gradient

  • Illustration for 1 filter:

ml

Convolutional layers: Feature maps#

  • One filter is not sufficient to detect all relevant patterns in an image

  • A convolutional layer applies and learns \(d\) filter in parallel

  • Slide \(d\) filters across the input image (in parallel) -> a (1x1xd) output per patch

  • Reassemble into a feature map with \(d\) ‘channels’, a (width x height x d) tensor.

ml

Border effects (zero padding)#

  • Consider a 5x5 image and a 3x3 filter: there are only 9 possible locations, hence the output is a 3x3 feature map

  • If we want to maintain the image size, we use zero-padding, adding 0’s all around the input tensor.

ml ml

Undersampling (striding)#

  • Sometimes, we want to downsample a high-resolution image

    • Faster processing, less noisy (hence less overfitting)

  • One approach is to skip values during the convolution

    • Distance between 2 windows: stride length

  • Example with stride length 2 (without padding):

ml

Max-pooling#

  • Another approach to shrink the input tensors is max-pooling :

    • Run a filter with a fixed stride length over the image

      • Usually 2x2 filters and stride lenght 2

    • The filter simply returns the max (or avg ) of all values

  • Agressively reduces the number of weights (less overfitting)

  • Information from every input node spreads more quickly to output nodes

    • In pure convnets, one input value spreads to 3x3 nodes of the first layer, 5x5 nodes of the second, etc.

    • Without maxpooling, you need much deeper networks, harder to train

  • Increases translation invariance : patterns can affect the predictions no matter where they occur in the image

Convolutional nets in practice#

  • Use multiple convolutional layers to learn patterns at different levels of abstraction

    • Find local patterns first (e.g. edges), then patterns across those patterns

  • Use MaxPooling layers to reduce resolution, increase translation invariance

  • Use sufficient filters in the first layer (otherwise information gets lost)

  • In deeper layers, use increasingly more filters

    • Preserve information about the input as resolution descreases

    • Avoid decreasing the number of activations (resolution x nr of filters)

  • For very deep nets, add skip connections to preserve information (and gradients)

    • Sums up outputs of earlier layers to those of later layers (with same dimensions)

Example with Keras#

  • Conv2D for 2D convolutional layers

    • 32 filters (default), randomly initialized (from uniform distribution)

    • Deeper layers use 64 filters

    • Filter size is 3x3

    • ReLU activation to simplify training of deeper networks

  • MaxPooling2D for max-pooling

    • 2x2 pooling reduces the number of inputs by a factor 4

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', 
                        input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
Hide code cell source
from tensorflow.keras import layers
from tensorflow.keras import models
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))

Observe how the input image on 28x28x1 is transformed to a 3x3x64 feature map

  • Convolutional layer:

    • No zero-padding: every output 2 pixels less in every dimension

    • 320 weights: (3x3 filter weights + 1 bias) * 32 filters

  • After every MaxPooling, resolution halved in every dimension

Hide code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 13, 13, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 5, 5, 64)          0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 3, 3, 64)          36928     
                                                                 
=================================================================
Total params: 55744 (217.75 KB)
Trainable params: 55744 (217.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
  • To classify the images, we still need a Dense and Softmax layer.

  • We can flatten the 3x3x64 feature map to a vector of size 576

model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
Hide code cell source
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
Hide code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 13, 13, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 5, 5, 64)          0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 3, 3, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 576)               0         
                                                                 
 dense (Dense)               (None, 64)                36928     
                                                                 
 dense_1 (Dense)             (None, 10)                650       
                                                                 
=================================================================
Total params: 93322 (364.54 KB)
Trainable params: 93322 (364.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
  • Flatten adds a lot of weights

  • Instead, we can do GlobalAveragePooling: returns average of each activation map

  • This sometimes works even without adding a dense layer (the number of outputs is similar to the number of classes)

model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(10, activation='softmax'))
ml
  • With GlobalAveragePooling: much fewer weights to learn

  • Use with caution: this destroys the location information learned by the CNN

  • Not ideal for tasks such as object localization

Hide code cell source
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(10, activation='softmax'))
model.summary()
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_3 (Conv2D)           (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 13, 13, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_4 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 5, 5, 64)          0         
 g2D)                                                            
                                                                 
 conv2d_5 (Conv2D)           (None, 3, 3, 64)          36928     
                                                                 
 global_average_pooling2d (  (None, 64)                0         
 GlobalAveragePooling2D)                                         
                                                                 
 dense_2 (Dense)             (None, 10)                650       
                                                                 
=================================================================
Total params: 56394 (220.29 KB)
Trainable params: 56394 (220.29 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Hide code cell source
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

(train_images, train_labels), (validation_images, validation_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255

validation_images = validation_images.reshape((10000, 28, 28, 1))
validation_images = validation_images.astype('float32') / 255

train_labels = to_categorical(train_labels)
validation_labels = to_categorical(validation_labels)

Run the model on MNIST dataset

  • Train and test as usual: 99% accuracy

    • Compared to 97,8% accuracy with the dense architecture

    • Flatten and GlobalAveragePooling yield similar performance

Model was trained beforehand and saved. Uncomment if you want to run from scratch

import pickle
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(train_images, train_labels, epochs=5, batch_size=64, verbose=0,
                    validation_data=(validation_images, validation_labels))

model.save(os.path.join(model_dir, 'mnist.h5'))
with open(os.path.join(model_dir, 'mnist_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
Hide code cell source
import pickle
from tensorflow.keras.models import load_model

model = load_model(os.path.join(model_dir, 'mnist.h5'))
validation_loss, validation_acc = model.evaluate(validation_images, validation_labels, verbose=0)
Hide code cell source
plt.figure()
plt.rcParams.update({'figure.figsize': (12*fig_scale,5*fig_scale), 
                     'font.size': 16*fig_scale,
                     'xtick.labelsize': 14*fig_scale,
                     'ytick.labelsize': 14*fig_scale});
<Figure size 1500x900 with 0 Axes>
Hide code cell source
print("Accuracy: ", validation_acc)

history = pickle.load(open("../data/models/mnist_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Accuracy:  0.9871000051498413
../_images/047a082654b83c8e6270d6f2ea0b13325b13652e9105bb4630eccd441c21ce37.png

Tip:

  • Training ConvNets can take a lot of time

  • Save the trained model (and history) to disk so that you can reload it later

model.save(os.path.join(model_dir, 'mnist.h5'))
with open(os.path.join(model_dir, 'mnist_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
Hide code cell source
# For validation of the GAP layer. Gives the same accuracy

#model = models.Sequential()
#model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
#model.add(layers.MaxPooling2D((2, 2)))
#model.add(layers.Conv2D(64, (3, 3), activation='relu'))
#model.add(layers.MaxPooling2D((2, 2)))
#model.add(layers.Conv2D(64, (3, 3), activation='relu'))
#model.add(layers.GlobalAveragePooling2D())
#model.add(layers.Dense(10, activation='softmax'))
#model.compile(optimizer='rmsprop',
#              loss='categorical_crossentropy',
#              metrics=['accuracy'])
#history = model.fit(train_images, train_labels, epochs=5, batch_size=64, verbose=0,
#                    validation_data=(validation_images, validation_labels))
#validation_loss, validation_acc = model.evaluate(validation_images, validation_labels, verbose=0)
#validation_acc 

Cats vs Dogs

  • A more realistic dataset: Cats vs Dogs

    • Colored JPEG images, different sizes

    • Not nicely centered, translation invariance is important

  • Preprocessing

    • Create balanced subsample of 4000 colored images

      • 3000 for training, 1000 validation

    • Decode JPEG images to floating-point tensors

    • Rescale pixel values to [0,1]

    • Resize images to 150x150 pixels

Uncomment to run from scratch

import os, shutil 
# Download data from https://www.kaggle.com/c/dogs-vs-cats/data
# Uncompress `train.zip` into the `original_dataset_dir`
original_dataset_dir = '../data/cats-vs-dogs_original'

# The directory where we will
# store our smaller dataset
train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')

if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    os.mkdir(train_dir)
    os.mkdir(validation_dir)
    
train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')
validation_cats_dir = os.path.join(validation_dir, 'cats')
validation_dogs_dir = os.path.join(validation_dir, 'dogs')

if not os.path.exists(train_cats_dir):
    os.mkdir(train_cats_dir)
    os.mkdir(train_dogs_dir)
    os.mkdir(validation_cats_dir)
    os.mkdir(validation_dogs_dir)

# Copy first 2000 cat images to train_cats_dir
fnames = ['cat.{}.jpg'.format(i) for i in range(2000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    dst = os.path.join(train_cats_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy next 1000 cat images to validation_cats_dir
fnames = ['cat.{}.jpg'.format(i) for i in range(2000, 3000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    dst = os.path.join(validation_cats_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy first 2000 dog images to train_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(2000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    dst = os.path.join(train_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy next 1000 dog images to validation_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(2000, 3000)]
for fname in fnames:
    src = os.path.join(original_dataset_dir, fname)
    dst = os.path.join(validation_dogs_dir, fname)
    shutil.copyfile(src, dst)

Data generators#

  • We to do this preprocessing on the fly. Storing processed data on disk is less efficient, and we can’t store all data in (GPU) memory

  • ImageDataGenerator: allows to encode, resize, and rescale JPEG images

  • Returns a Python generator we can endlessly query for batches of images

  • Separately for training, validation, and validation set

train_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        train_dir,              # Directory with images
        target_size=(150, 150), # Resize images 
        batch_size=20,          # Return 20 images at a time
        class_mode='binary')    # Binary labels
Hide code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')

# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=20,
        class_mode='binary')
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
Hide code cell source
for data_batch, labels_batch in train_generator:
    plt.figure(figsize=(10,5))
    for i in range(7):
        plt.subplot(171+i)
        plt.xticks([])
        plt.yticks([])
        imgplot = plt.imshow(data_batch[i])
        plt.title('cat' if labels_batch[i] == 0 else 'dog', fontdict={'fontsize':24*fig_scale})
        plt.tight_layout()
    break
../_images/a6f08d42d41ce67059983613a46166916a96607e2f3c3131e64a720aaabdf229.png
Hide code cell source
from tensorflow.keras import layers
from tensorflow.keras import models

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

Since the images are larger and more complex, we add another convolutional layer and increase the number of filters to 128.

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))



Hide code cell source
model.summary()
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_6 (Conv2D)           (None, 148, 148, 32)      896       
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 74, 74, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_7 (Conv2D)           (None, 72, 72, 64)        18496     
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 36, 36, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_8 (Conv2D)           (None, 34, 34, 128)       73856     
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 17, 17, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_9 (Conv2D)           (None, 15, 15, 128)       147584    
                                                                 
 max_pooling2d_7 (MaxPoolin  (None, 7, 7, 128)         0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 6272)              0         
                                                                 
 dense_3 (Dense)             (None, 512)               3211776   
                                                                 
 dense_4 (Dense)             (None, 1)                 513       
                                                                 
=================================================================
Total params: 3453121 (13.17 MB)
Trainable params: 3453121 (13.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

Training#

  • The fit function also supports generators

    • 100 steps per epoch (batch size: 20 images per step), for 30 epochs

    • Provide a separate generator for the validation data

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])
history = model.fit(
      train_generator, steps_per_epoch=100,
      epochs=30, verbose=0,
      validation_data=validation_generator,
      validation_steps=50)
Hide code cell source
from tensorflow.keras import optimizers

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.

Uncomment to run. Otherwise, we will load the pretrained model from disk.

import pickle
history = model.fit(
      train_generator,
      steps_per_epoch=100,
      epochs=30, verbose=1,
      validation_data=validation_generator,
      validation_steps=50)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_1.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_1_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
  • The network is overfitting. Validation accuracy at 75% while training accuracy reaches 100%

  • There are several things to explore:

    • Generating more training data (data augmentation)

    • Regularization (e.g. Dropout, L1/L2, Batch Normalization,…)

    • Use pretrained rather than randomly initialized filters

Hide code cell source
import pickle
history = pickle.load(open("../data/models/cats_and_dogs_small_1_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
../_images/35b272933652ddb95a140421aaf728a63995234a22ff081b8f732b6310c14a5e.png

Data augmentation#

  • Generate new images via image transformations

    • Images will be randomly transformed every epoch

  • We can again use a data generator to do this

datagen = ImageDataGenerator(
      rotation_range=40,     # Rotate image up to 40 degrees
      width_shift_range=0.2, # Shift image left-right up to 20% of image width
      height_shift_range=0.2,# Shift image up-down up to 20% of image height
      shear_range=0.2,       # Shear (slant) the image up to 0.2 degrees
      zoom_range=0.2,        # Zoom in up to 20%
      horizontal_flip=True,  # Horizontally flip the image
      fill_mode='nearest')   # How to fill in pixels that didn't exist before
Hide code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')

Example

Hide code cell source
# This is module with image preprocessing utilities
from tensorflow.keras.preprocessing import image
train_cats_dir = os.path.join(data_dir, 'train', 'cats')
fnames = [os.path.join(train_cats_dir, fname) for fname in os.listdir(train_cats_dir)]

# We pick one image to "augment"
img_path = fnames[8]

# Read the image and resize it
img = image.load_img(img_path, target_size=(150, 150))

# Convert it to a Numpy array with shape (150, 150, 3)
x = image.img_to_array(img)

# Reshape it to (1, 150, 150, 3)
x = x.reshape((1,) + x.shape)

# The .flow() command below generates batches of randomly transformed images.
# It will loop indefinitely, so we need to `break` the loop at some point!
for a in range(2):
    i = 0
    for batch in datagen.flow(x, batch_size=1):
        plt.subplot(141+i) 
        plt.xticks([]) 
        plt.yticks([])
        imgplot = plt.imshow(image.array_to_img(batch[0]))
        i += 1
        if i % 4 == 0:
            break
        
    plt.tight_layout()
    plt.show()
../_images/9e622d03da7d66c37f6319dd6a1929ca63ef062045bde9d043b6948cf76898fe.png ../_images/1f7e587fb0280f526e2c9a1f9ad7cfdceddc5189c08a659f5b60ed51b2eb4acc.png

We also add Dropout before the Dense layer

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Hide code cell source
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.
Hide code cell source
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,)

# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=32,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.

Uncomment to run from scratch

history = model.fit_generator(
      train_generator,
      steps_per_epoch=60, # 2000 images / batch size 32
      epochs=100,  verbose=0,
      validation_data=validation_generator,
      validation_steps=30) # About 1000/32
model.save(os.path.join(model_dir, 'cats_and_dogs_small_2.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_2_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

(Almost) no more overfitting!

Hide code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_2_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
../_images/cae9f0ecf151e08c1e08674f1714a1b50483f1e62314b65ab183eb061bb17175.png

Real-world CNNs#

VGG16#

  • Deeper architecture (16 layers): allows it to learn more complex high-level features

    • Textures, patterns, shapes,…

  • Small filters (3x3) work better: capture spatial information while reducing number of parameters

  • Max-pooling (2x2): reduces spatial dimension, improves translation invariance

    • Lower resolution forces model to learn robust features (less sensitive to small input changes)

    • Only after every 2 layers, otherwise dimensions reduce too fast

  • Downside: too many parameters, expensive to train

ml

Inceptionv3#

  • Inception modules: parallel branches learn features of different sizes and scales (3x3, 5x5, 7x7,…)

    • Add reduction blocks that reduce dimensionality via convolutions with stride 2

  • Factorized convolutions: a 3x3 conv. can be replaced by combining 1x3 and 3x1, and is 33% cheaper

    • A 5x5 can be replaced by combining 3x3 and 3x3, which can in turn be factorized as above

  • 1x1 convolutions, or Network-In-Network (NIN) layers help reduce the number of channels: cheaper

  • An auxiliary classifier adds an additional gradient signal deeper in the network

ml

ResNet50#

  • Residual (skip) connections: add earlier feature map to a later one (dimensions must match)

    • Information can bypass layers, reduces vanishing gradients, allows much deeper nets

  • Residual blocks: skip small number or layers and repeat many times

    • Match dimensions though padding and 1x1 convolutions

    • When resolution drops, add 1x1 convolutions with stride 2

  • Can be combined with Inception blocks

ml

Interpreting the model#

  • Let’s see what the convnet is learning exactly by observing the intermediate feature maps

    • A layer’s output is also called its activation

  • We can choose a specific test image, and then retrieve and visualize the activation for every filter for every layer

  • Layer 0: has activations of resolution 148x148 for each of its 32 filters

  • Layer 2: has activations of resolution 72x72 for each of its 64 filters

  • Layer 4: has activations of resolution 34x34 for each of its 128 filters

  • Layer 6: has activations of resolution 15x15 for each of its 128 filters

Hide code cell source
from tensorflow.keras.models import load_model

model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_2.h5'))
Hide code cell source
model.summary()  # As a reminder
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv2d_10 (Conv2D)          (None, 148, 148, 32)      896       
                                                                 
 max_pooling2d_8 (MaxPoolin  (None, 74, 74, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 72, 72, 64)        18496     
                                                                 
 max_pooling2d_9 (MaxPoolin  (None, 36, 36, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_12 (Conv2D)          (None, 34, 34, 128)       73856     
                                                                 
 max_pooling2d_10 (MaxPooli  (None, 17, 17, 128)       0         
 ng2D)                                                           
                                                                 
 conv2d_13 (Conv2D)          (None, 15, 15, 128)       147584    
                                                                 
 max_pooling2d_11 (MaxPooli  (None, 7, 7, 128)         0         
 ng2D)                                                           
                                                                 
 flatten_2 (Flatten)         (None, 6272)              0         
                                                                 
 dropout (Dropout)           (None, 6272)              0         
                                                                 
 dense_4 (Dense)             (None, 512)               3211776   
                                                                 
 dense_5 (Dense)             (None, 1)                 513       
                                                                 
=================================================================
Total params: 3453121 (13.17 MB)
Trainable params: 3453121 (13.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Hide code cell source
img_path = os.path.join(data_dir, 'train/cats/cat.1700.jpg')

# We preprocess the image into a 4D tensor
from tensorflow.keras.preprocessing import image
import numpy as np

img = image.load_img(img_path, target_size=(150, 150))
img_tensor = image.img_to_array(img)
img_tensor = np.expand_dims(img_tensor, axis=0) 
# Remember that the model was trained on inputs
# that were preprocessed in the following way:
img_tensor /= 255.
Hide code cell source
from tensorflow.keras import models

# Extracts the outputs of the top 8 layers:
layer_outputs = [layer.output for layer in model.layers[:8]]
# Creates a model that will return these outputs, given the model input:
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)

# This will return a list of 5 Numpy arrays:
# one array per layer activation
activations = activation_model.predict(img_tensor)
1/1 [==============================] - 1s 529ms/step
  • To extract the activations, we create a new model that outputs the trained layers

    • 8 output layers in total (only the convolutional part)

  • We input a test image for prediction and then read the relevant outputs

layer_outputs = [layer.output for layer in model.layers[:8]]
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)

Output of the first Conv2D layer, 3rd channel (filter):

  • Similar to a diagonal edge detector

  • Your own channels may look different

Hide code cell source
first_layer_activation = activations[0]

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor[0])
ax2.matshow(first_layer_activation[0, :, :,6], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 6');
../_images/3fc03add295dcce87cc0ad50d51fbed44f0b22f066bc6c42e308be45f9e21afc.png

Output of filter 16:

  • Cat eye detector?

Hide code cell source
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor[0])
ax2.matshow(first_layer_activation[0, :, :,25], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 25');
../_images/ce5074687a15130a49246c25bbe264a395367576a96b89eae502f2434d1af025.png

The same filter responds quite differently for other inputs (green detector?).

Hide code cell source
img_path = os.path.join(data_dir, 'train/dogs/dog.1528.jpg')

# We preprocess the image into a 4D tensor
img = image.load_img(img_path, target_size=(150, 150))
img_tensor2 = image.img_to_array(img)
img_tensor2 = np.expand_dims(img_tensor2, axis=0) 
# Remember that the model was trained on inputs
# that were preprocessed in the following way:
img_tensor2 /= 255.

activations2 = activation_model.predict(img_tensor2)

first_layer_activation2 = activations2[0]

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor2[0])
ax2.matshow(first_layer_activation2[0, :, :, 25], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 25');
1/1 [==============================] - 0s 10ms/step
../_images/3899a9a6da8ac56c215501025cc6721c5e8a2ab0d5d0a2f2c5b941ad95654f19.png
Hide code cell source
images_per_row = 16

layer_names = []
for layer in model.layers[:8]:
    layer_names.append(layer.name)

def plot_activations(layer_index, activations):
    start = layer_index
    end = layer_index+1
    # Now let's display our feature maps
    for layer_name, layer_activation in zip(layer_names[start:end], activations[start:end]):
        # This is the number of features in the feature map
        n_features = layer_activation.shape[-1]

        # The feature map has shape (1, size, size, n_features)
        size = layer_activation.shape[1]

        # We will tile the activation channels in this matrix
        n_cols = n_features // images_per_row
        display_grid = np.zeros((size * n_cols, images_per_row * size))

        # We'll tile each filter into this big horizontal grid
        for col in range(n_cols):
            for row in range(images_per_row):
                channel_image = layer_activation[0,
                                                 :, :,
                                                 col * images_per_row + row]
                # Post-process the feature to make it visually palatable
                channel_image -= channel_image.mean()
                channel_image /= channel_image.std()
                channel_image *= 64
                channel_image += 128
                channel_image = np.clip(channel_image, 0, 255).astype('uint8')
                display_grid[col * size : (col + 1) * size,
                             row * size : (row + 1) * size] = channel_image

        # Display the grid
        scale = 1. / size
        plt.figure(figsize=(scale * display_grid.shape[1],
                            scale * display_grid.shape[0]))
        plt.title("Activation of layer {} ({})".format(layer_index+1,layer_name), fontdict={'fontsize':30*fig_scale})
        plt.grid(False)
        plt.axis("off")
        plt.imshow(display_grid, aspect='auto', cmap='viridis')

    plt.show()
  • First 2 convolutional layers: various edge detectors

Hide code cell source
plot_activations(0, activations)
plot_activations(2, activations)
../_images/4a68204bf0a4c35d7fededc2838c780449108005b32c477dd4915ca440f583d9.png ../_images/e013f2be2f7fddb8a7d8f92f30cc324b74eb30aea369de46d7c49b72490a64b9.png
  • 3rd convolutional layer: increasingly abstract: ears, eyes

Hide code cell source
plot_activations(4, activations)
../_images/f64545938f6bd79e3137d4f2279e6db2a6aa7d289dec71d293b46e924aab2d0e.png
  • Last convolutional layer: more abstract patterns

  • Empty filter activations: input image does not have the information that the filter was interested in (maybe it was dog-specific)

Hide code cell source
plot_activations(6, activations)
../_images/6ec7c6f4034ab7ea6fdd303e213e382525ce995d7db7ac29e23fd2cbb010f3ef.png
  • Same layer, with dog image input

    • Very different activations

Hide code cell source
plot_activations(6, activations2)
../_images/bdc95b816f00ad7081ed15376ac61aa797ea23266faa242bc92b26586e90edc4.png

Spatial hierarchies#

  • Deep convnets can learn spatial hierarchies of patterns

    • First layer can learn very local patterns (e.g. edges)

    • Second layer can learn specific combinations of patterns

    • Every layer can learn increasingly complex abstractions

ml

Visualizing the learned filters#

  • Visualize filters by finding the input image that they are maximally responsive to

  • gradient ascent in input space : start from a random image \(x\), use loss to update the input values to values that the filter responds to more strongly (keep weights fixed)

    • \(X_{(i+1)} = X_{(i)} + \frac{\partial L(x, X_{(i)})}{\partial X} * \eta\)

    from keras import backend as K
    input_img = np.random.random((1, size, size, 3)) * 20 + 128.
    loss = K.mean(layer_output[:, :, :, filter_index])
    grads = K.gradients(loss, model.input)[0] # Compute gradient
    for i in range(40): # Run gradient ascent for 40 steps
        loss_v, grads_v = K.function([input_img], [loss, grads])
        input_img_data += grads_v * step
Hide code cell source
#tf.compat.v1.disable_eager_execution()
from tensorflow.keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

# Convert tensor to image
def deprocess_image(x):
    # normalize tensor: center on 0., ensure std is 0.1
    x -= x.mean()
    x /= (x.std() + 1e-5)
    x *= 0.1

    # clip to [0, 1]
    x += 0.5
    x = np.clip(x, 0, 1)

    # convert to RGB array
    x *= 255
    x = np.clip(x, 0, 255).astype('uint8')
    return x

def generate_pattern(layer_name, filter_index, size=150):
    # Build a loss function that maximizes the activation
    # of the nth filter of the layer considered.
    layer_output = model.get_layer(layer_name).output
    loss = K.mean(layer_output[:, :, :, filter_index])

    # Compute the gradient of the input picture wrt this loss
    grads = K.gradients(loss, model.input)[0]

    # Normalization trick: we normalize the gradient
    grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-5)

    # This function returns the loss and grads given the input picture
    iterate = K.function([model.input], [loss, grads])
    
    # We start from a gray image with some noise
    input_img_data = np.random.random((1, size, size, 3)) * 20 + 128.

    # Run gradient ascent for 40 steps
    step = 1.
    for i in range(40):
        loss_value, grads_value = iterate([input_img_data])
        input_img_data += grads_value * step
        
    img = input_img_data[0]
    return deprocess_image(img)

def visualize_filter(layer_name):
    size = 64
    margin = 5

    # This a empty (black) image where we will store our results.
    results = np.zeros((8 * size + 7 * margin, 8 * size + 7 * margin, 3))

    for i in range(8):  # iterate over the rows of our results grid
        for j in range(8):  # iterate over the columns of our results grid
            # Generate the pattern for filter `i + (j * 8)` in `layer_name`
            filter_img = generate_pattern(layer_name, i + (j * 8), size=size)

            # Put the result in the square `(i, j)` of the results grid
            horizontal_start = i * size + i * margin
            horizontal_end = horizontal_start + size
            vertical_start = j * size + j * margin
            vertical_end = vertical_start + size
            results[horizontal_start: horizontal_end, vertical_start: vertical_end, :] = filter_img

    return results
  • Learned filters of second convolutional layer

  • Mostly general, some respond to specific shapes/colors

Hide code cell source
# Throws an error. Skipping until fixed
# filters = visualize_filter('conv2d_10')
Hide code cell source
# Display the results grid
#plt.figure(figsize=(10, 10))
#plt.imshow((filters * 255).astype(np.uint8));
  • Learned filters of last convolutional layer

  • More focused on center, some vague cat/dog head shapes

ml

Let’s do this again for the VGG16 network pretrained on ImageNet (much larger)

model = VGG16(weights='imagenet', include_top=False)
Hide code cell source
from tensorflow.keras.applications import VGG16

model = VGG16(weights='imagenet',
              include_top=False)

layer_name = 'block3_conv1'
filter_index = 0

layer_output = model.get_layer(layer_name).output
loss = K.mean(layer_output[:, :, :, filter_index])
Hide code cell source
# VGG16 model
model.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0         
                                                                 
 block3_conv1 (Conv2D)       (None, None, None, 256)   295168    
                                                                 
 block3_conv2 (Conv2D)       (None, None, None, 256)   590080    
                                                                 
 block3_conv3 (Conv2D)       (None, None, None, 256)   590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, None, None, 256)   0         
                                                                 
 block4_conv1 (Conv2D)       (None, None, None, 512)   1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, None, None, 512)   2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, None, None, 512)   2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, None, None, 512)   0         
                                                                 
 block5_conv1 (Conv2D)       (None, None, None, 512)   2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, None, None, 512)   2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, None, None, 512)   2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, None, None, 512)   0         
                                                                 
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 14714688 (56.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
  • Visualize convolution filters 0-2 from layer 5 of the VGG network trained on ImageNet

  • Some respond to dots or waves in the image

Hide code cell source
patterns = []
for i in range(3):
    patterns.append(generate_pattern('block3_conv1', i))
Hide code cell source
for i in range(3):
    plt.subplot(131+i) 
    plt.xticks([]) 
    plt.yticks([])
    plt.imshow(patterns[i])
plt.tight_layout()
plt.show();
../_images/31149223fd3ac01adef3e9cae85ece2493ace66a7ca984ecb13b880671e7b486.png

First 64 filters for 1st convolutional layer in block 1: simple edges and colors

Hide code cell source
patterns1 = visualize_filter('block1_conv1')
patterns2 = visualize_filter('block2_conv1')
patterns3 = visualize_filter('block3_conv1')
patterns4 = visualize_filter('block4_conv1')
Hide code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns1 * 255).astype(np.uint8));
../_images/db2faf0d8b507c2151a7bc63785b4f6822735156832c8cee68a9a0946808ed55.png

Filters in 2nd block of convolution layers: simple textures (combined edges and colors)

Hide code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns2 * 255).astype(np.uint8));
../_images/8491122185c8f8f3c07637fd65f2d0bc88accc55e7fb007334c2544ddcb02cfb.png

Filters in 3rd block of convolution layers: more natural textures

Hide code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns3 * 255).astype(np.uint8));
../_images/bae5db54e59b83a17f702285c2cddbf7c07dea0ebd77b6dfd8cb3da1b9e757f6.png

Filters in 4th block of convolution layers: feathers, eyes, leaves,…

Hide code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns4 * 255).astype(np.uint8));
../_images/126576e59ab858aea2bc7d0b15dd0feb8b0db8b21f3a8d8925a735813216229d.png

Visualizing class activation#

  • We can also visualize which part of the input image had the greatest influence on the final classification. Helps to interpret what the model is paying attention to.

  • Class activation maps : produces a heatmap over the input image

    • Choose a convolution layer, do Global Average Pooling (GAP) to get one output per filter

    • Get the weights between those outputs and the class of interest

    • Compute the weighted sum of all filter activations: combines what each filter is responding to and how much this affects the class prediction

ml
Hide code cell source
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras import backend as K

K.clear_session()
# Note that we are including the densely-connected classifier on top;
# all previous times, we were discarding it.
model = VGG16(weights='imagenet')
Hide code cell source
## From Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
import numpy as np

# The local path to our target image
img_path = '../notebooks/images/10_elephants.jpg'

# `img` is a PIL image of size 224x224
img = image.load_img(img_path, target_size=(224, 224))

# `x` is a float32 Numpy array of shape (224, 224, 3)
x = image.img_to_array(img)

# We add a dimension to transform our array into a "batch"
# of size (1, 224, 224, 3)
x = np.expand_dims(x, axis=0)

# Finally we preprocess the batch
# (this does channel-wise color normalization)
x = preprocess_input(x)# This is the "african elephant" entry in the prediction vector
african_elephant_output = model.output[:, 386]

# The is the output feature map of the `block5_conv3` layer,
# the last convolutional layer in VGG16
last_conv_layer = model.get_layer('block5_conv3')

# This is the gradient of the "african elephant" class with regard to
# the output feature map of `block5_conv3`
grads = K.gradients(african_elephant_output, last_conv_layer.output)[0]

# This is a vector of shape (512,), where each entry
# is the mean intensity of the gradient over a specific feature map channel
pooled_grads = K.mean(grads, axis=(0, 1, 2))

# This function allows us to access the values of the quantities we just defined:
# `pooled_grads` and the output feature map of `block5_conv3`,
# given a sample image
iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]])

# These are the values of these two quantities, as Numpy arrays,
# given our sample image of two elephants
pooled_grads_value, conv_layer_output_value = iterate([x])

# We multiply each channel in the feature map array
# by "how important this channel is" with regard to the elephant class
for i in range(512):
    conv_layer_output_value[:, :, i] *= pooled_grads_value[i]

# The channel-wise mean of the resulting feature map
# is our heatmap of class activation
heatmap = np.mean(conv_layer_output_value, axis=-1)

Example on VGG with a specific input image

  • Take the last convolutional layer of VGG pretrained on ImageNet

    • It consists of 512 filters of size 14x14

model = VGG16(weights='imagenet')
last_conv_layer = model.get_layer('block5_conv3')
Hide code cell source
print("Last conv layer shape:", last_conv_layer.output_shape)
Last conv layer shape: (None, 14, 14, 512)
  • Choose an input image and preprocess it so we can feed it to the model

img = image.load_img(img_path, target_size=(224, 224))
  • Find the output node for its class (‘african elephant’, class 386)

african_elephant_output = model.output[:, 386]
ml

Preprocessing

  • Load image

  • Resize to 224 x 224 (what VGG was trained on)

  • Do the same preprocessing (Keras VGG utility)

from keras.applications.vgg16 import preprocess_input
img_path = '../images/10_elephants.jpg'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0) # Transform to batch of size (1, 224, 224, 3)
x = preprocess_input(x)
  • Sanity test: do we get the right prediction?

preds = model.predict(x)
Hide code cell source
preds = model.predict(x)
Hide code cell source
print('Predicted:', decode_predictions(preds, top=3)[0])
Predicted: [('n02504458', 'African_elephant', 0.99998724), ('n01871265', 'tusker', 1.2755356e-05), ('n02403003', 'ox', 2.5128067e-08)]
  • VGG doesn’t use GAP. Compute the average gradient from the output node to the conv layer

  • Multiply (channel-wise) with the activations of the conv layer

grads = K.gradients(african_elephant_output, last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
for i in range(512): # 512 filters
    conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
heatmap = np.mean(conv_layer_output_value, axis=-1)
  • Visualize heatmap. It’s 14x14 since that’s the output dimension of the conv layer

Hide code cell source
heatmap = np.maximum(heatmap, 0)
heatmap /= np.max(heatmap)
plt.figure(figsize=(4*fig_scale, 4*fig_scale))
plt.matshow(heatmap, fignum=1)
plt.axis("off");
../_images/b178fd3944bb34e7a2d21a66bf4b7b117e1619f13143e0893a074b2a127e92c4.png
Hide code cell source
# pip install opencv-python
import cv2

# We use cv2 to load the original image
img = cv2.imread(img_path)

# We resize the heatmap to have the same size as the original image
heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))

# We convert the heatmap to RGB
heatmap = np.uint8(255 * heatmap)

# We apply the heatmap to the original image
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

# 0.4 here is a heatmap intensity factor
superimposed_img = heatmap * 0.4 + img

# Save the image to disk
cv2.imwrite('../notebooks/images/elephant_cam.jpg', superimposed_img)
True
  • Upscaled and superimposed on the original image

  • The model looked at the face of the baby elephant and the trunk of the large elephant

Hide code cell source
img = cv2.imread('../notebooks/images/elephant_cam.jpg')
RGB_im = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

plt.imshow(RGB_im)
plt.title('Class activation map')
plt.xticks([])
plt.yticks([])
plt.show()
../_images/1154f9235dbfab9e72ebafb5e1f1eb0079b4513c46b59fe8f21736ee55631617.png

Transfer learning#

  • We can re-use pretrained networks instead of training from scratch

  • Learned features can be a generic model of the visual world

  • Use convolutional base to contruct features, then train any classifier on new data

  • Also called transfer learning , which is a kind of meta-learning

ml
  • Let’s instantiate the VGG16 model (without the dense layers)

  • With 150x150 inputs, the final feature map has shape (4, 4, 512)

conv_base = VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))
Hide code cell source
conv_base = VGG16(weights='imagenet', 
                  include_top=False,
                  input_shape=(150, 150, 3))
Hide code cell source
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_2 (InputLayer)        [(None, 150, 150, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 150, 150, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 150, 150, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 75, 75, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 75, 75, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 75, 75, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 37, 37, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 37, 37, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 37, 37, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 37, 37, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 18, 18, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 18, 18, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 18, 18, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 18, 18, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 9, 9, 512)         0         
                                                                 
 block5_conv1 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 4, 4, 512)         0         
                                                                 
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 14714688 (56.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

Using pre-trained networks: 3 ways#

  • Fast feature extraction (for similar task, little data)

    • Call predict from the convolutional base to build new features

    • Use outputs as input to a new neural net (or other algorithm)

  • End-to-end tuning (for similar task, lots of data + data augmentation)

    • Extend the convolutional base model with a new dense layer

    • Train it end to end on the new data (expensive!)

  • Fine-tuning (for somewhat different task)

    • Unfreeze a few of the top convolutional layers, and retrain

    • Update only the more abstract representations

ml

Fast feature extraction (without data augmentation)#

  • Run every batch through the pre-trained convolutional base

features = conv_base.predict(inputs)
  • Build Dense neural net (with Dropout)

  • Train and evaluate with the transformed examples

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.fit(features, labels, epochs=30)
Hide code cell source
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')

datagen = ImageDataGenerator(rescale=1./255)
batch_size = 20

def extract_features(directory, sample_count):
    features = np.zeros(shape=(sample_count, 4, 4, 512))
    labels = np.zeros(shape=(sample_count))
    generator = datagen.flow_from_directory(
        directory,
        target_size=(150, 150),
        batch_size=batch_size,
        class_mode='binary')
    i = 0
    for inputs_batch, labels_batch in generator:
        features_batch = conv_base.predict(inputs_batch)
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
        i += 1
        if i * batch_size >= sample_count:
            # Note that since generators yield data indefinitely in a loop,
            # we must `break` after every image has been seen once.
            break
    return features, labels

Uncomment to recompute from scratch.

train_features, train_labels = extract_features(train_dir, 2000)
validation_features, validation_labels = extract_features(validation_dir, 1000)

train_features = np.reshape(train_features, (2000, 4 * 4 * 512))
validation_features = np.reshape(validation_features, (1000, 4 * 4 * 512))

Uncomment to recompute from scratch.

from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
              loss='binary_crossentropy',
              metrics=['acc'])

history = model.fit(train_features, train_labels,
                    epochs=30, verbose=0,
                    batch_size=20,
                    validation_data=(validation_features, validation_labels))

model.save(os.path.join(model_dir, 'cats_and_dogs_small_3a.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_3a_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
  • Validation accuracy around 90%, much better!

  • Still overfitting, despite the Dropout: not enough training data

Hide code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_3a_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.90500003
../_images/b373a9d3564a4f7c4dd68a12836f62f1f772feb60a92eead79c8afa357aec3e3.png

Fast feature extraction (with data augmentation)#

  • Simply add the Dense layers to the convolutional base

  • Freeze the convolutional base (before you compile)

    • Without freezing, you train it end-to-end (expensive)

model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
conv_base.trainable = False # Freeze the pretrained weights
Hide code cell source
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
conv_base.trainable = False
Hide code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 vgg16 (Functional)          (None, 4, 4, 512)         14714688  
                                                                 
 flatten (Flatten)           (None, 8192)              0         
                                                                 
 dense (Dense)               (None, 256)               2097408   
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
=================================================================
Total params: 16812353 (64.13 MB)
Trainable params: 2097665 (8.00 MB)
Non-trainable params: 14714688 (56.13 MB)
_________________________________________________________________
Hide code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers


train_datagen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')

# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=20,
        class_mode='binary')

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=2e-5),
              metrics=['acc'])
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.

Uncomment to train from scratch (slow!)

history = model.fit_generator(
      train_generator,
      steps_per_epoch=100,
      epochs=30,
      validation_data=validation_generator,
      validation_steps=50,
      verbose=0)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_3b_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

Data augmentation and training (takes a LONG time)

train_datagen = ImageDataGenerator(
      rescale=1./255, rotation_range=40, width_shift_range=0.2,
      height_shift_range=0.2, shear_range=0.2, zoom_range=0.2,
      horizontal_flip=True, fill_mode='nearest')
train_generator = train_datagen.flow_from_directory(train_dir,
      target_size=(150, 150), batch_size=20, class_mode='binary')
history = model.fit_generator(
      train_generator, steps_per_epoch=100, epochs=30,
      validation_data=validation_generator, validation_steps=50)

We now get about 90% accuracy again, and very little overfitting

Hide code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_3b_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.906
../_images/02dd1f19f70dd307de10b8f58117a32ffb952d057d7fa346cf61e8cb9a54f540.png

Fine-tuning#

  • Add your custom network on top of an already trained base network.

  • Freeze the base network, but unfreeze the last block of conv layers.

for layer in conv_base.layers:
    if layer.name == 'block5_conv1':
        layer.trainable = True
    else:
        layer.trainable = False

Visualized

ml ml
Hide code cell source
conv_base.trainable = True

set_trainable = False
for layer in conv_base.layers:
    if layer.name == 'block5_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False
Hide code cell source
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_2 (InputLayer)        [(None, 150, 150, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 150, 150, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 150, 150, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 75, 75, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 75, 75, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 75, 75, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 37, 37, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 37, 37, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 37, 37, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 37, 37, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 18, 18, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 18, 18, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 18, 18, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 18, 18, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 9, 9, 512)         0         
                                                                 
 block5_conv1 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 9, 9, 512)         2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 4, 4, 512)         0         
                                                                 
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 7079424 (27.01 MB)
Non-trainable params: 7635264 (29.13 MB)
_________________________________________________________________
  • Load trained network, finetune

    • Use a small learning rate, large number of epochs

    • You don’t want to unlearn too much: catastrophic forgetting

model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.RMSprop(lr=1e-5),
              metrics=['acc'])
history = model.fit(
      train_generator, steps_per_epoch=100, epochs=100,
      validation_data=validation_generator,
      validation_steps=50)

Uncomment to run from scratch (slow!)

from keras.models import load_model
model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-5),
              metrics=['acc'])

history = model.fit(
      train_generator,
      steps_per_epoch=100,
      epochs=100, # Repeat with epochs=100
      validation_data=validation_generator,
      validation_steps=50)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_4.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_4_history.p'), 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

Almost 95% accuracy. The curves are quite noisy, though.

Hide code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_4_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.90800005
../_images/577d858d72b339fba5a91dfa4e219d924bfa2b8994c7612591420d93cde24e1f.png
  • We can smooth the learning curves using a running average

Hide code cell source
def smooth_curve(points, factor=0.8):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

smooth_history = {}
smooth_history['loss'] = smooth_curve(history['loss'])
smooth_history['acc'] = smooth_curve(history['acc'])
smooth_history['val_loss'] = smooth_curve(history['val_loss'])
smooth_history['val_acc'] = smooth_curve(history['val_acc'])

print("Max val_acc",np.max(smooth_history['val_acc']))
pd.DataFrame(smooth_history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.9039536851123335
../_images/765aba5f89f01b861afb79157787cad1cfd90cdfdb9213a41fb8e441d219fc7c.png

Take-aways#

  • Convnets are ideal for addressing image-related problems.

  • They learn a hierarchy of modular patterns and concepts to represent the visual world.

  • Representations are easy to inspect

  • Data augmentation helps fight overfitting

  • You can use a pretrained convnet to build better models via transfer learning