Lecture 9: Convolutional Neural Networks#
Handling image data
Joaquin Vanschoren, Eindhoven University of Technology
Overview#
Image convolution
Convolutional neural networks
Data augmentation
Model interpretation
Using pre-trained networks (transfer learning)
Show code cell source
# Auto-setup when running on Google Colab
import os
import tensorflow as tf
if 'google.colab' in str(get_ipython()) and not os.path.exists('/content/master'):
!git clone -q https://github.com/ML-course/master.git /content/master
!pip --quiet install -r /content/master/requirements_colab.txt
%cd master/notebooks
# Global imports and settings
%matplotlib inline
from preamble import *
interactive = True # Set to True for interactive plots
if interactive:
fig_scale = 0.5
plt.rcParams.update(print_config)
else: # For printing
fig_scale = 0.4
plt.rcParams.update(print_config)
Show code cell source
data_dir = '../data/cats-vs-dogs_small'
model_dir = '../data/models'
if not os.path.exists(model_dir):
os.makedirs(model_dir)
Show code cell source
# This notebook includes models that are too slow to rebuild often,
# so we are downloading pretrained versions
# !pip install gdown # uncomment to install the downloader
import gdown
import zipfile
if not os.path.exists(os.path.join(model_dir,"cats_and_dogs_small_1.h5")):
url = 'https://drive.google.com/uc?id=1p10QM5jvTSJsw3060JkKSBrxDy9HwBko'
output = os.path.join(model_dir,"lecture9_models.zip")
gdown.download(url, output, quiet=False)
with zipfile.ZipFile(output, 'r') as zip_ref:
zip_ref.extractall(model_dir)
os.remove(output)
download_cats = False
if download_cats:
url = 'https://drive.google.com/uc?id=1xojwqGMrwiWLBBcVNXurBZ0OyFqsS_V0'
output = os.path.join(data_dir,"lecture9_data.zip")
gdown.download(url, output, quiet=False)
with zipfile.ZipFile(output, 'r') as zip_ref:
zip_ref.extractall(data_dir)
os.remove(output)
Convolutions#
Operation that transforms an image by sliding a smaller image (called a filter or kernel ) over the image and multiplying the pixel values
Slide an \(n\) x \(n\) filter over \(n\) x \(n\) patches of the original image
Every pixel is replaced by the sum of the element-wise products of the values of the image patch around that pixel and the kernel
# kernel and image_patch are n x n matrices
pixel_out = np.sum(kernel * image_patch)
Show code cell source
from __future__ import print_function
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from skimage import color
# Visualize convolution. See https://tonysyu.github.io/
def iter_pixels(image):
""" Yield pixel position (row, column) and pixel intensity. """
height, width = image.shape[:2]
for i in range(height):
for j in range(width):
yield (i, j), image[i, j]
# Visualize result
def imshow_pair(image_pair, titles=('', ''), figsize=(10, 5), **kwargs):
fig, axes = plt.subplots(ncols=2, figsize=figsize)
for ax, img, label in zip(axes.ravel(), image_pair, titles):
ax.imshow(img, **kwargs)
ax.set_title(label, fontdict={'fontsize':32*fig_scale})
ax.set_xticks([])
ax.set_yticks([])
# Visualize result
def imshow_triple(axes, image_pair, titles=('', '', ''), figsize=(10, 5), **kwargs):
for ax, img, label in zip(axes, image_pair, titles):
ax.imshow(img, **kwargs)
ax.set_title(label, fontdict={'fontsize':10*fig_scale})
ax.set_xticks([])
ax.set_yticks([])
# Zero-padding
def padding_for_kernel(kernel):
""" Return the amount of padding needed for each side of an image.
For example, if the returned result is [1, 2], then this means an
image should be padded with 1 extra row on top and bottom, and 2
extra columns on the left and right.
"""
# Slice to ignore RGB channels if they exist.
image_shape = kernel.shape[:2]
# We only handle kernels with odd dimensions so make sure that's true.
# (The "center" pixel of an even number of pixels is arbitrary.)
assert all((size % 2) == 1 for size in image_shape)
return [(size - 1) // 2 for size in image_shape]
def add_padding(image, kernel):
h_pad, w_pad = padding_for_kernel(kernel)
return np.pad(image, ((h_pad, h_pad), (w_pad, w_pad)),
mode='constant', constant_values=0)
def remove_padding(image, kernel):
inner_region = [] # A 2D slice for grabbing the inner image region
for pad in padding_for_kernel(kernel):
slice_i = np.s_[:] if pad == 0 else np.s_[pad: -pad]
inner_region.append(slice_i)
return image # [inner_region] # Broken in numpy 1.24, doesn't seem necessary
# Slice windows
def window_slice(center, kernel):
r, c = center
r_pad, c_pad = padding_for_kernel(kernel)
# Slicing is (inclusive, exclusive) so add 1 to the stop value
return np.s_[r-r_pad:r+r_pad+1, c-c_pad:c+c_pad+1]
# Apply convolution kernel to image patch
def apply_kernel(center, kernel, original_image):
image_patch = original_image[window_slice(center, kernel)]
# An element-wise multiplication followed by the sum
return np.sum(kernel * image_patch)
# Move kernel over the image
def iter_kernel_labels(image, kernel):
original_image = image
image = add_padding(original_image, kernel)
i_pad, j_pad = padding_for_kernel(kernel)
for (i, j), pixel in iter_pixels(original_image):
# Shift the center of the kernel to ignore padded border.
i += i_pad
j += j_pad
mask = np.zeros(image.shape, dtype=int) # Background = 0
mask[window_slice((i, j), kernel)] = kernel # Kernel = 1
#mask[i, j] = 2 # Kernel-center = 2
yield (i, j), mask
# Visualize kernel as it moves over the image
def visualize_kernel(kernel_labels, image):
return kernel_labels + image #color.label2rgb(kernel_labels, image, bg_label=0)
# Do a single step
def convolution_demo(image, kernel, **kwargs):
# Initialize generator since we're only ever going to iterate over
# a pixel once. The cached result is used, if we step back.
gen_kernel_labels = iter_kernel_labels(image, kernel)
image_cache = []
image_padded = add_padding(image, kernel)
# Plot original image and kernel-overlay next to filtered image.
@interact(i_step=(0, image.size-1,1))
def convolution_step(i_step=0):
# Create all images up to the current step, unless they're already
# cached:
while i_step >= len(image_cache):
# For the first step (`i_step == 0`), the original image is the
# filtered image; after that we look in the cache, which stores
# (`kernel_overlay`, `filtered`).
filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
# We don't want to overwrite the previously filtered image:
filtered = filtered_prev.copy()
# Get the labels used to visualize the kernel
center, kernel_labels = next(gen_kernel_labels)
# Modify the pixel value at the kernel center
filtered[center] = apply_kernel(center, kernel, image_padded)
# Take the original image and overlay our kernel visualization
kernel_overlay = visualize_kernel(kernel_labels, image_padded)
# Save images for reuse.
image_cache.append((kernel_overlay, filtered))
# Remove padding we added to deal with boundary conditions
# (Loop since each step has 2 images)
image_pair = [remove_padding(each, kernel)
for each in image_cache[i_step]]
imshow_pair(image_pair, **kwargs)
plt.show()
return convolution_step
# Full process
def convolution_full(ax, image, kernel, **kwargs):
# Initialize generator since we're only ever going to iterate over
# a pixel once. The cached result is used, if we step back.
gen_kernel_labels = iter_kernel_labels(image, kernel)
print(gen_kernel_labels)
image_cache = []
image_padded = add_padding(image, kernel)
# Plot original image and kernel-overlay next to filtered image.
for i_step in range(image.size-1):
# For the first step (`i_step == 0`), the original image is the
# filtered image; after that we look in the cache, which stores
# (`kernel_overlay`, `filtered`).
filtered_prev = image_padded if i_step == 0 else image_cache[-1][1]
# We don't want to overwrite the previously filtered image:
filtered = filtered_prev.copy()
# Get the labels used to visualize the kernel
center, kernel_labels = next(gen_kernel_labels)
# Modify the pixel value at the kernel center
filtered[center] = apply_kernel(center, kernel, image_padded)
# Take the original image and overlay our kernel visualization
kernel_overlay = visualize_kernel(kernel_labels, image_padded)
# Save images for reuse.
image_cache.append((kernel_overlay, filtered))
# Remove padding we added to deal with boundary conditions
# (Loop since each step has 2 images)
image_triple = [remove_padding(each, kernel)
for each in image_cache[i_step]]
image_triple.insert(1,kernel)
imshow_triple(ax, image_triple, **kwargs)
Different kernels can detect different types of patterns in the image
Show code cell source
horizontal_edge_kernel = np.array([[ 1, 2, 1],
[ 0, 0, 0],
[-1, -2, -1]])
diagonal_edge_kernel = np.array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
edge_detect_kernel = np.array([[-1, -1, -1],
[-1, 8, -1],
[-1, -1, -1]])
Show code cell source
mnist_data = oml.datasets.get_dataset(554) # Download MNIST data
# Get the predictors X and the labels y
X_mnist, y_mnist, c, a = mnist_data.get_data(dataset_format='array', target=mnist_data.default_target_attribute);
image = X_mnist[1].reshape((28, 28))
image = (image - np.min(image))/np.ptp(image) # Normalize
titles = ('Image and kernel', 'Filtered image')
convolution_demo(image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r');
Show code cell source
if not interactive:
fig, axs = plt.subplots(3, 3, figsize=(5*fig_scale, 5*fig_scale))
titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
convolution_full(axs[1,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
convolution_full(axs[2,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
plt.tight_layout()
Demonstration on Google streetview data#
House numbers photographed from Google streetview imagery, cropped and centered around digits, but with neighboring numbers or other edge artifacts.
Show code cell source
SVHN = oml.datasets.get_dataset(41081)
X, y, cats, attrs = SVHN.get_data(dataset_format='array',
target=SVHN.default_target_attribute)
Show code cell source
def plot_images(X, y, grayscale=False):
fig, axes = plt.subplots(1, len(X), figsize=(10, 5))
for n in range(len(X)):
if grayscale:
axes[n].imshow(X[n].reshape(32, 32)/255, cmap='gray')
else:
axes[n].imshow(X[n].reshape(32, 32, 3)/255)
axes[n].set_xlabel((y[n]+1), fontsize=32*fig_scale) # Label is index+1
axes[n].set_xticks(()), axes[n].set_yticks(())
plt.show();
images = range(5)
X_sub_color = [X[i] for i in images]
y_sub = [y[i] for i in images]
plot_images(X_sub_color, y_sub)
For recognizing digits, color is not important, so we grayscale the images
Show code cell source
def rgb2gray(X, dim=32):
return np.expand_dims(np.dot(X.reshape(len(X), dim*dim, 3), [0.2990, 0.5870, 0.1140]), axis=2)
Xsm = rgb2gray(X[:100])
X_sub = [Xsm[i] for i in images]
plot_images(X_sub, y_sub, grayscale=True)
Demonstration
Show code cell source
def normalize_image(X):
image = X.reshape((32, 32))
return (image - np.min(image))/np.ptp(image) # Normalize
image = normalize_image(X_sub[3])
demo2 = convolution_demo(image, horizontal_edge_kernel,
vmin=-4, vmax=4, cmap='gray_r');
Show code cell source
if not interactive:
fig, axs = plt.subplots(3, 3, figsize=(5*fig_scale, 5*fig_scale))
titles = ('Image and kernel', 'Hor. edge filter', 'Filtered image')
convolution_full(axs[0,:], image, horizontal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
titles = ('Image and kernel', 'Diag. edge filter', 'Filtered image')
convolution_full(axs[1,:], image, diagonal_edge_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
titles = ('Image and kernel', 'Edge detect filter', 'Filtered image')
convolution_full(axs[2,:], image, edge_detect_kernel, vmin=-4, vmax=4, titles=titles, cmap='gray_r')
plt.tight_layout()
Image convolution in practice#
How do we know which filters are best for a given image?
Families of kernels (or filter banks ) can be run on every image
Gabor, Sobel, Haar Wavelets,…
Gabor filters: Wave patterns generated by changing:
Frequency: narrow or wide ondulations
Theta: angle (direction) of the wave
Sigma: resolution (size of the filter)
Demonstration
Show code cell source
from scipy import ndimage as ndi
from skimage import data
from skimage.util import img_as_float
from skimage.filters import gabor_kernel
# Gabor Filters.
@interact
def demoGabor(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
plt.gray()
plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
plt.title(f'freq: {frequency}, theta: {theta}, sigma: {sigma}', fontdict={'fontsize':16*fig_scale})
plt.xticks([])
plt.yticks([])
Show code cell source
if not interactive:
plt.subplot(1, 3, 1)
demoGabor(frequency=0.16, theta=1.2, sigma=4.0)
plt.subplot(1, 3, 2)
demoGabor(frequency=0.31, theta=0, sigma=3.6)
plt.subplot(1, 3, 3)
demoGabor(frequency=0.36, theta=1.6, sigma=1.3)
plt.tight_layout()
Demonstration on the streetview data
Show code cell source
# Calculate the magnitude of the Gabor filter response given a kernel and an imput image
def magnitude(image, kernel):
image = (image - image.mean()) / image.std() # Normalize images
return np.sqrt(ndi.convolve(image, np.real(kernel), mode='wrap')**2 +
ndi.convolve(image, np.imag(kernel), mode='wrap')**2)
Show code cell source
@interact
def demoGabor2(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
plt.subplot(131)
plt.title('Original', fontdict={'fontsize':24*fig_scale})
plt.imshow(image)
plt.xticks([])
plt.yticks([])
plt.subplot(132)
plt.title('Gabor kernel', fontdict={'fontsize':24*fig_scale})
plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
plt.xticks([])
plt.yticks([])
plt.subplot(133)
plt.title('Response magnitude', fontdict={'fontsize':24*fig_scale})
plt.imshow(np.real(magnitude(image, gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma))), interpolation='nearest')
plt.tight_layout()
plt.xticks([])
plt.yticks([])
Show code cell source
if not interactive:
demoGabor2(frequency=0.16, theta=1.4, sigma=1.2)
Filter banks#
Different filters detect different edges, shapes,…
Not all seem useful
Show code cell source
# More images
image3 = normalize_image(Xsm[3])
image5 = normalize_image(Xsm[5])
image13 = normalize_image(Xsm[13])
image_names = ('3', '5', '8') # labels
images = (image3, image5, image13)
def plot_filter_bank(images):
# Create a set of kernels, apply them to each image, store the results
results = []
kernel_params = []
for theta in (0, 1):
theta = theta / 4. * np.pi
for frequency in (0.1, 0.2):
for sigma in (1, 3):
kernel = gabor_kernel(frequency, theta=theta,sigma_x=sigma,sigma_y=sigma)
params = 'theta=%.2f,\nfrequency=%.2f\nsigma=%.2f' % (theta, frequency, sigma)
kernel_params.append(params)
results.append((kernel, [magnitude(img, kernel) for img in images]))
# Plotting
fig, axes = plt.subplots(nrows=4, ncols=9, figsize=(14*fig_scale, 8*fig_scale))
plt.gray()
#fig.suptitle('Image responses for Gabor filter kernels', fontsize=12)
axes[0][0].axis('off')
for label, img, ax in zip(image_names, images, axes[1:]):
axs = ax[0]
axs.imshow(img)
axs.set_ylabel(label, fontsize=12*fig_scale)
axs.set_xticks([]) # Remove axis ticks
axs.set_yticks([])
# Plot Gabor kernel
col = 1
for label, (kernel, magnitudes), ax_col in zip(kernel_params, results, axes[0][1:]):
ax_col.imshow(np.real(kernel), interpolation='nearest') # Plot kernel
ax_col.set_title(label, fontsize=10*fig_scale)
ax_col.axis('off')
# Plot Gabor responses with the contrast normalized for each filter
vmin = np.min(magnitudes)
vmax = np.max(magnitudes)
for patch, ax in zip(magnitudes, axes.T[col][1:]):
ax.imshow(patch, vmin=vmin, vmax=vmax) # Plot convolutions
ax.axis('off')
col += 1
plt.show()
plot_filter_bank(images)
Another example: Fashion MNIST
Show code cell source
fmnist_data = oml.datasets.get_dataset(40996) # Download FMNIST data
# Get the predictors X and the labels y
X_fm, y_fm, c, a = fmnist_data.get_data(dataset_format='array', target=fmnist_data.default_target_attribute);
Show code cell source
# build a list of figures for plotting
def buildFigureList(fig, subfiglist, titles, length):
for i in range(0,length):
pixels = np.array(subfiglist[i], dtype='float')
pixels = pixels.reshape((28, 28))
a=fig.add_subplot(1,length,i+1)
imgplot =plt.imshow(pixels, cmap='gray_r')
a.set_title(titles[i], fontsize=6)
a.axes.get_xaxis().set_visible(False)
a.axes.get_yaxis().set_visible(False)
return
subfiglist = []
titles=[]
for i in range(0,10):
subfiglist.append(X_fm[i])
titles.append(i)
buildFigureList(plt.figure(1),subfiglist, titles, 10)
plt.show()
Demonstration
Show code cell source
boot = X_fm[0].reshape((28, 28))
image2=boot
@interact
def demoGabor3(frequency=(0.01,1,0.05), theta=(0,3.14,0.1), sigma=(0,5,0.1)):
plt.subplot(131)
plt.title('Original', fontdict={'fontsize':24*fig_scale})
plt.imshow(image2)
plt.xticks([]) # Remove axis ticks
plt.yticks([])
plt.subplot(132)
plt.title('Gabor kernel', fontdict={'fontsize':24*fig_scale})
plt.imshow(np.real(gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma)), interpolation='nearest')
plt.xticks([]) # Remove axis ticks
plt.yticks([])
plt.subplot(133)
plt.xticks([]) # Remove axis ticks
plt.yticks([])
plt.title('Response magnitude', fontdict={'fontsize':24*fig_scale})
plt.imshow(np.real(magnitude(image2, gabor_kernel(frequency=frequency, theta=theta, sigma_x=sigma, sigma_y=sigma))), interpolation='nearest')
Show code cell source
if not interactive:
demoGabor3(frequency=0.81, theta=2.7, sigma=0.9)
Fashion MNIST with multiple filters (filter bank)
Show code cell source
# Fetch some Fashion-MNIST images
boot = X_fm[0].reshape(28, 28)
shirt = X_fm[1].reshape(28, 28)
dress = X_fm[2].reshape(28, 28)
image_names = ('boot', 'shirt', 'dress')
images = (boot, shirt, dress)
plot_filter_bank(images)
Convolutional neural nets#
Finding relationships between individual pixels and the correct class is hard
We want to discover ‘local’ patterns (edges, lines, endpoints)
Representing such local patterns as features makes it easier to learn from them
We could use convolutions, but how to choose the filters?
Convolutional Neural Networks (ConvNets)#
Instead of manually designing the filters, we can also learn them based on data
Choose filter sizes (manually), initialize with small random weights
Forward pass: Convolutional layer slides the filter over the input, generates the output
Backward pass: Update the filter weights according to the loss gradient
Illustration for 1 filter:
Convolutional layers: Feature maps#
One filter is not sufficient to detect all relevant patterns in an image
A convolutional layer applies and learns \(d\) filter in parallel
Slide \(d\) filters across the input image (in parallel) -> a (1x1xd) output per patch
Reassemble into a feature map with \(d\) ‘channels’, a (width x height x d) tensor.
Border effects (zero padding)#
Consider a 5x5 image and a 3x3 filter: there are only 9 possible locations, hence the output is a 3x3 feature map
If we want to maintain the image size, we use zero-padding, adding 0’s all around the input tensor.
Undersampling (striding)#
Sometimes, we want to downsample a high-resolution image
Faster processing, less noisy (hence less overfitting)
One approach is to skip values during the convolution
Distance between 2 windows: stride length
Example with stride length 2 (without padding):
Max-pooling#
Another approach to shrink the input tensors is max-pooling :
Run a filter with a fixed stride length over the image
Usually 2x2 filters and stride lenght 2
The filter simply returns the max (or avg ) of all values
Agressively reduces the number of weights (less overfitting)
Information from every input node spreads more quickly to output nodes
In
pure
convnets, one input value spreads to 3x3 nodes of the first layer, 5x5 nodes of the second, etc.Without maxpooling, you need much deeper networks, harder to train
Increases translation invariance : patterns can affect the predictions no matter where they occur in the image
Convolutional nets in practice#
Use multiple convolutional layers to learn patterns at different levels of abstraction
Find local patterns first (e.g. edges), then patterns across those patterns
Use MaxPooling layers to reduce resolution, increase translation invariance
Use sufficient filters in the first layer (otherwise information gets lost)
In deeper layers, use increasingly more filters
Preserve information about the input as resolution descreases
Avoid decreasing the number of activations (resolution x nr of filters)
For very deep nets, add skip connections to preserve information (and gradients)
Sums up outputs of earlier layers to those of later layers (with same dimensions)
Example with Keras#
Conv2D
for 2D convolutional layers32 filters (default), randomly initialized (from uniform distribution)
Deeper layers use 64 filters
Filter size is 3x3
ReLU activation to simplify training of deeper networks
MaxPooling2D
for max-pooling2x2 pooling reduces the number of inputs by a factor 4
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
Show code cell source
from tensorflow.keras import layers
from tensorflow.keras import models
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
Observe how the input image on 28x28x1 is transformed to a 3x3x64 feature map
Convolutional layer:
No zero-padding: every output 2 pixels less in every dimension
320 weights: (3x3 filter weights + 1 bias) * 32 filters
After every MaxPooling, resolution halved in every dimension
Show code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 26, 26, 32) 320
max_pooling2d (MaxPooling2 (None, 13, 13, 32) 0
D)
conv2d_1 (Conv2D) (None, 11, 11, 64) 18496
max_pooling2d_1 (MaxPoolin (None, 5, 5, 64) 0
g2D)
conv2d_2 (Conv2D) (None, 3, 3, 64) 36928
=================================================================
Total params: 55744 (217.75 KB)
Trainable params: 55744 (217.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
To classify the images, we still need a Dense and Softmax layer.
We can flatten the 3x3x64 feature map to a vector of size 576
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
Show code cell source
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
Show code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 26, 26, 32) 320
max_pooling2d (MaxPooling2 (None, 13, 13, 32) 0
D)
conv2d_1 (Conv2D) (None, 11, 11, 64) 18496
max_pooling2d_1 (MaxPoolin (None, 5, 5, 64) 0
g2D)
conv2d_2 (Conv2D) (None, 3, 3, 64) 36928
flatten (Flatten) (None, 576) 0
dense (Dense) (None, 64) 36928
dense_1 (Dense) (None, 10) 650
=================================================================
Total params: 93322 (364.54 KB)
Trainable params: 93322 (364.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Flatten
adds a lot of weightsInstead, we can do
GlobalAveragePooling
: returns average of each activation mapThis sometimes works even without adding a dense layer (the number of outputs is similar to the number of classes)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(10, activation='softmax'))
With
GlobalAveragePooling
: much fewer weights to learnUse with caution: this destroys the location information learned by the CNN
Not ideal for tasks such as object localization
Show code cell source
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(10, activation='softmax'))
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_3 (Conv2D) (None, 26, 26, 32) 320
max_pooling2d_2 (MaxPoolin (None, 13, 13, 32) 0
g2D)
conv2d_4 (Conv2D) (None, 11, 11, 64) 18496
max_pooling2d_3 (MaxPoolin (None, 5, 5, 64) 0
g2D)
conv2d_5 (Conv2D) (None, 3, 3, 64) 36928
global_average_pooling2d ( (None, 64) 0
GlobalAveragePooling2D)
dense_2 (Dense) (None, 10) 650
=================================================================
Total params: 56394 (220.29 KB)
Trainable params: 56394 (220.29 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Show code cell source
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
(train_images, train_labels), (validation_images, validation_labels) = mnist.load_data()
train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255
validation_images = validation_images.reshape((10000, 28, 28, 1))
validation_images = validation_images.astype('float32') / 255
train_labels = to_categorical(train_labels)
validation_labels = to_categorical(validation_labels)
Run the model on MNIST dataset
Train and test as usual: 99% accuracy
Compared to 97,8% accuracy with the dense architecture
Flatten
andGlobalAveragePooling
yield similar performance
Model was trained beforehand and saved. Uncomment if you want to run from scratch
import pickle
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(train_images, train_labels, epochs=5, batch_size=64, verbose=0,
validation_data=(validation_images, validation_labels))
model.save(os.path.join(model_dir, 'mnist.h5'))
with open(os.path.join(model_dir, 'mnist_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
Show code cell source
import pickle
from tensorflow.keras.models import load_model
model = load_model(os.path.join(model_dir, 'mnist.h5'))
validation_loss, validation_acc = model.evaluate(validation_images, validation_labels, verbose=0)
Show code cell source
plt.figure()
plt.rcParams.update({'figure.figsize': (12*fig_scale,5*fig_scale),
'font.size': 16*fig_scale,
'xtick.labelsize': 14*fig_scale,
'ytick.labelsize': 14*fig_scale});
<Figure size 1500x900 with 0 Axes>
Show code cell source
print("Accuracy: ", validation_acc)
history = pickle.load(open("../data/models/mnist_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Accuracy: 0.9871000051498413
Tip:
Training ConvNets can take a lot of time
Save the trained model (and history) to disk so that you can reload it later
model.save(os.path.join(model_dir, 'mnist.h5'))
with open(os.path.join(model_dir, 'mnist_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
Show code cell source
# For validation of the GAP layer. Gives the same accuracy
#model = models.Sequential()
#model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
#model.add(layers.MaxPooling2D((2, 2)))
#model.add(layers.Conv2D(64, (3, 3), activation='relu'))
#model.add(layers.MaxPooling2D((2, 2)))
#model.add(layers.Conv2D(64, (3, 3), activation='relu'))
#model.add(layers.GlobalAveragePooling2D())
#model.add(layers.Dense(10, activation='softmax'))
#model.compile(optimizer='rmsprop',
# loss='categorical_crossentropy',
# metrics=['accuracy'])
#history = model.fit(train_images, train_labels, epochs=5, batch_size=64, verbose=0,
# validation_data=(validation_images, validation_labels))
#validation_loss, validation_acc = model.evaluate(validation_images, validation_labels, verbose=0)
#validation_acc
Cats vs Dogs
A more realistic dataset: Cats vs Dogs
Colored JPEG images, different sizes
Not nicely centered, translation invariance is important
Preprocessing
Create balanced subsample of 4000 colored images
3000 for training, 1000 validation
Decode JPEG images to floating-point tensors
Rescale pixel values to [0,1]
Resize images to 150x150 pixels
Uncomment to run from scratch
import os, shutil
# Download data from https://www.kaggle.com/c/dogs-vs-cats/data
# Uncompress `train.zip` into the `original_dataset_dir`
original_dataset_dir = '../data/cats-vs-dogs_original'
# The directory where we will
# store our smaller dataset
train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')
if not os.path.exists(data_dir):
os.mkdir(data_dir)
os.mkdir(train_dir)
os.mkdir(validation_dir)
train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')
validation_cats_dir = os.path.join(validation_dir, 'cats')
validation_dogs_dir = os.path.join(validation_dir, 'dogs')
if not os.path.exists(train_cats_dir):
os.mkdir(train_cats_dir)
os.mkdir(train_dogs_dir)
os.mkdir(validation_cats_dir)
os.mkdir(validation_dogs_dir)
# Copy first 2000 cat images to train_cats_dir
fnames = ['cat.{}.jpg'.format(i) for i in range(2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
dst = os.path.join(train_cats_dir, fname)
shutil.copyfile(src, dst)
# Copy next 1000 cat images to validation_cats_dir
fnames = ['cat.{}.jpg'.format(i) for i in range(2000, 3000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
dst = os.path.join(validation_cats_dir, fname)
shutil.copyfile(src, dst)
# Copy first 2000 dog images to train_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
dst = os.path.join(train_dogs_dir, fname)
shutil.copyfile(src, dst)
# Copy next 1000 dog images to validation_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(2000, 3000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, fname)
dst = os.path.join(validation_dogs_dir, fname)
shutil.copyfile(src, dst)
Data generators#
We to do this preprocessing on the fly. Storing processed data on disk is less efficient, and we can’t store all data in (GPU) memory
ImageDataGenerator
: allows to encode, resize, and rescale JPEG imagesReturns a Python generator we can endlessly query for batches of images
Separately for training, validation, and validation set
train_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
train_dir, # Directory with images
target_size=(150, 150), # Resize images
batch_size=20, # Return 20 images at a time
class_mode='binary') # Binary labels
Show code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')
# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
# This is the target directory
train_dir,
# All images will be resized to 150x150
target_size=(150, 150),
batch_size=20,
# Since we use binary_crossentropy loss, we need binary labels
class_mode='binary')
validation_generator = validation_datagen.flow_from_directory(
validation_dir,
target_size=(150, 150),
batch_size=20,
class_mode='binary')
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
Show code cell source
for data_batch, labels_batch in train_generator:
plt.figure(figsize=(10,5))
for i in range(7):
plt.subplot(171+i)
plt.xticks([])
plt.yticks([])
imgplot = plt.imshow(data_batch[i])
plt.title('cat' if labels_batch[i] == 0 else 'dog', fontdict={'fontsize':24*fig_scale})
plt.tight_layout()
break
Show code cell source
from tensorflow.keras import layers
from tensorflow.keras import models
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Since the images are larger and more complex, we add another convolutional layer and increase the number of filters to 128.
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Show code cell source
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_6 (Conv2D) (None, 148, 148, 32) 896
max_pooling2d_4 (MaxPoolin (None, 74, 74, 32) 0
g2D)
conv2d_7 (Conv2D) (None, 72, 72, 64) 18496
max_pooling2d_5 (MaxPoolin (None, 36, 36, 64) 0
g2D)
conv2d_8 (Conv2D) (None, 34, 34, 128) 73856
max_pooling2d_6 (MaxPoolin (None, 17, 17, 128) 0
g2D)
conv2d_9 (Conv2D) (None, 15, 15, 128) 147584
max_pooling2d_7 (MaxPoolin (None, 7, 7, 128) 0
g2D)
flatten_1 (Flatten) (None, 6272) 0
dense_3 (Dense) (None, 512) 3211776
dense_4 (Dense) (None, 1) 513
=================================================================
Total params: 3453121 (13.17 MB)
Trainable params: 3453121 (13.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Training#
The
fit
function also supports generators100 steps per epoch (batch size: 20 images per step), for 30 epochs
Provide a separate generator for the validation data
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-4),
metrics=['acc'])
history = model.fit(
train_generator, steps_per_epoch=100,
epochs=30, verbose=0,
validation_data=validation_generator,
validation_steps=50)
Show code cell source
from tensorflow.keras import optimizers
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-4),
metrics=['acc'])
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.
Uncomment to run. Otherwise, we will load the pretrained model from disk.
import pickle
history = model.fit(
train_generator,
steps_per_epoch=100,
epochs=30, verbose=1,
validation_data=validation_generator,
validation_steps=50)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_1.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_1_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
The network is overfitting. Validation accuracy at 75% while training accuracy reaches 100%
There are several things to explore:
Generating more training data (data augmentation)
Regularization (e.g. Dropout, L1/L2, Batch Normalization,…)
Use pretrained rather than randomly initialized filters
Show code cell source
import pickle
history = pickle.load(open("../data/models/cats_and_dogs_small_1_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Data augmentation#
Generate new images via image transformations
Images will be randomly transformed every epoch
We can again use a data generator to do this
datagen = ImageDataGenerator(
rotation_range=40, # Rotate image up to 40 degrees
width_shift_range=0.2, # Shift image left-right up to 20% of image width
height_shift_range=0.2,# Shift image up-down up to 20% of image height
shear_range=0.2, # Shear (slant) the image up to 0.2 degrees
zoom_range=0.2, # Zoom in up to 20%
horizontal_flip=True, # Horizontally flip the image
fill_mode='nearest') # How to fill in pixels that didn't exist before
Show code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest')
Example
Show code cell source
# This is module with image preprocessing utilities
from tensorflow.keras.preprocessing import image
train_cats_dir = os.path.join(data_dir, 'train', 'cats')
fnames = [os.path.join(train_cats_dir, fname) for fname in os.listdir(train_cats_dir)]
# We pick one image to "augment"
img_path = fnames[8]
# Read the image and resize it
img = image.load_img(img_path, target_size=(150, 150))
# Convert it to a Numpy array with shape (150, 150, 3)
x = image.img_to_array(img)
# Reshape it to (1, 150, 150, 3)
x = x.reshape((1,) + x.shape)
# The .flow() command below generates batches of randomly transformed images.
# It will loop indefinitely, so we need to `break` the loop at some point!
for a in range(2):
i = 0
for batch in datagen.flow(x, batch_size=1):
plt.subplot(141+i)
plt.xticks([])
plt.yticks([])
imgplot = plt.imshow(image.array_to_img(batch[0]))
i += 1
if i % 4 == 0:
break
plt.tight_layout()
plt.show()
We also add Dropout before the Dense layer
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Show code cell source
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-4),
metrics=['acc'])
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.
Show code cell source
train_datagen = ImageDataGenerator(
rescale=1./255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,)
# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
# This is the target directory
train_dir,
# All images will be resized to 150x150
target_size=(150, 150),
batch_size=32,
# Since we use binary_crossentropy loss, we need binary labels
class_mode='binary')
validation_generator = validation_datagen.flow_from_directory(
validation_dir,
target_size=(150, 150),
batch_size=32,
class_mode='binary')
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
Uncomment to run from scratch
history = model.fit_generator(
train_generator,
steps_per_epoch=60, # 2000 images / batch size 32
epochs=100, verbose=0,
validation_data=validation_generator,
validation_steps=30) # About 1000/32
model.save(os.path.join(model_dir, 'cats_and_dogs_small_2.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_2_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
(Almost) no more overfitting!
Show code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_2_history.p", "rb"))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Real-world CNNs#
VGG16#
Deeper architecture (16 layers): allows it to learn more complex high-level features
Textures, patterns, shapes,…
Small filters (3x3) work better: capture spatial information while reducing number of parameters
Max-pooling (2x2): reduces spatial dimension, improves translation invariance
Lower resolution forces model to learn robust features (less sensitive to small input changes)
Only after every 2 layers, otherwise dimensions reduce too fast
Downside: too many parameters, expensive to train
Inceptionv3#
Inception modules: parallel branches learn features of different sizes and scales (3x3, 5x5, 7x7,…)
Add reduction blocks that reduce dimensionality via convolutions with stride 2
Factorized convolutions: a 3x3 conv. can be replaced by combining 1x3 and 3x1, and is 33% cheaper
A 5x5 can be replaced by combining 3x3 and 3x3, which can in turn be factorized as above
1x1 convolutions, or Network-In-Network (NIN) layers help reduce the number of channels: cheaper
An auxiliary classifier adds an additional gradient signal deeper in the network
ResNet50#
Residual (skip) connections: add earlier feature map to a later one (dimensions must match)
Information can bypass layers, reduces vanishing gradients, allows much deeper nets
Residual blocks: skip small number or layers and repeat many times
Match dimensions though padding and 1x1 convolutions
When resolution drops, add 1x1 convolutions with stride 2
Can be combined with Inception blocks
Interpreting the model#
Let’s see what the convnet is learning exactly by observing the intermediate feature maps
A layer’s output is also called its activation
We can choose a specific test image, and then retrieve and visualize the activation for every filter for every layer
Layer 0: has activations of resolution 148x148 for each of its 32 filters
Layer 2: has activations of resolution 72x72 for each of its 64 filters
Layer 4: has activations of resolution 34x34 for each of its 128 filters
Layer 6: has activations of resolution 15x15 for each of its 128 filters
Show code cell source
from tensorflow.keras.models import load_model
model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_2.h5'))
Show code cell source
model.summary() # As a reminder
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_10 (Conv2D) (None, 148, 148, 32) 896
max_pooling2d_8 (MaxPoolin (None, 74, 74, 32) 0
g2D)
conv2d_11 (Conv2D) (None, 72, 72, 64) 18496
max_pooling2d_9 (MaxPoolin (None, 36, 36, 64) 0
g2D)
conv2d_12 (Conv2D) (None, 34, 34, 128) 73856
max_pooling2d_10 (MaxPooli (None, 17, 17, 128) 0
ng2D)
conv2d_13 (Conv2D) (None, 15, 15, 128) 147584
max_pooling2d_11 (MaxPooli (None, 7, 7, 128) 0
ng2D)
flatten_2 (Flatten) (None, 6272) 0
dropout (Dropout) (None, 6272) 0
dense_4 (Dense) (None, 512) 3211776
dense_5 (Dense) (None, 1) 513
=================================================================
Total params: 3453121 (13.17 MB)
Trainable params: 3453121 (13.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Show code cell source
img_path = os.path.join(data_dir, 'train/cats/cat.1700.jpg')
# We preprocess the image into a 4D tensor
from tensorflow.keras.preprocessing import image
import numpy as np
img = image.load_img(img_path, target_size=(150, 150))
img_tensor = image.img_to_array(img)
img_tensor = np.expand_dims(img_tensor, axis=0)
# Remember that the model was trained on inputs
# that were preprocessed in the following way:
img_tensor /= 255.
Show code cell source
from tensorflow.keras import models
# Extracts the outputs of the top 8 layers:
layer_outputs = [layer.output for layer in model.layers[:8]]
# Creates a model that will return these outputs, given the model input:
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
# This will return a list of 5 Numpy arrays:
# one array per layer activation
activations = activation_model.predict(img_tensor)
1/1 [==============================] - 1s 529ms/step
To extract the activations, we create a new model that outputs the trained layers
8 output layers in total (only the convolutional part)
We input a test image for prediction and then read the relevant outputs
layer_outputs = [layer.output for layer in model.layers[:8]]
activation_model = models.Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(img_tensor)
Output of the first Conv2D layer, 3rd channel (filter):
Similar to a diagonal edge detector
Your own channels may look different
Show code cell source
first_layer_activation = activations[0]
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor[0])
ax2.matshow(first_layer_activation[0, :, :,6], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 6');
Output of filter 16:
Cat eye detector?
Show code cell source
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor[0])
ax2.matshow(first_layer_activation[0, :, :,25], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 25');
The same filter responds quite differently for other inputs (green detector?).
Show code cell source
img_path = os.path.join(data_dir, 'train/dogs/dog.1528.jpg')
# We preprocess the image into a 4D tensor
img = image.load_img(img_path, target_size=(150, 150))
img_tensor2 = image.img_to_array(img)
img_tensor2 = np.expand_dims(img_tensor2, axis=0)
# Remember that the model was trained on inputs
# that were preprocessed in the following way:
img_tensor2 /= 255.
activations2 = activation_model.predict(img_tensor2)
first_layer_activation2 = activations2[0]
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.imshow(img_tensor2[0])
ax2.matshow(first_layer_activation2[0, :, :, 25], cmap='viridis')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.set_xticks([])
ax2.set_yticks([])
ax1.set_xlabel('Input image')
ax2.set_xlabel('Activation of filter 25');
1/1 [==============================] - 0s 10ms/step
Show code cell source
images_per_row = 16
layer_names = []
for layer in model.layers[:8]:
layer_names.append(layer.name)
def plot_activations(layer_index, activations):
start = layer_index
end = layer_index+1
# Now let's display our feature maps
for layer_name, layer_activation in zip(layer_names[start:end], activations[start:end]):
# This is the number of features in the feature map
n_features = layer_activation.shape[-1]
# The feature map has shape (1, size, size, n_features)
size = layer_activation.shape[1]
# We will tile the activation channels in this matrix
n_cols = n_features // images_per_row
display_grid = np.zeros((size * n_cols, images_per_row * size))
# We'll tile each filter into this big horizontal grid
for col in range(n_cols):
for row in range(images_per_row):
channel_image = layer_activation[0,
:, :,
col * images_per_row + row]
# Post-process the feature to make it visually palatable
channel_image -= channel_image.mean()
channel_image /= channel_image.std()
channel_image *= 64
channel_image += 128
channel_image = np.clip(channel_image, 0, 255).astype('uint8')
display_grid[col * size : (col + 1) * size,
row * size : (row + 1) * size] = channel_image
# Display the grid
scale = 1. / size
plt.figure(figsize=(scale * display_grid.shape[1],
scale * display_grid.shape[0]))
plt.title("Activation of layer {} ({})".format(layer_index+1,layer_name), fontdict={'fontsize':30*fig_scale})
plt.grid(False)
plt.axis("off")
plt.imshow(display_grid, aspect='auto', cmap='viridis')
plt.show()
First 2 convolutional layers: various edge detectors
Show code cell source
plot_activations(0, activations)
plot_activations(2, activations)
3rd convolutional layer: increasingly abstract: ears, eyes
Show code cell source
plot_activations(4, activations)
Last convolutional layer: more abstract patterns
Empty filter activations: input image does not have the information that the filter was interested in (maybe it was dog-specific)
Show code cell source
plot_activations(6, activations)
Same layer, with dog image input
Very different activations
Show code cell source
plot_activations(6, activations2)
Spatial hierarchies#
Deep convnets can learn spatial hierarchies of patterns
First layer can learn very local patterns (e.g. edges)
Second layer can learn specific combinations of patterns
Every layer can learn increasingly complex abstractions
Visualizing the learned filters#
Visualize filters by finding the input image that they are maximally responsive to
gradient ascent in input space : start from a random image \(x\), use loss to update the input values to values that the filter responds to more strongly (keep weights fixed)
\(X_{(i+1)} = X_{(i)} + \frac{\partial L(x, X_{(i)})}{\partial X} * \eta\)
from keras import backend as K
input_img = np.random.random((1, size, size, 3)) * 20 + 128.
loss = K.mean(layer_output[:, :, :, filter_index])
grads = K.gradients(loss, model.input)[0] # Compute gradient
for i in range(40): # Run gradient ascent for 40 steps
loss_v, grads_v = K.function([input_img], [loss, grads])
input_img_data += grads_v * step
Show code cell source
#tf.compat.v1.disable_eager_execution()
from tensorflow.keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
# Convert tensor to image
def deprocess_image(x):
# normalize tensor: center on 0., ensure std is 0.1
x -= x.mean()
x /= (x.std() + 1e-5)
x *= 0.1
# clip to [0, 1]
x += 0.5
x = np.clip(x, 0, 1)
# convert to RGB array
x *= 255
x = np.clip(x, 0, 255).astype('uint8')
return x
def generate_pattern(layer_name, filter_index, size=150):
# Build a loss function that maximizes the activation
# of the nth filter of the layer considered.
layer_output = model.get_layer(layer_name).output
loss = K.mean(layer_output[:, :, :, filter_index])
# Compute the gradient of the input picture wrt this loss
grads = K.gradients(loss, model.input)[0]
# Normalization trick: we normalize the gradient
grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-5)
# This function returns the loss and grads given the input picture
iterate = K.function([model.input], [loss, grads])
# We start from a gray image with some noise
input_img_data = np.random.random((1, size, size, 3)) * 20 + 128.
# Run gradient ascent for 40 steps
step = 1.
for i in range(40):
loss_value, grads_value = iterate([input_img_data])
input_img_data += grads_value * step
img = input_img_data[0]
return deprocess_image(img)
def visualize_filter(layer_name):
size = 64
margin = 5
# This a empty (black) image where we will store our results.
results = np.zeros((8 * size + 7 * margin, 8 * size + 7 * margin, 3))
for i in range(8): # iterate over the rows of our results grid
for j in range(8): # iterate over the columns of our results grid
# Generate the pattern for filter `i + (j * 8)` in `layer_name`
filter_img = generate_pattern(layer_name, i + (j * 8), size=size)
# Put the result in the square `(i, j)` of the results grid
horizontal_start = i * size + i * margin
horizontal_end = horizontal_start + size
vertical_start = j * size + j * margin
vertical_end = vertical_start + size
results[horizontal_start: horizontal_end, vertical_start: vertical_end, :] = filter_img
return results
Learned filters of second convolutional layer
Mostly general, some respond to specific shapes/colors
Show code cell source
# Throws an error. Skipping until fixed
# filters = visualize_filter('conv2d_10')
Show code cell source
# Display the results grid
#plt.figure(figsize=(10, 10))
#plt.imshow((filters * 255).astype(np.uint8));
Learned filters of last convolutional layer
More focused on center, some vague cat/dog head shapes
Let’s do this again for the VGG16
network pretrained on ImageNet
(much larger)
model = VGG16(weights='imagenet', include_top=False)
Show code cell source
from tensorflow.keras.applications import VGG16
model = VGG16(weights='imagenet',
include_top=False)
layer_name = 'block3_conv1'
filter_index = 0
layer_output = model.get_layer(layer_name).output
loss = K.mean(layer_output[:, :, :, filter_index])
Show code cell source
# VGG16 model
model.summary()
Model: "vgg16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, None, None, 3)] 0
block1_conv1 (Conv2D) (None, None, None, 64) 1792
block1_conv2 (Conv2D) (None, None, None, 64) 36928
block1_pool (MaxPooling2D) (None, None, None, 64) 0
block2_conv1 (Conv2D) (None, None, None, 128) 73856
block2_conv2 (Conv2D) (None, None, None, 128) 147584
block2_pool (MaxPooling2D) (None, None, None, 128) 0
block3_conv1 (Conv2D) (None, None, None, 256) 295168
block3_conv2 (Conv2D) (None, None, None, 256) 590080
block3_conv3 (Conv2D) (None, None, None, 256) 590080
block3_pool (MaxPooling2D) (None, None, None, 256) 0
block4_conv1 (Conv2D) (None, None, None, 512) 1180160
block4_conv2 (Conv2D) (None, None, None, 512) 2359808
block4_conv3 (Conv2D) (None, None, None, 512) 2359808
block4_pool (MaxPooling2D) (None, None, None, 512) 0
block5_conv1 (Conv2D) (None, None, None, 512) 2359808
block5_conv2 (Conv2D) (None, None, None, 512) 2359808
block5_conv3 (Conv2D) (None, None, None, 512) 2359808
block5_pool (MaxPooling2D) (None, None, None, 512) 0
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 14714688 (56.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Visualize convolution filters 0-2 from layer 5 of the VGG network trained on ImageNet
Some respond to dots or waves in the image
Show code cell source
patterns = []
for i in range(3):
patterns.append(generate_pattern('block3_conv1', i))
Show code cell source
for i in range(3):
plt.subplot(131+i)
plt.xticks([])
plt.yticks([])
plt.imshow(patterns[i])
plt.tight_layout()
plt.show();
First 64 filters for 1st convolutional layer in block 1: simple edges and colors
Show code cell source
patterns1 = visualize_filter('block1_conv1')
patterns2 = visualize_filter('block2_conv1')
patterns3 = visualize_filter('block3_conv1')
patterns4 = visualize_filter('block4_conv1')
Show code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns1 * 255).astype(np.uint8));
Filters in 2nd block of convolution layers: simple textures (combined edges and colors)
Show code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns2 * 255).astype(np.uint8));
Filters in 3rd block of convolution layers: more natural textures
Show code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns3 * 255).astype(np.uint8));
Filters in 4th block of convolution layers: feathers, eyes, leaves,…
Show code cell source
# Display the results grid
plt.figure(figsize=(6*fig_scale, 6*fig_scale))
plt.axis("off")
plt.imshow((patterns4 * 255).astype(np.uint8));
Visualizing class activation#
We can also visualize which part of the input image had the greatest influence on the final classification. Helps to interpret what the model is paying attention to.
Class activation maps : produces a heatmap over the input image
Choose a convolution layer, do Global Average Pooling (GAP) to get one output per filter
Get the weights between those outputs and the class of interest
Compute the weighted sum of all filter activations: combines what each filter is responding to and how much this affects the class prediction
Show code cell source
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras import backend as K
K.clear_session()
# Note that we are including the densely-connected classifier on top;
# all previous times, we were discarding it.
model = VGG16(weights='imagenet')
Show code cell source
## From Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
import numpy as np
# The local path to our target image
img_path = '../notebooks/images/10_elephants.jpg'
# `img` is a PIL image of size 224x224
img = image.load_img(img_path, target_size=(224, 224))
# `x` is a float32 Numpy array of shape (224, 224, 3)
x = image.img_to_array(img)
# We add a dimension to transform our array into a "batch"
# of size (1, 224, 224, 3)
x = np.expand_dims(x, axis=0)
# Finally we preprocess the batch
# (this does channel-wise color normalization)
x = preprocess_input(x)# This is the "african elephant" entry in the prediction vector
african_elephant_output = model.output[:, 386]
# The is the output feature map of the `block5_conv3` layer,
# the last convolutional layer in VGG16
last_conv_layer = model.get_layer('block5_conv3')
# This is the gradient of the "african elephant" class with regard to
# the output feature map of `block5_conv3`
grads = K.gradients(african_elephant_output, last_conv_layer.output)[0]
# This is a vector of shape (512,), where each entry
# is the mean intensity of the gradient over a specific feature map channel
pooled_grads = K.mean(grads, axis=(0, 1, 2))
# This function allows us to access the values of the quantities we just defined:
# `pooled_grads` and the output feature map of `block5_conv3`,
# given a sample image
iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]])
# These are the values of these two quantities, as Numpy arrays,
# given our sample image of two elephants
pooled_grads_value, conv_layer_output_value = iterate([x])
# We multiply each channel in the feature map array
# by "how important this channel is" with regard to the elephant class
for i in range(512):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
# The channel-wise mean of the resulting feature map
# is our heatmap of class activation
heatmap = np.mean(conv_layer_output_value, axis=-1)
Example on VGG with a specific input image
Take the last convolutional layer of VGG pretrained on ImageNet
It consists of 512 filters of size 14x14
model = VGG16(weights='imagenet')
last_conv_layer = model.get_layer('block5_conv3')
Show code cell source
print("Last conv layer shape:", last_conv_layer.output_shape)
Last conv layer shape: (None, 14, 14, 512)
Choose an input image and preprocess it so we can feed it to the model
img = image.load_img(img_path, target_size=(224, 224))
Find the output node for its class (‘african elephant’, class 386)
african_elephant_output = model.output[:, 386]
Preprocessing
Load image
Resize to 224 x 224 (what VGG was trained on)
Do the same preprocessing (Keras VGG utility)
from keras.applications.vgg16 import preprocess_input
img_path = '../images/10_elephants.jpg'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0) # Transform to batch of size (1, 224, 224, 3)
x = preprocess_input(x)
Sanity test: do we get the right prediction?
preds = model.predict(x)
Show code cell source
preds = model.predict(x)
Show code cell source
print('Predicted:', decode_predictions(preds, top=3)[0])
Predicted: [('n02504458', 'African_elephant', 0.99998724), ('n01871265', 'tusker', 1.2755356e-05), ('n02403003', 'ox', 2.5128067e-08)]
VGG doesn’t use GAP. Compute the average gradient from the output node to the conv layer
Multiply (channel-wise) with the activations of the conv layer
grads = K.gradients(african_elephant_output, last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
for i in range(512): # 512 filters
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
heatmap = np.mean(conv_layer_output_value, axis=-1)
Visualize
heatmap
. It’s 14x14 since that’s the output dimension of the conv layer
Show code cell source
heatmap = np.maximum(heatmap, 0)
heatmap /= np.max(heatmap)
plt.figure(figsize=(4*fig_scale, 4*fig_scale))
plt.matshow(heatmap, fignum=1)
plt.axis("off");
Show code cell source
# pip install opencv-python
import cv2
# We use cv2 to load the original image
img = cv2.imread(img_path)
# We resize the heatmap to have the same size as the original image
heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
# We convert the heatmap to RGB
heatmap = np.uint8(255 * heatmap)
# We apply the heatmap to the original image
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
# 0.4 here is a heatmap intensity factor
superimposed_img = heatmap * 0.4 + img
# Save the image to disk
cv2.imwrite('../notebooks/images/elephant_cam.jpg', superimposed_img)
True
Upscaled and superimposed on the original image
The model looked at the face of the baby elephant and the trunk of the large elephant
Show code cell source
img = cv2.imread('../notebooks/images/elephant_cam.jpg')
RGB_im = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(RGB_im)
plt.title('Class activation map')
plt.xticks([])
plt.yticks([])
plt.show()
Transfer learning#
We can re-use pretrained networks instead of training from scratch
Learned features can be a generic model of the visual world
Use convolutional base to contruct features, then train any classifier on new data
Also called transfer learning , which is a kind of meta-learning
Let’s instantiate the VGG16 model (without the dense layers)
With 150x150 inputs, the final feature map has shape (4, 4, 512)
conv_base = VGG16(weights='imagenet', include_top=False, input_shape=(150, 150, 3))
Show code cell source
conv_base = VGG16(weights='imagenet',
include_top=False,
input_shape=(150, 150, 3))
Show code cell source
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 150, 150, 3)] 0
block1_conv1 (Conv2D) (None, 150, 150, 64) 1792
block1_conv2 (Conv2D) (None, 150, 150, 64) 36928
block1_pool (MaxPooling2D) (None, 75, 75, 64) 0
block2_conv1 (Conv2D) (None, 75, 75, 128) 73856
block2_conv2 (Conv2D) (None, 75, 75, 128) 147584
block2_pool (MaxPooling2D) (None, 37, 37, 128) 0
block3_conv1 (Conv2D) (None, 37, 37, 256) 295168
block3_conv2 (Conv2D) (None, 37, 37, 256) 590080
block3_conv3 (Conv2D) (None, 37, 37, 256) 590080
block3_pool (MaxPooling2D) (None, 18, 18, 256) 0
block4_conv1 (Conv2D) (None, 18, 18, 512) 1180160
block4_conv2 (Conv2D) (None, 18, 18, 512) 2359808
block4_conv3 (Conv2D) (None, 18, 18, 512) 2359808
block4_pool (MaxPooling2D) (None, 9, 9, 512) 0
block5_conv1 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv2 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv3 (Conv2D) (None, 9, 9, 512) 2359808
block5_pool (MaxPooling2D) (None, 4, 4, 512) 0
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 14714688 (56.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Using pre-trained networks: 3 ways#
Fast feature extraction (for similar task, little data)
Call
predict
from the convolutional base to build new featuresUse outputs as input to a new neural net (or other algorithm)
End-to-end tuning (for similar task, lots of data + data augmentation)
Extend the convolutional base model with a new dense layer
Train it end to end on the new data (expensive!)
Fine-tuning (for somewhat different task)
Unfreeze a few of the top convolutional layers, and retrain
Update only the more abstract representations
Fast feature extraction (without data augmentation)#
Run every batch through the pre-trained convolutional base
features = conv_base.predict(inputs)
Build Dense neural net (with Dropout)
Train and evaluate with the transformed examples
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.fit(features, labels, epochs=30)
Show code cell source
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
train_dir = os.path.join(data_dir, 'train')
validation_dir = os.path.join(data_dir, 'validation')
datagen = ImageDataGenerator(rescale=1./255)
batch_size = 20
def extract_features(directory, sample_count):
features = np.zeros(shape=(sample_count, 4, 4, 512))
labels = np.zeros(shape=(sample_count))
generator = datagen.flow_from_directory(
directory,
target_size=(150, 150),
batch_size=batch_size,
class_mode='binary')
i = 0
for inputs_batch, labels_batch in generator:
features_batch = conv_base.predict(inputs_batch)
features[i * batch_size : (i + 1) * batch_size] = features_batch
labels[i * batch_size : (i + 1) * batch_size] = labels_batch
i += 1
if i * batch_size >= sample_count:
# Note that since generators yield data indefinitely in a loop,
# we must `break` after every image has been seen once.
break
return features, labels
Uncomment to recompute from scratch.
train_features, train_labels = extract_features(train_dir, 2000)
validation_features, validation_labels = extract_features(validation_dir, 1000)
train_features = np.reshape(train_features, (2000, 4 * 4 * 512))
validation_features = np.reshape(validation_features, (1000, 4 * 4 * 512))
Uncomment to recompute from scratch.
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
loss='binary_crossentropy',
metrics=['acc'])
history = model.fit(train_features, train_labels,
epochs=30, verbose=0,
batch_size=20,
validation_data=(validation_features, validation_labels))
model.save(os.path.join(model_dir, 'cats_and_dogs_small_3a.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_3a_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
Validation accuracy around 90%, much better!
Still overfitting, despite the Dropout: not enough training data
Show code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_3a_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.90500003
Fast feature extraction (with data augmentation)#
Simply add the Dense layers to the convolutional base
Freeze the convolutional base (before you compile)
Without freezing, you train it end-to-end (expensive)
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
conv_base.trainable = False # Freeze the pretrained weights
Show code cell source
from tensorflow.keras import models
from tensorflow.keras import layers
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
conv_base.trainable = False
Show code cell source
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
vgg16 (Functional) (None, 4, 4, 512) 14714688
flatten (Flatten) (None, 8192) 0
dense (Dense) (None, 256) 2097408
dense_1 (Dense) (None, 1) 257
=================================================================
Total params: 16812353 (64.13 MB)
Trainable params: 2097665 (8.00 MB)
Non-trainable params: 14714688 (56.13 MB)
_________________________________________________________________
Show code cell source
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers
train_datagen = ImageDataGenerator(
rescale=1./255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest')
# Note that the validation data should not be augmented!
validation_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
# This is the target directory
train_dir,
# All images will be resized to 150x150
target_size=(150, 150),
batch_size=20,
# Since we use binary_crossentropy loss, we need binary labels
class_mode='binary')
validation_generator = validation_datagen.flow_from_directory(
validation_dir,
target_size=(150, 150),
batch_size=20,
class_mode='binary')
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=2e-5),
metrics=['acc'])
Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.RMSprop` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.RMSprop`.
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.RMSprop.
Uncomment to train from scratch (slow!)
history = model.fit_generator(
train_generator,
steps_per_epoch=100,
epochs=30,
validation_data=validation_generator,
validation_steps=50,
verbose=0)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_3b_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
Data augmentation and training (takes a LONG time)
train_datagen = ImageDataGenerator(
rescale=1./255, rotation_range=40, width_shift_range=0.2,
height_shift_range=0.2, shear_range=0.2, zoom_range=0.2,
horizontal_flip=True, fill_mode='nearest')
train_generator = train_datagen.flow_from_directory(train_dir,
target_size=(150, 150), batch_size=20, class_mode='binary')
history = model.fit_generator(
train_generator, steps_per_epoch=100, epochs=30,
validation_data=validation_generator, validation_steps=50)
We now get about 90% accuracy again, and very little overfitting
Show code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_3b_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.906
Fine-tuning#
Add your custom network on top of an already trained base network.
Freeze the base network, but unfreeze the last block of conv layers.
for layer in conv_base.layers:
if layer.name == 'block5_conv1':
layer.trainable = True
else:
layer.trainable = False
Visualized
Show code cell source
conv_base.trainable = True
set_trainable = False
for layer in conv_base.layers:
if layer.name == 'block5_conv1':
set_trainable = True
if set_trainable:
layer.trainable = True
else:
layer.trainable = False
Show code cell source
conv_base.summary()
Model: "vgg16"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 150, 150, 3)] 0
block1_conv1 (Conv2D) (None, 150, 150, 64) 1792
block1_conv2 (Conv2D) (None, 150, 150, 64) 36928
block1_pool (MaxPooling2D) (None, 75, 75, 64) 0
block2_conv1 (Conv2D) (None, 75, 75, 128) 73856
block2_conv2 (Conv2D) (None, 75, 75, 128) 147584
block2_pool (MaxPooling2D) (None, 37, 37, 128) 0
block3_conv1 (Conv2D) (None, 37, 37, 256) 295168
block3_conv2 (Conv2D) (None, 37, 37, 256) 590080
block3_conv3 (Conv2D) (None, 37, 37, 256) 590080
block3_pool (MaxPooling2D) (None, 18, 18, 256) 0
block4_conv1 (Conv2D) (None, 18, 18, 512) 1180160
block4_conv2 (Conv2D) (None, 18, 18, 512) 2359808
block4_conv3 (Conv2D) (None, 18, 18, 512) 2359808
block4_pool (MaxPooling2D) (None, 9, 9, 512) 0
block5_conv1 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv2 (Conv2D) (None, 9, 9, 512) 2359808
block5_conv3 (Conv2D) (None, 9, 9, 512) 2359808
block5_pool (MaxPooling2D) (None, 4, 4, 512) 0
=================================================================
Total params: 14714688 (56.13 MB)
Trainable params: 7079424 (27.01 MB)
Non-trainable params: 7635264 (29.13 MB)
_________________________________________________________________
Load trained network, finetune
Use a small learning rate, large number of epochs
You don’t want to unlearn too much: catastrophic forgetting
model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-5),
metrics=['acc'])
history = model.fit(
train_generator, steps_per_epoch=100, epochs=100,
validation_data=validation_generator,
validation_steps=50)
Uncomment to run from scratch (slow!)
from keras.models import load_model
model = load_model(os.path.join(model_dir, 'cats_and_dogs_small_3b.h5'))
model.compile(loss='binary_crossentropy',
optimizer=optimizers.RMSprop(lr=1e-5),
metrics=['acc'])
history = model.fit(
train_generator,
steps_per_epoch=100,
epochs=100, # Repeat with epochs=100
validation_data=validation_generator,
validation_steps=50)
model.save(os.path.join(model_dir, 'cats_and_dogs_small_4.h5'))
with open(os.path.join(model_dir, 'cats_and_dogs_small_4_history.p'), 'wb') as file_pi:
pickle.dump(history.history, file_pi)
Almost 95% accuracy. The curves are quite noisy, though.
Show code cell source
history = pickle.load(open("../data/models/cats_and_dogs_small_4_history.p", "rb"))
print("Max val_acc",np.max(history['val_acc']))
pd.DataFrame(history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.90800005
We can smooth the learning curves using a running average
Show code cell source
def smooth_curve(points, factor=0.8):
smoothed_points = []
for point in points:
if smoothed_points:
previous = smoothed_points[-1]
smoothed_points.append(previous * factor + point * (1 - factor))
else:
smoothed_points.append(point)
return smoothed_points
smooth_history = {}
smooth_history['loss'] = smooth_curve(history['loss'])
smooth_history['acc'] = smooth_curve(history['acc'])
smooth_history['val_loss'] = smooth_curve(history['val_loss'])
smooth_history['val_acc'] = smooth_curve(history['val_acc'])
print("Max val_acc",np.max(smooth_history['val_acc']))
pd.DataFrame(smooth_history).plot(lw=2,style=['b:','r:','b-','r-']);
plt.xlabel('epochs');
Max val_acc 0.9039536851123335
Take-aways#
Convnets are ideal for addressing image-related problems.
They learn a hierarchy of modular patterns and concepts to represent the visual world.
Representations are easy to inspect
Data augmentation helps fight overfitting
You can use a pretrained convnet to build better models via transfer learning