y = sigmoid(np.dot(X, W) + b)

def sigmoid(x):
  return 1/(1 + np.exp(-x)) 

def add(x, y):
  return x + y

# Flatten images, create train-test split
X_flat = X.reshape(70000, 28 * 28)
X_train, X_test, y_train, y_test = train_test_split(X_flat, y, stratify=y)

# Convert numpy arrays to PyTorch tensors with correct types
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

import torch
from torch.utils.data import DataLoader, TensorDataset

# Create PyTorch datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

import torch.nn as nn

model = nn.Sequential(
    nn.Linear(28 * 28, 512), # Layer 1: 28*28 inputs to 512 output nodes
    nn.ReLU(),
    nn.Linear(512, 512), # Layer 2: 512 inputs to 512 output nodes
    nn.ReLU(),
    nn.Linear(512, 10), # Layer 3: 512 inputs to output nodes
)

import torch.nn.functional as F

class NeuralNetwork(nn.Module): # Class that defines your model
    def __init__(self):
        super(NeuralNetwork, self).__init__() # Components defined in __init__
        self.fc1 = nn.Linear(28 * 28, 512)    # Fully connected layers
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 10)

    def forward(self, x):        # Forward pass and structure of the network
        x = F.relu(self.fc1(x))  # Layer 1: Input to FC1, then through ReLU
        x = F.relu(self.fc2(x))  # Layer 2: Then though FC2, then ReLU
        x = self.fc3(x)          # Layer 3: Then though FC3, then SoftMax
        return x                 # Return output

model = NeuralNetwork()

import torch.optim as optim
import torchmetrics

# Loss function with label smoothing. Also applies softmax internally
criterion = nn.CrossEntropyLoss(label_smoothing=0.01)

# Optimizer. Note that we pass the model parameters at creation time.
optimizer = optim.RMSprop(model.parameters(), lr=0.001, momentum=0.0)

# Accuracy metric
accuracy_metric = torchmetrics.Accuracy(task="multiclass", num_classes=10)

if torch.cuda.is_available():         # For CUDA based systems
    device = torch.device("cuda")
if torch.backends.mps.is_available(): # For MPS (M1-M4 Mac) based systems
    device = torch.device("mps")
print(f"Used device: {device}")

# Move models and metrics to `device`
model.to(device)                      
accuracy_metric = accuracy_metric.to(device)

# Move batches one at a time (GPUs have limited memory)
for X_batch, y_batch in train_loader:
    X_batch, y_batch = X_batch.to(device), y_batch.to(device)

for epoch in range(10):
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device) # to GPU

        # Forward pass + loss calculation
        outputs = model(X_batch)
        loss = cross_entropy(outputs, y_batch)

        # Backward pass
        optimizer.zero_grad() # Reset gradients (otherwise they accumulate)
        loss.backward()       # Backprop. Computes all gradients
        optimizer.step()      # Uses gradients to update weigths

Epoch [1/5], Loss: 2.3524, Accuracy: 0.7495
Epoch [2/5], Loss: 0.5531, Accuracy: 0.8259
Epoch [3/5], Loss: 0.5102, Accuracy: 0.8408
Epoch [4/5], Loss: 0.4897, Accuracy: 0.8493
Epoch [5/5], Loss: 0.4758, Accuracy: 0.8550

class NeuralNetwork(pl.LightningModule):
    def __init__(self):
        pass # Initialize model
    
    def forward(self, x): 
        pass # Forward pass, return output tensor
    
    def configure_optimizers(self):
        pass # Configure optimizer (e.g. Adam)
    
    def training_step(self, batch, batch_idx):
        pass # Return loss tensor
    
    def validation_step(self, batch, batch_idx):
        pass # Return loss tensor
        
    def test_step(self, batch, batch_idx):
        pass # Return loss tensor

import pytorch_lightning as pl

class NeuralNetwork(pl.LightningModule):
    def __init__(self):
        super(LitNeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 10)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.01)
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

    def training_step(self, batch, batch_idx):
        X_batch, y_batch = batch
        outputs = self(X_batch)
        return self.criterion(outputs, y_batch)

    def configure_optimizers(self):
        return optim.RMSprop(self.parameters(), lr=0.001, momentum=0.0)

model = NeuralNetwork()

ModelSummary(pl_model, max_depth=2)

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | fc1       | Linear             | 401 K  | train
1 | fc2       | Linear             | 262 K  | train
2 | fc3       | Linear             | 5.1 K  | train
3 | criterion | CrossEntropyLoss   | 0      | train
4 | accuracy  | MulticlassAccuracy | 0      | train
---------------------------------------------------------
669 K     Trainable params
0         Non-trainable params
669 K     Total params
2.679     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode

def training_step(self, batch, batch_idx):
    X_batch, y_batch = batch
    outputs = self(X_batch)                 # Logits (raw outputs)
    loss = self.criterion(outputs, y_batch) # Loss
    preds = torch.argmax(outputs, dim=1)    # Predictions
    acc = self.accuracy(preds, y_batch)     # Metric
    self.log("train_loss", loss)            # self.log is the default
    self.log("train_acc", acc)              # TensorBoard logger
    return loss

def on_train_epoch_end(self): # Runs at the end of every epoch
    avg_loss = self.trainer.callback_metrics["train_loss"].item()
    avg_acc = self.trainer.callback_metrics["train_acc"].item()
    print(f"Epoch {self.trainer.current_epoch}: Loss = {avg_loss:.4f}, Train accuracy = {avg_acc:.4f}")

def validation_step(self, batch, batch_idx):
    X_batch, y_batch = batch
    outputs = self(X_batch)
    loss = self.criterion(outputs, y_batch)
    preds = torch.argmax(outputs, dim=1)
    acc = self.accuracy(preds, y_batch)
    self.log("val_loss", loss, on_epoch=True)
    self.log("val_acc", acc, on_epoch=True)
    return loss

Training: |                                                                                                   …

Epoch 1: Loss = 0.6928, Accuracy = 0.8000
Epoch 2: Loss = 0.3986, Accuracy = 0.9000
Epoch 3: Loss = 0.3572, Accuracy = 0.9000

# Define early stopping callback
early_stopping = EarlyStopping(
    monitor="val_loss", mode="min", # minimize validation loss
    patience=3)                     # Number of epochs with no improvement before stopping
          
# Update the Trainer to include early stopping as a callback
trainer = pl.Trainer(
    max_epochs=10, accelerator=accelerator,
    callbacks=[TrainingPlotCallback(), early_stopping]  # Attach the callbacks
)

def training_step(self, batch, batch_idx):
    X_batch, y_batch = batch
    outputs = self(X_batch)                 
    loss = self.criterion(outputs, y_batch)
    l1_lambda = 1e-5 # L1 Regularization
    l1_loss = sum(p.abs().sum() for p in self.parameters())
    l2_lambda = 1e-4 # L2 Regularization
    l2_loss = sum((p ** 2).sum() for p in self.parameters())
    return loss + l2_lambda * l2_loss  # Using L2 only

def configure_optimizers(self):
    return optim.RMSprop(self.parameters(), lr=0.001, momentum=0.0, weight_decay=1e-4)

def __init__(self):
    super(NeuralNetwork, self).__init__()
    self.fc1 = nn.Linear(28 * 28, 512)
    self.dropout1 = nn.Dropout(p=0.2)  # 20% dropout
    self.fc2 = nn.Linear(512, 512)
    self.dropout2 = nn.Dropout(p=0.1)  # 10% dropout
    self.fc3 = nn.Linear(512, 10)

def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.dropout1(x)  # Apply dropout
    x = F.relu(self.fc2(x))
    x = self.dropout2(x)  # Apply dropout
    return self.fc3(x)

def __init__(self):
    super(NeuralNetwork, self).__init__()
    self.fc1 = nn.Linear(28 * 28, 512)
    self.bn1 = nn.BatchNorm1d(512)  # Batch normalization after first layer
    self.fc2 = nn.Linear(512, 265)
    self.bn2 = nn.BatchNorm1d(265)  # Batch normalization after second layer
    self.fc3 = nn.Linear(265, 10)

def forward(self, x):
    x = x.view(x.size(0), -1)  # Flatten the image
    x = F.relu(self.bn1(self.fc1(x)))  # Apply batch norm after linear layer
    x = F.relu(self.bn2(self.fc2(x)))  # Apply batch norm after second layer
    return self.fc3(x)

class NeuralNetwork(pl.LightningModule):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 265)
        self.bn1 = nn.BatchNorm1d(265)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(265, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(32, 10)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=0.01)
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        x = self.fc4(x)
        return x

model = nn.Sequential(
        nn.Linear(28 * 28, 265),
        nn.BatchNorm1d(265),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(265, 64),
        nn.BatchNorm1d(64),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(64, 32),
        nn.BatchNorm1d(32),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(32, 10))

logger = pl.loggers.TensorBoardLogger("logs/", name="my_experiment")
trainer = pl.Trainer(max_epochs=2, logger=logger)
trainer.fit(lit_model, trainloader)

Lecture 6. Neural Networks¶

Overview¶

Architecture¶

Basic Architecture¶

More layers¶

Why layers?¶

Other architectures¶

Training Neural Nets¶

Mini-batch Stochastic Gradient Descent (recap)¶

Forward pass¶

Tensor operations¶

Element-wise operations¶

Backward pass (backpropagation)¶

Example¶

Backpropagation (2)¶

Backpropagation (3)¶

Summary¶

Activation functions for hidden layers¶

Effect of activation functions on the gradient¶

ReLU vs Tanh¶

Activation functions for output layer¶

Weight initialization¶

Weight initialization: transfer learning¶

Optimizers¶

SGD with learning rate schedules¶

SGD with learning rate schedules¶

Momentum¶

Momentum in practice¶

Adaptive gradients¶

Adam (Adaptive moment estimation)¶

SGD Optimizer Zoo¶

Neural networks in practice¶

Preparing the data¶

Building the network¶

Choosing loss, optimizer, metrics¶

Training on GPU¶

Training loop¶

loss.backward()¶

In PyTorch Lightning¶

Training¶

Lightning Trainer¶

Choosing training hyperparameters¶

Model selection¶

Early stopping¶

Regularization and memorization capacity¶

Weight regularization (weight decay)¶

Dropout¶

Dropout layers¶

Batch Normalization¶

BatchNorm layers¶

New model¶

New model (Sequential API)¶

Other logging tools¶

Summary¶

`loss.backward()`¶