Multilayer Perceptrons in PyTorch

Andrew Fogarty


# load python
# load packages
import sys
sys.path.append("C:/Users/Andrew/Desktop/Projects/Deep Learning/utils")  # this is the folder with py files
from tools import AverageMeter, ProgressBar #scriptName without .py extension; import each class
from radam import RAdam
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup, AdamW
from import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, re, os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from import Dataset, Subset
from sklearn.preprocessing import LabelEncoder
from torchvision import transforms

# set seed and gpu requirements
SEED = 15
## <torch._C.Generator object at 0x000000001FA40370>
torch.backends.cudnn.deterministic = True

# set gpu/cpu
## <torch.cuda.amp.autocast_mode.autocast object at 0x00000000339BCB48>
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

1 Introduction

Artificial Neural Networks (ANN) are powerful classification and regression algorithms that can solve simple and complex linear and non-linear modeling problems. In this post, we demonstrate the functionality of a basic deep learning multi-layer perceptron model on PyTorch using the famous MNIST data set.

1.1 Preparing a Custom Data Set

In the chunk below, we begin by loading our data using PyTorch’s custom Dataset class – ideal for lazy evaluation, advanced modeling, and big data.

# create Dataset
class CSVDataset(Dataset):
    """MNIST dataset."""

    def __init__(self, csv_file, transform=None):
            csv_file (string): Path to the csv file.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        # initialize
        self.data_frame = pd.read_csv(csv_file)
        # all columns but the last
        self.features = self.data_frame[self.data_frame.columns[:-1]]
        # the last column = self.data_frame[self.data_frame.columns[-1]]
        # initialize the transform if specified
        self.transform = transform

        # get length of df
    def __len__(self):
        return len(self.data_frame)

        # get sample target
    def __get_target__(self):

        # get df filtered by indices
    def __get_values__(self, indices):
        return self.data_frame.iloc[indices]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # pull a sample in a dict
        sample = {'features': torch.tensor(self.features.iloc[idx].values),
                  'target': torch.tensor([idx]),
                  'idx': torch.tensor(idx)}

        if self.transform:
            sample = self.transform(sample)

        return sample

Next, a custom transform is created which applies a normalization transformation on-the-fly.

class Pixel_Normalize():

    # retrieve sample and unpack it
    def __call__(self, sample):
        features, target, idx = (sample['features'],

        # normalize each pixel
        normalized_pixels = torch.true_divide(sample['features'], 255)

        # yield another dict
        return {'features': normalized_pixels,
                'target': target,
                'idx': idx}

Lastly, we instantiate the custom data set and split it into train, validation, and test data sets as well as check its output.

# instantiate the lazy data set
csv_dataset = CSVDataset(csv_file='',

# set train, valid, and test size
train_size = int(0.8 * len(csv_dataset))
valid_size = int(0.1 * len(csv_dataset))

# use random split to create three data sets;
train_ds, valid_ds, test_ds =, [train_size, valid_size, valid_size])

# check the output
for i, batch in enumerate(train_ds):
    if i == 0:

By preparing the __get_target__() method in the data set class, we can check the distribution of our dependent variable easily.

# check the distribution of dependent variable; some imbalance
## 1    7877
## 7    7293
## 3    7141
## 2    6990
## 9    6958
## 0    6903
## 6    6876
## 8    6825
## 4    6824
## 5    6313
## Name: class, dtype: int64

To deal with imbalance, we have two options: (1) prepare a stratified-split data set, or (2) use a weighted sampler. For small data sets, option #1 is ideal because we prepare (sample) data sets in such a way that they are ideally representative of the true population. If we split them haphazardly, we do two things: (1) we violate the i.i.d assumption of machine learning, and (2) we break our sample’s underlying distribution and thereby underestimate or overestimate the population’s true distribution.

Since our data set is large, we will use a weighted sampler:

# prepare weighted sampling for imbalanced classification
def create_sampler(train_ds, csv_dataset):
    # get indicies from train split
    train_indices = train_ds.indices
    # generate class distributions [y1, y2, etc...]
    bin_count = np.bincount(csv_dataset.__get_target__()[train_indices])
    # weight gen
    weight = 1. / bin_count.astype(np.float32)
    # produce weights for each observation in the data set
    samples_weight = torch.tensor([weight[t] for t in csv_dataset.__get_target__()[train_indices]])
    # prepare sampler
    sampler =,
    return sampler

# create sampler for the training ds
train_sampler = create_sampler(train_ds, csv_dataset)    

1.2 Preparing a Feed Forward Neural Network

With some of the data preparation out of the way, the next task is to build a feed forward neural network. ReLU is probably the most prominent activation function, owing to its speed and reliability. However, ReLU neurons may die during training, a form of regularization, which can happen if the input is large/small leading to a zero gradient. ReLU suffers less from a vanishing gradient problem as compared to sigmoid or Tanh, but can explode easily.

We generally use multiple layers because:

  1. We can achieve the same expressiveness with more layers with fewer parameters. With fewer parameters we are less likely to overfit.

  2. More layers provides a form of regularization because later layers are constrained on the behavior of earlier layers.

  3. We can use different layers for different levels of feature abstraction.

  4. However, more layers leads to an increased risk of vanishing or exploding gradients.

# create NN
# nn.Module tells PyTorch to do backward propagation
class FF_NN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(FF_NN, self).__init__()
        # initialize 3 layers
        # first hidden layer
        self.linear_1 = torch.nn.Linear(num_features, num_hidden_1)
        # second hidden layer
        self.linear_2 = torch.nn.Linear(num_hidden_1, num_hidden_2)
        # output layer
        self.linear_out = torch.nn.Linear(num_hidden_2, num_classes)

    # define how and what order model parameters should be used in forward prop.
    def forward(self, x):
        # run inputs through first layer
        out = self.linear_1(x)
        # apply relu
        out = F.relu(out)
        # run inputs through second layer
        out = self.linear_2(out)
        # apply relu
        out = F.relu(out)
        # run inputs through final classification layer
        logits = self.linear_out(out)
        probs = F.log_softmax(logits, dim=1)
        return logits, probs
# load the NN model
num_features = 784
num_hidden_1 = 128
num_hidden_2 = 256
num_classes = 10
model = FF_NN(num_features=num_features, num_classes=num_classes).to(DEVICE)

1.3 Training Helper Functions

There are a few things worth pointing out in terms of what is happening in the training function below.

  1. When invoking scaler.scale(loss).backward(), we are computing the gradients. Backward() is automatically constructed by torch’s autograd based on the forward() method and the loss function.

  2. When invoking scaler.step(optimizer), we use the gradients to update the weights according to the optimization method. For example, given SGD: weight = weight + learning_rate \(\times\) gradient.

  3. F.cross_entropy is our loss function which takes logits as inputs and performs log_softmax internally. It is desirable to use this loss function over something like nll_loss because it is more numerically stable.

A PyTorch cheat sheet for loss functions is as follows:

# train function
def train(dataloader):
    #pbar = ProgressBar(n_total=len(dataloader), desc='Training')
    train_loss = AverageMeter()
    for batch_idx, batch in enumerate(dataloader):
        b_features, b_target, b_idx = batch['features'].to(DEVICE),  batch['target'].to(DEVICE), batch['idx'].to(DEVICE)
        with autocast():
            logits, probs = model(b_features)
            loss = F.cross_entropy(logits, b_target)
        # pbar(step=batch_idx, info={'loss': loss.item()})  # removed for markdown
        train_loss.update(loss.item(), n=1)
    return {'loss': train_loss.avg}

# valid/test function
def test(dataloader):
    #pbar = ProgressBar(n_total=len(dataloader), desc='Testing')
    valid_loss = AverageMeter()
    valid_acc = AverageMeter()
    valid_f1 = AverageMeter()
    count = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            b_features, b_target, b_idx = batch['features'].to(DEVICE),  batch['target'].to(DEVICE), batch['idx'].to(DEVICE)
            logits, probs = model(b_features)
            loss = F.cross_entropy(logits, b_target).item()
            pred = probs.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct = pred.eq(b_target.view_as(pred)).sum().item()
            f1 = f1_score("cpu").numpy(),"cpu").numpy(), average='macro')
            valid_f1.update(f1, n=b_features.size(0))
            valid_loss.update(loss, n=b_features.size(0))
            valid_acc.update(correct, n=1)
            count += b_features.size(0)
            # pbar(step=batch_idx)  # removed for markdown
    return {'valid_loss': valid_loss.avg,
            'valid_acc': valid_acc.sum /count,
            'valid_f1': valid_f1.avg}

1.4 Data Loaders and Misc

# set number of epochs
epochs = 5

# create DataLoaders with samplers
train_dataloader = DataLoader(train_ds,

valid_dataloader = DataLoader(valid_ds,

test_dataloader = DataLoader(test_ds,

# create gradient scaler for mixed precision
scaler = GradScaler()

1.5 Optimizer and Scheduler

Next, Rectified Adam (RAdam) is implemented along with a OneCycleLR scheduler.

# create optimizer
optimizer = RAdam(model.parameters(), lr=0.1)

# set LR scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
# training
for epoch in range(1, epochs + 1):
    train_log = train(train_dataloader)
    valid_log = test(valid_dataloader)
    logs = dict(train_log, **valid_log)
    show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
## Epoch: 1 -  loss: 0.7573 - valid_loss: 0.1846 - valid_acc: 0.9439 - valid_f1: 0.9404 
## Epoch: 2 -  loss: 0.1576 - valid_loss: 0.1325 - valid_acc: 0.9599 - valid_f1: 0.9586 
## Epoch: 3 -  loss: 0.0956 - valid_loss: 0.1134 - valid_acc: 0.9660 - valid_f1: 0.9636 
## Epoch: 4 -  loss: 0.0485 - valid_loss: 0.0689 - valid_acc: 0.9809 - valid_f1: 0.9799 
## Epoch: 5 -  loss: 0.0245 - valid_loss: 0.0670 - valid_acc: 0.9806 - valid_f1: 0.9790 
## C:/Users/Andrew/Desktop/Projects/Deep Learning/utils\ UserWarning: This overload of add_ is deprecated:
##  add_(Number alpha, Tensor other)
## Consider using one of the following signatures instead:
##  add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:766.)
##   exp_avg.mul_(beta1).add_(1 - beta1, grad)
# testing
test_log = test(test_dataloader)
## {'valid_loss': 0.06291491132163043, 'valid_acc': 0.9818571428571429, 'valid_f1': 0.9804316277722123}

2 Sources