# import packages
import torch
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import sys
sys.path.append("C:/Users/Andrew/Desktop/Projects/Deep Learning/utils")
from tools import AverageMeter, ProgressBar
SEED = 123
torch.manual_seed(SEED)
## <torch._C.Generator object at 0x000000002058C050>
## <torch.cuda.amp.autocast_mode.autocast object at 0x000000001F86B808>
Numerical optimizers provide the means to estimate our parameters by finding the values that maximize the likelihood of generating our data. This guide helps us understand how optimization algorithms find the best estimates for our coefficients in a step-by-step process.
This guide uses Gradient Descent and regularization techniques in a completely manual approach to finding parameters that most likely generated our data. Another way of thinking about gradient descent is that we are inevitably asking the algorithm the following: What parameter values will push our error to zero?
In this section, we will create a torch data set that reads a csv file and processes (normalizes) our data on the fly via the standardize
class.
# create Dataset
class CSVDataset(Dataset):
"""LM dataset."""
def __init__(self, csv_file, transform=None):
"""
Args:
csv_file (string): Path to the csv file.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
# initialize
self.data_frame = pd.read_csv(csv_file)
# all columns but the last
self.features = self.data_frame[self.data_frame.columns[:-1]]
# the last column
self.target = self.data_frame[self.data_frame.columns[-1]]
# initialize the transform if specified
self.transform = transform
# get length of df
def __len__(self):
return len(self.data_frame)
# get df mean and std
def __get_norm__(self):
self.mu, self.sigma = np.mean(self.features.values, axis=0), np.std(self.features.values, axis=0)
return self.mu, self.sigma
# get sample target
def __get_target__(self):
return self.target
# get df filtered by indices
def __get_values__(self, indices):
return self.data_frame.iloc[indices]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
# pull a sample in a dict
sample = {'features': torch.tensor(self.features.iloc[idx].values),
'target': torch.tensor(self.target.iloc[idx]),
'idx': torch.tensor(idx)}
if self.transform:
sample = self.transform(sample)
return sample
class Standardize():
# retrieve sample and unpack it
def __call__(self, sample):
features, target, idx = (sample['features'],
sample['target'],
sample['idx'])
# normalize each value; zero mean, unit variance
normalized_features = (features - csv_dataset.__get_norm__()[0]) / csv_dataset.__get_norm__()[1]
# yield another dict
return {'features': torch.as_tensor(normalized_features,
dtype=torch.float32,
device=device),
'target': torch.as_tensor(target,
dtype=torch.float32,
device=device),
'idx': torch.as_tensor(idx,
dtype=torch.int,
device=device)}
We will use sklearn
’s make_classification
method to generate some data useful for logistic regression.
X, y = datasets.make_classification(n_samples=1000,
n_features=2,
n_informative=2,
n_redundant=0,
n_classes=2,
random_state=15)
# place data into df
df = pd.DataFrame({'x1': X[:, 0], 'x2': X[:, 1], 'y': y})
df.to_csv('classification_demo.csv', index=False)
# instantiate the lazy data set
csv_dataset = CSVDataset(csv_file='classification_demo.csv', transform=Standardize())
# check normalization unit variance values
csv_dataset.__get_norm__()[1]
## array([1.39204168, 1.26812651])
## array([-0.02077113, 0.01676885])
Next, we will split the data into training and test sets.
The class below finds the weights and bias terms for a logistic regression model through stochastic gradient descent. Through numerical optimization, we find our coefficients via backward propagation by:
Instantiating guesses for our weights and intercept at 0. Alternatively, we could initialize our guesses based on a distribution of our choice.
Matrix multiplying our features against our errors to find our new weight increment and summing our errors to find our new intercept increment.
Update our weights and intercept by multiplying the learning rate (\(\eta\)) against our respective increments and then finally subtracting these values from the instantiated guesses for our weights and bias.
To avoid calculating and figuring out backward propagation, PyTorch allows us to skip this process by simply wrapping class model with torch.nn.Module
and then relying on torch.optim
to handle backward propagation and to update our model’s weights.
# create manual logistic regression model
class LR():
''' Stochastic Gradient Descent Logistic Regression Model '''
def __init__(self, num_features, LAMBDA=0.0):
"""
Args:
num_features (int): Number of independent variables
LAMBDA (float): L2 Regularization (Ridge)
"""
self.num_features = num_features
self.weights = torch.zeros(1, num_features,
dtype=torch.float32, device=device).normal_(0.0, 0.1)
self.bias = torch.zeros(1, dtype=torch.float32, device=device)
self.LAMBDA = LAMBDA
def forward(self, x):
# linear combination
linear = torch.add(torch.mm(x, self.weights.t()), self.bias).view(-1)
# activation function
probas = torch.sigmoid(linear)
return probas
def backward(self, x, y, probas):
# compute error
grad_loss_out = y - probas.view(-1)
# compute gradient loss for each weight
grad_loss_w = -torch.mm(x.t(), grad_loss_out.view(-1, 1)).t()
# compute gradient loss for intercept
grad_loss_b = -torch.sum(grad_loss_out)
return grad_loss_w, grad_loss_b
def _logit_cost(self, y, probas):
tmp1 = torch.mm(-y.view(1, -1), torch.log(probas.view(-1, 1)))
tmp2 = torch.mm((1 - y).view(1, -1), torch.log(1 - probas.view(-1, 1)))
l2 = self.LAMBDA / 2.0 * torch.sum(self.weights**2)
return (tmp1 - tmp2) + l2
Next we prepare our data loaders and check our data again.
# set epochs
epochs = 20
# set lr
learning_rate = 0.01
# instantiate model
model = LR(num_features=2, LAMBDA=25.0)
# prepare training function
def train(dataloader):
#pbar = ProgressBar(n_total=len(dataloader), desc='Training')
train_loss = AverageMeter()
for batch_idx, batch in enumerate(dataloader):
# forward
probas = model.forward(batch['features'])
# backward
grad_w, grad_b = model.backward(batch['features'],
batch['target'],
probas)
# manual regularization -- account for mini-batches
l2_reg = model.LAMBDA * model.weights / len(dataloader)
# update weights
model.weights -= learning_rate * (grad_w + l2_reg)
model.bias -= learning_rate * grad_b
# record loss
loss = model._logit_cost(batch['target'], probas)
# update meter
train_loss.update(loss.item(), n=1)
# update progress bar
#pbar(step=batch_idx, info={'batch_loss': loss.item()})
return {'train_loss': train_loss.avg}
# training
for epoch in range(1, epochs + 1):
train_log = train(train_dataloader)
logs = dict(train_log)
train_logs = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
print(train_logs)
##
## Epoch: 1 - train_loss: 61.5763
##
## Epoch: 2 - train_loss: 68.8976
##
## Epoch: 3 - train_loss: 75.1295
##
## Epoch: 4 - train_loss: 79.3965
##
## Epoch: 5 - train_loss: 80.2331
##
## Epoch: 6 - train_loss: 81.3295
##
## Epoch: 7 - train_loss: 81.1134
##
## Epoch: 8 - train_loss: 82.3650
##
## Epoch: 9 - train_loss: 81.3603
##
## Epoch: 10 - train_loss: 81.6685
##
## Epoch: 11 - train_loss: 80.6160
##
## Epoch: 12 - train_loss: 81.1265
##
## Epoch: 13 - train_loss: 82.3104
##
## Epoch: 14 - train_loss: 81.0290
##
## Epoch: 15 - train_loss: 82.2829
##
## Epoch: 16 - train_loss: 81.7126
##
## Epoch: 17 - train_loss: 81.8430
##
## Epoch: 18 - train_loss: 81.8037
##
## Epoch: 19 - train_loss: 81.6940
##
## Epoch: 20 - train_loss: 80.5625
# valid/test function
def test(dataloader):
#pbar = ProgressBar(n_total=len(dataloader), desc='Testing')
valid_loss = AverageMeter()
valid_acc = AverageMeter()
count = 0
for batch_idx, batch in enumerate(dataloader):
# forward -- skip backward prop
probas = model.forward(batch['features'])
# record loss
loss = model._logit_cost(batch['target'], probas)
# get predictions
prediction = torch.where(probas > 0.5, torch.tensor(1, device=device), torch.tensor(0, device=device)).view(-1)
# compare
correct = prediction.eq(batch['target']).sum().item()
valid_loss.update(loss, n=batch['features'].size(0))
valid_acc.update(correct, n=1)
count += batch['features'].size(0)
#pbar(step=batch_idx)
return {'valid_loss': valid_loss.avg,
'valid_acc': valid_acc.sum / count}
We can see that our work above maps closely an sklearn implementation that uses all of the data for training.
## sklearn verify
# C = inverse of lambda
LAMBDA = 25.0
C = 1 / LAMBDA
clf = LogisticRegression(solver='lbfgs', penalty='l2', C=C).fit(X, y)
print(clf.coef_)
## [[-0.03849518 2.00255362]]
In a fashion very similar to logistic regression, the class below finds the weights and bias terms for a linear regression model through stochastic gradient descent.
df2 = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat479-deep-learning-ss19/master/L05_grad-descent/code/datasets/linreg-data.csv', index_col=0)
X = df[['x1', 'x2']].values
y = df['y'].values
df2 = pd.DataFrame({'x1': X[:, 0], 'x2': X[:, 1], 'y': y})
# save df
df2.to_csv('regression_demo.csv', index=False)
class ToTensor():
# retrieve sample and unpack it
def __call__(self, sample):
features, target, idx = (sample['features'],
sample['target'],
sample['idx'])
# yield another dict
return {'features': torch.as_tensor(features,
dtype=torch.float32,
device=device),
'target': torch.as_tensor(target,
dtype=torch.float32,
device=device),
'idx': torch.as_tensor(idx,
dtype=torch.int,
device=device)}
# instantiate the lazy data set
csv_dataset2 = CSVDataset(csv_file='regression_demo.csv', transform=ToTensor())
The linear model is created below. Notice that forward
models a linear relationship of the weights with respect to the output. Backward
follows the general premise of backward propagation taking place in the following steps:
Take the results of forward propagation and compute \(\hat{Y}\)
Compute the loss \(Y\)-\(\hat{Y}\) to get the gradient of the loss with respect to each weight using the chain rule.
Outside of the backward
function, we multiply the results of step 2 by the learning rate and update each weight.
# create manual ols model
class LM():
''' Stochastic Gradient Descent Linear Model '''
def __init__(self, num_features, LAMBDA=0.0):
"""
Args:
num_features (int): Number of independent variables
LAMBDA (float): L2 Regularization (Ridge)
"""
# set num. dimensions
self.num_features = num_features
# initialize weights as zeros
self.weights = torch.zeros(1, num_features,
dtype=torch.float32, device=device).normal_(0.0, 0.1)
# initialize bias as zeros
self.bias = torch.zeros(1, dtype=torch.float32, device=device)
# initialize LAMBDA
self.LAMBDA = LAMBDA
def forward(self, x):
# linear combination
linear = torch.add(torch.mm(x, self.weights.t()), self.bias).view(-1)
# activation = identity(x) = x
pass # do nothing for activation
return linear
def backward(self, x, y, y_hat):
# find gradient loss
grad_loss_out = y - y_hat.view(-1)
# chain rule: find loss for weights
grad_loss_w = 2 * -torch.mm(x.t(), grad_loss_out.view(-1, 1)) / y.size(0)
# chain rule: find loss for bias
grad_loss_b = 2 * -torch.sum(grad_loss_out) / y.size(0)
return grad_loss_w, grad_loss_b
def loss(self, y_hat, y):
# mean squared error
return torch.mean((y_hat - y)**2)
# create DataLoader
train_dataloader = DataLoader(csv_dataset2,
batch_size=100,
sampler=None,
shuffle=True)
# set epochs
epochs = 40
# set lr
learning_rate = 0.05
# instantiate model
model = LM(num_features=2, LAMBDA=10.0)
# prepare training function
def train(dataloader):
#pbar = ProgressBar(n_total=len(dataloader), desc='Training')
train_loss = AverageMeter()
for batch_idx, batch in enumerate(dataloader):
# forward
y_hat = model.forward(batch['features'].float())
# backward
grad_w, grad_b = model.backward(batch['features'],
batch['target'],
y_hat)
# manual regularization
l2_reg = (model.LAMBDA * model.weights)
l2_reg = l2_reg.reshape(2, 1)
# update weights
model.weights -= learning_rate * (grad_w + l2_reg).view(-1)
model.bias -= (learning_rate * grad_b).view(-1)
# record loss
loss = model.loss(batch['target'], y_hat)
# update meter
train_loss.update(loss.item(), n=1)
# update progress bar
#pbar(step=batch_idx, info={'batch_loss': loss.item()})
return {'train_loss': train_loss.avg}
# training
for epoch in range(1, epochs + 1):
train_log = train(train_dataloader)
logs = dict(train_log)
train_logs = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
print(train_logs)
##
## Epoch: 1 - train_loss: 0.2927
##
## Epoch: 2 - train_loss: 0.1986
##
## Epoch: 3 - train_loss: 0.1871
##
## Epoch: 4 - train_loss: 0.1845
##
## Epoch: 5 - train_loss: 0.1851
##
## Epoch: 6 - train_loss: 0.1849
##
## Epoch: 7 - train_loss: 0.1848
##
## Epoch: 8 - train_loss: 0.1849
##
## Epoch: 9 - train_loss: 0.1854
##
## Epoch: 10 - train_loss: 0.1844
##
## Epoch: 11 - train_loss: 0.1851
##
## Epoch: 12 - train_loss: 0.1844
##
## Epoch: 13 - train_loss: 0.1848
##
## Epoch: 14 - train_loss: 0.1845
##
## Epoch: 15 - train_loss: 0.1851
##
## Epoch: 16 - train_loss: 0.1850
##
## Epoch: 17 - train_loss: 0.1844
##
## Epoch: 18 - train_loss: 0.1854
##
## Epoch: 19 - train_loss: 0.1849
##
## Epoch: 20 - train_loss: 0.1849
##
## Epoch: 21 - train_loss: 0.1843
##
## Epoch: 22 - train_loss: 0.1845
##
## Epoch: 23 - train_loss: 0.1853
##
## Epoch: 24 - train_loss: 0.1853
##
## Epoch: 25 - train_loss: 0.1845
##
## Epoch: 26 - train_loss: 0.1847
##
## Epoch: 27 - train_loss: 0.1851
##
## Epoch: 28 - train_loss: 0.1854
##
## Epoch: 29 - train_loss: 0.1843
##
## Epoch: 30 - train_loss: 0.1846
##
## Epoch: 31 - train_loss: 0.1854
##
## Epoch: 32 - train_loss: 0.1839
##
## Epoch: 33 - train_loss: 0.1856
##
## Epoch: 34 - train_loss: 0.1843
##
## Epoch: 35 - train_loss: 0.1855
##
## Epoch: 36 - train_loss: 0.1848
##
## Epoch: 37 - train_loss: 0.1849
##
## Epoch: 38 - train_loss: 0.1848
##
## Epoch: 39 - train_loss: 0.1849
##
## Epoch: 40 - train_loss: 0.1848
## Weights tensor([[0.0011, 0.0749]], device='cuda:0')
## Bias tensor([0.4993], device='cuda:0')
# sklearn ridge
from sklearn.linear_model import Ridge
LAMBDA = 10.0
# C = inverse of lambda
C = 1/LAMBDA
# alpha = 1 / (2C)
alpha = 1 / (2*C)
x = np.ascontiguousarray(X)
# alpha * N obs for ridge
ridge = Ridge(alpha=alpha*1000, solver='sag').fit(x, y)
print(ridge.coef_)
## [0.00110112 0.07494251]
## 0.49776617212765734