# load packages
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, re, random, string
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from transformers import get_linear_schedule_with_warmup
from itertools import repeat
import optuna
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup, AdamW
SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
## <torch._C.Generator object at 0x000000001F68F050>
torch.backends.cudnn.deterministic = True
torch.cuda.amp.autocast(enabled=True)
# tell pytorch to use cuda
## <torch.cuda.amp.autocast_mode.autocast object at 0x00000000349FA248>
Text or sequence classification aims to label a sentence or document based on its content. In this post, we use Convolutional Neural Networks to classify a novel data set that I created based on insurgent propaganda messages. This guide stands in contrast to other walk-throughs on the model in that it: (1) offers a full treatment of data preparation and PyTorch, (2) uses GloVe embeddings correctly by taking into account unknown or padding tokens by generating unique vectors for them, and (3) specifically tells the embedding
layer which look-up index is the padding token.
In this section, we run a CNN with GloVe embeddings after addressing some of the high points about CNNs. Convolutional layers aim to find spatial patterns, predominantly in images, through the use of kernels. Kernels can be thought of as small windows that slide across the pixels of an image - calculating the respective weight by multiplying the pixel values with the kernel weight. These values are then summed to get a filtered pixel value which in turn are representative of local features.
In Natural Language Processing, text is one-dimensional but we often represent each word (or character) with an embedding vector, thereby giving our text a two-dimensional representation. The convolutional kernel slides over the embeddings (features) of multiple words rather than pixels. Instead of three input channels like Red Green Blue for images, text processing has just 1 (akin to gray scale) because a single sentence/document will be associated with a single list of embeddings. To slide a kernel over sequences of word embeddings, the sliding window needs to be allowed to look at multiple word embeddings in a sequence. Instead of a square, kernels take on shapes associated with the number of words (\(n\)) to look at in a sequence and the length of the embedding sequence (\(m\)). This \(n\times m\) kernel tells us how many word embeddings it will view at once, like n-grams, while also the length of the word embedding that it will take it account, say 200 or 300 for GLoVe or 768 for BERT.
CNNs tend to include multiple kernel heights, typically 3, each representing a different n-gram range. As a CNN trains, kernel weights are learned for words and surrounding words in a sequential window, yielding local features. CNNs then use max-pooling which aims to retain only the most important (highest value) local feature while discarding less important features.
GloVe is a common set of embeddings used by practitioners and academics in text analysis containing features for 400,000 words. Embeddings capture the similarities between words (e.g., they have high cosine similarities) and are the basis of NLP. Word embedding methods represent words as continuous vectors in a low dimensional space which capture lexical and semantic properties of words. Embeddings can be obtained from the internal representations from neural network models of text or by low rank approximation of co-occurrence statistics.
In this section, I will describe how to set up, correctly, a CNN in PyTorch that relies on GloVe. I begin by loading and preparing my data, sub-setting it to yield only two classes.
# prepare and load data
def prepare_df(pkl_location):
# read pkl as pandas
df = pd.read_pickle(pkl_location)
# just keep us/kabul labels
df = df.loc[(df['target'] == 'US') | (df['target'] == 'Kabul')]
# mask DV to recode
us = df['target'] == 'US'
kabul = df['target'] == 'Kabul'
# apply mask
df.loc[us, 'target'] = 1
df.loc[kabul, 'target'] = 0
# reset index
df = df.reset_index(drop=True)
return df
df = prepare_df('C:\\Users\\Andrew\\Desktop\\df.pkl')
Next, I do a small amount of additional data cleaning to my text, as described below.
# prepare data
def clean_df(df):
# strip dash but keep a space
df['body'] = df['body'].str.replace('-', ' ')
# prepare keys for punctuation removal
translator = str.maketrans(dict.fromkeys(string.punctuation))
# lower case the data
df['body'] = df['body'].apply(lambda x: x.lower())
# remove excess spaces near punctuation
df['body'] = df['body'].apply(lambda x: re.sub(r'\s([?.!"](?:\s|$))', r'\1', x))
# remove punctuation -- f1 improves by .05 by disabling this
df['body'] = df['body'].apply(lambda x: x.translate(translator))
# generate a word count
df['word_count'] = df['body'].apply(lambda x: len(x.split()))
# remove excess white spaces
df['body'] = df['body'].apply(lambda x: " ".join(x.split()))
return df
df = clean_df(df)
Since my corpus includes transliterations of Afghan words, there are a sizable amount of words that are not in the GloVe embedding and are otherwise probably not very helpful to helping us understand important local features through the lens of a CNN even if we were to give them coefficients for an unknown word. As such, I remove rare words from my corpus by:
filter
to drop the rare words from the list# lets remove rare words
def remove_rare_words(df):
# get counts of each word -- necessary for vocab
counts = Counter(" ".join(df['body'].values.tolist()).split(" "))
# remove low counts -- keep those above 2
counts = {key: value for key, value in counts.items() if value > 2}
# remove rare words from corpus
def remove_rare(x):
return ' '.join(list(filter(lambda x: x in counts.keys(), x.split())))
# apply funx
df['body'] = df['body'].apply(remove_rare)
return df
df = remove_rare_words(df)
Next, I execute a few functions to clean up the data set further and to learn a bit more about my corpus in its entirety.
## 5472
## 0
df = df.loc[df['word_count'] > 20]
# what is 95th percentile of word count?
percentile_95 = int(df['word_count'].quantile(0.95))
print(percentile_95)
# whats the length of the vocab?
## 974
counts = Counter(" ".join(df['body'].values.tolist()).split(" "))
vocab = sorted(counts, key=counts.get, reverse=True)
print(len(vocab))
## 16354
Now I am ready to load my GloVe embeddings.
# load GloVe embeddings
def load_GloVe(file_path):
# create empty dict to store data
embeddings_dictionary = dict()
# load the file
glove_file = open(file_path, encoding="utf8")
# for each entry
for line in glove_file:
# split on spaces
records = line.split()
# get the word located in position 0
word = records[0]
# get the embeddings which is the remainder
vector_dimensions = np.asarray(records[1:], dtype='float32')
# add to dictionary
embeddings_dictionary[word] = vector_dimensions
# close the file
glove_file.close()
return embeddings_dictionary
# load GloVe
file_path = 'C:\\Users\\Andrew\\Desktop\\glove.6B.200d.txt'
embeddings_dictionary = load_GloVe(file_path)
# useful computing to check vector values and indices
embeddings_dictionary.get('.') # get key value
## array([ 1.2289e-01, 5.8037e-01, -6.9635e-02, -5.0288e-01, 1.0503e-01,
## 3.9945e-01, -3.8635e-01, -8.4279e-02, 1.2219e-01, 8.0312e-02,
## 3.2337e-01, 4.7579e-01, -3.8375e-02, -7.0900e-03, 4.1524e-01,
## 3.2121e-01, -2.1185e-01, 3.6144e-01, -5.5623e-02, -3.0512e-02,
## 4.2854e-01, 2.8547e+00, -1.4623e-01, -1.7557e-01, 3.1197e-01,
## -1.3118e-01, 3.3298e-02, 1.3093e-01, 8.9889e-02, -1.2417e-01,
## 2.3396e-03, -6.8954e-02, -1.0754e-01, -1.1551e-01, -3.1052e-01,
## -1.2097e-01, -4.6691e-01, -8.3600e-02, -3.7664e-02, -7.1779e-02,
## -1.1899e-01, -2.0381e-01, -1.2424e-01, 4.6339e-01, -1.9828e-01,
## -8.0365e-03, 5.3718e-01, 3.1739e-02, 3.4331e-01, 7.9704e-03,
## 4.8744e-03, 3.0592e-02, -1.7615e-01, 8.2342e-01, -1.3793e-01,
## -1.0075e-01, -1.2686e-01, 7.4735e-02, -8.8719e-02, -4.2719e-02,
## 7.6624e-02, 8.9263e-02, 6.4445e-02, -3.1958e-02, 1.5254e-01,
## -1.0384e-01, 7.6604e-02, 3.4099e-01, 2.4331e-01, -1.0452e-01,
## 4.0714e-01, -1.8260e-01, -4.0667e-02, 5.0878e-01, 8.0760e-02,
## 2.2759e-01, -4.2162e-02, -1.8171e-01, -9.5025e-02, 3.0334e-02,
## 8.8202e-02, -3.9843e-06, -3.9877e-03, 1.5724e-01, 3.3167e-01,
## 8.4710e-02, -2.5919e-01, -4.1384e-01, 2.9920e-01, -5.4255e-01,
## 3.2129e-02, 1.0030e-01, 4.4202e-01, 4.4682e-02, -9.0681e-02,
## -1.0481e-01, -1.1860e-01, -3.1972e-01, -2.0790e-01, -4.0203e-02,
## -2.2988e-02, 2.2824e-01, 5.5238e-03, 1.2568e-01, -1.4640e-01,
## -1.4904e-01, -1.1561e-01, 1.0517e+00, -1.9498e-01, 8.3958e-02,
## 4.4812e-02, -1.2965e-01, -9.3468e-02, 2.1237e-01, -8.8332e-02,
## -1.8680e-01, 2.6521e-01, 1.3097e-01, -4.8102e-02, -2.2467e-01,
## 2.8412e-01, 3.4907e-01, 3.4833e-01, 1.7877e-02, 3.0504e-01,
## -8.3453e-01, 4.8856e-02, -1.9330e-01, 2.0764e-01, -4.9701e-01,
## -1.8747e-01, -7.6801e-02, 1.5558e-01, -4.6844e-01, 4.0944e-01,
## 2.1386e-01, 8.2392e-02, -2.6491e-01, -2.1224e-01, -1.3293e-01,
## 1.4738e-01, -1.4192e-01, 1.8994e-01, -1.5587e-01, 1.0738e+00,
## 4.0789e-01, -2.7452e-01, -1.8431e-01, 6.8679e-04, -8.7115e-02,
## 1.9672e-01, 4.0918e-01, -3.5462e-01, -6.3260e-02, 4.4920e-01,
## -6.0568e-02, -4.1636e-02, 2.0531e-01, 1.7025e-02, -5.8448e-01,
## 7.5441e-02, 8.2116e-02, -4.6008e-01, 1.2393e-02, -2.5310e-02,
## 1.4177e-01, -9.2192e-02, 3.4505e-01, -5.2136e-01, 5.7304e-01,
## 1.1973e-02, 3.3196e-02, 2.9672e-01, -2.7899e-01, 1.9979e-01,
## 2.5666e-01, 8.2079e-02, -7.8436e-02, 9.3719e-02, 2.4202e-01,
## 1.3495e+00, -3.0434e-01, -3.0936e-01, 4.2047e-01, -7.9068e-02,
## -1.4819e-01, -8.9404e-02, 6.6800e-02, 2.2405e-01, 2.7226e-01,
## -3.5236e-02, 1.7688e-01, -5.3600e-02, 7.0031e-03, -3.3006e-02,
## -8.0021e-02, -2.4451e-01, -3.9174e-02, -1.6236e-01, -9.6652e-02],
## dtype=float32)
## 2
One issue I see with many guides that use GloVe is that they do not do anything to account for padding tokens nor for unknown words. The function remedies that in several ways:
# create vectors for "unknown" and "padding" and add to GloVe
def modify_GloVe(embeddings_dictionary):
# create key values for unknown
unknown_vector = np.random.uniform(-0.14, 0.14, 201) # var of GloVe 200d
# create key values for padding
pad_vector = np.repeat(0, 200)
pad_vector = np.append(pad_vector, 1)
# turn dict into list to append easily
embeddings_tensor = list(embeddings_dictionary.values())
# extend GloVe dimension by 2
embeddings_tensor = [np.append(i, 0) for i in embeddings_tensor]
# add unknown and pad vectors via vstack
embeddings_tensor = np.vstack([embeddings_tensor, unknown_vector])
embeddings_tensor = np.vstack([embeddings_tensor, pad_vector])
# finalize transform into tensor
embeddings_tensor = torch.Tensor(embeddings_tensor)
return embeddings_tensor
# modify GloVe and turn into torch tensor
embeddings_tensor = modify_GloVe(embeddings_dictionary)
# check shape
print(embeddings_tensor.shape)
## torch.Size([400002, 201])
With GloVe loaded, we need to tokenize our corpus into GloVe tokens. This is because when we feed our tokens into the model, it uses an embedding
layer that acts as a look-up table. We will eventually specify that this look-up table be the object we just created, embeddings_tensor
, so that when a token is fed into our model, the embedding
layer will “look-up” the 200 dimension feature for that word inside the embeddings_tensor
object and append the features to our batch undergoing forward propagation.
# convert strings to GloVe familiar tokens
def text_to_GloVe_tokens(df, embeddings_dictionary):
# create container for words that do not match
no_matches = []
# create container for tokenized strings
glove_tokenized_data = []
# create lookup for token ids
word_map = dict(zip(embeddings_dictionary.keys(), range(len(embeddings_dictionary))))
# for each document
for doc in df['body']:
# split each string
doc = doc.split()
# create token container
tokens = []
# for each word in the document
for word in doc:
# if word is a GloVE word
if word in word_map:
# get its GloVe index
idx = word_map.get(word)
# save its token
tokens.append(idx)
# otherwise
else:
# it must be an unknown word to GloVe
idx = 400000 # unknown word
# so append that word to no matches
no_matches.append(word)
# but also give it a vector lookup
tokens.append(idx)
# combine the tokens
glove_tokenized_data.append(tokens)
return no_matches, glove_tokenized_data
With our corpus tokenized to match GloVe, it is in our interest to know just how many words in our corpus have no embedding features. The code below determines that for us.
# get a list of no matches and our GloVe tokens
no_matches, glove_tokenized_data = text_to_GloVe_tokens(df, embeddings_dictionary)
# after removing rare words, how many words are we not accounting for now?
print(len(set(no_matches)))
## 1495
Our next challenge is managing the lengths of our messages as they all need to be equal. The function below receives the GloVe tokenized data and a specified max length. It then proceeds to check the size of each item in the corpus and then either truncates or adds our new specialized padding token to the end of the message.
max_len
is a hyperparameter that we can experiment with, however, I have chosen the 95th percentile of my corpus’ word_count
as my desired max length.
# post pad GloVe
def pad_GloVe(tokenized_data, max_len):
padded_tokens = []
max_len = max_len
# for each tokenized document
for tokenized_sent in tokenized_data:
# if current doc length is greater than max length
if len(tokenized_sent) > max_len:
# trim it to max length
current_sent = tokenized_sent[:max_len]
# append
padded_tokens.append(current_sent)
# if current doc length is less than max length
if len(tokenized_sent) < max_len:
# find the difference in length
extension = max_len - len(tokenized_sent)
# pad sentences to max_len
tokenized_sent.extend(repeat(400001, extension))
# append new padded token
padded_tokens.append(tokenized_sent)
elif len(tokenized_sent) == max_len:
padded_tokens.append(tokenized_sent)
return np.array(padded_tokens, dtype=np.int64)
# get new padded tokens
padded_GloVe = pad_GloVe(glove_tokenized_data, percentile_95)
# check shape; 9994 documents, 974 length
print(padded_GloVe.shape)
# check to make sure padding done right
## (10041, 974)
## []
With the corpus work out of the way, we now proceed to prepare our data for analysis in PyTorch. The code below creates a TensorDataset
comprised of our features, padded GloVe tokens, and our labels. It then proceeds to spit the data sets into train, validation, and test sets.
# prepare tensor data sets
def prepare_dataset(padded_tokens, target):
# prepare target into np array
target = np.array(target.values, dtype=np.int64).reshape(-1, 1)
# create tensor data sets
tensor_df = TensorDataset(torch.from_numpy(padded_tokens), torch.from_numpy(target))
# 80% of df
train_size = int(0.8 * len(df))
# 20% of df
val_size = len(df) - train_size
# 50% of validation
test_size = int(val_size - 0.5*val_size)
# divide the dataset by randomly selecting samples
train_dataset, val_dataset = random_split(tensor_df, [train_size, val_size])
# divide validation by randomly selecting samples
val_dataset, test_dataset = random_split(val_dataset, [test_size, test_size+1])
return train_dataset, val_dataset, test_dataset
# create tenor data sets
train_dataset, val_dataset, test_dataset = prepare_dataset(padded_GloVe, df['target'])
Since my corpus is imbalanced, I produce weighted samplers to help balance the distribution of data as it is fed outside of my data loaders.
# helper function to count target distribution inside tensor data sets
def target_count(tensor_dataset):
# set empty count containers
count0 = 0
count1 = 0
# set total container to turn into torch tensor
total = []
# for every item in the tensor data set
for i in tensor_dataset:
# if the target is equal to 0
if i[1].item() == 0:
count0 += 1
# if the target is equal to 1
elif i[1].item() == 1:
count1 += 1
total.append(count0)
total.append(count1)
return torch.tensor(total)
# prepare weighted sampling for imbalanced classification
def create_sampler(target_tensor, tensor_dataset):
# generate class distributions [x, y]
class_sample_count = target_count(tensor_dataset)
# weight
weight = 1. / class_sample_count.float()
# produce weights for each observation in the data set
samples_weight = torch.tensor([weight[t[1]] for t in tensor_dataset])
# prepare sampler
sampler = torch.utils.data.WeightedRandomSampler(weights=samples_weight,
num_samples=len(samples_weight),
replacement=True)
return sampler
# create samplers for just training
train_sampler = create_sampler(target_count(train_dataset), train_dataset)
As you might have guessed, preparing data loaders for each of our train, dev, and test data sets is our next task.
# create DataLoaders with samplers
train_dataloader = DataLoader(train_dataset,
batch_size=80,
sampler=train_sampler,
shuffle=False)
valid_dataloader = DataLoader(val_dataset,
batch_size=80,
shuffle=True)
test_dataloader = DataLoader(test_dataset,
batch_size=80,
shuffle=True)
We can check to see how our sampler is working by running the loop below. As we can see, the data loader is outputting relatively balanced data into each batch.
# lets check class balance for each batch to see how the sampler is working
for i, (x, y) in enumerate(train_dataloader):
if i in range(0, 10):
print("batch index {}, 0/1: {}/{}".format(
i, (y == 0).sum(), (y == 1).sum()))
## batch index 0, 0/1: 45/35
## batch index 1, 0/1: 40/40
## batch index 2, 0/1: 36/44
## batch index 3, 0/1: 44/36
## batch index 4, 0/1: 43/37
## batch index 5, 0/1: 41/39
## batch index 6, 0/1: 46/34
## batch index 7, 0/1: 46/34
## batch index 8, 0/1: 37/43
## batch index 9, 0/1: 41/39
Next, we build a Kim Yoon (2014) CNN designed to use GloVe embeddings.
# Build Kim Yoon CNN
class KimCNN(nn.Module):
def __init__(self, config):
super().__init__()
output_channel = config.output_channel # number of kernels
num_classes = config.num_classes # number of targets to predict
vocab_size = config.vocab_size # vocab size of corpus
embedding_dim = config.embedding_dim # GloVe embed dim size
pre_embed = config.pre_embed # GloVe coefs
self.mode = config.mode # static, or not
ks = 3 # three conv nets here
dropout = config.dropout # dropout value
padding = config.padding_idx # padding indx value
# for single embedding, input_channel = 1
input_channel = 1
if config.mode == 'rand':
rand_embed_init = torch.Tensor(vocab_size, embedding_dim).uniform_(-0.25, 0.25)
self.embed = nn.Embedding.from_pretrained(rand_embed_init, freeze=False)
elif config.mode == 'static':
self.static_embed = nn.Embedding.from_pretrained(pre_embed,
freeze=True,
padding_idx=padding)
elif config.mode == 'non-static':
self.non_static_embed = nn.Embedding.from_pretrained(pre_embed,
freeze=False,
padding_idx=padding)
# input channel increases with trainable and untrainable embeddings
elif config.mode == 'multichannel':
self.static_embed = nn.Embedding.from_pretrained(pre_embed,
freeze=True,
padding_idx=padding)
self.non_static_embed = nn.Embedding.from_pretrained(pre_embed,
freeze=False,
padding_idx=padding)
input_channel = 2
else:
print("Unsupported Mode")
raise Exception
# input_channel = word embeddings at a value of 1; 3 for RGB images
# output_channel = number of kernels
# [3, 4, 5] = window height
# embedding_dim = length of embedding dim; my GloVe is 202
# padding = padding to account for height of search window
self.conv1 = nn.Conv2d(input_channel, output_channel, (3, embedding_dim), padding=(2, 0))
self.conv2 = nn.Conv2d(input_channel, output_channel, (4, embedding_dim), padding=(3, 0))
self.conv3 = nn.Conv2d(input_channel, output_channel, (5, embedding_dim), padding=(4, 0))
# apply dropout
self.dropout = nn.Dropout(dropout)
# fully connected layer for classification
# 3x conv nets * output channel
self.fc1 = nn.Linear(ks * output_channel, num_classes)
def forward(self, x, **kwargs):
if self.mode == 'rand':
word_input = self.embed(x) # (batch, sent_len, embed_dim)
x = word_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
elif self.mode == 'static':
static_input = self.static_embed(x)
x = static_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
elif self.mode == 'non-static':
non_static_input = self.non_static_embed(x)
x = non_static_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
elif self.mode == 'multichannel':
non_static_input = self.non_static_embed(x)
static_input = self.static_embed(x)
x = torch.stack([non_static_input, static_input], dim=1) # (batch, channel_input=2, sent_len, embed_dim)
else:
print("Unsupported Mode")
raise Exception
# squeeze to get size; (batch, channel_output, ~=sent_len) * ks
x = [F.relu(self.conv1(x)).squeeze(3), F.relu(self.conv2(x)).squeeze(3), F.relu(self.conv3(x)).squeeze(3)]
# max-over-time pooling; # (batch, channel_output) * ks
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
# concat results; (batch, channel_output * ks)
x = torch.cat(x, 1)
# add dropout
x = self.dropout(x)
# generate logits (batch, target_size)
logit = self.fc1(x)
return logit
Below we instantiate a helper function for time keeping.
# time function
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
Now, we prepare functions to train, validate, and test our data.
def train(model, dataloader, optimizer, criterion):
# capture time
total_t0 = time.time()
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
print('Training...')
# reset total loss for epoch
train_total_loss = 0
total_train_f1 = 0
# put model into traning mode
model.train()
# for each batch of training data...
for step, batch in enumerate(dataloader):
# progress update every 40 batches.
if step % 40 == 0 and not step == 0:
# Report progress.
print(' Batch {:>5,} of {:>5,}.'.format(step, len(dataloader)))
# Unpack this training batch from our dataloader:
#
# As we unpack the batch, we'll also copy each tensor to the GPU
#
# `batch` contains two pytorch tensors:
# [0]: input ids
# [1]: labels
b_input_ids = batch[0].cuda()
b_labels = batch[1].cuda().long()
# clear previously calculated gradients
optimizer.zero_grad()
with autocast():
# forward propagation (evaluate model on training batch)
logits = model(b_input_ids)
# calculate cross entropy loss
loss = criterion(logits.view(-1, 2), b_labels.view(-1))
# sum the training loss over all batches for average loss at end
# loss is a tensor containing a single value
train_total_loss += loss.item()
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
# Backward passes under autocast are not recommended.
# Backward ops run in the same dtype autocast chose for corresponding forward ops.
scaler.scale(loss).backward()
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
scaler.step(optimizer)
# Updates the scale for next iteration.
scaler.update()
# update the learning rate
scheduler.step()
# get preds
_, predicted = torch.max(logits, 1)
# move logits and labels to CPU
predicted = predicted.detach().cpu().numpy()
y_true = b_labels.detach().cpu().numpy()
# calculate f1
total_train_f1 += f1_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# calculate the average loss over all of the batches
avg_train_loss = train_total_loss / len(dataloader)
# calculate the average f1 over all of the batches
avg_train_f1 = total_train_f1 / len(dataloader)
# Record all statistics from this epoch.
training_stats.append(
{
'Train Loss': avg_train_loss,
'Train F1': avg_train_f1
}
)
# training time end
training_time = format_time(time.time() - total_t0)
# print result summaries
print("")
print("summary results")
print("epoch | trn loss | trn f1 | trn time ")
print(f"{epoch+1:5d} | {avg_train_loss:.5f} | {avg_train_f1:.5f} | {training_time:}")
torch.cuda.empty_cache()
return None
def validating(model, dataloader, criterion):
# capture validation time
total_t0 = time.time()
# After the completion of each training epoch, measure our performance on
# our validation set.
print("")
print("Running Validation...")
# put the model in evaluation mode
model.eval()
# track variables
total_valid_accuracy = 0
total_valid_loss = 0
total_valid_f1 = 0
total_valid_recall = 0
total_valid_precision = 0
# evaluate data for one epoch
for batch in dataloader:
# unpack batch from dataloader
b_input_ids = batch[0].cuda()
b_labels = batch[1].cuda().long()
# tell pytorch not to bother calculating gradients
# as its only necessary for training
with torch.no_grad():
# forward propagation (evaluate model on training batch)
logits = model(b_input_ids)
# calculate BCEWithLogitsLoss
loss = criterion(logits.view(-1, 2), b_labels.view(-1))
# calculate preds
_, predicted = torch.max(logits, 1)
# accumulate validation loss
total_valid_loss += loss.item()
# move logits and labels to CPU
predicted = predicted.detach().cpu().numpy()
y_true = b_labels.detach().cpu().numpy()
# calculate f1
total_valid_f1 += f1_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# calculate accuracy
total_valid_accuracy += accuracy_score(predicted, y_true)
# calculate precision
total_valid_precision += precision_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# calculate recall
total_valid_recall += recall_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# report final accuracy of validation run
avg_accuracy = total_valid_accuracy / len(dataloader)
# report final f1 of validation run
global avg_val_f1
avg_val_f1 = total_valid_f1 / len(dataloader)
# report final f1 of validation run
avg_precision = total_valid_precision / len(dataloader)
# report final f1 of validation run
avg_recall = total_valid_recall / len(dataloader)
# calculate the average loss over all of the batches.
avg_val_loss = total_valid_loss / len(dataloader)
# Record all statistics from this epoch.
valid_stats.append(
{
'Val Loss': avg_val_loss,
'Val Accur.': avg_accuracy,
'Val precision': avg_precision,
'Val recall': avg_recall,
'Val F1': avg_val_f1
}
)
# capture end validation time
training_time = format_time(time.time() - total_t0)
# print result summaries
print("")
print("summary results")
print("epoch | val loss | val f1 | val time")
print(f"{epoch+1:5d} | {avg_val_loss:.5f} | {avg_val_f1:.5f} | {training_time:}")
return None
def testing(model, dataloader, criterion):
print("")
print("Running Testing...")
# put the model in evaluation mode
model.eval()
# track variables
total_test_accuracy = 0
total_test_loss = 0
total_test_f1 = 0
total_test_recall = 0
total_test_precision = 0
# evaluate data for one epoch
for step, batch in enumerate(dataloader):
# progress update every 40 batches.
if step % 40 == 0 and not step == 0:
# Report progress.
print(' Batch {:>5,} of {:>5,}.'.format(step, len(dataloader)))
# unpack batch from dataloader
b_input_ids = batch[0].cuda()
b_labels = batch[1].cuda().long()
# tell pytorch not to bother calculating gradients
# only necessary for training
with torch.no_grad():
# forward propagation (evaluate model on training batch)
logits = model(b_input_ids)
# calculate cross entropy loss
loss = criterion(logits.view(-1, 2), b_labels.view(-1))
# calculate preds
_, predicted = torch.max(logits, 1)
# accumulate validation loss
total_test_loss += loss.item()
# move logits and labels to CPU
predicted = predicted.detach().cpu().numpy()
y_true = b_labels.detach().cpu().numpy()
# calculate f1
total_test_f1 += f1_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# calculate accuracy
total_test_accuracy += accuracy_score(predicted, y_true)
# calculate precision
total_test_precision += precision_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# calculate recall
total_test_recall += recall_score(predicted, y_true,
average='weighted',
labels=np.unique(predicted))
# report final accuracy of validation run
avg_accuracy = total_test_accuracy / len(dataloader)
# report final f1 of validation run
avg_test_f1 = total_test_f1 / len(dataloader)
# report final f1 of validation run
avg_precision = total_test_precision / len(dataloader)
# report final f1 of validation run
avg_recall = total_test_recall / len(dataloader)
# calculate the average loss over all of the batches.
avg_test_loss = total_test_loss / len(dataloader)
# Record all statistics from this epoch.
test_stats.append(
{
'Test Loss': avg_test_loss,
'Test Accur.': avg_accuracy,
'Test precision': avg_precision,
'Test recall': avg_recall,
'Test F1': avg_test_f1
}
)
return None
In order to use our CNN, we need to specify a config class that sets a number of hyperparameters that the class is expecting.
# instantiate model config -- set ex-post from optuna search
class config:
def __init__(self):
config.pre_embed = embeddings_tensor # GloVe vectors
config.mode = 'static' # dont train embedding
config.num_classes = 2 # binary
config.output_channel = 300 # number of kernels
config.embedding_dim = 201 # GloVe embed dimension (202)
config.vocab_size = len(vocab)+2 # vocab size of corpus plus unknown/padding
config.dropout = 0.1 # dropout value
config.padding_idx = 400001 # padding token index
return None
# create config
config1 = config()
# instantiate model - attach to GPU
model = KimCNN(config1).cuda()
Now we are almost ready to train. A few other preparatory objects are created like the loss criteria, epochs, the optimizer, and our optimizer scheduler.
# set loss
criterion = nn.CrossEntropyLoss()
# set number of epochs
epochs = 5
# set optimizer
optimizer = AdamW(model.parameters(),
lr=0.0009978734977728082,
weight_decay=0.5
)
# set LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0,
num_training_steps=total_steps)
# create gradient scaler for mixed precision
scaler = GradScaler()
Finally we are ready to train. Two containers are created to store the results of each training and validation epoch
# create training result storage
training_stats = []
valid_stats = []
best_valid_loss = float('inf')
# for each epoch
for epoch in range(epochs):
# train
train(model, train_dataloader, optimizer, criterion)
# validate
validating(model, valid_dataloader, criterion)
# check validation loss
if valid_stats[epoch]['Val Loss'] < best_valid_loss:
best_valid_loss = valid_stats[epoch]['Val Loss']
# save best model for use later
torch.save(model.state_dict(), 'cnn-model1.pt')
##
## ======== Epoch 1 / 5 ========
## Training...
## Batch 40 of 101.
## Batch 80 of 101.
##
## summary results
## epoch | trn loss | trn f1 | trn time
## 1 | 0.36045 | 0.85398 | 0:00:07
##
## Running Validation...
##
## summary results
## epoch | val loss | val f1 | val time
## 1 | 0.34221 | 0.84473 | 0:00:00
##
## ======== Epoch 2 / 5 ========
## Training...
## Batch 40 of 101.
## Batch 80 of 101.
##
## summary results
## epoch | trn loss | trn f1 | trn time
## 2 | 0.23643 | 0.90830 | 0:00:05
##
## Running Validation...
##
## summary results
## epoch | val loss | val f1 | val time
## 2 | 0.30264 | 0.86857 | 0:00:00
##
## ======== Epoch 3 / 5 ========
## Training...
## Batch 40 of 101.
## Batch 80 of 101.
##
## summary results
## epoch | trn loss | trn f1 | trn time
## 3 | 0.17739 | 0.93776 | 0:00:06
##
## Running Validation...
##
## summary results
## epoch | val loss | val f1 | val time
## 3 | 0.25956 | 0.88933 | 0:00:00
##
## ======== Epoch 4 / 5 ========
## Training...
## Batch 40 of 101.
## Batch 80 of 101.
##
## summary results
## epoch | trn loss | trn f1 | trn time
## 4 | 0.13432 | 0.96171 | 0:00:05
##
## Running Validation...
##
## summary results
## epoch | val loss | val f1 | val time
## 4 | 0.24878 | 0.89241 | 0:00:00
##
## ======== Epoch 5 / 5 ========
## Training...
## Batch 40 of 101.
## Batch 80 of 101.
##
## summary results
## epoch | trn loss | trn f1 | trn time
## 5 | 0.10477 | 0.97135 | 0:00:05
##
## Running Validation...
##
## summary results
## epoch | val loss | val f1 | val time
## 5 | 0.25085 | 0.88790 | 0:00:00
##
## C:\Users\Andrew\Anaconda3\envs\my_ml\lib\site-packages\torch\optim\lr_scheduler.py:123: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
## "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
After training, we organize the results nicely in pandas
.
# organize results
pd.set_option('precision', 3)
df_train_stats = pd.DataFrame(data=training_stats)
df_valid_stats = pd.DataFrame(data=valid_stats)
df_stats = pd.concat([df_train_stats, df_valid_stats], axis=1)
df_stats.insert(0, 'Epoch', range(1, len(df_stats)+1))
df_stats = df_stats.set_index('Epoch')
df_stats
## Train Loss Train F1 Val Loss ... Val precision Val recall Val F1
## Epoch ...
## 1 0.360 0.854 0.342 ... 0.867 0.854 0.845
## 2 0.236 0.908 0.303 ... 0.881 0.875 0.869
## 3 0.177 0.938 0.260 ... 0.893 0.892 0.889
## 4 0.134 0.962 0.249 ... 0.895 0.895 0.892
## 5 0.105 0.971 0.251 ... 0.892 0.892 0.888
##
## [5 rows x 7 columns]
Then we plot our results like so:
def plot_results(df):
# styling from seaborn.
sns.set(style='darkgrid')
# uncrease the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
# plot the learning curve.
plt.plot(df_stats['Train Loss'], 'b-o', label="Training")
plt.plot(df_stats['Val Loss'], 'g-o', label="Validation")
# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
return plt.show()
plot_results(df_stats)
And lastly run our final test:
## <All keys matched successfully>
##
## Running Testing...
## Test Loss Test Accur. Test precision Test recall Test F1
## 0 0.266 0.872 0.874 0.872 0.868
Pretty good results for a novel data set made from scratch with an “outdated” model.
There are some other tasks that might want to do like inference which is using what we know to predict what we do not. So let’s say that we have two new messages but no labels (see below). We can tokenize these strings like all of our other data and generate on-the-fly predictions about their inferred label.
# inference
def infer_class(model, string_df, min_len=percentile_95):
# tokenize the string into GloVe
no_matches, glove_tokenized_data = text_to_GloVe_tokens(string_df, embeddings_dictionary)
# check length
if len(glove_tokenized_data[0]) < min_len:
glove_tokenized_data = pad_GloVe(glove_tokenized_data, percentile_95)
tensor = torch.LongTensor(glove_tokenized_data[0]).cuda().unsqueeze(0)
logits = model(tensor)
_, predicted = torch.max(logits, 1)
return print(predicted.item())
str_US = ['You will no longer be sent to an eternal war, President Donald Trump told a gathering marking the graduation of the US military academy. You will no longer fight in countries whose names are not known to most Americans. You will no longer be involved in the wars of the ancient nations. Clearly, Trump is referring to the long-running war in Afghanistan. The United States has been fighting the for the past two decades; it suffered a high number of casualties along with enormous financial losses. Many times, senior US officials have acknowledged that they cannot win the war in Afghanistan. Trump has told the truth, that his troops will no longer be involved in the wars of the ancient nations, because the never-ending war in Afghanistan has severely damaged America is reputation on the international level and caused the country extreme economic hardships. Recent surveys show that the support in the United States for this lost war has plummeted, and the people have now realized that the American leaders made false promises about this war. Nearly 20 years ago, the founder of the Islamic Emirate, the late Amir al- Mu aminin Mullah Mohammad Omar, warned the Americans to give up the intention of occupying Afghanistan.']
str_AF = ['On 15 June, the soldiers of the puppet regime came to carry out operations in Sheikhano area , Tagab District, Kapisa Province. The mujahideen retaliated severely: 17 offensive soldiers of the puppet were killed in the operation; many of their corpses are lying on the battlefield; and many others were wounded.']
temp_df_US = pd.DataFrame({'body': str_US})
temp_df_US = clean_df(temp_df_US)
print(infer_class(model, temp_df_US)) # 1 = US = Correct
## 1
## None
temp_df_AF = pd.DataFrame({'body': str_AF})
temp_df_AF = clean_df(temp_df_AF)
print(infer_class(model, temp_df_AF)) # 0 = Kabul = Correct
## 0
## None
The code below shows how we can use state-of-the-art pruning and search algorithms to improve our model’s performance through hyperparameter selection.
# optuna -- tune hyperparameters
# create gradient scaler for mixed precision
scaler = GradScaler()
training_stats = []
valid_stats = []
epochs = 5
def objective(trial):
# alter hyperparameters
kernel_num = trial.suggest_int('output_channel', low=600, high=1500, step=50)
dropout_num = trial.suggest_float('dropout', low=0.1, high=0.5, step=0.05)
learning_rate = trial.suggest_loguniform('lr', 1e-5, 1e-3)
config1 = config()
config1.output_channel = kernel_num
config1.dropout = dropout_num
# data loaders
train_dataloader = DataLoader(train_dataset,
batch_size=80,
sampler=train_sampler,
shuffle=False)
valid_dataloader = DataLoader(val_dataset,
batch_size=80,
shuffle=True)
# instantiate model
model = KimCNN(config1).cuda()
# set optimizer
optimizer = AdamW(model.parameters(),
lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
# set LR scheduler
total_steps = len(train_dataloader) * epochs
global scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0,
num_training_steps=total_steps)
global epoch
for epoch in range(epochs):
# set containers
train_total_loss = 0
total_train_f1 = 0
# put model into traning mode
model.train()
# for each batch of training data...
for step, batch in enumerate(train_dataloader):
b_input_ids = batch[0].cuda()
b_labels = batch[1].cuda().type(torch.cuda.FloatTensor)
optimizer.zero_grad()
with autocast():
logits = model(b_input_ids)
loss = criterion(logits, b_labels)
train_total_loss += loss.item()
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
# validation
model.eval()
total_valid_loss = 0
total_valid_f1 = 0
# evaluate data for one epoch
for batch in valid_dataloader:
b_input_ids = batch[0].cuda()
b_labels = batch[1].cuda().type(torch.cuda.FloatTensor)
with torch.no_grad():
logits = model(b_input_ids)
loss = criterion(logits, b_labels)
total_valid_loss += loss.item()
# generate predictions
rounded_preds = torch.round(torch.sigmoid(logits))
# move logits and labels to CPU
rounded_preds = rounded_preds.detach().cpu().numpy()
y_true = b_labels.detach().cpu().numpy()
# calculate f1
total_valid_f1 += f1_score(rounded_preds, y_true,
average='weighted',
labels=np.unique(rounded_preds))
avg_val_f1 = total_valid_f1 / len(valid_dataloader)
avg_val_loss = total_valid_loss / len(valid_dataloader)
trial.report(avg_val_loss, epoch)
# Handle pruning based on the intermediate value.
if trial.should_prune():
raise optuna.exceptions.TrialPruned()
return avg_val_loss
study = optuna.create_study(direction="minimize",
pruner=optuna.pruners.HyperbandPruner(min_resource=1,
max_resource=5,
reduction_factor=3,
))
study.optimize(objective, n_trials=35)
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
print("Study statistics: ")
print(" Number of finished trials: ", len(study.trials))
print(" Number of pruned trials: ", len(pruned_trials))
print(" Number of complete trials: ", len(complete_trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
Pennington, Jeffrey, Richard Socher, and Christopher D. Manning. “Glove: Global vectors for word representation.” In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp. 1532-1543. 2014. GloVe https://nlp.stanford.edu/projects/glove/
Kim, Yoon. “Convolutional neural networks for sentence classification.” arXiv preprint arXiv:1408.5882 (2014).