import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from shared.step_by_step import StepByStep, RUNS_FOLDER_NAME
import platform
import datetime
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
'fivethirtyeight')
plt.style.use(
from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
Binary Classification
We’ll use Scikit-Learn’s make_moons to generate a toy dataset with 1000 data points and two features.
Data Generation
= make_moons(n_samples=1000, noise=0.3, random_state=11)
X, y = train_test_split(X, y, test_size=.2, random_state=13) X_train, X_val, y_train, y_val
We first use Scikit-Learn’s StandardScaler
to standardize datasets:
= StandardScaler()
sc # always fit only on X_train
sc.fit(X_train) = sc.transform(X_train)
X_train_scalled = sc.transform(X_val) # DO NOT use fit or fit_transform on X_val, it causes data leak
X_val_scalled = sc.mean_
m = sc.var_
v print(m, v)
assert ((X_train_scalled[0] - m)/np.sqrt(v) - X_train[0] < np.finfo(float).eps).all()
[0.4866699 0.26184213] [0.80645937 0.32738853]
= X_train_scalled
X_train = X_val_scalled X_val
from matplotlib.colors import ListedColormap
def figure1(X_train, y_train, X_val, y_val, cm_bright=None):
if cm_bright is None:
= ListedColormap(['#FF0000', '#0000FF'])
cm_bright
= plt.subplots(1, 2, figsize=(12, 6))
fig, ax
0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)#, edgecolors='k')
ax[0].set_xlabel(r'$X_1$')
ax[0].set_ylabel(r'$X_2$')
ax[0].set_xlim([-2.3, 2.3])
ax[0].set_ylim([-2.3, 2.3])
ax[0].set_title('Generated Data - Train')
ax[
1].scatter(X_val[:, 0], X_val[:, 1], c=y_val, cmap=cm_bright)#, edgecolors='k')
ax[1].set_xlabel(r'$X_1$')
ax[1].set_ylabel(r'$X_2$')
ax[1].set_xlim([-2.3, 2.3])
ax[1].set_ylim([-2.3, 2.3])
ax[1].set_title('Generated Data - Validation')
ax[
fig.tight_layout()
return fig
= figure1(X_train, y_train, X_val, y_val) fig
Data Preparation
The preparation of data starts by converting the data points from Numpy arrays to PyTorch tensors and sending them to the available device:
= 'cuda' if torch.cuda.is_available() else 'cpu'
device
# Builds tensors from numpy arrays
= torch.as_tensor(X_train).float()
x_train_tensor = torch.as_tensor(y_train.reshape(-1, 1)).float() # reshape makes shape from (80,) to (80,1)
y_train_tensor
= torch.as_tensor(X_val).float()
x_val_tensor = torch.as_tensor(y_val.reshape(-1, 1)).float() y_val_tensor
= TensorDataset(x_train_tensor, y_train_tensor)
train_data = TensorDataset(x_val_tensor, y_val_tensor)
val_data
= DataLoader(train_data, 64, shuffle=True)
train_loader = DataLoader(val_data, 64) val_loader
Linear model
42)
torch.manual_seed(
= 0.01
lr
= nn.Sequential()
model 'linear', nn.Linear(2,1))
model.add_module('sigmoid', nn.Sigmoid())
model.add_module(
= optim.Adam(model.parameters(), lr=lr)
optimizer = nn.BCELoss() loss_fn
= StepByStep(model, optimizer, loss_fn)
sbs_lin
sbs_lin.set_loaders(train_loader, val_loader)100) sbs_lin.train(
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 42.69it/s]
= sbs_lin.plot_losses() _
Let’s predict the values for X_train
(y_train_predicted
) and X_val
(y_val_predicted
) and plot them. Also let’s see how good of a job linear regression did using confusion matrix:
def predict_plot_count(sbs):
= sbs.predict(X_train)
y_train_predicted = sbs.predict(X_val)
y_val_predicted = figure1(X_train, y_train_predicted, X_val, y_val_predicted)
fig print('Confusion matrix:')
print(confusion_matrix(y_val, list(map(int, (y_val_predicted > 0.5).ravel()))))
print('Correct categories:')
print(sbs.loader_apply(sbs.val_loader, sbs.correct))
predict_plot_count(sbs_lin)
Confusion matrix:
[[82 14]
[14 90]]
Correct categories:
tensor([[ 82, 96],
[ 90, 104]])
and we see there are some false positives and false negatives (off-diagonal elements).
Two-layer model
Let’s make a better model.
= nn.Sequential()
model_nonlin 'linear1', nn.Linear(2,10))
model_nonlin.add_module('relu', nn.ReLU())
model_nonlin.add_module('linear2', nn.Linear(10,1))
model_nonlin.add_module('sigmoid', nn.Sigmoid())
model_nonlin.add_module(= optim.Adam(model_nonlin.parameters(), lr=lr)
optimizer
= StepByStep(model_nonlin, optimizer, loss_fn)
sbs_nonlin
sbs_nonlin.set_loaders(train_loader, val_loader)100)
sbs_nonlin.train(= sbs_nonlin.plot_losses() _
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 50.74it/s]
predict_plot_count(sbs_nonlin)
Confusion matrix:
[[86 10]
[10 94]]
Correct categories:
tensor([[ 86, 96],
[ 94, 104]])
And this is better (we could calculate precision and recall).
Three-layer model
= nn.Sequential(
model_nonlin2 2,50),
nn.Linear(
nn.ReLU(),50,20),
nn.Linear(
nn.ReLU(),20,1),
nn.Linear(
nn.Sigmoid()
)= optim.Adam(model_nonlin2.parameters(), lr=lr)
optimizer
= StepByStep(model_nonlin2, optimizer, loss_fn)
sbs_nonlin2
sbs_nonlin2.set_loaders(train_loader, val_loader)200)
sbs_nonlin2.train(= sbs_nonlin2.plot_losses() _
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 45.80it/s]
predict_plot_count(sbs_nonlin2)
Confusion matrix:
[[86 10]
[11 93]]
Correct categories:
tensor([[ 86, 96],
[ 93, 104]])
Not much better then 2-layer model, and we can see that model might be overfitting considering flattness of val_loss
. But overall not bad considering the noise.
Trying SGD instead of Adam
= nn.Sequential(
model_nonlin2 2,50),
nn.Linear(
nn.ReLU(),50,20),
nn.Linear(
nn.ReLU(),20,1),
nn.Linear(
nn.Sigmoid()
)= optim.SGD(model_nonlin2.parameters(), lr=lr)
optimizer
= StepByStep(model_nonlin2, optimizer, loss_fn)
sbs_nonlin3
sbs_nonlin3.set_loaders(train_loader, val_loader)200)
sbs_nonlin3.train(= sbs_nonlin3.plot_losses() _
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 51.42it/s]
predict_plot_count(sbs_nonlin3)
Confusion matrix:
[[83 13]
[13 91]]
Correct categories:
tensor([[ 83, 96],
[ 91, 104]])
So Adam optimizer is performing much better then SGD.