import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as la
from sklearn.decomposition import PCA
import urllib.request
import PIL
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
from torch.utils.data import TensorDataset, DataLoader
from sklearn.decomposition import FastICA
from tqdm import tqdm
import medmnist
from medmnist import INFO, Evaluator
import scipy
import IPython
10 Unsupervised learning
10.1 PCA
Let
Notice:
(i.e. diagonalizess ) (i.e. the total variability is the sum of the eigenvalues)- Since
, is a rotation matrix. Thus, rotates in such a way that to maximize variability in the first dimension, then the second dimensions … if- Another representation of
is by simply rewriting the matrix algebra of . - The variables
then: have uncorrelated elements ( for by property 5), have the same total variability as the elements of ( by property 2), are a rotation of the , are ordered so that has the greatest amount of variability and so on.
Notation:
- The
are simply called the eigenvalues or principal components variation. is called the principal component scores.- The
are called the principal component loadings or weights, with being called the first principal component and so on.
Statistical properties under the assumption that the
if .
10.1.1 Sample PCA
Of course, we’re describing PCA as a conceptual process. We realize
We can relate PCA to the SVD as follows. Let
10.1.2 PCA with a large dimension
Consider the case where one of
10.1.3 Simple example
= 1000
n = (0, 0)
mu = np.matrix([[1, .5], [.5, 1]])
Sigma = np.random.multivariate_normal( mean = mu, cov = Sigma, size = n)
X
0], X[:,1]) plt.scatter(X[:,
<matplotlib.collections.PathCollection at 0x7d2d4f957f10>
= X - X.mean(0)
X print(X.mean(0))
= np.matmul(np.transpose(X), X) / (n-1)
Sigma_hat Sigma_hat
[-6.88338275e-18 -2.02060590e-17]
array([[1.07377785, 0.50162145],
[0.50162145, 1.01008348]])
= la.eig(Sigma_hat)
evd = evd[0]
lambda_ = evd[1]
v_hat = np.matmul(X, np.transpose(v_hat))
u_hat 0], u_hat[:,1]) plt.scatter(u_hat[:,
<matplotlib.collections.PathCollection at 0x7d2d4f3b5810>
Fit using scikitlearn’s function
= PCA(n_components = 2).fit(X)
pca print(pca.explained_variance_)
print(lambda_ )
[1.54456206 0.53929927]
[1.54456206 0.53929927]
10.1.4 Example
Let’s consider the melanoma dataset that we looked at before. First we read in the data as we have done before so we don’t show that code.
Using downloaded and verified file: /home/bcaffo/.medmnist/dermamnist.npz
Using downloaded and verified file: /home/bcaffo/.medmnist/dermamnist.npz
Using downloaded and verified file: /home/bcaffo/.medmnist/dermamnist.npz
Next, let’s get the data from the torch dataloader format back into an image array and a matrix with the image part (28, 28, 3) vectorized.
def loader_to_array(dataloader):
## Read one iteration to get data
= iter(dataloader).next()
test_input, test_target ## Total number of training images
= np.sum([inputs.shape[0] for inputs, targets in dataloader])
n ## The dimensions of the images
= (test_input.shape[2], test_input.shape[3])
imgdim = np.empty( (n, imgdim[0], imgdim[1], 3))
images
## Read the data from the data loader into our numpy array
= 0
idx for inputs, targets in dataloader:
= inputs.detach().numpy()
inputs for j in range(inputs.shape[0]):
= inputs[j,:,:,:]
img ## get it out of pytorch format
= np.transpose(img, (1, 2, 0))
img = img
images[idx,:,:,:] += 1
idx = images.reshape(n, 3 * np.prod(imgdim))
matrix return images, matrix
= loader_to_array(train_loader)
train_images, train_matrix = loader_to_array(test_loader)
test_images, test_matrix
## Demean the matrices
= train_matrix.mean(0)
train_mean = train_matrix - train_mean
train_matrix = test_matrix.mean(0)
test_mean = test_matrix - test_mean test_matrix
Now let’s actually perform PCA using scikitlearn. We’ll plot the eigenvalues divided by their sums,
from sklearn.decomposition import PCA
= 10
n_comp = PCA(n_components = n_comp).fit(train_matrix)
pca plt.plot(pca.explained_variance_ratio_)
Often this is done by plotting the cummulative sum so that you can visualize how much variance is explained by including the top
plt.plot(np.cumsum(pca.explained_variance_ratio_))
Note that the weights from the eigenvectors,
= pca.components_ eigen_moles
Let’s project our testing data onto the principal component basis created by our training data and see how it does. Let
Notice that
Let’s try this on our mole data.
= pca.inverse_transform(pca.transform(test_matrix))
test_matrix_fit abs( test_matrix - test_matrix_fit)) np.mean(np.
0.03792391028744076
10.2 ICA
ICA, independent components analysis, tries to find linear transformations of the data that are statistically independent. Usually, independence is an assumption in ICA, not actually embedded in the loss function.
Let
One way to characterize the estimation problem is to parameterize
The most popular approaches try to find
One form of ICA maximizes the kurtosis. If
Statistical versions of ICA don’t require
10.2.1 Example
Consider an example that PCA would have somewhat of a hard time with. In this case, our data is from a mixture of normals with half from a normal with a strong positive correlation and half with a strong negative correlation. Because the angle between the two is not 90 degrees PCA has no chance. No rotation of the axes satisfies the obvious structure in this data.
= 1000
n
= np.matrix([[4, 1.8], [1.8, 1]])
Sigma = np.random.multivariate_normal( mean = [0, 0], cov = Sigma, size = int(n/2))
a = np.matrix([[4, -1.8], [-1.8, 1]])
Sigma = np.random.multivariate_normal( mean = [0, 0], cov = Sigma, size = int(n/2))
b = np.append( a, b, axis = 0)
x 0], x[:,1])
plt.scatter(x[:,-6, 6])
plt.xlim([-6, 6]) plt.ylim([
(-6.0, 6.0)
Let’s try fast ICA. Notice it comes much closer to discovering the structure we’d like to discover than PCA could. It pulls appart the two components to a fair degree. Also note, there’s a random starting point of ICA, so that I get fairly different fits over re-runs of the algorithm. I had to lower the tolerance to get a good fit.
Indpendent components are order invariant and sign invariant.
= FastICA(tol = 1e-7)
transformer = transformer.fit(x)
icafit = icafit.transform(x)
s 0], s[:,1])
plt.scatter(s[:,min(), s.max()])
plt.xlim( [s.min(), s.max()]) plt.ylim( [s.
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/decomposition/_fastica.py:488: FutureWarning:
From version 1.3 whiten='unit-variance' will be used by default.
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/decomposition/_fastica.py:120: ConvergenceWarning:
FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
(-0.14345493269991716, 0.1408356388606491)
10.2.2 Cocktail party example
The classic ICA problem is the so called cocktail party problem. In this, you have
import audio2numpy as a2n
= a2n.audio_from_file("mp3s/4.mp3")
s1, i1 = a2n.audio_from_file("mp3s/2.mp3")
s2, i2 = a2n.audio_from_file("mp3s/6.mp3")
s3, i3
## Get everything to be the same shape and sum the two audio channels
= np.min((s1.shape[0], s2.shape[0], s3.shape[0]))
n = s1[0:n,:].mean(axis = 1)
s1 = s2[0:n,:].mean(axis = 1)
s2 = s3[0:n,:].mean(axis = 1)
s3
= np.matrix([s1, s2, s3]) s
Mix the signals.
= np.matrix( [ [.7, .2, .1], [.1, .7, .2], [.2, .1, .7] ])
w = np.transpose(np.matmul(w, s)) x
Here’s an example mixed signal
= x[:,1].reshape(n), rate = i1) IPython.display.Audio(data
Now try to unmix using fastICA
= FastICA(whiten=True, tol = 1e-7)
transformer = transformer.fit(x) icafit
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/decomposition/_fastica.py:673: FutureWarning:
From version 1.3 whiten=True should be specified as whiten='arbitrary-variance' (its current behaviour). This behavior is deprecated in 1.1 and will raise ValueError in 1.3.
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/utils/validation.py:727: FutureWarning:
np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
icafit.mixing_
array([[-12.70609981, -51.16254236, 16.57481899],
[-44.0705694 , -7.49682052, 31.47133465],
[ -5.51913194, -14.14746356, 108.62751866]])
Unmixing matrix
icafit.components_
array([[ 0.00166493, -0.02401024, 0.00670215],
[-0.02080964, 0.00581294, 0.0014911 ],
[-0.00262562, -0.00046284, 0.00974049]])
Here’s a scatterplot matrix where the real component is on the rows and the estimated component is on the columns.
= np.transpose(icafit.transform(x))
hat_s
=(10,10))
plt.figure(figsize
for i in range(3):
for j in range(3):
3, 3, (3 * i + j) + 1)
plt.subplot( plt.scatter(hat_s[i,:].squeeze(), np.asarray(s)[j,:])
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/utils/validation.py:727: FutureWarning:
np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
We can now play the estimated sources and see how they turned out.
from scipy.io.wavfile import write
= 0
i = (hat_s[i,:].reshape(n) / np.max(np.abs(hat_s[i,:]))) * .5
data = data, rate = i1) IPython.display.Audio(data
= 1
i = (hat_s[i,:].reshape(n) / np.max(np.abs(hat_s[i,:]))) * .5
data = data, rate = i1) IPython.display.Audio(data
= 2
i = (hat_s[i,:].reshape(n) / np.max(np.abs(hat_s[i,:]))) * .5
data = data, rate = i1) IPython.display.Audio(data
10.2.3 Imaging example using ICA
Let’s see what we get for the images. Logically, one would consider voxels as mixed sources and images as the iid replications. But, then the sources would not be images. Let’s try the other dimension and see what we get where subject images are mixtures of source images. This is analogous to finding a soure basis of subject images.
This is often done in ICA where people transpose matrices to investigate different problems.
= FastICA(n_components=10, random_state=0,whiten='unit-variance', tol = 1e-7)
transformer = transformer.fit_transform(np.transpose(train_matrix))
icafit icafit.shape
/home/bcaffo/miniconda3/envs/ds4bio/lib/python3.10/site-packages/sklearn/decomposition/_fastica.py:120: ConvergenceWarning:
FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
(2352, 10)
10.3 Autoencoders
Autoencoders are another unsupervised learning technique. Autoencoders take in a record and spit out a prediction of the same size. The goal is to represent the records as a NN. In an incomplete autoencoder, the model is regularized by the embedding (middle) layer being much lower than the input dimension. In this way, an autoencoder is a dimension reduction technique, reducing the input dimension size downto a much lower size (the encoder) then back out to the original size (the decoder). We can represent the autoencoder with a network diagram as below.
Let
over the weights. This sort of network is called an autoencoder. Notice that the same data is the input and output of the network. This kind of learning is called unsupervised, since we’re not trying to use
Notice overfitting concerns are somewhat different in this network construction. If this model fits well, then it’s suggesting that 2 numbers can explain 8. That is, the network will have reduced the inputs to only two dimensions, that we could visualize for example. That is a form of parsimony that prevents overfitting. The middle layer is called the embedding. It is called this because an autoencoder is a form of non-linear embedding of our data into a lower dimensionional space.
There’s nothing to prevent us from having convolutional layers if the inputs are images. That’s what we’ll work on here. For convolutional autoencoders, it’s typical to increase the number of channels and decrease the image sizes as one works through the network.
10.3.1 PCA and autoencoders
Without modification, autoencoders can be programmed that span the same space as PCA/SVD (Plaut 2018). Enforcing the orthogonality requires something like adding Lagrange terms to the loss function. There’s no reason why you would do this, since PCA is well developed and works just fine. However, it does suggest why NNs are such a large class of models.
Let
Therefore, consider a neural network that specifies that the first layer defines
That is, it has a linear activation function with no bias term and weights
Again, this is a linear activation function with weights
Finally, we see that a two layer autoencoder -without the constraints- contains the PCA fit as a special case and spans the same space as the PCA fit. Similarly we see that such a two layer encoder is overspecified, as most NNs are.
10.3.2 Example on dermamnist
First, let’s set up our autoencoder by defining a python class then initializing it.
= 5
kernel_size
class autoencoder(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, kernel_size)
self.conv2 = nn.Conv2d(6, 12, kernel_size)
self.pool = nn.MaxPool2d(2, 2)
self.iconv1 = nn.ConvTranspose2d(12, 6, kernel_size+1, stride = 2)
self.iconv2 = nn.ConvTranspose2d(6, 3, kernel_size+1, stride = 2)
def encode(self, x):
= F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x return x
def decode(self, x):
= F.relu(self.iconv1(x))
x ## Use the sigmoid as the final layer
## since we've normalized pixel values to be between 0 and 1
= torch.sigmoid(self.iconv2(x))
x return(x)
def forward(self, x):
return self.decode(self.encode(x))
= autoencoder() autoencoder
We can try out the autoencoder by
## Here's some example data by grabbing one batch
= iter(train_loader).next()
tryItOut, _ print(tryItOut.shape)
## Let's encode that data
= autoencoder.encode(tryItOut)
encoded print(encoded.shape)
## Now let's decode the encoded data
= autoencoder.decode(encoded)
decoded print(decoded.shape)
## Now let's run the whole thing through
= autoencoder.forward(tryItOut)
fedForward print(fedForward.shape)
torch.Size([128, 3, 28, 28])
torch.Size([128, 12, 4, 4])
torch.Size([128, 3, 28, 28])
torch.Size([128, 3, 28, 28])
= fedForward.detach().numpy()
test
## Plot out the first 5 images, note this isn't very interesting, since
## all of the weights haven't been trained
=(10,5))
plt.figure(figsizefor i in range(5):
1, 5,i+1)
plt.subplot(
plt.xticks([])
plt.yticks([])= np.transpose(test[i,:,:,:], (1, 2, 0))
img plt.imshow(img)
#Optimizer
= torch.optim.Adam(autoencoder.parameters(), lr = 0.001)
optimizer
#Epochs
= 20
n_epochs
autoencoder.train()
for epoch in range(n_epochs):
for data, _ in train_loader:
= data
images
optimizer.zero_grad()= autoencoder.forward(images)
outputs = F.mse_loss(outputs, images)
loss
loss.backward() optimizer.step()
## the data from the last iteration is called images
= images.detach().numpy()
trainSample
=(10,5))
plt.figure(figsizefor i in range(5):
1, 5,i+1)
plt.subplot(
plt.xticks([])
plt.yticks([])= np.transpose(trainSample[i,:,:,:], (1, 2, 0))
img
plt.imshow(img)
## the output from the last iterations (feed forward through the network) is called outputs
= outputs.detach().numpy()
trainOutput
=(10,5))
plt.figure(figsizefor i in range(5):
2, 5,i+6)
plt.subplot(
plt.xticks([])
plt.yticks([])= np.transpose(trainOutput[i,:,:,:], (1, 2, 0))
img plt.imshow(img)
On a test batch
= iter(test_loader).next()
test_batch, _ = test_batch.detach().numpy()
x_test = autoencoder.forward(test_batch).detach().numpy()
testSample
=(10,4))
plt.figure(figsize
## Plot the original data
for i in range(5):
2, 5, i + 1)
plt.subplot(
plt.xticks([])
plt.yticks([])= np.transpose(x_test[i,:,:,:], (1, 2, 0))
img
plt.imshow(img)# Plot the data having been run throught the convolutional autoencoder
for i in range(5):
2, 5, i + 6)
plt.subplot(
plt.xticks([])
plt.yticks([])= np.transpose(testSample[i,:,:,:], (1, 2, 0))
img plt.imshow(img)