You are on page 1of 4

Understanding the mathematics behind PCA

In [1]:

import numpy as np
import pandas as pd

Step1: Lets take a dataset to explain

In [2]:

A = np.matrix([[1,2,3,4],
[5,5,6,7],
[1,4,2,3],
[5,3,2,1],
[8,1,2,2]])

In [3]:
df = pd.DataFrame(A,columns = ['f1','f2','f3','f4'])
df
Out[3]:

f1 f2 f3 f4

0 1 2 3 4

1 5 5 6 7

2 1 4 2 3

3 5 3 2 1

4 8 1 2 2

Step 2: Standardize the dataset

In [4]:
df_std = (df - df.mean()) / (df.std())
df_std
Out[4]:

f1 f2 f3 f4

0 -1.000000 -0.632456 0.000000 0.260623

1 0.333333 1.264911 1.732051 1.563740

2 -1.000000 0.632456 -0.577350 -0.173749

3 0.333333 0.000000 -0.577350 -1.042493

4 1.333333 -1.264911 -0.577350 -0.608121

Find the covariance matrix for the given dataset


There are two methods to do this

Sample formula
Population formula

Note: Any of the formula, can be used result will be same


Covariance population formula (divide by N)

In [5]:
df_cov = np.cov(df_std.T, bias = 1)
df_cov
Out[5]:
array([[ 0.8 , -0.25298221, 0.03849002, -0.14479075],
[-0.25298221, 0.8 , 0.51120772, 0.49449803],
[ 0.03849002, 0.51120772, 0.8 , 0.75235479],
[-0.14479075, 0.49449803, 0.75235479, 0.8 ]])

Covariance sample formula (divide by N-1)

In [6]:
cov_mat = np.cov(df_std.T, bias = 0)
cov_mat
Out[6]:
array([[ 1. , -0.31622777, 0.04811252, -0.18098843],
[-0.31622777, 1. , 0.63900965, 0.61812254],
[ 0.04811252, 0.63900965, 1. , 0.94044349],
[-0.18098843, 0.61812254, 0.94044349, 1. ]])

In [9]:
## verify varinace(f1) is as expected
print('var(f1) (population formula): ',((df_std.f1)**2).sum()/5)
print('var(f1) (sample formula): ',((df_std.f1)**2).sum()/4)

var(f1) (population formula): 0.8


var(f1) (sample formula): 1.0

In [11]:
## verify covarinace(f1,f2) is as expected
print('covar(f1,f2) (population formula): ',((df_std.f1)*(df_std.f2)).sum()/5)
print('covar(f1,f2) (sample formula): ',((df_std.f1)*(df_std.f2)).sum()/4)

covar(f1,f2) (population formula): -0.25298221281347033


covar(f1,f2) (sample formula): -0.3162277660168379

Calculate Eigenvalue and eigen vector


In [12]:
eigen_val, eigen_vectors = np.linalg.eig(cov_mat)

In [13]:
print(eigen_val)

[2.51579324 1.0652885 0.39388704 0.02503121]

In [14]:
print(eigen_vectors)

[[ 0.16195986 -0.91705888 -0.30707099 0.19616173]


[-0.52404813 0.20692161 -0.81731886 0.12061043]
[-0.58589647 -0.3205394 0.1882497 -0.72009851]
[-0.59654663 -0.11593512 0.44973251 0.65454704]]
Sort the eigen values and their correspoding eigen vectors

Since the eigen values are already sorted in our case, so no need of this step

In [75]:
n_components=3

pick

In [76]:
top_eigen_vectors = eigen_vectors[:,:n_components]

In [77]:

top_eigen_vectors
Out[77]:
array([[ 0.16195986, -0.91705888, -0.30707099],
[-0.52404813, 0.20692161, -0.81731886],
[-0.58589647, -0.3205394 , 0.1882497 ],
[-0.59654663, -0.11593512, 0.44973251]])

In [78]:
top_eigen_vectors.shape

Out[78]:
(4, 3)

In [79]:
np.array(df_std).shape
Out[79]:
(5, 4)

df_std.shape n_eigen_vectors.shape = transformed_data.shape


(5,4) (4,3) = (5,3)

In [80]:
transformed_data = np.matmul(np.array(df_std),top_eigen_vectors)

In [85]:
pd.DataFrame(data = transformed_data
, columns = ['principal component '+ str(i+1) for i in range(n_components)]
)
Out[85]:

principal component 1 principal component 2 principal component 3

0 0.014003 0.755975 0.941200

1 -2.556534 -0.780432 -0.106870

2 -0.051480 1.253135 -0.396673

3 1.014150 0.000239 -0.679886

4 1.579861 -1.228917 0.242230

In [82]:
transformed_data.shape

Out[82]:
(5, 3)

Now lets see the result using the Sklearn library


In [83]:
from sklearn.decomposition import PCA
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(df_std)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component '+ str(i+1) for i in range(n_components)]
)

In [84]:
principalDf
Out[84]:

principal component 1 principal component 2 principal component 3

0 -0.014003 0.755975 0.941200

1 2.556534 -0.780432 -0.106870

2 0.051480 1.253135 -0.396673

3 -1.014150 0.000239 -0.679886

4 -1.579861 -1.228917 0.242230

In [ ]:

In [ ]:

You might also like