forked from williamtnguyen/housing-price-regression
-
Notifications
You must be signed in to change notification settings - Fork 0
/
principal_component_analysis.py
41 lines (31 loc) · 1.32 KB
/
principal_component_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
def get_eigenvalues(dataframe):
# calculate the mean of each column (feature)
mean = np.mean(dataframe.T, axis=1)
# center columns by subtracting column means
centered_matrix = (dataframe - mean)
print(f'Matrix after centering:\n {centered_matrix}')
# calculate covariance matrix of centered matrix
covariance_matrix = np.cov(centered_matrix.T)
print(f'Computed covariance matrix:\n {covariance_matrix}')
# using numpy's eig function to compute eigenvector and eigenvalues of covariance matrix
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print(f'COMPUTED EIGENVALUES:\n {eigenvalues}')
# computing variance of each PC and accumulating a total variance
variances = []
total_variance = 0
for eigenvalue in eigenvalues:
variance = eigenvalue / (len(dataframe) - 1)
variances.append(variance)
total_variance += variance
principal_components = {}
for index, variance in enumerate(variances):
variance_ratio = variance / total_variance
principal_components[index] = variance_ratio
# sorting the dictionary in descending order by variance ratio
sorted_pcs = dict(sorted(principal_components.items(), reverse=True, key=lambda x: abs(x[1])))
print('\n')
# outputting the results
for key in sorted_pcs:
print(f'Principal Component #{key}: {sorted_pcs[key]}')