-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add supporting code for visualizing graphs
File contains additional code provided by Udacity necessary to visualize the graphs in the project
- Loading branch information
Showing
1 changed file
with
163 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
########################################### | ||
# Suppress matplotlib user warnings | ||
# Necessary for newer version of matplotlib | ||
import warnings | ||
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib") | ||
# | ||
# Display inline matplotlib plots with IPython | ||
from IPython import get_ipython | ||
get_ipython().run_line_magic('matplotlib', 'inline') | ||
########################################### | ||
|
||
import matplotlib.pyplot as plt | ||
import matplotlib.cm as cm | ||
import pandas as pd | ||
import numpy as np | ||
|
||
def pca_results(good_data, pca): | ||
''' | ||
Create a DataFrame of the PCA results | ||
Includes dimension feature weights and explained variance | ||
Visualizes the PCA results | ||
''' | ||
|
||
# Dimension indexing | ||
dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)] | ||
|
||
# PCA components | ||
components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys()) | ||
components.index = dimensions | ||
|
||
# PCA explained variance | ||
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) | ||
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) | ||
variance_ratios.index = dimensions | ||
|
||
# Create a bar plot visualization | ||
fig, ax = plt.subplots(figsize = (14,8)) | ||
|
||
# Plot the feature weights as a function of the components | ||
components.plot(ax = ax, kind = 'bar'); | ||
ax.set_ylabel("Feature Weights") | ||
ax.set_xticklabels(dimensions, rotation=0) | ||
|
||
|
||
# Display the explained variance ratios | ||
for i, ev in enumerate(pca.explained_variance_ratio_): | ||
ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev)) | ||
|
||
# Return a concatenated DataFrame | ||
return pd.concat([variance_ratios, components], axis = 1) | ||
|
||
def cluster_results(reduced_data, preds, centers, pca_samples): | ||
''' | ||
Visualizes the PCA-reduced cluster data in two dimensions | ||
Adds cues for cluster centers and student-selected sample data | ||
''' | ||
|
||
predictions = pd.DataFrame(preds, columns = ['Cluster']) | ||
plot_data = pd.concat([predictions, reduced_data], axis = 1) | ||
|
||
# Generate the cluster plot | ||
fig, ax = plt.subplots(figsize = (14,8)) | ||
|
||
# Color map | ||
cmap = cm.get_cmap('gist_rainbow') | ||
|
||
# Color the points based on assigned cluster | ||
for i, cluster in plot_data.groupby('Cluster'): | ||
cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ | ||
color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); | ||
|
||
# Plot centers with indicators | ||
for i, c in enumerate(centers): | ||
ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ | ||
alpha = 1, linewidth = 2, marker = 'o', s=200); | ||
ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); | ||
|
||
# Plot transformed sample points | ||
ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ | ||
s = 150, linewidth = 4, color = 'black', marker = 'x'); | ||
|
||
# Set plot title | ||
ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); | ||
|
||
|
||
def biplot(good_data, reduced_data, pca): | ||
''' | ||
Produce a biplot that shows a scatterplot of the reduced | ||
data and the projections of the original features. | ||
good_data: original data, before transformation. | ||
Needs to be a pandas dataframe with valid column names | ||
reduced_data: the reduced data (the first two dimensions are plotted) | ||
pca: pca object that contains the components_ attribute | ||
return: a matplotlib AxesSubplot object (for any additional customization) | ||
This procedure is inspired by the script: | ||
https://github.com/teddyroland/python-biplot | ||
''' | ||
|
||
fig, ax = plt.subplots(figsize = (14,8)) | ||
# scatterplot of the reduced data | ||
ax.scatter(x=reduced_data.loc[:, 'Dimension 1'], y=reduced_data.loc[:, 'Dimension 2'], | ||
facecolors='b', edgecolors='b', s=70, alpha=0.5) | ||
|
||
feature_vectors = pca.components_.T | ||
|
||
# we use scaling factors to make the arrows easier to see | ||
arrow_size, text_pos = 7.0, 8.0, | ||
|
||
# projections of the original features | ||
for i, v in enumerate(feature_vectors): | ||
ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], | ||
head_width=0.2, head_length=0.2, linewidth=2, color='red') | ||
ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', | ||
ha='center', va='center', fontsize=18) | ||
|
||
ax.set_xlabel("Dimension 1", fontsize=14) | ||
ax.set_ylabel("Dimension 2", fontsize=14) | ||
ax.set_title("PC plane with original feature projections.", fontsize=16); | ||
return ax | ||
|
||
|
||
def channel_results(reduced_data, outliers, pca_samples): | ||
''' | ||
Visualizes the PCA-reduced cluster data in two dimensions using the full dataset | ||
Data is labeled by "Channel" and cues added for student-selected sample data | ||
''' | ||
|
||
# Check that the dataset is loadable | ||
try: | ||
full_data = pd.read_csv("customers.csv") | ||
except: | ||
print "Dataset could not be loaded. Is the file missing?" | ||
return False | ||
|
||
# Create the Channel DataFrame | ||
channel = pd.DataFrame(full_data['Channel'], columns = ['Channel']) | ||
channel = channel.drop(channel.index[outliers]).reset_index(drop = True) | ||
labeled = pd.concat([reduced_data, channel], axis = 1) | ||
|
||
# Generate the cluster plot | ||
fig, ax = plt.subplots(figsize = (14,8)) | ||
|
||
# Color map | ||
cmap = cm.get_cmap('gist_rainbow') | ||
|
||
# Color the points based on assigned Channel | ||
labels = ['Hotel/Restaurant/Cafe', 'Retailer'] | ||
grouped = labeled.groupby('Channel') | ||
for i, channel in grouped: | ||
channel.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \ | ||
color = cmap((i-1)*1.0/2), label = labels[i-1], s=30); | ||
|
||
# Plot transformed sample points | ||
for i, sample in enumerate(pca_samples): | ||
ax.scatter(x = sample[0], y = sample[1], \ | ||
s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none'); | ||
ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125); | ||
|
||
# Set plot title | ||
ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled"); |