From 82b057d909f7acb330261871e87d303b9a5ca266 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Tue, 16 Oct 2018 20:04:46 -0400 Subject: [PATCH] fix notebook bugs and add running results --- python/notebooks/dbscan_demo.ipynb | 115 +++++++++++++++++------ python/notebooks/pca_demo.ipynb | 133 +++++++++++++++++++------- python/notebooks/tsvd_demo.ipynb | 146 +++++++++++++++++++++-------- 3 files changed, 290 insertions(+), 104 deletions(-) diff --git a/python/notebooks/dbscan_demo.ipynb b/python/notebooks/dbscan_demo.ipynb index f7bb480b97f..57b13d4d9c5 100644 --- a/python/notebooks/dbscan_demo.ipynb +++ b/python/notebooks/dbscan_demo.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -52,44 +52,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "def load_data(nrows, ncols, cached = 'data/mortgage.npy'):\n", + "import gzip\n", + "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):\n", " if os.path.exists(cached):\n", - " X = np.load(cached)\n", - " return X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n", + " print('use mortgage data')\n", + " with gzip.open(cached) as f:\n", + " X = np.load(f)\n", + " X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n", " else:\n", - " return np.random.rand(nrows,ncols)" + " print('use random data')\n", + " X = np.random.rand(nrows,ncols)\n", + " df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})\n", + " return df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "def np2pygdf(df):\n", - " # convert numpy array to pygdf dataframe\n", + "def pd2pygdf(df):\n", + " # convert pandas dataframe to pygdf dataframe\n", + " if isinstance(df,np.ndarray):\n", + " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", " pdf = pygdf.DataFrame()\n", - " for c in range(df.shape[1]):\n", - " pdf[c] = df[:,c]\n", + " for c,column in enumerate(df):\n", + " pdf[c] = df[column]\n", " return pdf" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "def array_equal(a,b,threshold=1e-4,with_sign=True):\n", + "from sklearn.metrics import mean_squared_error\n", + "def array_equal(a,b,threshold=5e-3,with_sign=True):\n", " a = to_nparray(a)\n", " b = to_nparray(b)\n", " if with_sign == False:\n", " a,b = np.abs(a),np.abs(b)\n", - " res = np.max(np.abs(a-b))