From 361428749ef952b6c952672f3058a2a7080a8464 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 19 Sep 2018 17:09:49 +0200 Subject: [PATCH 01/32] Working on a new better tested RandomForrestClassifier with correct min max range of values --- ...learn.ensemble.RandomForestClassifier.json | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index df01686a..9cbb693a 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -46,9 +46,14 @@ } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, "criterion": { "type": "str", - "default": "entropy", + "default": "gini", "values": ["entropy", "gini"] }, "max_features": { @@ -62,19 +67,24 @@ "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 100] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0, + "range": [0.0, 0.5] }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] + "default": null, + "range": [1, 100] }, "class_weight": { "type": "str", From feaf11d21a39952f7fb48843f848b35a69279806 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 20 Sep 2018 12:42:02 +0200 Subject: [PATCH 02/32] Modified and created primitives that reffer to sklearn.ensemble that don't require base_estimator or estimator. --- ...sklearn.ensemble.ExtraTreesClassifier.json | 116 ++++++++++++++++ .../sklearn.ensemble.ExtraTreesRegressor.json | 112 ++++++++++++++++ ...n.ensemble.GradientBoostingClassifier.json | 120 +++++++++++++++++ ...rn.ensemble.GradientBoostingRegressor.json | 124 ++++++++++++++++++ .../sklearn.ensemble.IsolationForest.json | 77 +++++++++++ ...learn.ensemble.RandomForestClassifier.json | 38 ++++-- ...klearn.ensemble.RandomForestRegressor.json | 54 ++++++-- ...sklearn.ensemble.RandomTreesEmbedding.json | 98 ++++++++++++++ 8 files changed, 716 insertions(+), 23 deletions(-) create mode 100644 mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json create mode 100644 mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json create mode 100644 mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json create mode 100644 mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json create mode 100644 mlblocks_primitives/sklearn.ensemble.IsolationForest.json create mode 100644 mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json new file mode 100644 index 00000000..efa755a4 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json @@ -0,0 +1,116 @@ +{ + "name": "sklearn.ensemble.ExtraTreesClassifier", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html", + "description": "Scikit-learn ExtraTreesClassifier. Implements a meta estimator that fits a number of randomized decision trees.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.ExtraTreesClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": -1 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "criterion": { + "type": "str", + "default": "gini", + "values": ["entropy", "gini"] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_depth": { + "type": "int", + "default": null, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json new file mode 100644 index 00000000..f8ef31cd --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json @@ -0,0 +1,112 @@ +{ + "name": "sklearn.ensemble.ExtraTreesRegressor", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html", + "description": "Scikit-learn ExtraTreesRegressor. Implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.ExtraTreesRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": -1 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "criterion": { + "type": "str", + "default": "mse", + "values": ["mae", "mse"] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_depth": { + "type": "int", + "default": null, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json new file mode 100644 index 00000000..9eb9a98e --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -0,0 +1,120 @@ +{ + "name": "sklearn.ensemble.GradientBoostingClassifier", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html", + "description": "Scikit-learn GradientBoostingClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.GradientBoostingClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": { + "loss": { + "type": "str", + "default": "deviance", + "values": ["deviance", "exponential"] + }, + "learning_rate": { + "type": "float", + "default": 0.1 + }, + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [1, 30] + }, + "criterion": { + "type": "str", + "default": "friedman_mse", + "values": ["friedman_mse", "friedman_mae"] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "subsample": { + "type": "float", + "default": 1.0 + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + }, + "warm_start": { + "type": "bool", + "default": false + }, + "presort": { + "type": "bool", + "default": "false" + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json new file mode 100644 index 00000000..8a73bdd0 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -0,0 +1,124 @@ +{ + "name": "sklearn.ensemble.GradientBoostingRegressor", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html", + "description": "Scikit-learn GradientBoostingRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.GradientBoostingRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": { + "loss": { + "type": "str", + "default": "ls", + "values": ["ls", "lad", "huber", "quantile"] + }, + "learning_rate": { + "type": "float", + "default": 0.1 + }, + "n_estimators": { + "type": "int", + "default": 100, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [1, 30] + }, + "criterion": { + "type": "str", + "default": "friedman_mse", + "values": ["friedman_mse", "friedman_mae"] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "subsample": { + "type": "float", + "default": 1.0 + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "alpha": { + "type": "float", + "default": 0.9 + }, + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + }, + "warm_start": { + "type": "bool", + "default": false + }, + "presort": { + "type": "bool", + "default": "false" + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json new file mode 100644 index 00000000..b7a22fb9 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -0,0 +1,77 @@ +{ + "name": "sklearn.ensemble.IsolationForest", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html", + "description": "Scikit-learn IsolationForest. The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.IsolationForest", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": -1 + }, + "contamination": { + "type": "float", + "default": 0.1, + "range": [0.0, 0.5] + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 100, + "range": [1, 500] + }, + "max_samples": { + "type": "str", + "default": "auto" + }, + "max_features": { + "type": "float", + "default": 1.0 + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index 9cbb693a..e9822a15 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -9,7 +9,6 @@ }, "modalities": [], "primitive": "sklearn.ensemble.RandomForestClassifier", - "validation_dataset": "wine", "fit": { "method": "fit", "args": [ @@ -19,7 +18,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -34,7 +33,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -59,11 +58,11 @@ "max_features": { "type": "str", "default": null, - "range": [null, "auto", "log2"] + "values": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { @@ -84,12 +83,33 @@ "max_leaf_nodes": { "type": "int", "default": null, - "range": [1, 100] + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false }, "class_weight": { - "type": "str", - "default": null, - "range": [null, "balanced"] + "type": "iterable", + "default": null } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json index 7cc9e08f..d9728d65 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json @@ -9,7 +9,6 @@ }, "modalities": [], "primitive": "sklearn.ensemble.RandomForestRegressor", - "validation_dataset": "boston", "fit": { "method": "fit", "args": [ @@ -19,7 +18,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -34,7 +33,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -46,6 +45,10 @@ } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10 + }, "criterion": { "type": "str", "default": "mse", @@ -53,28 +56,51 @@ }, "max_features": { "type": "str", - "default": null, - "range": [null, "auto", "log2"] + "default": "auto", + "range": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 1000] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 1000] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0 }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] + "default": null + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0 + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json new file mode 100644 index 00000000..49601790 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json @@ -0,0 +1,98 @@ +{ + "name": "sklearn.ensemble.RandomTreesEmbedding", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomTreesEmbedding.html", + "description": "Scikit-learn RandomTreesEmbedding. An unsupervised transformation of a dataset to a high-dimensional sparse representation.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.RandomTreesEmbedding", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "Sparse" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": -1 + }, + "sparse_output": { + "type": "bool", + "default": true + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 5, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + } + } + } +} From 725de7b61ece6784b9979b28a424f1e80e9ca9c5 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 20 Sep 2018 16:05:12 +0200 Subject: [PATCH 03/32] Decomposition PCA and DL done --- ...earn.decomposition.DictionaryLearning.json | 96 +++++++++++++++++++ .../sklearn.decomposition.PCA.json | 25 ++++- 2 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json diff --git a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json new file mode 100644 index 00000000..112908ee --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json @@ -0,0 +1,96 @@ +{ + "name": "sklearn.decomposition.DictionaryLearning", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html", + "description": "Dictionary learning.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.DictionaryLearning", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "split_sign": { + "type": "bool", + "default": false + }, + "n_jobs": { + "type": "int", + "default": -1 + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null + }, + "alpha": { + "type": "float", + "default": 1.0 + }, + "max_iter": { + "type": "int", + "default": 1000 + }, + "tol": { + "type": "float", + "default": 1e-08 + }, + "fit_algorithm": { + "type": "str", + "default": "lars", + "values": ["lars", "cd"] + }, + "transform_algorithm": { + "type": "str", + "default": "omp", + "values": ["lasso_lars", "lasso_cd", "lars", "omp", "threshold"] + }, + "transform_n_nonzero_coefs": { + "type": "int", + "default": null + }, + "transform_alpha": { + "type": "float", + "default": 1.0 + }, + "code_init": { + "type": "iterable", + "default": null + }, + "dict_init": { + "type": "iterable", + "default": null + }, + "verbose": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.PCA.json b/mlblocks_primitives/sklearn.decomposition.PCA.json index 57958b46..27355d9b 100644 --- a/mlblocks_primitives/sklearn.decomposition.PCA.json +++ b/mlblocks_primitives/sklearn.decomposition.PCA.json @@ -9,7 +9,6 @@ }, "modalities": [], "primitive": "sklearn.decomposition.PCA", - "validation_dataset": "wine", "fit": { "method": "fit", "args": [ @@ -35,10 +34,32 @@ ] }, "hyperparameters": { - "FIXME": "This needs to be reviewed", "fixed": { + "copy": { + "type": "bool", + "default": true + } }, "tunable": { + "tol": { + "type": "float", + "default": 0.0, + "range": [0.0, 100.0] + }, + "iterated_power":{ + "type": "int", + "default": "auto", + "range": [0, 1000] + }, + "whiten": { + "type": "bool", + "default": false + }, + "svd_solver": { + "type": "str", + "default": "auto", + "values": ["auto", "arpack", "full", "randomized"] + } } } } From 88cf33d68b139df4fdb6cb041c75a43262266d88 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 20 Sep 2018 16:35:15 +0200 Subject: [PATCH 04/32] Added AdaBoost and Bagging from sklearn --- .../sklearn.ensemble.AdaBoostClassifier.json | 64 ++++++++++++++ .../sklearn.ensemble.AdaBoostRegressor.json | 64 ++++++++++++++ .../sklearn.ensemble.BaggingClassifier.json | 87 +++++++++++++++++++ .../sklearn.ensemble.BaggingRegressor.json | 87 +++++++++++++++++++ 4 files changed, 302 insertions(+) create mode 100644 mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json create mode 100644 mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json create mode 100644 mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json create mode 100644 mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json new file mode 100644 index 00000000..f7962be2 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json @@ -0,0 +1,64 @@ +{ + "name": "sklearn.ensemble.AdaBoostClassifier", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html", + "description": "Scikit-learn AdaBoostClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.AdaBoostClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "learning_rate": { + "type": "float", + "default": 1.0 + }, + "algorithm": { + "type": "str", + "default": "SAMME.R", + "values": ["SAMME", "SAMME.R"] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json new file mode 100644 index 00000000..a7c6eb93 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json @@ -0,0 +1,64 @@ +{ + "name": "sklearn.ensemble.AdaBoostRegressor", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html", + "description": "Scikit-learn AdaBoostRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.AdaBoostRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "learning_rate": { + "type": "float", + "default": 1.0 + }, + "loss": { + "type": "str", + "default": "linear", + "values": ["linear", "square", "exponential"] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json new file mode 100644 index 00000000..6686ea4c --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json @@ -0,0 +1,87 @@ +{ + "name": "sklearn.ensemble.BaggingClassifier", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html", + "description": "Scikit-learn BaggingClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.BaggingClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + }, + "n_jobs": { + "type": "int", + "default": 1 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "max_samples": { + "type": "float", + "default": 1.0 + }, + "max_features": { + "type": "int", + "default": 1.0 + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "bootstrap_features": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json new file mode 100644 index 00000000..904aaa9a --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json @@ -0,0 +1,87 @@ +{ + "name": "sklearn.ensemble.BaggingRegressor", + "author": "Carles Sala ", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html", + "description": "Scikit-learn BaggingRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.BaggingRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + }, + "n_jobs": { + "type": "int", + "default": 1 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_samples": { + "type": "float", + "default": 1.0 + }, + "max_features": { + "type": "int", + "default": 1.0 + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "bootstrap_features": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 + } + } + } +} From e200bdf3acd0bc23f614ddab3860ae775565d785 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 21 Sep 2018 10:16:23 +0200 Subject: [PATCH 05/32] Modified min / max range for int / floats --- .../sklearn.ensemble.AdaBoostClassifier.json | 3 ++- .../sklearn.ensemble.AdaBoostRegressor.json | 3 ++- .../sklearn.ensemble.BaggingClassifier.json | 12 ++++++++---- .../sklearn.ensemble.BaggingRegressor.json | 14 +++++++++----- .../sklearn.ensemble.ExtraTreesClassifier.json | 3 ++- .../sklearn.ensemble.ExtraTreesRegressor.json | 3 ++- ...learn.ensemble.GradientBoostingClassifier.json | 8 +++++--- ...klearn.ensemble.GradientBoostingRegressor.json | 13 ++++++++----- .../sklearn.ensemble.IsolationForest.json | 11 +++++++---- .../sklearn.ensemble.RandomForestClassifier.json | 3 ++- ...arn.ensemble.RandomForestClassifier_proba.json | 9 +++++---- .../sklearn.ensemble.RandomForestRegressor.json | 15 ++++++++++----- .../sklearn.ensemble.RandomTreesEmbedding.json | 3 ++- 13 files changed, 64 insertions(+), 36 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json index f7962be2..d0de949e 100644 --- a/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json @@ -52,7 +52,8 @@ }, "learning_rate": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [1.0, 10.0] }, "algorithm": { "type": "str", diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json index a7c6eb93..e9635b4f 100644 --- a/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json @@ -52,7 +52,8 @@ }, "learning_rate": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [1.0, 10.0] }, "loss": { "type": "str", diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json index 6686ea4c..1f13d56a 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json @@ -45,7 +45,8 @@ }, "n_jobs": { "type": "int", - "default": 1 + "default": 1, + "range": [-1, 10] } }, "tunable": { @@ -56,11 +57,13 @@ }, "max_samples": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [1.0, 100.0] }, "max_features": { "type": "int", - "default": 1.0 + "default": 1.0, + "range": [1.0, 1000.0] }, "bootstrap": { "type": "bool", @@ -80,7 +83,8 @@ }, "verbose": { "type": "int", - "default": 0 + "default": 0, + "range": [0, 100] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json index 904aaa9a..d7357d49 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json @@ -45,7 +45,8 @@ }, "n_jobs": { "type": "int", - "default": 1 + "default": 1, + "range": [-1, 10] } }, "tunable": { @@ -56,11 +57,13 @@ }, "max_samples": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [1.0, 100.0] }, "max_features": { - "type": "int", - "default": 1.0 + "type": "float", + "default": 1.0, + "range": [1.0, 1000.0] }, "bootstrap": { "type": "bool", @@ -80,7 +83,8 @@ }, "verbose": { "type": "int", - "default": 0 + "default": 0, + "range": [0, 100] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json index efa755a4..b38f6bc8 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json @@ -41,7 +41,8 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json index f8ef31cd..ff964555 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json @@ -41,7 +41,8 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json index 9eb9a98e..985517b5 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -47,7 +47,8 @@ }, "learning_rate": { "type": "float", - "default": 0.1 + "default": 0.1, + "range": [0.01, 10.0] }, "n_estimators": { "type": "int", @@ -77,11 +78,12 @@ "min_weight_fraction_leaf": { "type": "float", "default": 0.0, - "range": [0.0, 0.5] + "range": [0.0, 10.0] }, "subsample": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [0.001, 100.0] }, "max_features": { "type": "str", diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json index 8a73bdd0..72227af3 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -47,7 +47,8 @@ }, "learning_rate": { "type": "float", - "default": 0.1 + "default": 0.1, + "range": [0.01, 10.0] }, "n_estimators": { "type": "int", @@ -77,11 +78,12 @@ "min_weight_fraction_leaf": { "type": "float", "default": 0.0, - "range": [0.0, 0.5] + "range": [0.0, 10.5] }, "subsample": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [0.01, 100.0] }, "max_features": { "type": "str", @@ -96,11 +98,12 @@ "min_impurity_decrease": { "type": "float", "default": 0.0, - "range": [0.0, 1000.0] + "range": [0.0, 100.0] }, "alpha": { "type": "float", - "default": 0.9 + "default": 0.9, + "range": [0.01, 10] }, "init": { "type": "object", diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index b7a22fb9..79952909 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -41,7 +41,8 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] }, "contamination": { "type": "float", @@ -56,12 +57,14 @@ "range": [1, 500] }, "max_samples": { - "type": "str", - "default": "auto" + "type": "int", + "default": "1", + "range": [0, 100] }, "max_features": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [1.0, 1000.0] }, "bootstrap": { "type": "bool", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index e9822a15..308a484b 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -41,7 +41,8 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json index 9fe13eac..56d31a9b 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json @@ -19,7 +19,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -34,7 +34,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -42,13 +42,14 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { "criterion": { "type": "str", - "default": "entropy", + "default": "gini", "values": ["entropy", "gini"] }, "max_features": { diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json index d9728d65..c8db2aeb 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json @@ -41,13 +41,15 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { "n_estimators": { "type": "int", - "default": 10 + "default": 10, + "range": [1, 500] }, "criterion": { "type": "str", @@ -76,7 +78,8 @@ }, "min_weight_fraction_leaf": { "type": "float", - "default": 0.0 + "default": 0.0, + "range": [0.0, 100.0] }, "max_leaf_nodes": { "type": "int", @@ -84,7 +87,8 @@ }, "min_impurity_decrease": { "type": "float", - "default": 0.0 + "default": 0.0, + "range": [0.0, 10.0] }, "bootstrap": { "type": "bool", @@ -96,7 +100,8 @@ }, "verbose": { "type": "int", - "default": 0 + "default": 0, + "range": [0, 100] }, "warm_start": { "type": "bool", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json index 49601790..abebfdfe 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json @@ -41,7 +41,8 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] }, "sparse_output": { "type": "bool", From 43d28d97f86e63e858a94df405f9da734ae97237 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 21 Sep 2018 12:48:46 +0200 Subject: [PATCH 06/32] Added myself as contributor. --- ...earn.decomposition.DictionaryLearning.json | 20 ++-- .../sklearn.decomposition.FactorAnalysis.json | 75 ++++++++++++ .../sklearn.decomposition.FastICA.json | 78 +++++++++++++ .../sklearn.decomposition.KernelPCA.json | 110 ++++++++++++++++++ .../sklearn.decomposition.PCA.json | 2 +- .../sklearn.decomposition.TruncatedSVD.json | 61 ++++++++++ 6 files changed, 338 insertions(+), 8 deletions(-) create mode 100644 mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json create mode 100644 mlblocks_primitives/sklearn.decomposition.FastICA.json create mode 100644 mlblocks_primitives/sklearn.decomposition.KernelPCA.json create mode 100644 mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json diff --git a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json index 112908ee..88f77d10 100644 --- a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json +++ b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json @@ -1,6 +1,6 @@ { "name": "sklearn.decomposition.DictionaryLearning", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html", "description": "Dictionary learning.", "classifiers": { @@ -41,7 +41,8 @@ }, "n_jobs": { "type": "int", - "default": -1 + "default": -1, + "range": [-1, 10] } }, "tunable": { @@ -51,15 +52,18 @@ }, "alpha": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [0.0, 10.0] }, "max_iter": { "type": "int", - "default": 1000 + "default": 1000, + "range": [0, 10000] }, "tol": { "type": "float", - "default": 1e-08 + "default": 1e-08, + "range": [0.0, 1.0] }, "fit_algorithm": { "type": "str", @@ -73,11 +77,13 @@ }, "transform_n_nonzero_coefs": { "type": "int", - "default": null + "default": null, + "range": [0, 100] }, "transform_alpha": { "type": "float", - "default": 1.0 + "default": 1.0, + "range": [0.0, 10.0] }, "code_init": { "type": "iterable", diff --git a/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json new file mode 100644 index 00000000..6a9e7b72 --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json @@ -0,0 +1,75 @@ +{ + "name": "sklearn.decomposition.FactorAnalysis", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html", + "description": "Factor Analysis. A simple linear generative model with Gaussian latent variables.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.FactorAnalysis", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "copy": { + "type": "bool", + "default": true + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [0, 500] + }, + "tol": { + "type": "float", + "default": 0.01, + "range": [0.0, 0.5] + }, + "max_iter": { + "type": "int", + "default": 1000, + "range": [10, 10000] + }, + "noise_variance_init": { + "type": "iterable", + "default": null + }, + "svd_method": { + "type": "str", + "default": "randomized", + "values": ["lapack", "randomized"] + }, + "iterated_power": { + "type": "int", + "default": 3, + "range": [0, 10] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.FastICA.json b/mlblocks_primitives/sklearn.decomposition.FastICA.json new file mode 100644 index 00000000..a73b3c2a --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.FastICA.json @@ -0,0 +1,78 @@ +{ + "name": "sklearn.decomposition.FastICA", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html", + "description": "FastICA: a fast algorithm for Independent Component Analysis.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.FastICA", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [1, 500] + }, + "algorithm": { + "type": "str", + "default": "parallel", + "values": ["parallel", "deflation"] + }, + "whiten": { + "type": "bool", + "default": true + }, + "fun": { + "type": "string", + "default": "logcosh", + "values": ["logcosh", "exp", "cube"] + }, + "fun_args": { + "type": "iterable", + "default": null + }, + "max_iter": { + "type": "int", + "default": 200, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 0.0001, + "range": [0.00001, 0.5] + }, + "w_init": { + "type": "iterable", + "default": null + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json new file mode 100644 index 00000000..184085ac --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json @@ -0,0 +1,110 @@ +{ + "name": "sklearn.decomposition.KernelPCA", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html", + "description": "Kernel Principal Component Analysis.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.KernelPCA", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": 1, + "range": [-1, 10] + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [1, 500] + }, + "kernel": { + "type": "str", + "default": "linear", + "values": [ + "linear", + "poly", + "rbf", + "sigmoid", + "cosine", + "precomputed" + ] + }, + "gamma": { + "type": "float", + "default": null, + "range": [0.0, 0.5] + }, + "coef0": { + "type": "float", + "default": 1.0, + "range": [0.0, 10.0] + }, + "kernel_params": { + "type": "str", + "default": null + }, + "alpha": { + "type": "int", + "default": 1, + "range": [0, 10] + }, + "fit_inverse_transform": { + "type": "bool", + "default": false + }, + "eigen_solver": { + "type": "str", + "default": "auto", + "values": ["auto", "arpack", "dense"] + }, + "tol": { + "type": "float", + "default": 0.0, + "range": [0.0, 10.0] + }, + "max_iter": { + "type": "int", + "default": null, + "range": [0, 100] + }, + "remove_zero_eig": { + "type": "bool", + "default": false + }, + "copy_X": { + "type": "bool", + "default": true + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.PCA.json b/mlblocks_primitives/sklearn.decomposition.PCA.json index 27355d9b..83287d69 100644 --- a/mlblocks_primitives/sklearn.decomposition.PCA.json +++ b/mlblocks_primitives/sklearn.decomposition.PCA.json @@ -1,6 +1,6 @@ { "name": "sklearn.decomposition.PCA", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html", "description": "Principal component analysis (PCA)", "classifiers": { diff --git a/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json b/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json new file mode 100644 index 00000000..6342162e --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json @@ -0,0 +1,61 @@ +{ + "name": "sklearn.decomposition.TruncatedSVD", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html", + "description": "Dimensionality reduction using truncated SVD.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.TruncatedSVD", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": { + "n_components": { + "type": "int", + "default": 2, + "range": [1, 500] + }, + "algorithm": { + "type": "str", + "default": "randomized", + "values": ["arpack", "randomized"] + }, + "n_iter": { + "type": "int", + "default": 5, + "range": [1, 100] + }, + "tol": { + "type": "float", + "default": 0.0, + "range": [0.001, 0.5] + } + } + } +} From 334d4822054d86040ddd2b20ff8d459d72cf0659 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 09:54:10 +0200 Subject: [PATCH 07/32] Changed some hyperparameters to Fixed instead of Tunable --- .../sklearn.ensemble.BaggingClassifier.json | 20 +++--- .../sklearn.ensemble.BaggingRegressor.json | 20 +++--- ...sklearn.ensemble.ExtraTreesClassifier.json | 29 ++++----- .../sklearn.ensemble.ExtraTreesRegressor.json | 20 +++--- ...n.ensemble.GradientBoostingClassifier.json | 36 +++++------ ...rn.ensemble.GradientBoostingRegressor.json | 36 +++++------ .../sklearn.ensemble.IsolationForest.json | 13 ++-- ...learn.ensemble.RandomForestClassifier.json | 28 ++++---- ...ensemble.RandomForestClassifier_proba.json | 64 +++++++++++++------ ...klearn.ensemble.RandomForestRegressor.json | 21 +++--- ...sklearn.ensemble.RandomTreesEmbedding.json | 18 +++--- 11 files changed, 161 insertions(+), 144 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json index 90a27ee9..e3a4ff0c 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json @@ -45,8 +45,15 @@ }, "n_jobs": { "type": "int", - "default": 1, - "range": [-1, 10] + "default": 1 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 } }, "tunable": { @@ -76,15 +83,6 @@ "oob_score": { "type": "bool", "default": false - }, - "warm_start": { - "type": "bool", - "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json index 3dececf1..7d5d6a18 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json @@ -45,8 +45,15 @@ }, "n_jobs": { "type": "int", - "default": 1, - "range": [-1, 10] + "default": 1 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 } }, "tunable": { @@ -76,15 +83,6 @@ "oob_score": { "type": "bool", "default": false - }, - "warm_start": { - "type": "bool", - "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json index 9d53b333..4057fb5e 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json @@ -41,8 +41,20 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null } }, "tunable": { @@ -98,19 +110,6 @@ "oob_score": { "type": "bool", "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 1000] - }, - "warm_start": { - "type": "bool", - "default": false - }, - "class_weight": { - "type": "iterable", - "default": null } } } diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json index 39811888..d21fef9f 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json @@ -41,8 +41,15 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false } }, "tunable": { @@ -98,15 +105,6 @@ "oob_score": { "type": "bool", "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 1000] - }, - "warm_start": { - "type": "bool", - "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json index dae8ed39..9ea2dc21 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -38,7 +38,24 @@ ] }, "hyperparameters": { - "fixed": {}, + "fixed": { + "warm_start": { + "type": "bool", + "default": false + }, + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "presort": { + "type": "bool", + "default": "false" + } + }, "tunable": { "loss": { "type": "str", @@ -99,23 +116,6 @@ "type": "float", "default": 0.0, "range": [0.0, 1000.0] - }, - "init": { - "type": "object", - "default": null - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] - }, - "warm_start": { - "type": "bool", - "default": false - }, - "presort": { - "type": "bool", - "default": "false" } } } diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json index ae2d170b..44c86610 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -38,7 +38,24 @@ ] }, "hyperparameters": { - "fixed": {}, + "fixed": { + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "presort": { + "type": "bool", + "default": "false" + } + }, "tunable": { "loss": { "type": "str", @@ -104,23 +121,6 @@ "type": "float", "default": 0.9, "range": [0.01, 10] - }, - "init": { - "type": "object", - "default": null - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] - }, - "warm_start": { - "type": "bool", - "default": false - }, - "presort": { - "type": "bool", - "default": "false" } } } diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index 0f6dada6..d4509fe9 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -41,13 +41,17 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 }, "contamination": { "type": "float", "default": 0.1, "range": [0.0, 0.5] + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] } }, "tunable": { @@ -69,11 +73,6 @@ "bootstrap": { "type": "bool", "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index 28ee9bff..b8d23228 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -41,8 +41,19 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null } }, "tunable": { @@ -98,19 +109,6 @@ "oob_score": { "type": "bool", "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 1000] - }, - "warm_start": { - "type": "bool", - "default": false - }, - "class_weight": { - "type": "iterable", - "default": null } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json index f00490d9..ab4a79e3 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json @@ -8,8 +8,7 @@ "subtype": "classifier" }, "modalities": [], - "primitive": "sklearn.ensemble.RandomForestClassifier", - "validation_dataset": "wine", + "primitive": "sklearn.ensemble.RandomForestClassifier_proba", "fit": { "method": "fit", "args": [ @@ -42,11 +41,27 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, "criterion": { "type": "str", "default": "gini", @@ -55,32 +70,45 @@ "max_features": { "type": "str", "default": null, - "range": [null, "auto", "log2"] + "values": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 100] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0, + "range": [0.0, 0.5] }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] - }, - "class_weight": { - "type": "str", "default": null, - "range": [null, "balanced"] + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json index 947fec9d..5ed09b96 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json @@ -41,8 +41,16 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": 1 + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + }, + "warm_start": { + "type": "bool", + "default": false } }, "tunable": { @@ -97,15 +105,6 @@ "oob_score": { "type": "bool", "default": false - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 100] - }, - "warm_start": { - "type": "bool", - "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json index 9a6c2009..551a551c 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json @@ -47,6 +47,15 @@ "sparse_output": { "type": "bool", "default": true + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false } }, "tunable": { @@ -84,15 +93,6 @@ "type": "float", "default": 0.0, "range": [0.0, 1000.0] - }, - "verbose": { - "type": "int", - "default": 0, - "range": [0, 1000] - }, - "warm_start": { - "type": "bool", - "default": false } } } From d93dd795e32d211307050b2ee7f7d99527ce7c13 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 10:34:09 +0200 Subject: [PATCH 08/32] Added new hyperparams from scikitlearn 2.0 --- ...learn.ensemble.GradientBoostingClassifier.json | 15 +++++++++++++++ ...klearn.ensemble.GradientBoostingRegressor.json | 15 +++++++++++++++ .../sklearn.ensemble.IsolationForest.json | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json index 9ea2dc21..9d79ec83 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -116,6 +116,21 @@ "type": "float", "default": 0.0, "range": [0.0, 1000.0] + }, + "validation_fraction": { + "type": "float", + "default": 0.1, + "range": [0.0, 1.0] + }, + "n_iter_no_change": { + "type": "int", + "default": null, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 1e-4, + "range": [0.0, 2.0] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json index 44c86610..6766bf02 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -121,6 +121,21 @@ "type": "float", "default": 0.9, "range": [0.01, 10] + }, + "validation_fraction": { + "type": "float", + "default": 0.1, + "range": [0.0, 1.0] + }, + "n_iter_no_change": { + "type": "int", + "default": null, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 1e-4, + "range": [0.0, 2.0] } } } diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index d4509fe9..e2b803dc 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -1,7 +1,7 @@ { "name": "sklearn.ensemble.IsolationForest", "contributors": ["Carles Sala ", "Plamen Valentinov "], - "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html", + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html", "description": "Scikit-learn IsolationForest. The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.", "classifiers": { "type": "estimator", From eaf50aa098f640dac21aef7101b704199c8c3194 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 12:09:03 +0200 Subject: [PATCH 09/32] Fixed n_jobs --- mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json | 2 +- mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json | 2 +- mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json | 2 +- mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json | 2 +- mlblocks_primitives/sklearn.ensemble.IsolationForest.json | 2 +- .../sklearn.ensemble.RandomForestClassifier.json | 2 +- .../sklearn.ensemble.RandomForestClassifier_proba.json | 2 +- .../sklearn.ensemble.RandomForestRegressor.json | 2 +- mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json | 3 +-- 9 files changed, 9 insertions(+), 10 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json index e3a4ff0c..55757a7a 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json @@ -45,7 +45,7 @@ }, "n_jobs": { "type": "int", - "default": 1 + "default": null }, "warm_start": { "type": "bool", diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json index 7d5d6a18..eb16c812 100644 --- a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json @@ -45,7 +45,7 @@ }, "n_jobs": { "type": "int", - "default": 1 + "default": null }, "warm_start": { "type": "bool", diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json index 4057fb5e..f061fc58 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "verbose": { "type": "int", diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json index d21fef9f..5cf7cbd3 100644 --- a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "verbose": { "type": "int", diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index e2b803dc..3f1d6366 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "contamination": { "type": "float", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index b8d23228..256278f8 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "verbose": { "type": "int", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json index ab4a79e3..f79fad20 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "verbose": { "type": "int", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json index 5ed09b96..4db89c45 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json @@ -41,7 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1 + "default": null }, "verbose": { "type": "int", diff --git a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json index 551a551c..6a9168d9 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json @@ -41,8 +41,7 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1, - "range": [-1, 10] + "default": null }, "sparse_output": { "type": "bool", From e071532a510872f4c7cffd4f81d36fab67eb2e6a Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 12:26:59 +0200 Subject: [PATCH 10/32] Fixed some hyperparams and added new ones from sk 2.0 --- ...earn.decomposition.DictionaryLearning.json | 35 +++++++++++-------- .../sklearn.decomposition.KernelPCA.json | 13 ++++--- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json index 88f77d10..9515a05a 100644 --- a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json +++ b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json @@ -35,14 +35,29 @@ }, "hyperparameters": { "fixed": { - "split_sign": { + "n_jobs": { + "type": "int", + "default": null + }, + "code_init": { + "type": "iterable", + "default": null + }, + "dict_init": { + "type": "iterable", + "default": null + }, + "verbose": { "type": "bool", "default": false }, - "n_jobs": { - "type": "int", - "default": -1, - "range": [-1, 10] + "positive_code": { + "type": "bool", + "default": false + }, + "positive_dict": { + "type": "bool", + "default": false } }, "tunable": { @@ -85,15 +100,7 @@ "default": 1.0, "range": [0.0, 10.0] }, - "code_init": { - "type": "iterable", - "default": null - }, - "dict_init": { - "type": "iterable", - "default": null - }, - "verbose": { + "split_sign": { "type": "bool", "default": false } diff --git a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json index 184085ac..a8e5394b 100644 --- a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json +++ b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json @@ -37,9 +37,12 @@ "fixed": { "n_jobs": { "type": "int", - "default": 1, - "range": [-1, 10] - } + "default": null + }, + "copy_X": { + "type": "bool", + "default": true + } }, "tunable": { "n_components": { @@ -100,10 +103,6 @@ "remove_zero_eig": { "type": "bool", "default": false - }, - "copy_X": { - "type": "bool", - "default": true } } } From 289eabd5e8c2b8def75f42e5bd87a66770153684 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 12:30:42 +0200 Subject: [PATCH 11/32] Moved contamination to tunable --- .../sklearn.ensemble.IsolationForest.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index 3f1d6366..0522cb89 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -43,11 +43,6 @@ "type": "int", "default": null }, - "contamination": { - "type": "float", - "default": 0.1, - "range": [0.0, 0.5] - }, "verbose": { "type": "int", "default": 0, @@ -73,6 +68,11 @@ "bootstrap": { "type": "bool", "default": false + }, + "contamination": { + "type": "float", + "default": 0.1, + "range": [0.0, 0.5] } } } From 3d9dbd2861cb139d7792d28698b332d73ae3ffc6 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 3 Oct 2018 13:03:03 +0200 Subject: [PATCH 12/32] Fixes on hyperparams --- .../sklearn.decomposition.FactorAnalysis.json | 8 ++++---- .../sklearn.decomposition.FastICA.json | 19 ++++++++++--------- .../sklearn.decomposition.KernelPCA.json | 8 ++++---- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json index 6a9e7b72..9ad0d89a 100644 --- a/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json +++ b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json @@ -38,6 +38,10 @@ "copy": { "type": "bool", "default": true + }, + "noise_variance_init": { + "type": "iterable", + "default": null } }, "tunable": { @@ -56,10 +60,6 @@ "default": 1000, "range": [10, 10000] }, - "noise_variance_init": { - "type": "iterable", - "default": null - }, "svd_method": { "type": "str", "default": "randomized", diff --git a/mlblocks_primitives/sklearn.decomposition.FastICA.json b/mlblocks_primitives/sklearn.decomposition.FastICA.json index a73b3c2a..5b081c05 100644 --- a/mlblocks_primitives/sklearn.decomposition.FastICA.json +++ b/mlblocks_primitives/sklearn.decomposition.FastICA.json @@ -34,7 +34,16 @@ ] }, "hyperparameters": { - "fixed": {}, + "fixed": { + "w_init": { + "type": "iterable", + "default": null + }, + "fun_args": { + "type": "iterable", + "default": null + } + }, "tunable": { "n_components": { "type": "int", @@ -55,10 +64,6 @@ "default": "logcosh", "values": ["logcosh", "exp", "cube"] }, - "fun_args": { - "type": "iterable", - "default": null - }, "max_iter": { "type": "int", "default": 200, @@ -68,10 +73,6 @@ "type": "float", "default": 0.0001, "range": [0.00001, 0.5] - }, - "w_init": { - "type": "iterable", - "default": null } } } diff --git a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json index a8e5394b..3fc2cbde 100644 --- a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json +++ b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json @@ -42,6 +42,10 @@ "copy_X": { "type": "bool", "default": true + }, + "kernel_params": { + "type": "str", + "default": null } }, "tunable": { @@ -72,10 +76,6 @@ "default": 1.0, "range": [0.0, 10.0] }, - "kernel_params": { - "type": "str", - "default": null - }, "alpha": { "type": "int", "default": 1, From c4a3a0e45918924f6725661e9ce22b674adc6744 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 17 Oct 2018 14:23:18 +0200 Subject: [PATCH 13/32] Updated scikit requirement --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8a1257d..b5a261d8 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'opencv-python>=3.4.0.12', 'python-louvain>=0.10', 'scikit-image>=0.13.1', - 'scikit-learn>=0.19.1', + 'scikit-learn>=0.20', 'scipy>=1.1.0', 'tensorflow==1.8.0', 'xgboost>=0.72.1', From 08bf926d665fed814beec338dc2d76f6700f0f8d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Wed, 17 Oct 2018 14:27:28 +0200 Subject: [PATCH 14/32] Updated scikit requirement. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 797d6086..86b396b7 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ 'opencv-python>=3.4.0.12', 'python-louvain>=0.10', 'scikit-image>=0.13.1', - 'scikit-learn>=0.19.1', + 'scikit-learn>=0.20', 'scipy>=1.1.0', 'tensorflow==1.8.0', 'xgboost>=0.72.1', From 738eceaddb214be137b1efcea94fc94fb706ee3c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 30 Nov 2018 15:03:24 +0100 Subject: [PATCH 15/32] Created piplenes with sklearn.ensemble classifiers --- .../sklearn.ensemble.AdaBoostClassifier.json | 24 +++++++++++++++++ .../sklearn.ensemble.BaggingClassifier.json | 26 ++++++++++++++++++ ...sklearn.ensemble.ExtraTreesClassifier.json | 24 +++++++++++++++++ ...n.ensemble.GradientBoostingClassifier.json | 27 +++++++++++++++++++ ...learn.ensemble.RandomForestClassifier.json | 26 ++++++++++++++++++ ...n.ensemble.GradientBoostingClassifier.json | 2 +- 6 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json new file mode 100644 index 00000000..54d79d18 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json @@ -0,0 +1,24 @@ +{ + "metadata": { + "name": "AdaBoostClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.AdaBoostClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "sklearn.ensemble.AdaBoostClassifier#1": { + "learning_rate": 0.1, + "n_estimators": 300 + } + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json new file mode 100644 index 00000000..c8a8d4a5 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json @@ -0,0 +1,26 @@ +{ + "metadata": { + "name": "BaggingClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.BaggingClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "sklearn.ensemble.BaggingClassifier#1": { + "n_jobs": -1, + "n_estimators": 300, + "max_samples": 1.0, + "bootstrap_features": true + } + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json new file mode 100644 index 00000000..50660710 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json @@ -0,0 +1,24 @@ +{ + "metadata": { + "name": "ExtraTreesClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.ExtraTreesClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "sklearn.ensemble.ExtraTreesClassifier#1": { + "n_jobs": -1, + "n_estimators": 300 + } + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json new file mode 100644 index 00000000..8cc73441 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json @@ -0,0 +1,27 @@ +{ + "metadata": { + "name": "GradientBoosting/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.GradientBoostingClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "sklearn.ensemble.GradientBoostingClassifier#1": { + "warm_start": true, + "presort": true, + "learning_rate": 0.1, + "n_estimators": 300, + "max_depth": 5 + } + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json new file mode 100644 index 00000000..1ea222eb --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json @@ -0,0 +1,26 @@ +{ + "metadata": { + "name": "RandomForestClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "sklearn.ensemble.RandomForestClassifier#1": { + "n_jobs": -1, + "n_estimators": 300, + "criterion": "entropy", + "warm_start": true + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json index 9d79ec83..574bcaa0 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -53,7 +53,7 @@ }, "presort": { "type": "bool", - "default": "false" + "default": false } }, "tunable": { From a293c7221639b53baa10beefa7d5d9e2b8fa3363 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 30 Nov 2018 17:19:26 +0100 Subject: [PATCH 16/32] Created piplenes with sklearn.ensemble regressors --- .../sklearn.ensemble.AdaBoostRegressor.json | 22 ++++++++++++++++ .../sklearn.ensemble.BaggingRegressor.json | 25 +++++++++++++++++++ .../sklearn.ensemble.ExtraTreesRegressor.json | 22 ++++++++++++++++ ...rn.ensemble.GradientBoostingRegressor.json | 23 +++++++++++++++++ ...klearn.ensemble.RandomForestRegressor.json | 23 +++++++++++++++++ ...rn.ensemble.GradientBoostingRegressor.json | 2 +- 6 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json create mode 100644 mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json new file mode 100644 index 00000000..87155b9a --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json @@ -0,0 +1,22 @@ +{ + "metadata": { + "name": "AdaBoostRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.AdaBoostRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.AdaBoostRegressor#1": { + "learning_rate": 0.1, + "n_estimators": 300 + } + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json new file mode 100644 index 00000000..3e3feb59 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json @@ -0,0 +1,25 @@ +{ + "metadata": { + "name": "BaggingRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.BaggingRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.BaggingRegressor#1": { + "n_jobs": -1, + "warm_start": true, + "n_estimators": 300 + + } + } +} + diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json new file mode 100644 index 00000000..c58e75d0 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json @@ -0,0 +1,22 @@ +{ + "metadata": { + "name": "ExtraTreesRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.ExtraTreesRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.ExtraTreesRegressor#1": { + "n_jobs": -1, + "n_estimators": 300 + } + } +} \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json new file mode 100644 index 00000000..ec56d0d6 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json @@ -0,0 +1,23 @@ +{ + "metadata": { + "name": "GradientBoostingRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.GradientBoostingRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.GradientBoostingRegressor#1": { + "learning_rate": 0.1, + "n_estimators": 300, + "verbose": 1 + } + } +} \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json new file mode 100644 index 00000000..be89b1d1 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json @@ -0,0 +1,23 @@ +{ + "metadata": { + "name": "RandomForestRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.RandomForestRegressor#1": { + "n_jobs": -1, + "warm_start": true, + "n_estimators": 300 + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json index 6766bf02..8659dd74 100644 --- a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -53,7 +53,7 @@ }, "presort": { "type": "bool", - "default": "false" + "default": false } }, "tunable": { From 8965a4df92aa29f4a2122c06d3b55da3affb53d7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 30 Nov 2018 17:33:34 +0100 Subject: [PATCH 17/32] Pipeline for Isolation Forest --- .../sklearn.ensemble.IsolationForest.json | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 mlblocks_pipelines/sklearn.ensemble.IsolationForest.json diff --git a/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json new file mode 100644 index 00000000..49135795 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json @@ -0,0 +1,24 @@ +{ + "metadata": { + "name": "IsolationForest/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.IsolationForest" + ], + "hyperparameters": { + "sklearn.ensemble.IsolationForest#1": { + "n_jobs": -1, + "n_estimators": 300, + "contamination": 0.2 + } + } +} From 844f2e9cda1b41355e614c7ef778ebdd1e147ba3 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 30 Nov 2018 17:34:37 +0100 Subject: [PATCH 18/32] Fix isolation forest hyperparameter --- mlblocks_primitives/sklearn.ensemble.IsolationForest.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json index 0522cb89..44090512 100644 --- a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -56,9 +56,8 @@ "range": [1, 500] }, "max_samples": { - "type": "int", - "default": "1", - "range": [0, 100] + "type": "str", + "default": "auto" }, "max_features": { "type": "float", From 2dc8f90114c5b2e8434b134b2235627d62682378 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Mon, 3 Dec 2018 12:57:43 +0100 Subject: [PATCH 19/32] Fix classification pipelines --- mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json | 3 +-- mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json | 3 +-- mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json | 3 +-- .../sklearn.ensemble.GradientBoostingClassifier.json | 3 +-- .../sklearn.ensemble.RandomForestClassifier.json | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json index 54d79d18..2e1e025e 100644 --- a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json @@ -12,8 +12,7 @@ "mlprimitives.preprocessing.ClassEncoder", "sklearn.preprocessing.Imputer", "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.AdaBoostClassifier", - "mlprimitives.preprocessing.ClassDecoder" + "sklearn.ensemble.AdaBoostClassifier" ], "hyperparameters": { "sklearn.ensemble.AdaBoostClassifier#1": { diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json index c8a8d4a5..74546278 100644 --- a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json @@ -12,8 +12,7 @@ "mlprimitives.preprocessing.ClassEncoder", "sklearn.preprocessing.Imputer", "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.BaggingClassifier", - "mlprimitives.preprocessing.ClassDecoder" + "sklearn.ensemble.BaggingClassifier" ], "hyperparameters": { "sklearn.ensemble.BaggingClassifier#1": { diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json index 50660710..d2126353 100644 --- a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json @@ -12,8 +12,7 @@ "mlprimitives.preprocessing.ClassEncoder", "sklearn.preprocessing.Imputer", "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.ExtraTreesClassifier", - "mlprimitives.preprocessing.ClassDecoder" + "sklearn.ensemble.ExtraTreesClassifier" ], "hyperparameters": { "sklearn.ensemble.ExtraTreesClassifier#1": { diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json index 8cc73441..ad8090c3 100644 --- a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json @@ -12,8 +12,7 @@ "mlprimitives.preprocessing.ClassEncoder", "sklearn.preprocessing.Imputer", "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.GradientBoostingClassifier", - "mlprimitives.preprocessing.ClassDecoder" + "sklearn.ensemble.GradientBoostingClassifier" ], "hyperparameters": { "sklearn.ensemble.GradientBoostingClassifier#1": { diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json index 1ea222eb..655b52d7 100644 --- a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json @@ -12,8 +12,7 @@ "mlprimitives.preprocessing.ClassEncoder", "sklearn.preprocessing.Imputer", "sklearn.preprocessing.StandardScaler", - "sklearn.ensemble.RandomForestClassifier", - "mlprimitives.preprocessing.ClassDecoder" + "sklearn.ensemble.RandomForestClassifier" ], "hyperparameters": { "sklearn.ensemble.RandomForestClassifier#1": { From 41ab90fe544834844f5b779a19eb28e8e55f0745 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Mon, 3 Dec 2018 14:55:54 +0100 Subject: [PATCH 20/32] Created pipeline tests for sklearn.decomposition --- ...earn.decomposition.DictionaryLearning.json | 20 +++++++++++++++++++ .../sklearn.decomposition.FactorAnalysis.json | 20 +++++++++++++++++++ .../sklearn.decomposition.FastICA.json | 19 ++++++++++++++++++ .../sklearn.decomposition.KernelPCA.json | 19 ++++++++++++++++++ .../sklearn.decomposition.PCA.json | 20 +++++++++++++++++++ .../sklearn.decomposition.TruncatedSVD.json | 20 +++++++++++++++++++ 6 files changed, 118 insertions(+) create mode 100644 mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json create mode 100644 mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json create mode 100644 mlblocks_pipelines/sklearn.decomposition.FastICA.json create mode 100644 mlblocks_pipelines/sklearn.decomposition.KernelPCA.json create mode 100644 mlblocks_pipelines/sklearn.decomposition.PCA.json create mode 100644 mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json diff --git a/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json b/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json new file mode 100644 index 00000000..3658d1da --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "FactorAnalysis/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FactorAnalysis", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FactorAnalysis#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json b/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json new file mode 100644 index 00000000..3658d1da --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "FactorAnalysis/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FactorAnalysis", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FactorAnalysis#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.FastICA.json b/mlblocks_pipelines/sklearn.decomposition.FastICA.json new file mode 100644 index 00000000..488b1751 --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.FastICA.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "FastICA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FastICA", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FastICA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json b/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json new file mode 100644 index 00000000..13e2c99f --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "KernelPCA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.KernelPCA", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.KernelPCA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.PCA.json b/mlblocks_pipelines/sklearn.decomposition.PCA.json new file mode 100644 index 00000000..1cfb9084 --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.PCA.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "PCA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.PCA", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.PCA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json b/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json new file mode 100644 index 00000000..ccecc11f --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "TruncatedSVD/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.TruncatedSVD", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.TruncatedSVD#1": {} + } +} From d65828815e3a12247069e3e4422e99f77e2d71ae Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Mon, 3 Dec 2018 15:57:58 +0100 Subject: [PATCH 21/32] Changed sklearn.ensemble pipelines to have default values and added RandomTreesEmbedding --- .../sklearn.ensemble.AdaBoostClassifier.json | 5 +---- .../sklearn.ensemble.AdaBoostRegressor.json | 5 +---- .../sklearn.ensemble.BaggingClassifier.json | 7 +------ .../sklearn.ensemble.BaggingRegressor.json | 7 +------ ...sklearn.ensemble.ExtraTreesClassifier.json | 5 +---- .../sklearn.ensemble.ExtraTreesRegressor.json | 5 +---- ...n.ensemble.GradientBoostingClassifier.json | 8 +------ ...rn.ensemble.GradientBoostingRegressor.json | 6 +----- .../sklearn.ensemble.IsolationForest.json | 6 +----- ...learn.ensemble.RandomForestClassifier.json | 7 +------ ...klearn.ensemble.RandomForestRegressor.json | 6 +----- ...sklearn.ensemble.RandomTreesEmbedding.json | 21 +++++++++++++++++++ 12 files changed, 32 insertions(+), 56 deletions(-) create mode 100644 mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json index 2e1e025e..414524fe 100644 --- a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json @@ -15,9 +15,6 @@ "sklearn.ensemble.AdaBoostClassifier" ], "hyperparameters": { - "sklearn.ensemble.AdaBoostClassifier#1": { - "learning_rate": 0.1, - "n_estimators": 300 - } + "sklearn.ensemble.AdaBoostClassifier#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json index 87155b9a..74277d7f 100644 --- a/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json @@ -14,9 +14,6 @@ "sklearn.ensemble.AdaBoostRegressor" ], "hyperparameters": { - "sklearn.ensemble.AdaBoostRegressor#1": { - "learning_rate": 0.1, - "n_estimators": 300 - } + "sklearn.ensemble.AdaBoostRegressor#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json index 74546278..c3cd98b4 100644 --- a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json @@ -15,11 +15,6 @@ "sklearn.ensemble.BaggingClassifier" ], "hyperparameters": { - "sklearn.ensemble.BaggingClassifier#1": { - "n_jobs": -1, - "n_estimators": 300, - "max_samples": 1.0, - "bootstrap_features": true - } + "sklearn.ensemble.BaggingClassifier#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json index 3e3feb59..4f6dfdb4 100644 --- a/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json @@ -14,12 +14,7 @@ "sklearn.ensemble.BaggingRegressor" ], "hyperparameters": { - "sklearn.ensemble.BaggingRegressor#1": { - "n_jobs": -1, - "warm_start": true, - "n_estimators": 300 - - } + "sklearn.ensemble.BaggingRegressor#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json index d2126353..49790e86 100644 --- a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json @@ -15,9 +15,6 @@ "sklearn.ensemble.ExtraTreesClassifier" ], "hyperparameters": { - "sklearn.ensemble.ExtraTreesClassifier#1": { - "n_jobs": -1, - "n_estimators": 300 - } + "sklearn.ensemble.ExtraTreesClassifier#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json index c58e75d0..00b305c5 100644 --- a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json @@ -14,9 +14,6 @@ "sklearn.ensemble.ExtraTreesRegressor" ], "hyperparameters": { - "sklearn.ensemble.ExtraTreesRegressor#1": { - "n_jobs": -1, - "n_estimators": 300 - } + "sklearn.ensemble.ExtraTreesRegressor#1": {} } } \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json index ad8090c3..7a92adfa 100644 --- a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json @@ -15,12 +15,6 @@ "sklearn.ensemble.GradientBoostingClassifier" ], "hyperparameters": { - "sklearn.ensemble.GradientBoostingClassifier#1": { - "warm_start": true, - "presort": true, - "learning_rate": 0.1, - "n_estimators": 300, - "max_depth": 5 - } + "sklearn.ensemble.GradientBoostingClassifier#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json index ec56d0d6..6d4ff0d7 100644 --- a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json @@ -14,10 +14,6 @@ "sklearn.ensemble.GradientBoostingRegressor" ], "hyperparameters": { - "sklearn.ensemble.GradientBoostingRegressor#1": { - "learning_rate": 0.1, - "n_estimators": 300, - "verbose": 1 - } + "sklearn.ensemble.GradientBoostingRegressor#1": {} } } \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json index 49135795..9b684039 100644 --- a/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json +++ b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json @@ -15,10 +15,6 @@ "sklearn.ensemble.IsolationForest" ], "hyperparameters": { - "sklearn.ensemble.IsolationForest#1": { - "n_jobs": -1, - "n_estimators": 300, - "contamination": 0.2 - } + "sklearn.ensemble.IsolationForest#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json index 655b52d7..747f6118 100644 --- a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json @@ -15,11 +15,6 @@ "sklearn.ensemble.RandomForestClassifier" ], "hyperparameters": { - "sklearn.ensemble.RandomForestClassifier#1": { - "n_jobs": -1, - "n_estimators": 300, - "criterion": "entropy", - "warm_start": true - } + "sklearn.ensemble.RandomForestClassifier#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json index be89b1d1..0c9985e5 100644 --- a/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json @@ -14,10 +14,6 @@ "sklearn.ensemble.RandomForestRegressor" ], "hyperparameters": { - "sklearn.ensemble.RandomForestRegressor#1": { - "n_jobs": -1, - "warm_start": true, - "n_estimators": 300 - } + "sklearn.ensemble.RandomForestRegressor#1": {} } } diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json new file mode 100644 index 00000000..d78157de --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json @@ -0,0 +1,21 @@ +{ + "metadata": { + "name": "RandomTreesEmbedding/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomTreesEmbedding", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.RandomTreesEmbedding#1": {} + } +} From 669b719bd022b985655bc5d223e7dd3672cfa090 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Tue, 18 Dec 2018 15:54:27 +0200 Subject: [PATCH 22/32] Issue 47: added primitive for LSTM TimeSeries Regressor --- ...as.Sequential.LSTMTimeSeriesRegressor.json | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json diff --git a/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json new file mode 100644 index 00000000..82fc5948 --- /dev/null +++ b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json @@ -0,0 +1,122 @@ +{ + "name": "keras.Sequential.LSTMTimeSeriesRegressor", + "author": "Ihssan Tinawi ", + "documentation": "", + "description": "This primitive consists of multiple Keras layers that can pass time-series data through an LSTM in order to predict the value at x_{t+1}", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "mlprimitives.adapters.keras.Sequential", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "array" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "array" + } + ] + }, + "hyperparameters": { + "fixed": { + "input_length": { + "type": "int", + "default": 1500 + }, + "dense_units": { + "type": "int", + "description": "Number of classes" + }, + "classification": { + "type": "bool", + "default": false + }, + "dense_activation": { + "type": "str", + "default": "tanh" + }, + "optimizer": { + "type": "str", + "default": "keras.optimizers.Adam" + }, + "loss": { + "type": "str", + "default": "keras.losses.mean_squared_error" + }, + "metrics": { + "type": "list", + "default": [ + "accuracy" + ] + }, + "layers": { + "type": "list", + "default": [ + { + "class": "keras.layers.Input", + "parameters": { + "shape": "input_shape" + } + }, + { + "class": "keras.layers.Dropout", + "parameters": { + "rate": "dropout_rate" + } + }, + { + "class": "keras.layers.LSTM", + "parameters": { + "units": "lstm_units" + } + }, + { + "class": "keras.layers.Dense", + "parameters": { + "units": "dense_units", + "activation": "dense_activation" + } + } + ] + } + }, + "tunable": { + "lstm_units": { + "type": "int", + "default": 50, + "range": [ + 1, + 500 + ] + }, + "dropout_rate": { + "type": "float", + "default": 0.1, + "range": [ + 0.01, + 0.75 + ] + } + } + } +} From 31631c97b95b9fd538e8c8459f5aaa5efd2868d4 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Wed, 19 Dec 2018 18:15:27 +0200 Subject: [PATCH 23/32] Issue 53: added time series primitives and function --- ...ves.timeseries.aggregate_average_time.json | 42 +++++++++++++++ ...ves.timeseries.create_window_sequence.json | 41 +++++++++++++++ mlprimitives/timeseries.py | 52 +++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json create mode 100644 mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json create mode 100644 mlprimitives/timeseries.py diff --git a/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json b/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json new file mode 100644 index 00000000..63e1a3af --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json @@ -0,0 +1,42 @@ +{ + "name": "mlprimitives.timeseries.aggregate_average_time", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries.aggregate_average_time", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries.aggregate_average_time", + "produce": { + "args": [ + { + "name": "df_time_value", + "type": "Pandas.DataFrame" + }, + { + "name": "interval_time_delta", + "type": "int" + }, + { + "name": "start_time", + "type": "int" + }, + { + "name": "end_time", + "type": "int" + } + + ], + "output": [ + { + "name": "aggregated_df", + "type": "Pandas.DataFrame" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": {} + } +} diff --git a/mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json b/mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json new file mode 100644 index 00000000..8d96f29f --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json @@ -0,0 +1,41 @@ +{ + "name": "mlprimitives.timeseries.create_window_sequence", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries.create_window_sequence", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries.create_window_sequence", + "produce": { + "args": [ + { + "name": "df_timeseries", + "type": "Pandas.DataFrame" + }, + { + "name": "window_size", + "type": "int" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "Y", + "type": "ndarray" + }, + { + "name": "time", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": {} + } +} diff --git a/mlprimitives/timeseries.py b/mlprimitives/timeseries.py new file mode 100644 index 00000000..4681c296 --- /dev/null +++ b/mlprimitives/timeseries.py @@ -0,0 +1,52 @@ +import pandas as pd +import time + + +def create_window_sequences(df_timeseries, window_size): + """ + Function that takes in a Pandas.DataFrame and a window_size then creates output arrays that correspond to a timeseries sequence with window_size overlap. The output arrays can be fed into a timeseries forecasting model. + Inputs: + df_timeseries (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). + window_size (int): number of values that overlap to create the sequence. + Outputs: + x (numpy.ndarray): contains the time series sequenced data. + y (numpy.ndarray): acts as the label for the forecasting problem. + time (numpy.ndarray): the corresponding timestamps series. + """ + X = [] + Y = [] + time = [] + for i in range(len(df) - window_size): + X.append(df[i:i+window_size]['value'].values.copy().reshape([-1, 1])) + Y.append(df[i+1:i+window_size+1]['value'].values.copy().reshape([-1, 1])) + time.append(df.iloc[i+window_size]['timestamp']) + + return np.asarray(X), np.asarray(Y), np.asarray(time) + + +def aggregate_average_time(df_time_value, interval_time_delta, start_time, end_time): + """ + Function that aggregates data in a Pandas dataframe by averaging over a given interval. It starts averaging from specified start_time. + Inputs: + df_time_value (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). + interval_time_delta (int): an Integer denoting the number of seconds in the desired interval. + start_time (int): a UNIX time stamp indicating the time to start aggregating. Can be smaller than the smallest time stamp value in the dataframe. + end_time (int): a UNIX time stamp indicating the time to end aggregating. Can be larger than the largest time stamp value in the dataframe. + + Outputs: + aggregated_df (Pandas.DataFrame): a Pandas dataframe with two colums ('timestamp' and 'value'), where each `timestamp` is the starting time of an interval and the `value` is the result of aggregation. For intervals that don't have data in df_time_value but are still included in start_time and end_time then the value will be NaN. + + """ + start_ts = start_time + accepted_points = [] + while start_ts < end_time: + # average the values between start_ts, [start_ts + timedelta (e.g. 6hrs)] + upper_ts = start_ts + time_delta + mask = (df_time_value['timestamp'] > start_ts) & (df_time_value['timestamp'] <= upper_ts) + average_value = df.loc[mask]['value'].mean(skipna=True) + + accepted_points.append([start_ts, average_value]) + start_ts = upper_ts # update the timestamp + + new_df = pd.DataFrame(accepted_points, columns=['timestamp','value']) + return new_df From feb64b8b0c8c984f50feb8c9a229ebab248dab80 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Wed, 19 Dec 2018 19:32:13 +0200 Subject: [PATCH 24/32] Issue 53: fixed lint and syntax errors --- ...s.timeseries.create_window_sequences.json} | 6 +-- mlprimitives/timeseries.py | 51 +++++++++++-------- 2 files changed, 34 insertions(+), 23 deletions(-) rename mlblocks_primitives/{mlprimitives.timeseries.create_window_sequence.json => mlprimitives.timeseries.create_window_sequences.json} (92%) diff --git a/mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json b/mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json similarity index 92% rename from mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json rename to mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json index 8d96f29f..51ccd336 100644 --- a/mlblocks_primitives/mlprimitives.timeseries.create_window_sequence.json +++ b/mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json @@ -1,13 +1,13 @@ { - "name": "mlprimitives.timeseries.create_window_sequence", + "name": "mlprimitives.timeseries.create_window_sequences", "author": "Ihssan Tinawi ", - "description": "mlprimitives.timeseries.create_window_sequence", + "description": "mlprimitives.timeseries.create_window_sequences", "classifiers": { "type": "preprocessor", "subtype": "feature_extractor" }, "modalities": ["timeseries"], - "primitive": "mlprimitives.timeseries.create_window_sequence", + "primitive": "mlprimitives.timeseries.create_window_sequences", "produce": { "args": [ { diff --git a/mlprimitives/timeseries.py b/mlprimitives/timeseries.py index 4681c296..317a422e 100644 --- a/mlprimitives/timeseries.py +++ b/mlprimitives/timeseries.py @@ -1,12 +1,16 @@ import pandas as pd -import time +import numpy as np def create_window_sequences(df_timeseries, window_size): """ - Function that takes in a Pandas.DataFrame and a window_size then creates output arrays that correspond to a timeseries sequence with window_size overlap. The output arrays can be fed into a timeseries forecasting model. + Function that takes in a Pandas.DataFrame and a window_size then creates + output arrays that correspond to a timeseries sequence with window_size overlap. + The output arrays can be fed into a timeseries forecasting model. Inputs: - df_timeseries (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). + df_timeseries (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' + and 'value' columns, and is sorted based on timestamp. + The timestamp column is in UNIX format (in seconds). window_size (int): number of values that overlap to create the sequence. Outputs: x (numpy.ndarray): contains the time series sequenced data. @@ -16,37 +20,44 @@ def create_window_sequences(df_timeseries, window_size): X = [] Y = [] time = [] - for i in range(len(df) - window_size): - X.append(df[i:i+window_size]['value'].values.copy().reshape([-1, 1])) - Y.append(df[i+1:i+window_size+1]['value'].values.copy().reshape([-1, 1])) - time.append(df.iloc[i+window_size]['timestamp']) - + for i in range(len(df_timeseries) - window_size): + X.append(df_timeseries[i: i + window_size]['value'].values.copy().reshape([-1, 1])) + Y.append(df_timeseries[i + 1: i + window_size + 1]['value'].values.copy().reshape([-1, 1])) + time.append(df_timeseries.iloc[i + window_size]['timestamp']) return np.asarray(X), np.asarray(Y), np.asarray(time) def aggregate_average_time(df_time_value, interval_time_delta, start_time, end_time): """ - Function that aggregates data in a Pandas dataframe by averaging over a given interval. It starts averaging from specified start_time. + Function that aggregates data in a Pandas dataframe by averaging over a given interval. + It starts averaging from specified start_time. Inputs: - df_time_value (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). - interval_time_delta (int): an Integer denoting the number of seconds in the desired interval. - start_time (int): a UNIX time stamp indicating the time to start aggregating. Can be smaller than the smallest time stamp value in the dataframe. - end_time (int): a UNIX time stamp indicating the time to end aggregating. Can be larger than the largest time stamp value in the dataframe. - + df_time_value (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' + and 'value' columns, and is sorted based on timestamp. The timestamp + column is in UNIX format (in seconds). + interval_time_delta (int): an Integer denoting the number of seconds + in the desired interval. + start_time (int): a UNIX time stamp indicating the time to start + aggregating. Can be smaller than the smallest time stamp value in the dataframe. + end_time (int): a UNIX time stamp indicating the time to end aggregating. + Can be larger than the largest time stamp value in the dataframe. Outputs: - aggregated_df (Pandas.DataFrame): a Pandas dataframe with two colums ('timestamp' and 'value'), where each `timestamp` is the starting time of an interval and the `value` is the result of aggregation. For intervals that don't have data in df_time_value but are still included in start_time and end_time then the value will be NaN. - + aggregated_df (Pandas.DataFrame): a Pandas dataframe with two colums + ('timestamp' and 'value'), where each `timestamp` is the starting time of + an interval and the `value` is the result of aggregation. For intervals that + don't have data in df_time_value but are still included in start_time + and end_time then the value will be NaN. """ start_ts = start_time accepted_points = [] while start_ts < end_time: # average the values between start_ts, [start_ts + timedelta (e.g. 6hrs)] - upper_ts = start_ts + time_delta + upper_ts = start_ts + interval_time_delta mask = (df_time_value['timestamp'] > start_ts) & (df_time_value['timestamp'] <= upper_ts) - average_value = df.loc[mask]['value'].mean(skipna=True) + average_value = df_time_value.loc[mask]['value'].mean(skipna=True) accepted_points.append([start_ts, average_value]) - start_ts = upper_ts # update the timestamp + start_ts = upper_ts # update the timestamp - new_df = pd.DataFrame(accepted_points, columns=['timestamp','value']) + new_df = pd.DataFrame(accepted_points, columns=['timestamp', 'value']) return new_df From 4e6f32102fddd5104e3b76d48c8489863889088c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 19 Dec 2018 18:55:36 +0100 Subject: [PATCH 25/32] add google-compute-engine which travis complains about --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4d028100..ece8de69 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ tests_require = [ 'mlblocks>=0.2.0', 'pytest>=3.4.2', + 'google-compute-engine==2.8.12', # required by travis ] From 46c1e98c22d6d6f68a38dfec3ba0ebf04d3dcd8e Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 20 Dec 2018 08:03:29 +0200 Subject: [PATCH 26/32] rewrote signatures of methods and adjusted json files accordingly --- ...ves.timeseries.aggregate_average_time.json | 33 ++++---- ....timeseries.rolling_window_sequences.json} | 22 +++--- mlprimitives/timeseries.py | 78 ++++++++++--------- 3 files changed, 70 insertions(+), 63 deletions(-) rename mlblocks_primitives/{mlprimitives.timeseries.create_window_sequences.json => mlprimitives.timeseries.rolling_window_sequences.json} (58%) diff --git a/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json b/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json index 63e1a3af..1703e604 100644 --- a/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json +++ b/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json @@ -1,42 +1,43 @@ { - "name": "mlprimitives.timeseries.aggregate_average_time", + "name": "mlprimitives.timeseries.time_segments_average", "author": "Ihssan Tinawi ", - "description": "mlprimitives.timeseries.aggregate_average_time", + "description": "mlprimitives.timeseries.time_segments_average", "classifiers": { "type": "preprocessor", "subtype": "feature_extractor" }, "modalities": ["timeseries"], - "primitive": "mlprimitives.timeseries.aggregate_average_time", + "primitive": "mlprimitives.timeseries.time_segments_average", "produce": { "args": [ { - "name": "df_time_value", - "type": "Pandas.DataFrame" + "name": "X", + "keyword": "time_value", + "type": "pandas.DataFrame" }, { - "name": "interval_time_delta", - "type": "int" + "name": "value_column", + "type": "str" }, { - "name": "start_time", - "type": "int" - }, - { - "name": "end_time", - "type": "int" + "name": "name_column", + "type": "str" } - ], "output": [ { "name": "aggregated_df", - "type": "Pandas.DataFrame" + "type": "pandas.DataFrame" } ] }, "hyperparameters": { - "fixed": {}, + "fixed": { + "interval": { + "type": "int", + "default": 3600 + } + }, "tunable": {} } } diff --git a/mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json b/mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json similarity index 58% rename from mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json rename to mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json index 51ccd336..92b4f034 100644 --- a/mlblocks_primitives/mlprimitives.timeseries.create_window_sequences.json +++ b/mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json @@ -1,22 +1,19 @@ { - "name": "mlprimitives.timeseries.create_window_sequences", + "name": "mlprimitives.timeseries.rolling_window_sequences", "author": "Ihssan Tinawi ", - "description": "mlprimitives.timeseries.create_window_sequences", + "description": "mlprimitives.timeseries.rolling_window_sequences", "classifiers": { "type": "preprocessor", "subtype": "feature_extractor" }, "modalities": ["timeseries"], - "primitive": "mlprimitives.timeseries.create_window_sequences", + "primitive": "mlprimitives.timeseries.rolling_window_sequences", "produce": { "args": [ { - "name": "df_timeseries", - "type": "Pandas.DataFrame" - }, - { - "name": "window_size", - "type": "int" + "name": "X", + "keyword": "time_value", + "type": "pandas.DataFrame" } ], "output": [ @@ -35,7 +32,12 @@ ] }, "hyperparameters": { - "fixed": {}, + "fixed": { + "window_size": { + "type": "int", + "default": 50 + } + }, "tunable": {} } } diff --git a/mlprimitives/timeseries.py b/mlprimitives/timeseries.py index 317a422e..c982b914 100644 --- a/mlprimitives/timeseries.py +++ b/mlprimitives/timeseries.py @@ -2,62 +2,66 @@ import numpy as np -def create_window_sequences(df_timeseries, window_size): +def rolling_window_sequences(X, window_size, value_column, time_column): """ - Function that takes in a Pandas.DataFrame and a window_size then creates + Function that takes in a pandas.DataFrame and a window_size then creates output arrays that correspond to a timeseries sequence with window_size overlap. The output arrays can be fed into a timeseries forecasting model. - Inputs: - df_timeseries (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' + Assumes the input is timeseries sorted. + Args: + X (pandas.DataFrame): a pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). window_size (int): number of values that overlap to create the sequence. - Outputs: - x (numpy.ndarray): contains the time series sequenced data. - y (numpy.ndarray): acts as the label for the forecasting problem. - time (numpy.ndarray): the corresponding timestamps series. + value_column (string): name of column that has the value field. + time_column (string): name of column that has the time field. + Returns: + (numpy.ndarray): contains the time series sequenced data with each + entry having window_size rows. + (numpy.ndarray): acts as the label for the forecasting problem with + each entry having window_size rows. + (numpy.ndarray): the corresponding timestamps series. """ - X = [] + output_X = [] Y = [] time = [] - for i in range(len(df_timeseries) - window_size): - X.append(df_timeseries[i: i + window_size]['value'].values.copy().reshape([-1, 1])) - Y.append(df_timeseries[i + 1: i + window_size + 1]['value'].values.copy().reshape([-1, 1])) - time.append(df_timeseries.iloc[i + window_size]['timestamp']) - return np.asarray(X), np.asarray(Y), np.asarray(time) + for i in range(len(X) - window_size): + # reshape into a vector to fit into a neural network model (vectorize it) + output_X.append(X[i: i + window_size][value_column].values.copy().reshape([-1, 1])) + Y.append(X[i + window_size + 1][value_column].values.copy().reshape([-1, 1])) + time.append(X.iloc[i + window_size][time_column]) + return np.asarray(output_X), np.asarray(Y), np.asarray(time) -def aggregate_average_time(df_time_value, interval_time_delta, start_time, end_time): + +def time_segments_average(X, interval, value_column, time_column): """ - Function that aggregates data in a Pandas dataframe by averaging over a given interval. - It starts averaging from specified start_time. - Inputs: - df_time_value (Pandas.DataFrame): a Pandas dataframe which has 'timestamp' - and 'value' columns, and is sorted based on timestamp. The timestamp - column is in UNIX format (in seconds). - interval_time_delta (int): an Integer denoting the number of seconds + function that aggregates data in a pandas dataframe by averaging over a given interval. + it starts averaging from the smallest timestamp in the dataframe and ends at the + largest timestamp. assumes the input is timeseries sorted. + args: + X (pandas.dataframe): a pandas dataframe which has 'timestamp' + and 'value' columns, and is sorted based on timestamp. the timestamp + column is in unix format (in seconds). + interval (int): an integer denoting the number of seconds in the desired interval. - start_time (int): a UNIX time stamp indicating the time to start - aggregating. Can be smaller than the smallest time stamp value in the dataframe. - end_time (int): a UNIX time stamp indicating the time to end aggregating. - Can be larger than the largest time stamp value in the dataframe. - Outputs: - aggregated_df (Pandas.DataFrame): a Pandas dataframe with two colums + value_column (string): name of column that has the value field. + time_column (string): name of column that has the time field. + returns: + pandas.dataframe: a pandas dataframe with two colums ('timestamp' and 'value'), where each `timestamp` is the starting time of - an interval and the `value` is the result of aggregation. For intervals that - don't have data in df_time_value but are still included in start_time - and end_time then the value will be NaN. + an interval and the `value` is the result of aggregation. """ - start_ts = start_time + start_ts = X[time_column].iloc[0] # min value + end_time = X[time_column].iloc[-1] # max value in dataframe accepted_points = [] while start_ts < end_time: # average the values between start_ts, [start_ts + timedelta (e.g. 6hrs)] - upper_ts = start_ts + interval_time_delta - mask = (df_time_value['timestamp'] > start_ts) & (df_time_value['timestamp'] <= upper_ts) - average_value = df_time_value.loc[mask]['value'].mean(skipna=True) + upper_ts = start_ts + interval + mask = (X[time_column] > start_ts) & (X[time_column] <= upper_ts) + average_value = X.loc[mask][value_column].mean(skipna=True) accepted_points.append([start_ts, average_value]) start_ts = upper_ts # update the timestamp - new_df = pd.DataFrame(accepted_points, columns=['timestamp', 'value']) - return new_df + return pd.DataFrame(accepted_points, columns=[time_column, value_column]) From 3372d04f11165688c7b7d8f6c2ece134ae102745 Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 20 Dec 2018 11:43:59 +0200 Subject: [PATCH 27/32] Removed instances of copy() from dataframes --- mlprimitives/timeseries.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mlprimitives/timeseries.py b/mlprimitives/timeseries.py index c982b914..9b0750d8 100644 --- a/mlprimitives/timeseries.py +++ b/mlprimitives/timeseries.py @@ -1,5 +1,5 @@ -import pandas as pd import numpy as np +import pandas as pd def rolling_window_sequences(X, window_size, value_column, time_column): @@ -10,46 +10,46 @@ def rolling_window_sequences(X, window_size, value_column, time_column): Assumes the input is timeseries sorted. Args: X (pandas.DataFrame): a pandas dataframe which has 'timestamp' - and 'value' columns, and is sorted based on timestamp. + and 'value' columns, and is sorted based on timestamp. The timestamp column is in UNIX format (in seconds). window_size (int): number of values that overlap to create the sequence. value_column (string): name of column that has the value field. time_column (string): name of column that has the time field. Returns: - (numpy.ndarray): contains the time series sequenced data with each + (numpy.ndarray): contains the time series sequenced data with each entry having window_size rows. - (numpy.ndarray): acts as the label for the forecasting problem with + (numpy.ndarray): acts as the label for the forecasting problem with each entry having window_size rows. (numpy.ndarray): the corresponding timestamps series. """ output_X = [] - Y = [] + y = [] time = [] for i in range(len(X) - window_size): # reshape into a vector to fit into a neural network model (vectorize it) - output_X.append(X[i: i + window_size][value_column].values.copy().reshape([-1, 1])) - Y.append(X[i + window_size + 1][value_column].values.copy().reshape([-1, 1])) + output_X.append(X[i: i + window_size][value_column].values.reshape([-1, 1])) + y.append(X[i + window_size + 1][value_column].values.reshape([-1, 1])) time.append(X.iloc[i + window_size][time_column]) - return np.asarray(output_X), np.asarray(Y), np.asarray(time) + return np.asarray(output_X), np.asarray(y), np.asarray(time) def time_segments_average(X, interval, value_column, time_column): """ - function that aggregates data in a pandas dataframe by averaging over a given interval. + function that aggregates data in a pandas dataframe by averaging over a given interval. it starts averaging from the smallest timestamp in the dataframe and ends at the largest timestamp. assumes the input is timeseries sorted. args: - X (pandas.dataframe): a pandas dataframe which has 'timestamp' + X (pandas.dataframe): a pandas dataframe which has 'timestamp' and 'value' columns, and is sorted based on timestamp. the timestamp column is in unix format (in seconds). - interval (int): an integer denoting the number of seconds + interval (int): an integer denoting the number of seconds in the desired interval. value_column (string): name of column that has the value field. time_column (string): name of column that has the time field. returns: - pandas.dataframe: a pandas dataframe with two colums - ('timestamp' and 'value'), where each `timestamp` is the starting time of + pandas.dataframe: a pandas dataframe with two colums + ('timestamp' and 'value'), where each `timestamp` is the starting time of an interval and the `value` is the result of aggregation. """ start_ts = X[time_column].iloc[0] # min value From 7724bceb260388d3f4a1e98fb2068c132003461a Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 20 Dec 2018 15:54:38 +0200 Subject: [PATCH 28/32] Issue 53: fixed naming issue with json files --- ...me.json => mlprimitives.timeseries.time_segments_average.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename mlblocks_primitives/{mlprimitives.timeseries.aggregate_average_time.json => mlprimitives.timeseries.time_segments_average.json} (100%) diff --git a/mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json b/mlblocks_primitives/mlprimitives.timeseries.time_segments_average.json similarity index 100% rename from mlblocks_primitives/mlprimitives.timeseries.aggregate_average_time.json rename to mlblocks_primitives/mlprimitives.timeseries.time_segments_average.json From 33e3dc1e2cb01f9b3e02d23735735ee02d63d2fc Mon Sep 17 00:00:00 2001 From: Ihssan Date: Thu, 20 Dec 2018 19:04:58 +0200 Subject: [PATCH 29/32] Issue 47: changed description of dense_units in primitive json --- .../keras.Sequential.LSTMTimeSeriesRegressor.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json index 82fc5948..a97e27ce 100644 --- a/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json +++ b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json @@ -2,7 +2,7 @@ "name": "keras.Sequential.LSTMTimeSeriesRegressor", "author": "Ihssan Tinawi ", "documentation": "", - "description": "This primitive consists of multiple Keras layers that can pass time-series data through an LSTM in order to predict the value at x_{t+1}", + "description": "This primitive consists of multiple Keras layers that can pass time-series data through an LSTM in order to predict the next n values.", "classifiers": { "type": "estimator", "subtype": "regressor" @@ -45,7 +45,8 @@ }, "dense_units": { "type": "int", - "description": "Number of classes" + "description": "Number of values ahead to predict", + "default": 1 }, "classification": { "type": "bool", From b2c5c109face49378ad7085b380959ac437c80f1 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 3 Jan 2019 16:20:27 +0100 Subject: [PATCH 30/32] Fix index usage in featuretools.dfs --- .../multi_table.classification.default.json | 34 +++++++++++++++++++ ... single_table.classification.default.json} | 4 +-- mlblocks_primitives/featuretools.dfs.json | 6 ++++ mlprimitives/adapters/featuretools.py | 23 ++++++++++--- setup.py | 2 +- 5 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 mlblocks_pipelines/multi_table.classification.default.json rename mlblocks_pipelines/{tabular.classification.default.json => single_table.classification.default.json} (87%) diff --git a/mlblocks_pipelines/multi_table.classification.default.json b/mlblocks_pipelines/multi_table.classification.default.json new file mode 100644 index 00000000..d1f4aadd --- /dev/null +++ b/mlblocks_pipelines/multi_table.classification.default.json @@ -0,0 +1,34 @@ +{ + "metadata": { + "name": "multi_table/classification/default", + "data_type": "multi_table", + "task_type": "classification" + }, + "validation": { + "dataset": "wikiqa", + "context": { + "entities": "$entities", + "relationships": "$relationships", + "target_entity": "data" + } + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "featuretools.dfs", + "xgboost.XGBClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "featuretools.dfs#1": { + "encode": true + }, + "xgboost.XGBClassifier#1": { + "n_jobs": -1, + "learning_rate": 0.1, + "n_estimators": 300, + "max_depth": 3, + "gamma": 0, + "min_child_weight": 1 + } + } +} diff --git a/mlblocks_pipelines/tabular.classification.default.json b/mlblocks_pipelines/single_table.classification.default.json similarity index 87% rename from mlblocks_pipelines/tabular.classification.default.json rename to mlblocks_pipelines/single_table.classification.default.json index 6c4f00cf..b5b8830b 100644 --- a/mlblocks_pipelines/tabular.classification.default.json +++ b/mlblocks_pipelines/single_table.classification.default.json @@ -1,7 +1,7 @@ { "metadata": { - "name": "tabular/classification/default", - "data_type": "tabular", + "name": "single_table/classification/default", + "data_type": "single_table", "task_type": "classification" }, "validation": { diff --git a/mlblocks_primitives/featuretools.dfs.json b/mlblocks_primitives/featuretools.dfs.json index 4c63e729..d79bab43 100644 --- a/mlblocks_primitives/featuretools.dfs.json +++ b/mlblocks_primitives/featuretools.dfs.json @@ -75,6 +75,12 @@ ] }, "hyperparameters": { + "fixed": { + "copy": { + "type": "bool", + "default": false + } + }, "tunable": { "max_depth": { "type": "int", diff --git a/mlprimitives/adapters/featuretools.py b/mlprimitives/adapters/featuretools.py index 498deae6..c2250bde 100644 --- a/mlprimitives/adapters/featuretools.py +++ b/mlprimitives/adapters/featuretools.py @@ -8,7 +8,8 @@ class DFS(object): features = None - def __init__(self, max_depth=None, encode=True, remove_low_information=True): + def __init__(self, max_depth=None, encode=True, remove_low_information=True, copy=False): + self.copy = copy self.max_depth = max_depth self.encode = encode self.remove_low_information = remove_low_information @@ -19,10 +20,22 @@ def __repr__(self): " remove_low_information={remove_low_information})" ).format(**self.__dict__) + def _get_index(self, X): + if self.copy: + X = X.copy() + + index = X.index.name or 'index' + while index in X.columns: + index = '_' + index + + X.index.name = index + X.reset_index(inplace=True) + + return X, index + def _get_entityset(self, X, target_entity, entities, relationships): if entities is None: - index = X.index.name - X = X.reset_index() + X, index = self._get_index(X) entities = { target_entity: (X, index) } @@ -32,7 +45,7 @@ def _get_entityset(self, X, target_entity, entities, relationships): return ft.EntitySet('entityset', entities, relationships) - def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): + def dfs(self, X=None, target_entity='X', entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) @@ -44,7 +57,7 @@ def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relatio if time_index: cutoff_time = target.df[[index, time_index]] - instance_ids = X.index.values.copy() + instance_ids = X[index].values.copy() self.features = ft.dfs( cutoff_time=cutoff_time, diff --git a/setup.py b/setup.py index ece8de69..220c6655 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ tests_require = [ - 'mlblocks>=0.2.0', + 'mlblocks>=0.2.4', 'pytest>=3.4.2', 'google-compute-engine==2.8.12', # required by travis ] From c05dcee01b2516242fa92ed38f6611d3e4858747 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 3 Jan 2019 19:48:52 +0100 Subject: [PATCH 31/32] Fix stopwrods typo --- mlblocks_primitives/mlprimitives.text.TextCleaner.json | 2 +- mlprimitives/text.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlblocks_primitives/mlprimitives.text.TextCleaner.json b/mlblocks_primitives/mlprimitives.text.TextCleaner.json index d9b3e77e..11849764 100644 --- a/mlblocks_primitives/mlprimitives.text.TextCleaner.json +++ b/mlblocks_primitives/mlprimitives.text.TextCleaner.json @@ -52,7 +52,7 @@ "type": "bool", "default": true }, - "stopwrods": { + "stopwords": { "type": "bool", "default": true }, diff --git a/mlprimitives/text.py b/mlprimitives/text.py index 4bdcfea6..1120272e 100644 --- a/mlprimitives/text.py +++ b/mlprimitives/text.py @@ -24,7 +24,7 @@ class TextCleaner(object): STOPWORDS = dict() def __init__(self, column=None, language='multi', lower=True, accents=True, - stopwrods=True, non_alpha=True, single_chars=True): + stopwords=True, non_alpha=True, single_chars=True): self.column = column self.language = language self.language_code = None From e30267fc0ea907a505b431204d72c2d689291685 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 3 Jan 2019 20:40:04 +0100 Subject: [PATCH 32/32] Fix SingleLayerCNNImageClassifier annotation --- ...ential.SingleLayerCNNImagelClassifier.json | 33 +++++++++++++++++++ ...uential.SingleLayerCNNImageClassifier.json | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json diff --git a/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json b/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json new file mode 100644 index 00000000..0ae8cd78 --- /dev/null +++ b/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "name": "keras.Sequential.SingleLayerCNNImageClassifier", + "data_type": "image", + "task_type": "classification" + }, + "validation": { + "dataset": "usps", + "context": {} + }, + "primitives": [ + "mlprimitives.counters.UniqueCounter", + "keras.Sequential.SingleLayerCNNImageClassifier" + ], + "input_names": { + "mlprimitives.counters.UniqueCounter#1": { + "X": "y" + } + }, + "output_names": { + "mlprimitives.counters.UniqueCounter#1": { + "counts": "classes" + } + }, + "init_params": { + "mlprimitives.counters.UniqueCounter#1": { + "add": 1 + }, + "keras.Sequential.SingleLayerCNNImageClassifier#1": { + "epochs": 5 + } + } +} diff --git a/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json b/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json index 21f7c166..8141c508 100644 --- a/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json +++ b/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json @@ -110,7 +110,7 @@ { "class": "keras.layers.Dense", "parameters": { - "units": "dense_units", + "units": "classes", "activation": "dense_activation" } }