diff --git a/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json b/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json new file mode 100644 index 00000000..0ae8cd78 --- /dev/null +++ b/mlblocks_pipelines/keras.Sequential.SingleLayerCNNImagelClassifier.json @@ -0,0 +1,33 @@ +{ + "metadata": { + "name": "keras.Sequential.SingleLayerCNNImageClassifier", + "data_type": "image", + "task_type": "classification" + }, + "validation": { + "dataset": "usps", + "context": {} + }, + "primitives": [ + "mlprimitives.counters.UniqueCounter", + "keras.Sequential.SingleLayerCNNImageClassifier" + ], + "input_names": { + "mlprimitives.counters.UniqueCounter#1": { + "X": "y" + } + }, + "output_names": { + "mlprimitives.counters.UniqueCounter#1": { + "counts": "classes" + } + }, + "init_params": { + "mlprimitives.counters.UniqueCounter#1": { + "add": 1 + }, + "keras.Sequential.SingleLayerCNNImageClassifier#1": { + "epochs": 5 + } + } +} diff --git a/mlblocks_pipelines/multi_table.classification.default.json b/mlblocks_pipelines/multi_table.classification.default.json new file mode 100644 index 00000000..d1f4aadd --- /dev/null +++ b/mlblocks_pipelines/multi_table.classification.default.json @@ -0,0 +1,34 @@ +{ + "metadata": { + "name": "multi_table/classification/default", + "data_type": "multi_table", + "task_type": "classification" + }, + "validation": { + "dataset": "wikiqa", + "context": { + "entities": "$entities", + "relationships": "$relationships", + "target_entity": "data" + } + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "featuretools.dfs", + "xgboost.XGBClassifier", + "mlprimitives.preprocessing.ClassDecoder" + ], + "hyperparameters": { + "featuretools.dfs#1": { + "encode": true + }, + "xgboost.XGBClassifier#1": { + "n_jobs": -1, + "learning_rate": 0.1, + "n_estimators": 300, + "max_depth": 3, + "gamma": 0, + "min_child_weight": 1 + } + } +} diff --git a/mlblocks_pipelines/tabular.classification.default.json b/mlblocks_pipelines/single_table.classification.default.json similarity index 87% rename from mlblocks_pipelines/tabular.classification.default.json rename to mlblocks_pipelines/single_table.classification.default.json index 6c4f00cf..b5b8830b 100644 --- a/mlblocks_pipelines/tabular.classification.default.json +++ b/mlblocks_pipelines/single_table.classification.default.json @@ -1,7 +1,7 @@ { "metadata": { - "name": "tabular/classification/default", - "data_type": "tabular", + "name": "single_table/classification/default", + "data_type": "single_table", "task_type": "classification" }, "validation": { diff --git a/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json b/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json new file mode 100644 index 00000000..3658d1da --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.DictionaryLearning.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "FactorAnalysis/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FactorAnalysis", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FactorAnalysis#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json b/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json new file mode 100644 index 00000000..3658d1da --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.FactorAnalysis.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "FactorAnalysis/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FactorAnalysis", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FactorAnalysis#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.FastICA.json b/mlblocks_pipelines/sklearn.decomposition.FastICA.json new file mode 100644 index 00000000..488b1751 --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.FastICA.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "FastICA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.FastICA", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.FastICA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json b/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json new file mode 100644 index 00000000..13e2c99f --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.KernelPCA.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "KernelPCA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.KernelPCA", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.KernelPCA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.PCA.json b/mlblocks_pipelines/sklearn.decomposition.PCA.json new file mode 100644 index 00000000..1cfb9084 --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.PCA.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "PCA/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.PCA", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.PCA#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json b/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json new file mode 100644 index 00000000..ccecc11f --- /dev/null +++ b/mlblocks_pipelines/sklearn.decomposition.TruncatedSVD.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "TruncatedSVD/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "sklearn.decomposition.DictionaryLearning", + "sklearn.decomposition.TruncatedSVD", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.decomposition.TruncatedSVD#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json new file mode 100644 index 00000000..414524fe --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostClassifier.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "AdaBoostClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.AdaBoostClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.AdaBoostClassifier#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json new file mode 100644 index 00000000..74277d7f --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.AdaBoostRegressor.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "AdaBoostRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.AdaBoostRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.AdaBoostRegressor#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json new file mode 100644 index 00000000..c3cd98b4 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingClassifier.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "BaggingClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.BaggingClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.BaggingClassifier#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json new file mode 100644 index 00000000..4f6dfdb4 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.BaggingRegressor.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "BaggingRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.BaggingRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.BaggingRegressor#1": {} + } +} + diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json new file mode 100644 index 00000000..49790e86 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesClassifier.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "ExtraTreesClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.ExtraTreesClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.ExtraTreesClassifier#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json new file mode 100644 index 00000000..00b305c5 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.ExtraTreesRegressor.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "ExtraTreesRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.ExtraTreesRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.ExtraTreesRegressor#1": {} + } +} \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json new file mode 100644 index 00000000..7a92adfa --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingClassifier.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "GradientBoosting/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.GradientBoostingClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.GradientBoostingClassifier#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json new file mode 100644 index 00000000..6d4ff0d7 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.GradientBoostingRegressor.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "GradientBoostingRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.GradientBoostingRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.GradientBoostingRegressor#1": {} + } +} \ No newline at end of file diff --git a/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json new file mode 100644 index 00000000..9b684039 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.IsolationForest.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "IsolationForest/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.IsolationForest" + ], + "hyperparameters": { + "sklearn.ensemble.IsolationForest#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json new file mode 100644 index 00000000..747f6118 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestClassifier.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "name": "RandomForestClassifier/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.RandomForestClassifier#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json new file mode 100644 index 00000000..0c9985e5 --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomForestRegressor.json @@ -0,0 +1,19 @@ +{ + "metadata": { + "name": "RandomForestRegressor/regression/default", + "data_type": "tabular", + "task_type": "regression" + }, + "validation": { + "dataset": "boston", + "context": {} + }, + "primitives": [ + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomForestRegressor" + ], + "hyperparameters": { + "sklearn.ensemble.RandomForestRegressor#1": {} + } +} diff --git a/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json new file mode 100644 index 00000000..d78157de --- /dev/null +++ b/mlblocks_pipelines/sklearn.ensemble.RandomTreesEmbedding.json @@ -0,0 +1,21 @@ +{ + "metadata": { + "name": "RandomTreesEmbedding/classification/default", + "data_type": "tabular", + "task_type": "classification" + }, + "validation": { + "dataset": "iris", + "context": {} + }, + "primitives": [ + "mlprimitives.preprocessing.ClassEncoder", + "sklearn.preprocessing.Imputer", + "sklearn.preprocessing.StandardScaler", + "sklearn.ensemble.RandomTreesEmbedding", + "sklearn.ensemble.RandomForestClassifier" + ], + "hyperparameters": { + "sklearn.ensemble.RandomTreesEmbedding#1": {} + } +} diff --git a/mlblocks_primitives/featuretools.dfs.json b/mlblocks_primitives/featuretools.dfs.json index 4c63e729..d79bab43 100644 --- a/mlblocks_primitives/featuretools.dfs.json +++ b/mlblocks_primitives/featuretools.dfs.json @@ -75,6 +75,12 @@ ] }, "hyperparameters": { + "fixed": { + "copy": { + "type": "bool", + "default": false + } + }, "tunable": { "max_depth": { "type": "int", diff --git a/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json new file mode 100644 index 00000000..a97e27ce --- /dev/null +++ b/mlblocks_primitives/keras.Sequential.LSTMTimeSeriesRegressor.json @@ -0,0 +1,123 @@ +{ + "name": "keras.Sequential.LSTMTimeSeriesRegressor", + "author": "Ihssan Tinawi ", + "documentation": "", + "description": "This primitive consists of multiple Keras layers that can pass time-series data through an LSTM in order to predict the next n values.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "mlprimitives.adapters.keras.Sequential", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "array" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "array" + } + ] + }, + "hyperparameters": { + "fixed": { + "input_length": { + "type": "int", + "default": 1500 + }, + "dense_units": { + "type": "int", + "description": "Number of values ahead to predict", + "default": 1 + }, + "classification": { + "type": "bool", + "default": false + }, + "dense_activation": { + "type": "str", + "default": "tanh" + }, + "optimizer": { + "type": "str", + "default": "keras.optimizers.Adam" + }, + "loss": { + "type": "str", + "default": "keras.losses.mean_squared_error" + }, + "metrics": { + "type": "list", + "default": [ + "accuracy" + ] + }, + "layers": { + "type": "list", + "default": [ + { + "class": "keras.layers.Input", + "parameters": { + "shape": "input_shape" + } + }, + { + "class": "keras.layers.Dropout", + "parameters": { + "rate": "dropout_rate" + } + }, + { + "class": "keras.layers.LSTM", + "parameters": { + "units": "lstm_units" + } + }, + { + "class": "keras.layers.Dense", + "parameters": { + "units": "dense_units", + "activation": "dense_activation" + } + } + ] + } + }, + "tunable": { + "lstm_units": { + "type": "int", + "default": 50, + "range": [ + 1, + 500 + ] + }, + "dropout_rate": { + "type": "float", + "default": 0.1, + "range": [ + 0.01, + 0.75 + ] + } + } + } +} diff --git a/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json b/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json index 21f7c166..8141c508 100644 --- a/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json +++ b/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json @@ -110,7 +110,7 @@ { "class": "keras.layers.Dense", "parameters": { - "units": "dense_units", + "units": "classes", "activation": "dense_activation" } } diff --git a/mlblocks_primitives/mlprimitives.text.TextCleaner.json b/mlblocks_primitives/mlprimitives.text.TextCleaner.json index d9b3e77e..11849764 100644 --- a/mlblocks_primitives/mlprimitives.text.TextCleaner.json +++ b/mlblocks_primitives/mlprimitives.text.TextCleaner.json @@ -52,7 +52,7 @@ "type": "bool", "default": true }, - "stopwrods": { + "stopwords": { "type": "bool", "default": true }, diff --git a/mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json b/mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json new file mode 100644 index 00000000..92b4f034 --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries.rolling_window_sequences.json @@ -0,0 +1,43 @@ +{ + "name": "mlprimitives.timeseries.rolling_window_sequences", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries.rolling_window_sequences", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries.rolling_window_sequences", + "produce": { + "args": [ + { + "name": "X", + "keyword": "time_value", + "type": "pandas.DataFrame" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "Y", + "type": "ndarray" + }, + { + "name": "time", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "window_size": { + "type": "int", + "default": 50 + } + }, + "tunable": {} + } +} diff --git a/mlblocks_primitives/mlprimitives.timeseries.time_segments_average.json b/mlblocks_primitives/mlprimitives.timeseries.time_segments_average.json new file mode 100644 index 00000000..1703e604 --- /dev/null +++ b/mlblocks_primitives/mlprimitives.timeseries.time_segments_average.json @@ -0,0 +1,43 @@ +{ + "name": "mlprimitives.timeseries.time_segments_average", + "author": "Ihssan Tinawi ", + "description": "mlprimitives.timeseries.time_segments_average", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_extractor" + }, + "modalities": ["timeseries"], + "primitive": "mlprimitives.timeseries.time_segments_average", + "produce": { + "args": [ + { + "name": "X", + "keyword": "time_value", + "type": "pandas.DataFrame" + }, + { + "name": "value_column", + "type": "str" + }, + { + "name": "name_column", + "type": "str" + } + ], + "output": [ + { + "name": "aggregated_df", + "type": "pandas.DataFrame" + } + ] + }, + "hyperparameters": { + "fixed": { + "interval": { + "type": "int", + "default": 3600 + } + }, + "tunable": {} + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json new file mode 100644 index 00000000..9515a05a --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.DictionaryLearning.json @@ -0,0 +1,109 @@ +{ + "name": "sklearn.decomposition.DictionaryLearning", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html", + "description": "Dictionary learning.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.DictionaryLearning", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "code_init": { + "type": "iterable", + "default": null + }, + "dict_init": { + "type": "iterable", + "default": null + }, + "verbose": { + "type": "bool", + "default": false + }, + "positive_code": { + "type": "bool", + "default": false + }, + "positive_dict": { + "type": "bool", + "default": false + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null + }, + "alpha": { + "type": "float", + "default": 1.0, + "range": [0.0, 10.0] + }, + "max_iter": { + "type": "int", + "default": 1000, + "range": [0, 10000] + }, + "tol": { + "type": "float", + "default": 1e-08, + "range": [0.0, 1.0] + }, + "fit_algorithm": { + "type": "str", + "default": "lars", + "values": ["lars", "cd"] + }, + "transform_algorithm": { + "type": "str", + "default": "omp", + "values": ["lasso_lars", "lasso_cd", "lars", "omp", "threshold"] + }, + "transform_n_nonzero_coefs": { + "type": "int", + "default": null, + "range": [0, 100] + }, + "transform_alpha": { + "type": "float", + "default": 1.0, + "range": [0.0, 10.0] + }, + "split_sign": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json new file mode 100644 index 00000000..9ad0d89a --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.FactorAnalysis.json @@ -0,0 +1,75 @@ +{ + "name": "sklearn.decomposition.FactorAnalysis", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html", + "description": "Factor Analysis. A simple linear generative model with Gaussian latent variables.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.FactorAnalysis", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "copy": { + "type": "bool", + "default": true + }, + "noise_variance_init": { + "type": "iterable", + "default": null + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [0, 500] + }, + "tol": { + "type": "float", + "default": 0.01, + "range": [0.0, 0.5] + }, + "max_iter": { + "type": "int", + "default": 1000, + "range": [10, 10000] + }, + "svd_method": { + "type": "str", + "default": "randomized", + "values": ["lapack", "randomized"] + }, + "iterated_power": { + "type": "int", + "default": 3, + "range": [0, 10] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.FastICA.json b/mlblocks_primitives/sklearn.decomposition.FastICA.json new file mode 100644 index 00000000..5b081c05 --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.FastICA.json @@ -0,0 +1,79 @@ +{ + "name": "sklearn.decomposition.FastICA", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html", + "description": "FastICA: a fast algorithm for Independent Component Analysis.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.FastICA", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "w_init": { + "type": "iterable", + "default": null + }, + "fun_args": { + "type": "iterable", + "default": null + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [1, 500] + }, + "algorithm": { + "type": "str", + "default": "parallel", + "values": ["parallel", "deflation"] + }, + "whiten": { + "type": "bool", + "default": true + }, + "fun": { + "type": "string", + "default": "logcosh", + "values": ["logcosh", "exp", "cube"] + }, + "max_iter": { + "type": "int", + "default": 200, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 0.0001, + "range": [0.00001, 0.5] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.KernelPCA.json b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json new file mode 100644 index 00000000..3fc2cbde --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.KernelPCA.json @@ -0,0 +1,109 @@ +{ + "name": "sklearn.decomposition.KernelPCA", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html", + "description": "Kernel Principal Component Analysis.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.KernelPCA", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "copy_X": { + "type": "bool", + "default": true + }, + "kernel_params": { + "type": "str", + "default": null + } + }, + "tunable": { + "n_components": { + "type": "int", + "default": null, + "range": [1, 500] + }, + "kernel": { + "type": "str", + "default": "linear", + "values": [ + "linear", + "poly", + "rbf", + "sigmoid", + "cosine", + "precomputed" + ] + }, + "gamma": { + "type": "float", + "default": null, + "range": [0.0, 0.5] + }, + "coef0": { + "type": "float", + "default": 1.0, + "range": [0.0, 10.0] + }, + "alpha": { + "type": "int", + "default": 1, + "range": [0, 10] + }, + "fit_inverse_transform": { + "type": "bool", + "default": false + }, + "eigen_solver": { + "type": "str", + "default": "auto", + "values": ["auto", "arpack", "dense"] + }, + "tol": { + "type": "float", + "default": 0.0, + "range": [0.0, 10.0] + }, + "max_iter": { + "type": "int", + "default": null, + "range": [0, 100] + }, + "remove_zero_eig": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.decomposition.PCA.json b/mlblocks_primitives/sklearn.decomposition.PCA.json index 984ad75d..83287d69 100644 --- a/mlblocks_primitives/sklearn.decomposition.PCA.json +++ b/mlblocks_primitives/sklearn.decomposition.PCA.json @@ -1,6 +1,6 @@ { "name": "sklearn.decomposition.PCA", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html", "description": "Principal component analysis (PCA)", "classifiers": { @@ -34,10 +34,32 @@ ] }, "hyperparameters": { - "FIXME": "This needs to be reviewed", "fixed": { + "copy": { + "type": "bool", + "default": true + } }, "tunable": { + "tol": { + "type": "float", + "default": 0.0, + "range": [0.0, 100.0] + }, + "iterated_power":{ + "type": "int", + "default": "auto", + "range": [0, 1000] + }, + "whiten": { + "type": "bool", + "default": false + }, + "svd_solver": { + "type": "str", + "default": "auto", + "values": ["auto", "arpack", "full", "randomized"] + } } } } diff --git a/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json b/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json new file mode 100644 index 00000000..6342162e --- /dev/null +++ b/mlblocks_primitives/sklearn.decomposition.TruncatedSVD.json @@ -0,0 +1,61 @@ +{ + "name": "sklearn.decomposition.TruncatedSVD", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html", + "description": "Dimensionality reduction using truncated SVD.", + "classifiers": { + "type": "preprocessor", + "subtype": "feature_selector" + }, + "modalities": [], + "primitive": "sklearn.decomposition.TruncatedSVD", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": {}, + "tunable": { + "n_components": { + "type": "int", + "default": 2, + "range": [1, 500] + }, + "algorithm": { + "type": "str", + "default": "randomized", + "values": ["arpack", "randomized"] + }, + "n_iter": { + "type": "int", + "default": 5, + "range": [1, 100] + }, + "tol": { + "type": "float", + "default": 0.0, + "range": [0.001, 0.5] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json new file mode 100644 index 00000000..c5e92992 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostClassifier.json @@ -0,0 +1,65 @@ +{ + "name": "sklearn.ensemble.AdaBoostClassifier", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html", + "description": "Scikit-learn AdaBoostClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.AdaBoostClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "learning_rate": { + "type": "float", + "default": 1.0, + "range": [1.0, 10.0] + }, + "algorithm": { + "type": "str", + "default": "SAMME.R", + "values": ["SAMME", "SAMME.R"] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json new file mode 100644 index 00000000..459fb3d6 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.AdaBoostRegressor.json @@ -0,0 +1,65 @@ +{ + "name": "sklearn.ensemble.AdaBoostRegressor", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html", + "description": "Scikit-learn AdaBoostRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.AdaBoostRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "learning_rate": { + "type": "float", + "default": 1.0, + "range": [1.0, 10.0] + }, + "loss": { + "type": "str", + "default": "linear", + "values": ["linear", "square", "exponential"] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json new file mode 100644 index 00000000..55757a7a --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.BaggingClassifier.json @@ -0,0 +1,89 @@ +{ + "name": "sklearn.ensemble.BaggingClassifier", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html", + "description": "Scikit-learn BaggingClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.BaggingClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + }, + "n_jobs": { + "type": "int", + "default": null + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 50, + "range": [1, 500] + }, + "max_samples": { + "type": "float", + "default": 1.0, + "range": [1.0, 100.0] + }, + "max_features": { + "type": "int", + "default": 1.0, + "range": [1.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "bootstrap_features": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json new file mode 100644 index 00000000..eb16c812 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.BaggingRegressor.json @@ -0,0 +1,89 @@ +{ + "name": "sklearn.ensemble.BaggingRegressor", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html", + "description": "Scikit-learn BaggingRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.BaggingRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "base_estimator": { + "type": "object", + "default": null + }, + "n_jobs": { + "type": "int", + "default": null + }, + "warm_start": { + "type": "bool", + "default": false + }, + "verbose": { + "type": "int", + "default": 0 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_samples": { + "type": "float", + "default": 1.0, + "range": [1.0, 100.0] + }, + "max_features": { + "type": "float", + "default": 1.0, + "range": [1.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "bootstrap_features": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json new file mode 100644 index 00000000..f061fc58 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesClassifier.json @@ -0,0 +1,116 @@ +{ + "name": "sklearn.ensemble.ExtraTreesClassifier", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html", + "description": "Scikit-learn ExtraTreesClassifier. Implements a meta estimator that fits a number of randomized decision trees.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.ExtraTreesClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "criterion": { + "type": "str", + "default": "gini", + "values": ["entropy", "gini"] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_depth": { + "type": "int", + "default": null, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json new file mode 100644 index 00000000..5cf7cbd3 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.ExtraTreesRegressor.json @@ -0,0 +1,111 @@ +{ + "name": "sklearn.ensemble.ExtraTreesRegressor", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html", + "description": "Scikit-learn ExtraTreesRegressor. Implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.", + "classifiers": { + "type": "estimator", + "subtype": "regressor" + }, + "modalities": [], + "primitive": "sklearn.ensemble.ExtraTreesRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "criterion": { + "type": "str", + "default": "mse", + "values": ["mae", "mse"] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_depth": { + "type": "int", + "default": null, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "oob_score": { + "type": "bool", + "default": false + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json new file mode 100644 index 00000000..574bcaa0 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingClassifier.json @@ -0,0 +1,137 @@ +{ + "name": "sklearn.ensemble.GradientBoostingClassifier", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html", + "description": "Scikit-learn GradientBoostingClassifier.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.GradientBoostingClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "warm_start": { + "type": "bool", + "default": false + }, + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "presort": { + "type": "bool", + "default": false + } + }, + "tunable": { + "loss": { + "type": "str", + "default": "deviance", + "values": ["deviance", "exponential"] + }, + "learning_rate": { + "type": "float", + "default": 0.1, + "range": [0.01, 10.0] + }, + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [1, 30] + }, + "criterion": { + "type": "str", + "default": "friedman_mse", + "values": ["friedman_mse", "friedman_mae"] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 10.0] + }, + "subsample": { + "type": "float", + "default": 1.0, + "range": [0.001, 100.0] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "validation_fraction": { + "type": "float", + "default": 0.1, + "range": [0.0, 1.0] + }, + "n_iter_no_change": { + "type": "int", + "default": null, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 1e-4, + "range": [0.0, 2.0] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json new file mode 100644 index 00000000..8659dd74 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.GradientBoostingRegressor.json @@ -0,0 +1,142 @@ +{ + "name": "sklearn.ensemble.GradientBoostingRegressor", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html", + "description": "Scikit-learn GradientBoostingRegressor.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.GradientBoostingRegressor", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "init": { + "type": "object", + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "presort": { + "type": "bool", + "default": false + } + }, + "tunable": { + "loss": { + "type": "str", + "default": "ls", + "values": ["ls", "lad", "huber", "quantile"] + }, + "learning_rate": { + "type": "float", + "default": 0.1, + "range": [0.01, 10.0] + }, + "n_estimators": { + "type": "int", + "default": 100, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [1, 30] + }, + "criterion": { + "type": "str", + "default": "friedman_mse", + "values": ["friedman_mse", "friedman_mae"] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 10.5] + }, + "subsample": { + "type": "float", + "default": 1.0, + "range": [0.01, 100.0] + }, + "max_features": { + "type": "str", + "default": null, + "values": [null, "auto", "log2", "sqrt"] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 100.0] + }, + "alpha": { + "type": "float", + "default": 0.9, + "range": [0.01, 10] + }, + "validation_fraction": { + "type": "float", + "default": 0.1, + "range": [0.0, 1.0] + }, + "n_iter_no_change": { + "type": "int", + "default": null, + "range": [1, 1000] + }, + "tol": { + "type": "float", + "default": 1e-4, + "range": [0.0, 2.0] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.IsolationForest.json b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json new file mode 100644 index 00000000..44090512 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.IsolationForest.json @@ -0,0 +1,78 @@ +{ + "name": "sklearn.ensemble.IsolationForest", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html", + "description": "Scikit-learn IsolationForest. The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.IsolationForest", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "predict", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 100, + "range": [1, 500] + }, + "max_samples": { + "type": "str", + "default": "auto" + }, + "max_features": { + "type": "float", + "default": 1.0, + "range": [1.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": false + }, + "contamination": { + "type": "float", + "default": 0.1, + "range": [0.0, 0.5] + } + } + } +} diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json index 40f926cc..256278f8 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier.json @@ -1,6 +1,6 @@ { "name": "sklearn.ensemble.RandomForestClassifier", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html", "description": "Scikit-learn RandomForestClassifier.", "classifiers": { @@ -18,7 +18,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -33,7 +33,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -41,44 +41,74 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, "criterion": { "type": "str", - "default": "entropy", + "default": "gini", "values": ["entropy", "gini"] }, "max_features": { "type": "str", "default": null, - "range": [null, "auto", "log2"] + "values": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 100] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0, + "range": [0.0, 0.5] }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] - }, - "class_weight": { - "type": "str", "default": null, - "range": [null, "balanced"] + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json index 9645816e..a9ffdb2f 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestClassifier_proba.json @@ -1,6 +1,6 @@ { "name": "sklearn.ensemble.RandomForestClassifier_proba", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html", "description": "Scikit-learn RandomForestClassifier that uses predict_proba as the produce method.", "classifiers": { @@ -18,7 +18,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -33,7 +33,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -41,44 +41,74 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": null + }, + "verbose": { + "type": "int", + "default": 0 + }, + "warm_start": { + "type": "bool", + "default": false + }, + "class_weight": { + "type": "iterable", + "default": null } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, "criterion": { "type": "str", - "default": "entropy", + "default": "gini", "values": ["entropy", "gini"] }, "max_features": { "type": "str", "default": null, - "range": [null, "auto", "log2"] + "values": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 100] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0, + "range": [0.0, 0.5] }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] - }, - "class_weight": { - "type": "str", "default": null, - "range": [null, "balanced"] + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json index c9b64ebb..4db89c45 100644 --- a/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json +++ b/mlblocks_primitives/sklearn.ensemble.RandomForestRegressor.json @@ -1,6 +1,6 @@ { "name": "sklearn.ensemble.RandomForestRegressor", - "author": "Carles Sala ", + "contributors": ["Carles Sala ", "Plamen Valentinov "], "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html", "description": "A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement if bootstrap=True (default).", "classifiers": { @@ -18,7 +18,7 @@ }, { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -33,7 +33,7 @@ "output": [ { "name": "y", - "type": "array" + "type": "ndarray" } ] }, @@ -41,10 +41,24 @@ "fixed": { "n_jobs": { "type": "int", - "default": -1 + "default": null + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 100] + }, + "warm_start": { + "type": "bool", + "default": false } }, "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, "criterion": { "type": "str", "default": "mse", @@ -52,28 +66,45 @@ }, "max_features": { "type": "str", - "default": null, - "range": [null, "auto", "log2"] + "default": "auto", + "range": [null, "auto", "log2", "sqrt"] }, "max_depth": { "type": "int", - "default": 10, + "default": null, "range": [1, 30] }, "min_samples_split": { - "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "type": "int", + "default": 2, + "range": [2, 1000] }, "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 1000] + }, + "min_weight_fraction_leaf": { "type": "float", - "default": 0.1, - "range": [0.0001, 0.5] + "default": 0.0, + "range": [0.0, 100.0] }, - "n_estimators": { + "max_leaf_nodes": { "type": "int", - "default": 30, - "values": [2, 500] + "default": null + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 10.0] + }, + "bootstrap": { + "type": "bool", + "default": true + }, + "oob_score": { + "type": "bool", + "default": false } } } diff --git a/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json new file mode 100644 index 00000000..6a9168d9 --- /dev/null +++ b/mlblocks_primitives/sklearn.ensemble.RandomTreesEmbedding.json @@ -0,0 +1,98 @@ +{ + "name": "sklearn.ensemble.RandomTreesEmbedding", + "contributors": ["Carles Sala ", "Plamen Valentinov "], + "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomTreesEmbedding.html", + "description": "Scikit-learn RandomTreesEmbedding. An unsupervised transformation of a dataset to a high-dimensional sparse representation.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "sklearn.ensemble.RandomTreesEmbedding", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "ndarray" + } + ] + }, + "produce": { + "method": "transform", + "args": [ + { + "name": "X", + "type": "ndarray" + } + ], + "output": [ + { + "name": "X", + "type": "Sparse" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": null + }, + "sparse_output": { + "type": "bool", + "default": true + }, + "verbose": { + "type": "int", + "default": 0, + "range": [0, 1000] + }, + "warm_start": { + "type": "bool", + "default": false + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 10, + "range": [1, 500] + }, + "max_depth": { + "type": "int", + "default": 5, + "range": [1, 30] + }, + "min_samples_split": { + "type": "int", + "default": 2, + "range": [2, 100] + }, + "min_samples_leaf": { + "type": "int", + "default": 1, + "range": [1, 100] + }, + "min_weight_fraction_leaf": { + "type": "float", + "default": 0.0, + "range": [0.0, 0.5] + }, + "max_leaf_nodes": { + "type": "int", + "default": null, + "range": [2, 1000] + }, + "min_impurity_decrease": { + "type": "float", + "default": 0.0, + "range": [0.0, 1000.0] + } + } + } +} diff --git a/mlprimitives/adapters/featuretools.py b/mlprimitives/adapters/featuretools.py index 498deae6..c2250bde 100644 --- a/mlprimitives/adapters/featuretools.py +++ b/mlprimitives/adapters/featuretools.py @@ -8,7 +8,8 @@ class DFS(object): features = None - def __init__(self, max_depth=None, encode=True, remove_low_information=True): + def __init__(self, max_depth=None, encode=True, remove_low_information=True, copy=False): + self.copy = copy self.max_depth = max_depth self.encode = encode self.remove_low_information = remove_low_information @@ -19,10 +20,22 @@ def __repr__(self): " remove_low_information={remove_low_information})" ).format(**self.__dict__) + def _get_index(self, X): + if self.copy: + X = X.copy() + + index = X.index.name or 'index' + while index in X.columns: + index = '_' + index + + X.index.name = index + X.reset_index(inplace=True) + + return X, index + def _get_entityset(self, X, target_entity, entities, relationships): if entities is None: - index = X.index.name - X = X.reset_index() + X, index = self._get_index(X) entities = { target_entity: (X, index) } @@ -32,7 +45,7 @@ def _get_entityset(self, X, target_entity, entities, relationships): return ft.EntitySet('entityset', entities, relationships) - def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): + def dfs(self, X=None, target_entity='X', entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) @@ -44,7 +57,7 @@ def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relatio if time_index: cutoff_time = target.df[[index, time_index]] - instance_ids = X.index.values.copy() + instance_ids = X[index].values.copy() self.features = ft.dfs( cutoff_time=cutoff_time, diff --git a/mlprimitives/text.py b/mlprimitives/text.py index 4bdcfea6..1120272e 100644 --- a/mlprimitives/text.py +++ b/mlprimitives/text.py @@ -24,7 +24,7 @@ class TextCleaner(object): STOPWORDS = dict() def __init__(self, column=None, language='multi', lower=True, accents=True, - stopwrods=True, non_alpha=True, single_chars=True): + stopwords=True, non_alpha=True, single_chars=True): self.column = column self.language = language self.language_code = None diff --git a/mlprimitives/timeseries.py b/mlprimitives/timeseries.py new file mode 100644 index 00000000..9b0750d8 --- /dev/null +++ b/mlprimitives/timeseries.py @@ -0,0 +1,67 @@ +import numpy as np +import pandas as pd + + +def rolling_window_sequences(X, window_size, value_column, time_column): + """ + Function that takes in a pandas.DataFrame and a window_size then creates + output arrays that correspond to a timeseries sequence with window_size overlap. + The output arrays can be fed into a timeseries forecasting model. + Assumes the input is timeseries sorted. + Args: + X (pandas.DataFrame): a pandas dataframe which has 'timestamp' + and 'value' columns, and is sorted based on timestamp. + The timestamp column is in UNIX format (in seconds). + window_size (int): number of values that overlap to create the sequence. + value_column (string): name of column that has the value field. + time_column (string): name of column that has the time field. + Returns: + (numpy.ndarray): contains the time series sequenced data with each + entry having window_size rows. + (numpy.ndarray): acts as the label for the forecasting problem with + each entry having window_size rows. + (numpy.ndarray): the corresponding timestamps series. + """ + output_X = [] + y = [] + time = [] + for i in range(len(X) - window_size): + # reshape into a vector to fit into a neural network model (vectorize it) + output_X.append(X[i: i + window_size][value_column].values.reshape([-1, 1])) + y.append(X[i + window_size + 1][value_column].values.reshape([-1, 1])) + time.append(X.iloc[i + window_size][time_column]) + + return np.asarray(output_X), np.asarray(y), np.asarray(time) + + +def time_segments_average(X, interval, value_column, time_column): + """ + function that aggregates data in a pandas dataframe by averaging over a given interval. + it starts averaging from the smallest timestamp in the dataframe and ends at the + largest timestamp. assumes the input is timeseries sorted. + args: + X (pandas.dataframe): a pandas dataframe which has 'timestamp' + and 'value' columns, and is sorted based on timestamp. the timestamp + column is in unix format (in seconds). + interval (int): an integer denoting the number of seconds + in the desired interval. + value_column (string): name of column that has the value field. + time_column (string): name of column that has the time field. + returns: + pandas.dataframe: a pandas dataframe with two colums + ('timestamp' and 'value'), where each `timestamp` is the starting time of + an interval and the `value` is the result of aggregation. + """ + start_ts = X[time_column].iloc[0] # min value + end_time = X[time_column].iloc[-1] # max value in dataframe + accepted_points = [] + while start_ts < end_time: + # average the values between start_ts, [start_ts + timedelta (e.g. 6hrs)] + upper_ts = start_ts + interval + mask = (X[time_column] > start_ts) & (X[time_column] <= upper_ts) + average_value = X.loc[mask][value_column].mean(skipna=True) + + accepted_points.append([start_ts, average_value]) + start_ts = upper_ts # update the timestamp + + return pd.DataFrame(accepted_points, columns=[time_column, value_column]) diff --git a/setup.py b/setup.py index 4d028100..220c6655 100644 --- a/setup.py +++ b/setup.py @@ -37,8 +37,9 @@ tests_require = [ - 'mlblocks>=0.2.0', + 'mlblocks>=0.2.4', 'pytest>=3.4.2', + 'google-compute-engine==2.8.12', # required by travis ]