From d4628c4902d054651eaefdcf73980625dbcaf6c6 Mon Sep 17 00:00:00 2001 From: Kshitijaa Jaglan <29124655+deutranium@users.noreply.github.com> Date: Tue, 23 Aug 2022 00:36:11 +0530 Subject: [PATCH] Added tests for feature selection component (#149) * Initial commit with working tests * cleaned and ensured the file is working with pre-commit * Testing checkpoint * Added support for module_file * Improved documentation * Fixed test * Fixed pre-commit errors * Added data files for component_test.py * Added tests for artifact count by type * Fixed minor bug * Added test to check if correct features are being selected * Update dependencies for feature_selection * Update tfx_addons/version.py * Update tfx_addons/version.py * Update tfx_addons/version.py Co-authored-by: Gerard Casas Saez --- tfx_addons/feature_selection/component.py | 40 ++++- .../feature_selection/component_test.py | 169 +++++++++++++++--- tfx_addons/feature_selection/requirements.txt | 4 + tfx_addons/feature_selection/test/iris.csv | 151 ++++++++++++++++ tfx_addons/version.py | 5 +- 5 files changed, 333 insertions(+), 36 deletions(-) create mode 100644 tfx_addons/feature_selection/requirements.txt create mode 100644 tfx_addons/feature_selection/test/iris.csv diff --git a/tfx_addons/feature_selection/component.py b/tfx_addons/feature_selection/component.py index 6892a83b..20d17f1d 100644 --- a/tfx_addons/feature_selection/component.py +++ b/tfx_addons/feature_selection/component.py @@ -1,4 +1,4 @@ -# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -93,20 +93,46 @@ def _get_file_list(dir_path): @component def FeatureSelection( # pylint: disable=C0103 - module_file: Parameter[str], orig_examples: InputArtifact[Examples], + orig_examples: InputArtifact[Examples], feature_selection: OutputArtifact[FeatureSelectionArtifact], - updated_data: OutputArtifact[Examples]): - """Feature Selection component - Args (from the module file): + updated_data: OutputArtifact[Examples], + module_file: Parameter[str] = None, + module_path: Parameter[str] = None, +): + """Runs a user-specified feature selection algorithm on an `Examples` artifact + Args: + - orig_examples: An `Examples` input artifact with the data to be + processed + - module_file: Python module file containing the configuration + Example: `modules_files.module_file_a` + Exactly one of `module_file` and `module_path` should be passed. + If both are used, module_file would be preferred + - module_path: Python module path containing the configuration + Example: `absolute_path/module_files/module_file_a.py` or + `./module_files/module_file_a.py` + Exactly one of `module_file` and `module_path` should be passed. + If both are used, module_file would be preferred + + Module file configuration: - SELECTOR_PARAMS: Parameters for SelectorFunc in the form of a kwargs dictionary + Example: {"score_func": chi2, "k": 2} + Here, `chi2` has been imported from sklearn.feature_selection - TARGET_FEATURE: Name of the feature containing target data - SelectorFunc: Selector function for univariate feature selection - example: SelectKBest, SelectPercentile from sklearn.feature_selection + Example: SelectKBest, SelectPercentile from sklearn.feature_selection """ # importing the required functions and variables from the module file - modules = importlib.import_module(module_file) + + if module_file: + modules = importlib.import_module(module_file) + elif module_path: + module_spec = importlib.util.spec_from_file_location( + "all_modules", module_path) + modules = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(modules) + mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"] selector_params, target_feature, selector_func = [ getattr(modules, i) for i in mod_names diff --git a/tfx_addons/feature_selection/component_test.py b/tfx_addons/feature_selection/component_test.py index fd6d83ef..4dc1cffe 100644 --- a/tfx_addons/feature_selection/component_test.py +++ b/tfx_addons/feature_selection/component_test.py @@ -1,4 +1,4 @@ -# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,38 +12,155 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for HelloComponent.""" +"""Tests for tfx_addons.feature_selection.component""" -import json +import csv +import importlib +import os +from typing import List, Optional, Text import tensorflow as tf -from tfx.examples.custom_components.hello_world.hello_component import \ - component -from tfx.types import artifact, channel_utils, standard_artifacts +import tfx +from tfx.orchestration import metadata +from tfx_addons.feature_selection import component -class HelloComponentTest(tf.test.TestCase): + +def _get_selected_features(module_file, data_path): + """Get the correct selected features for testing""" + + data = [] + + # importing required configurations + modules = importlib.import_module(module_file) + mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"] + selector_params, target_feature, selector_func = [ + getattr(modules, i) for i in mod_names + ] + + # getting the data + with open(data_path, 'r') as file: + my_reader = csv.reader(file, delimiter=',') + for row in my_reader: + data.append(row) + + # splitting X (input) and Y (output) from CSV data + target_idx = data[0].index(target_feature) + target_data = [i.pop(target_idx) for i in data] + + # runnign the selector function for feature selection + selector = selector_func(**selector_params) + selector.fit_transform(data[1:], target_data[1:]) + + # getting selected feature names + selected_indices = selector.get_support(indices=True) + final_features = set(data[0][idx] for idx in selected_indices) + + return final_features + + +def _create_pipeline( + pipeline_name: Text, + pipeline_root: Text, + data_root: Text, + module_path: Text, + metadata_path: Text, + beam_pipeline_args: Optional[List[Text]] = None) -> tfx.v1.dsl.Pipeline: + """Creating sample pipeline with two components: CsvExampleGen and + FeatureSelection""" + + # specifying the pipeline components + example_gen = tfx.components.CsvExampleGen(input_base=data_root) + feature_selection = component.FeatureSelection( + orig_examples=example_gen.outputs['examples'], module_path=module_path) + + components = [example_gen, feature_selection] + + return tfx.v1.dsl.Pipeline( + pipeline_name=pipeline_name, + pipeline_root=pipeline_root, + components=components, + metadata_connection_config=metadata.sqlite_metadata_connection_config( + metadata_path), + beam_pipeline_args=beam_pipeline_args) + + +class FeatureSelectionTest(tf.test.TestCase): def setUp(self): - super(HelloComponentTest, self).setUp() - self.name = 'HelloWorld' - - def testConstruct(self): - input_data = standard_artifacts.Examples() - input_data.split_names = json.dumps(artifact.DEFAULT_EXAMPLE_SPLITS) - output_data = standard_artifacts.Examples() - output_data.split_names = json.dumps(artifact.DEFAULT_EXAMPLE_SPLITS) - this_component = component.HelloComponent( - input_data=channel_utils.as_channel([input_data]), - output_data=channel_utils.as_channel([output_data]), - name=u'Testing123') - self.assertEqual(standard_artifacts.Examples.TYPE_NAME, - this_component.outputs['output_data'].type_name) - artifact_collection = this_component.outputs['output_data'].get() - for artifacts in artifact_collection: - split_list = json.loads(artifacts.split_names) - self.assertEqual(artifact.DEFAULT_EXAMPLE_SPLITS.sort(), - split_list.sort()) + super().setUp() + self._test_dir = os.path.join( + os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), + self._testMethodName) + self._feature_selection_root = os.path.dirname(__file__) + self._pipeline_name = 'feature_selection' + self._data_root = os.path.join(self._feature_selection_root, 'test') + self._data_path = os.path.join(self._data_root, 'iris.csv') + self._module_path = os.path.join(self._feature_selection_root, 'example', + 'modules', 'iris_module_file.py') + self._module_file = "tfx_addons.feature_selection.example.modules.\ +iris_module_file" + + self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines', + self._pipeline_name) + self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata', + self._pipeline_name, 'metadata.db') + + def assertExecutedOnce(self, component: Text) -> None: # pylint: disable=W0621 + """Check the component is executed exactly once.""" + component_path = os.path.join(self._pipeline_root, component) + self.assertTrue(tfx.dsl.io.fileio.exists(component_path)) + execution_path = os.path.join(component_path, '.system', + 'executor_execution') + execution = tfx.dsl.io.fileio.listdir(execution_path) + self.assertLen(execution, 1) + + def assertPipelineExecution(self) -> None: + self.assertExecutedOnce('CsvExampleGen') + self.assertExecutedOnce('FeatureSelection') + + def testFeatureSelectionPipelineLocal(self): + tfx.v1.orchestration.LocalDagRunner().run( + _create_pipeline(pipeline_name=self._pipeline_name, + pipeline_root=self._pipeline_root, + data_root=self._data_root, + module_path=self._module_path, + metadata_path=self._metadata_path)) + + expected_execution_count = 2 # one each for CsvExampleGen and Feature Selection + true_selected_features = _get_selected_features(self._module_file, + self._data_path) + + metadata_config = ( + tfx.orchestration.metadata.sqlite_metadata_connection_config( + self._metadata_path)) + with metadata.Metadata(metadata_config) as m: + execution_count = len(m.store.get_executions()) + selected_features_struct = list( + m.store.get_artifacts_by_type( + "Feature Selection")[0].properties["selected_features"]. + struct_value.fields.values.__self__["__value__"].list_value.values) + component_selected_features = set( + feature.string_value for feature in selected_features_struct) + + # TEST: execution count + self.assertEqual(expected_execution_count, execution_count) + + # TEST: number of artifacts with TYPE_NAME `Feature Selection` + self.assertEqual(1, + len(m.store.get_artifacts_by_type("Feature Selection"))) + + # TEST: number of artifacts with TYPE_NAME `Examples` + # (one each from CsvExampleGen and FeatureSelection) + self.assertEqual(2, len(m.store.get_artifacts_by_type("Examples"))) + + # TEST: if the features selected by component are correct + self.assertEqual(component_selected_features, true_selected_features) + + self.assertPipelineExecution() if __name__ == '__main__': + tf.compat.v1.enable_v2_behavior() tf.test.main() + +# _disabled pylint warning `W0621: Redefining name 'component' from outer scope` till an alternate way is found diff --git a/tfx_addons/feature_selection/requirements.txt b/tfx_addons/feature_selection/requirements.txt new file mode 100644 index 00000000..bec56781 --- /dev/null +++ b/tfx_addons/feature_selection/requirements.txt @@ -0,0 +1,4 @@ +scikit_learn==1.1.2 +tensorflow +tfx +tfx_bsl==1.9.0 diff --git a/tfx_addons/feature_selection/test/iris.csv b/tfx_addons/feature_selection/test/iris.csv new file mode 100644 index 00000000..20bd6ee5 --- /dev/null +++ b/tfx_addons/feature_selection/test/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,setosa +4.9,3.0,1.4,0.2,setosa +4.7,3.2,1.3,0.2,setosa +4.6,3.1,1.5,0.2,setosa +5.0,3.6,1.4,0.2,setosa +5.4,3.9,1.7,0.4,setosa +4.6,3.4,1.4,0.3,setosa +5.0,3.4,1.5,0.2,setosa +4.4,2.9,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.4,3.7,1.5,0.2,setosa +4.8,3.4,1.6,0.2,setosa +4.8,3.0,1.4,0.1,setosa +4.3,3.0,1.1,0.1,setosa +5.8,4.0,1.2,0.2,setosa +5.7,4.4,1.5,0.4,setosa +5.4,3.9,1.3,0.4,setosa +5.1,3.5,1.4,0.3,setosa +5.7,3.8,1.7,0.3,setosa +5.1,3.8,1.5,0.3,setosa +5.4,3.4,1.7,0.2,setosa +5.1,3.7,1.5,0.4,setosa +4.6,3.6,1.0,0.2,setosa +5.1,3.3,1.7,0.5,setosa +4.8,3.4,1.9,0.2,setosa +5.0,3.0,1.6,0.2,setosa +5.0,3.4,1.6,0.4,setosa +5.2,3.5,1.5,0.2,setosa +5.2,3.4,1.4,0.2,setosa +4.7,3.2,1.6,0.2,setosa +4.8,3.1,1.6,0.2,setosa +5.4,3.4,1.5,0.4,setosa +5.2,4.1,1.5,0.1,setosa +5.5,4.2,1.4,0.2,setosa +4.9,3.1,1.5,0.2,setosa +5.0,3.2,1.2,0.2,setosa +5.5,3.5,1.3,0.2,setosa +4.9,3.6,1.4,0.1,setosa +4.4,3.0,1.3,0.2,setosa +5.1,3.4,1.5,0.2,setosa +5.0,3.5,1.3,0.3,setosa +4.5,2.3,1.3,0.3,setosa +4.4,3.2,1.3,0.2,setosa +5.0,3.5,1.6,0.6,setosa +5.1,3.8,1.9,0.4,setosa +4.8,3.0,1.4,0.3,setosa +5.1,3.8,1.6,0.2,setosa +4.6,3.2,1.4,0.2,setosa +5.3,3.7,1.5,0.2,setosa +5.0,3.3,1.4,0.2,setosa +7.0,3.2,4.7,1.4,versicolor +6.4,3.2,4.5,1.5,versicolor +6.9,3.1,4.9,1.5,versicolor +5.5,2.3,4.0,1.3,versicolor +6.5,2.8,4.6,1.5,versicolor +5.7,2.8,4.5,1.3,versicolor +6.3,3.3,4.7,1.6,versicolor +4.9,2.4,3.3,1.0,versicolor +6.6,2.9,4.6,1.3,versicolor +5.2,2.7,3.9,1.4,versicolor +5.0,2.0,3.5,1.0,versicolor +5.9,3.0,4.2,1.5,versicolor +6.0,2.2,4.0,1.0,versicolor +6.1,2.9,4.7,1.4,versicolor +5.6,2.9,3.6,1.3,versicolor +6.7,3.1,4.4,1.4,versicolor +5.6,3.0,4.5,1.5,versicolor +5.8,2.7,4.1,1.0,versicolor +6.2,2.2,4.5,1.5,versicolor +5.6,2.5,3.9,1.1,versicolor +5.9,3.2,4.8,1.8,versicolor +6.1,2.8,4.0,1.3,versicolor +6.3,2.5,4.9,1.5,versicolor +6.1,2.8,4.7,1.2,versicolor +6.4,2.9,4.3,1.3,versicolor +6.6,3.0,4.4,1.4,versicolor +6.8,2.8,4.8,1.4,versicolor +6.7,3.0,5.0,1.7,versicolor +6.0,2.9,4.5,1.5,versicolor +5.7,2.6,3.5,1.0,versicolor +5.5,2.4,3.8,1.1,versicolor +5.5,2.4,3.7,1.0,versicolor +5.8,2.7,3.9,1.2,versicolor +6.0,2.7,5.1,1.6,versicolor +5.4,3.0,4.5,1.5,versicolor +6.0,3.4,4.5,1.6,versicolor +6.7,3.1,4.7,1.5,versicolor +6.3,2.3,4.4,1.3,versicolor +5.6,3.0,4.1,1.3,versicolor +5.5,2.5,4.0,1.3,versicolor +5.5,2.6,4.4,1.2,versicolor +6.1,3.0,4.6,1.4,versicolor +5.8,2.6,4.0,1.2,versicolor +5.0,2.3,3.3,1.0,versicolor +5.6,2.7,4.2,1.3,versicolor +5.7,3.0,4.2,1.2,versicolor +5.7,2.9,4.2,1.3,versicolor +6.2,2.9,4.3,1.3,versicolor +5.1,2.5,3.0,1.1,versicolor +5.7,2.8,4.1,1.3,versicolor +6.3,3.3,6.0,2.5,virginica +5.8,2.7,5.1,1.9,virginica +7.1,3.0,5.9,2.1,virginica +6.3,2.9,5.6,1.8,virginica +6.5,3.0,5.8,2.2,virginica +7.6,3.0,6.6,2.1,virginica +4.9,2.5,4.5,1.7,virginica +7.3,2.9,6.3,1.8,virginica +6.7,2.5,5.8,1.8,virginica +7.2,3.6,6.1,2.5,virginica +6.5,3.2,5.1,2.0,virginica +6.4,2.7,5.3,1.9,virginica +6.8,3.0,5.5,2.1,virginica +5.7,2.5,5.0,2.0,virginica +5.8,2.8,5.1,2.4,virginica +6.4,3.2,5.3,2.3,virginica +6.5,3.0,5.5,1.8,virginica +7.7,3.8,6.7,2.2,virginica +7.7,2.6,6.9,2.3,virginica +6.0,2.2,5.0,1.5,virginica +6.9,3.2,5.7,2.3,virginica +5.6,2.8,4.9,2.0,virginica +7.7,2.8,6.7,2.0,virginica +6.3,2.7,4.9,1.8,virginica +6.7,3.3,5.7,2.1,virginica +7.2,3.2,6.0,1.8,virginica +6.2,2.8,4.8,1.8,virginica +6.1,3.0,4.9,1.8,virginica +6.4,2.8,5.6,2.1,virginica +7.2,3.0,5.8,1.6,virginica +7.4,2.8,6.1,1.9,virginica +7.9,3.8,6.4,2.0,virginica +6.4,2.8,5.6,2.2,virginica +6.3,2.8,5.1,1.5,virginica +6.1,2.6,5.6,1.4,virginica +7.7,3.0,6.1,2.3,virginica +6.3,3.4,5.6,2.4,virginica +6.4,3.1,5.5,1.8,virginica +6.0,3.0,4.8,1.8,virginica +6.9,3.1,5.4,2.1,virginica +6.7,3.1,5.6,2.4,virginica +6.9,3.1,5.1,2.3,virginica +5.8,2.7,5.1,1.9,virginica +6.8,3.2,5.9,2.3,virginica +6.7,3.3,5.7,2.5,virginica +6.7,3.0,5.2,2.3,virginica +6.3,2.5,5.0,1.9,virginica +6.5,3.0,5.2,2.0,virginica +6.2,3.4,5.4,2.3,virginica +5.9,3.0,5.1,1.8,virginica diff --git a/tfx_addons/version.py b/tfx_addons/version.py index 89a1eb74..581b81f4 100644 --- a/tfx_addons/version.py +++ b/tfx_addons/version.py @@ -54,9 +54,8 @@ "schema_curation": [ f"tfx{_TFXVERSION_CONSTRAINT}", ], - "feature_selection": [ - f"tfx{_TFXVERSION_CONSTRAINT}", - ], + "feature_selection": + [f"tfx{_TFXVERSION_CONSTRAINT}", "scikit_learn>=1.0.2,<2.0.0"], "feast_examplegen": [ f"tfx{_TFXVERSION_CONSTRAINT}", "feast>=0.21.3,<1.0.0",