From bfe5c747011f23647a97d3266825fa0eb818de7a Mon Sep 17 00:00:00 2001 From: MoooCat Date: Fri, 12 Jul 2024 17:19:58 +0800 Subject: [PATCH 01/30] add value_fields in metadata --- sdgx/data_models/metadata.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index f1df8d9f..ed16e1d1 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -46,7 +46,6 @@ class Metadata(BaseModel): """ @field_validator("column_list") - @classmethod def check_column_list(cls, value) -> Any: # check if v has duplicate element if len(value) == len(set(value)): @@ -101,6 +100,17 @@ def format_fields(self) -> Iterable[str]: (k for k in self.model_fields if k.endswith("_format")), (k for k in self._extend.keys() if k.endswith("_format")), ) + + @property + def value_fields(self) -> Iterable[str]: + """ + Return all tag fields in this metadata. + """ + + return chain( + (k for k in self.model_fields if k.endswith("_values")), + (k for k in self._extend.keys() if k.endswith("_values")), + ) def __eq__(self, other): if not isinstance(other, Metadata): @@ -115,6 +125,10 @@ def __eq__(self, other): self.get(key) == other.get(key) for key in set(chain(self.format_fields, other.format_fields)) ) + and all( + self.get(key) == other.get(key) + for key in set(chain(self.value_fields, other.value_fields)) + ) and self.version == other.version ) From f8cf69accc6df30abb4c050b98f3494ca71e2ab8 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Fri, 12 Jul 2024 17:20:06 +0800 Subject: [PATCH 02/30] add ConstInspector --- sdgx/data_models/inspectors/const.py | 50 ++++++++++++++++++++++ tests/data_models/inspector/test_const.py | 51 +++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 sdgx/data_models/inspectors/const.py create mode 100644 tests/data_models/inspector/test_const.py diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py new file mode 100644 index 00000000..122713d6 --- /dev/null +++ b/sdgx/data_models/inspectors/const.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import Any +import pandas as pd + +from sdgx.data_models.inspectors.base import Inspector +from sdgx.data_models.inspectors.extension import hookimpl + + +class ConstInspector(Inspector): + + const_columns: set[str] = set() + + const_values : dict[Any] = {} + + _inspect_level = 80 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def fit(self, raw_data: pd.DataFrame, *args, **kwargs): + """Fit the inspector. + + Gets the list of const columns from the raw data. + + Args: + raw_data (pd.DataFrame): Raw data + """ + + # Identify columns where the const rate exceeds the threshold + self.const_columns = set() + + # iterate each column + for column in raw_data.columns: + if len(raw_data[column].value_counts(normalize=True)) == 1: + self.const_columns.add(column) + self.const_values[column] = raw_data[column][0] + + self.ready = True + + def inspect(self, *args, **kwargs) -> dict[str, Any]: + """Inspect raw data and generate metadata.""" + + return {"const_columns": list(self.const_columns), + "const_values": self.const_values} + + +@hookimpl +def register(manager): + manager.register("ConstInspector", ConstInspector) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py new file mode 100644 index 00000000..93b8b5c0 --- /dev/null +++ b/tests/data_models/inspector/test_const.py @@ -0,0 +1,51 @@ +import pandas as pd +import pytest + +from sdgx.data_models.inspectors.const import ConstInspector + +@pytest.fixture +def inspector(): + yield ConstInspector() + + +@pytest.fixture +def raw_data(demo_single_table_path): + yield pd.read_csv(demo_single_table_path) + + +@pytest.fixture +def test_const_data(raw_data: pd.DataFrame): + # Convert the columns to float to allow None values + raw_data["age"] = raw_data["age"].astype(float) + raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) + + # Set the values to None + raw_data["age"].values[:] = 100 + raw_data["fnlwgt"].values[:] = 3.14 + raw_data["workclass"].values[:] = "President" + + yield raw_data + + +def test_inspector(inspector: ConstInspector, test_const_data): + inspector.fit(test_const_data) + assert inspector.ready + assert inspector.const_columns + assert sorted(inspector.inspect()["const_columns"]) == sorted( + [ + "age", + "fnlwgt","workclass" + ] + ) + + assert inspector.inspect_level == 80 + assert sorted(list(inspector.const_values.keys())) == sorted(["age", + "fnlwgt","workclass"]) + assert inspector.const_values["age"] == 100 + assert inspector.const_values["fnlwgt"] == 3.14 + assert inspector.const_values["workclass"] == 'President' + + + +if __name__ == "__main__": + pytest.main(["-vv", "-s", __file__]) From 159b708e381b82820eba6f2ff446630ef8f6e589 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Fri, 12 Jul 2024 17:57:42 +0800 Subject: [PATCH 03/30] add ConstValueTransformer and its testcase --- sdgx/data_processors/manager.py | 2 +- sdgx/data_processors/transformers/const.py | 105 ++++++++++++++++++ .../transformers/test_transformers_const.py | 73 ++++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 sdgx/data_processors/transformers/const.py create mode 100644 tests/data_processors/transformers/test_transformers_const.py diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py index a3544609..a32ee85b 100644 --- a/sdgx/data_processors/manager.py +++ b/sdgx/data_processors/manager.py @@ -52,7 +52,7 @@ class DataProcessorManager(Manager): "IntValueFormatter", "DatetimeFormatter", ] - ] + ["EmptyTransformer".lower(), "ColumnOrderTransformer".lower()] + ] + ["ConstValueTransformer".lower(), "EmptyTransformer".lower(), "ColumnOrderTransformer".lower()] """ preset_defalut_processors list stores the lowercase names of the transformers loaded by default. When using the synthesizer, they will be loaded by default to facilitate user operations. diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py new file mode 100644 index 00000000..fef4b084 --- /dev/null +++ b/sdgx/data_processors/transformers/const.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from typing import Any +import pandas as pd + +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.extension import hookimpl +from sdgx.data_processors.transformers.base import Transformer +from sdgx.utils import logger + + +class ConstValueTransformer(Transformer): + """ + A transformer that replaces the input with a constant value. + + This class is used to transform any input data into a predefined constant value. + It is particularly useful in scenarios where a consistent output is required regardless of the input. + + Attributes: + const_value (dict[Any]): The constant value that will be returned. + """ + + const_columns: list = [] + + const_values: dict[Any] = {} + + def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): + """ + Fit method for the transformer. + + This method processes the metadata to identify columns that should be replaced with a constant value. + It updates the internal state of the transformer with the columns and their corresponding constant values. + + Args: + metadata (Metadata | None): The metadata object containing information about the columns and their data types. + **kwargs (dict[str, Any]): Additional keyword arguments. + + Returns: + None + """ + + for each_col in metadata.column_list: + if metadata.get_column_data_type(each_col) == "const": + self.const_columns.append(each_col) + self.const_values[each_col] = metadata.get('const_values')[each_col] + + logger.info("ConstValueTransformer Fitted.") + + self.fitted = True + + def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: + """ + Convert method to handle missing values in the input data by replacing specified columns with constant values. + + This method iterates over the columns identified for replacement with constant values and removes them from the input DataFrame. + The removal is based on the columns specified during the fitting process. + + Args: + raw_data (pd.DataFrame): The input DataFrame containing the data to be processed. + + Returns: + pd.DataFrame: A DataFrame with the specified columns removed. + """ + + processed_data = raw_data + + logger.info("Converting data using ConstValueTransformer...") + + for each_col in self.const_columns: + processed_data = self.remove_columns(processed_data, [each_col]) + + logger.info("Converting data using ConstValueTransformer... Finished.") + + return processed_data + + def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: + """ + Reverse_convert method for the transformer. + + This method restores the original columns that were replaced with constant values during the conversion process. + It iterates over the columns identified for replacement with constant values and adds them back to the DataFrame + with the predefined constant values. + + Args: + processed_data (pd.DataFrame): The input DataFrame containing the processed data. + + Returns: + pd.DataFrame: A DataFrame with the original columns restored, filled with their corresponding constant values. + """ + df_length = processed_data.shape[0] + + for each_col_name in self.const_columns: + each_value = self.const_values[each_col_name] + each_const_col = [each_value for _ in range(df_length)] + each_const_df = pd.DataFrame({each_col_name: each_const_col}) + processed_data = self.attach_columns(processed_data, each_const_df) + + logger.info("Data reverse-converted by ConstValueTransformer.") + + return processed_data + + +@hookimpl +def register(manager): + manager.register("ConstValueTransformer", ConstValueTransformer) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py new file mode 100644 index 00000000..b628d22f --- /dev/null +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -0,0 +1,73 @@ +import numpy as np +import pandas as pd +import pytest + +from sdgx.data_models.metadata import Metadata +from sdgx.data_processors.transformers.const import ConstValueTransformer + + +@pytest.fixture +def raw_data(demo_single_table_path): + yield pd.read_csv(demo_single_table_path) + + +@pytest.fixture +def test_const_data(raw_data: pd.DataFrame): + # Convert the columns to float to allow None values + raw_data["age"] = raw_data["age"].astype(float) + raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) + + # Set the values to None + raw_data["age"].values[:] = 100 + raw_data["fnlwgt"].values[:] = 1.41421 + + yield raw_data + + +def test_const_handling_test_df(test_const_data: pd.DataFrame): + """ + Test the handling of const columns in a DataFrame. + This function tests the behavior of a DataFrame when it contains const columns. + It is designed to be used in a testing environment, where the DataFrame is passed as an argument. + + Parameters: + test_const_data (pd.DataFrame): The DataFrame to test. + + Returns: + None + + Raises: + AssertionError: If the DataFrame does not handle const columns as expected. + """ + + metadata = Metadata.from_dataframe(test_const_data) + + # Initialize the ConstValueTransformer. + const_transformer = ConstValueTransformer() + # Check if the transformer has not been fitted yet. + assert const_transformer.fitted is False + + # Fit the transformer with the DataFrame. + const_transformer.fit(metadata) + + # Check if the transformer has been fitted after the fit operation. + assert const_transformer.fitted + + # Check the const column + assert sorted(const_transformer.const_columns) == ["age", "fnlwgt"] + + # Transform the DataFrame using the transformer. + transformed_df = const_transformer.convert(test_const_data) + + # Check if the transformed DataFrame does not contain any const columns. + # assert not df_has_const_col(transformed_df) + processed_metadata = Metadata.from_dataframe(transformed_df) + assert not processed_metadata.get("const_columns") + + # reverse convert the df + reverse_converted_df = const_transformer.reverse_convert(transformed_df) + reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) + assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt"} + assert reverse_converted_df['age'][0] == 100 + assert reverse_converted_df['fnlwgt'][0] == 1.41421 + From 6f6d87a707beb1a3b4a48a9ffda617c7c256764d Mon Sep 17 00:00:00 2001 From: MoooCat Date: Fri, 12 Jul 2024 18:03:52 +0800 Subject: [PATCH 04/30] update comments and validator parameter name --- sdgx/data_models/inspectors/const.py | 34 +++++++++++++++++++++------- sdgx/data_models/metadata.py | 28 +++++++++++++++++------ 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index 122713d6..906a042d 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -8,28 +8,46 @@ class ConstInspector(Inspector): + """ + ConstInspector is a class designed to identify columns in a DataFrame that contain constant values. + It extends the base Inspector class and is used to fit the data and inspect it for constant columns. + + Attributes: + const_columns (set[str]): A set of column names that contain constant values. + const_values (dict[Any]): A dictionary mapping column names to their constant values. + _inspect_level (int): The inspection level for this inspector, set to 80. + """ const_columns: set[str] = set() + """ + A set of column names that contain constant values. This attribute is populated during the fit method by identifying columns in the DataFrame where all values are the same. + """ const_values : dict[Any] = {} - + """ + A dictionary mapping column names to their constant values. This attribute is populated during the fit method by storing the unique value found in each constant column. + """ + _inspect_level = 80 + """ + The inspection level for this inspector, set to 80. This attribute indicates the priority or depth of inspection that this inspector performs relative to other inspectors. + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def fit(self, raw_data: pd.DataFrame, *args, **kwargs): - """Fit the inspector. + """ + Fit the inspector to the raw data. - Gets the list of const columns from the raw data. + This method identifies columns in the DataFrame that contain constant values. It populates the `const_columns` set with the names of these columns and the `const_values` dictionary with the constant values found in each column. Args: - raw_data (pd.DataFrame): Raw data - """ - - # Identify columns where the const rate exceeds the threshold - self.const_columns = set() + raw_data (pd.DataFrame): The raw data to be inspected. + Returns: + None + """ # iterate each column for column in raw_data.columns: if len(raw_data[column].value_counts(normalize=True)) == 1: diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index ed16e1d1..031d2bc8 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -40,16 +40,15 @@ class Metadata(BaseModel): """ column_list: List[str] = Field(default_factory=list, title="The List of Column Names") - """" column_list is the actual value of self.column_list """ @field_validator("column_list") - def check_column_list(cls, value) -> Any: + def check_column_list(cls, v) -> Any: # check if v has duplicate element - if len(value) == len(set(value)): - return value + if len(v) == len(set(v)): + return v raise MetadataInitError("column_list has duplicate element!") column_inspect_level: Dict[str, int] = defaultdict(lambda: 10) @@ -82,7 +81,12 @@ def check_column_list(cls, value) -> Any: @property def tag_fields(self) -> Iterable[str]: """ - Return all tag fields in this metadata. + Returns a list of fields that represent tags or labels associated with the data. + + These fields might include labels used for categorization, such as tags for machine learning labels or data classification. + + Returns: + list[str]: List of field names that represent tags or labels. """ return chain( @@ -93,7 +97,12 @@ def tag_fields(self) -> Iterable[str]: @property def format_fields(self) -> Iterable[str]: """ - Return all tag fields in this metadata. + Returns a list of fields that represent the format or structure of the data. + + These fields might include metadata about how the data is structured, such as date formats, delimiters, etc. + + Returns: + list[str]: List of field names that represent data formats. """ return chain( @@ -104,7 +113,12 @@ def format_fields(self) -> Iterable[str]: @property def value_fields(self) -> Iterable[str]: """ - Return all tag fields in this metadata. + Returns a list of fields that represent the values in the dataset. + + These fields are typically numeric or categorical values that are used for analysis or modeling. + + Returns: + list[str]: List of field names that represent values. """ return chain( From 24dfaab91a0041dade807b71db25ed07e76aea2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 10:13:39 +0000 Subject: [PATCH 05/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/inspectors/const.py | 10 +++++----- sdgx/data_models/metadata.py | 14 +++++++------- sdgx/data_processors/manager.py | 6 +++++- sdgx/data_processors/transformers/const.py | 5 +++-- tests/data_models/inspector/test_const.py | 16 +++++----------- .../transformers/test_transformers_const.py | 5 ++--- 6 files changed, 27 insertions(+), 29 deletions(-) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index 906a042d..5c1e1364 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Any + import pandas as pd from sdgx.data_models.inspectors.base import Inspector @@ -23,7 +24,7 @@ class ConstInspector(Inspector): A set of column names that contain constant values. This attribute is populated during the fit method by identifying columns in the DataFrame where all values are the same. """ - const_values : dict[Any] = {} + const_values: dict[Any] = {} """ A dictionary mapping column names to their constant values. This attribute is populated during the fit method by storing the unique value found in each constant column. """ @@ -32,7 +33,7 @@ class ConstInspector(Inspector): """ The inspection level for this inspector, set to 80. This attribute indicates the priority or depth of inspection that this inspector performs relative to other inspectors. """ - + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -48,7 +49,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Returns: None """ - # iterate each column + # iterate each column for column in raw_data.columns: if len(raw_data[column].value_counts(normalize=True)) == 1: self.const_columns.add(column) @@ -59,8 +60,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" - return {"const_columns": list(self.const_columns), - "const_values": self.const_values} + return {"const_columns": list(self.const_columns), "const_values": self.const_values} @hookimpl diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index 031d2bc8..d33ce8fd 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -82,9 +82,9 @@ def check_column_list(cls, v) -> Any: def tag_fields(self) -> Iterable[str]: """ Returns a list of fields that represent tags or labels associated with the data. - + These fields might include labels used for categorization, such as tags for machine learning labels or data classification. - + Returns: list[str]: List of field names that represent tags or labels. """ @@ -98,9 +98,9 @@ def tag_fields(self) -> Iterable[str]: def format_fields(self) -> Iterable[str]: """ Returns a list of fields that represent the format or structure of the data. - + These fields might include metadata about how the data is structured, such as date formats, delimiters, etc. - + Returns: list[str]: List of field names that represent data formats. """ @@ -109,14 +109,14 @@ def format_fields(self) -> Iterable[str]: (k for k in self.model_fields if k.endswith("_format")), (k for k in self._extend.keys() if k.endswith("_format")), ) - + @property def value_fields(self) -> Iterable[str]: """ Returns a list of fields that represent the values in the dataset. - + These fields are typically numeric or categorical values that are used for analysis or modeling. - + Returns: list[str]: List of field names that represent values. """ diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py index a32ee85b..9c331cae 100644 --- a/sdgx/data_processors/manager.py +++ b/sdgx/data_processors/manager.py @@ -52,7 +52,11 @@ class DataProcessorManager(Manager): "IntValueFormatter", "DatetimeFormatter", ] - ] + ["ConstValueTransformer".lower(), "EmptyTransformer".lower(), "ColumnOrderTransformer".lower()] + ] + [ + "ConstValueTransformer".lower(), + "EmptyTransformer".lower(), + "ColumnOrderTransformer".lower(), + ] """ preset_defalut_processors list stores the lowercase names of the transformers loaded by default. When using the synthesizer, they will be loaded by default to facilitate user operations. diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py index fef4b084..cba6846c 100644 --- a/sdgx/data_processors/transformers/const.py +++ b/sdgx/data_processors/transformers/const.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Any + import pandas as pd from sdgx.data_models.metadata import Metadata @@ -19,7 +20,7 @@ class ConstValueTransformer(Transformer): Attributes: const_value (dict[Any]): The constant value that will be returned. """ - + const_columns: list = [] const_values: dict[Any] = {} @@ -42,7 +43,7 @@ def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): for each_col in metadata.column_list: if metadata.get_column_data_type(each_col) == "const": self.const_columns.append(each_col) - self.const_values[each_col] = metadata.get('const_values')[each_col] + self.const_values[each_col] = metadata.get("const_values")[each_col] logger.info("ConstValueTransformer Fitted.") diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index 93b8b5c0..cc1a940f 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -3,6 +3,7 @@ from sdgx.data_models.inspectors.const import ConstInspector + @pytest.fixture def inspector(): yield ConstInspector() @@ -31,20 +32,13 @@ def test_inspector(inspector: ConstInspector, test_const_data): inspector.fit(test_const_data) assert inspector.ready assert inspector.const_columns - assert sorted(inspector.inspect()["const_columns"]) == sorted( - [ - "age", - "fnlwgt","workclass" - ] - ) - + assert sorted(inspector.inspect()["const_columns"]) == sorted(["age", "fnlwgt", "workclass"]) + assert inspector.inspect_level == 80 - assert sorted(list(inspector.const_values.keys())) == sorted(["age", - "fnlwgt","workclass"]) + assert sorted(list(inspector.const_values.keys())) == sorted(["age", "fnlwgt", "workclass"]) assert inspector.const_values["age"] == 100 assert inspector.const_values["fnlwgt"] == 3.14 - assert inspector.const_values["workclass"] == 'President' - + assert inspector.const_values["workclass"] == "President" if __name__ == "__main__": diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index b628d22f..6c1bc992 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -68,6 +68,5 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): reverse_converted_df = const_transformer.reverse_convert(transformed_df) reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt"} - assert reverse_converted_df['age'][0] == 100 - assert reverse_converted_df['fnlwgt'][0] == 1.41421 - + assert reverse_converted_df["age"][0] == 100 + assert reverse_converted_df["fnlwgt"][0] == 1.41421 From 67c5a6ec37973f26e23cac5aa2f4a82794fc102a Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 20:38:40 +0800 Subject: [PATCH 06/30] update typo in test case --- .../data_processors/transformers/test_transformers_const.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 6c1bc992..845fb0bd 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -20,6 +20,7 @@ def test_const_data(raw_data: pd.DataFrame): # Set the values to None raw_data["age"].values[:] = 100 raw_data["fnlwgt"].values[:] = 1.41421 + raw_data["workclass"].values[:] = "President" yield raw_data @@ -54,7 +55,7 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): assert const_transformer.fitted # Check the const column - assert sorted(const_transformer.const_columns) == ["age", "fnlwgt"] + assert sorted(const_transformer.const_columns) == ["age", "fnlwgt", 'workclass',] # Transform the DataFrame using the transformer. transformed_df = const_transformer.convert(test_const_data) @@ -67,6 +68,7 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): # reverse convert the df reverse_converted_df = const_transformer.reverse_convert(transformed_df) reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) - assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt"} + assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt", 'workclass'} assert reverse_converted_df["age"][0] == 100 assert reverse_converted_df["fnlwgt"][0] == 1.41421 + assert reverse_converted_df["fnlwgt"][0] == "President" From 01f156e643980bce85a99adb4514e02b37c87351 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:39:02 +0000 Subject: [PATCH 07/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/test_transformers_const.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 845fb0bd..7b961acf 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -55,7 +55,11 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): assert const_transformer.fitted # Check the const column - assert sorted(const_transformer.const_columns) == ["age", "fnlwgt", 'workclass',] + assert sorted(const_transformer.const_columns) == [ + "age", + "fnlwgt", + "workclass", + ] # Transform the DataFrame using the transformer. transformed_df = const_transformer.convert(test_const_data) @@ -68,7 +72,7 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): # reverse convert the df reverse_converted_df = const_transformer.reverse_convert(transformed_df) reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) - assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt", 'workclass'} + assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt", "workclass"} assert reverse_converted_df["age"][0] == 100 assert reverse_converted_df["fnlwgt"][0] == 1.41421 assert reverse_converted_df["fnlwgt"][0] == "President" From 085dfb2148b497163cec9e3888b755a8f1eefb48 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 20:54:57 +0800 Subject: [PATCH 08/30] fix metadata test error of value fields The management of metadata fields may be flawed, necessitating an examination of the eq method or the manner in which fields are retrieved. We will open a separate pull request to address this issue. --- sdgx/data_models/metadata.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index d33ce8fd..975aa6fe 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -45,6 +45,7 @@ class Metadata(BaseModel): """ @field_validator("column_list") + # @classmethod def check_column_list(cls, v) -> Any: # check if v has duplicate element if len(v) == len(set(v)): @@ -70,6 +71,7 @@ def check_column_list(cls, v) -> Any: discrete_columns: Set[str] = set() datetime_columns: Set[str] = set() datetime_format: Dict = defaultdict(str) + const_values: Dict = defaultdict(str) # version info version: str = "1.0" @@ -206,6 +208,7 @@ def set(self, key: str, value: Any): key in self.model_fields and key not in self.tag_fields and key not in self.format_fields + and key not in self.value_fields ): raise MetadataInitError( f"Set {key} not in tag_fields, try set it directly as m.{key} = value" @@ -252,6 +255,9 @@ def add(self, key: str, values: str | Iterable[str]): # already in fields that contains dict if key in list(self.format_fields): self.get(key).update(values) + + if key in list(self.value_fields): + self.get(key).update(values) # in extend if self._extend.get(key, None) is None: From f0472e92d6e693029564691516554e3cfcddc739 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:56:37 +0000 Subject: [PATCH 09/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index 975aa6fe..e1d0445e 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -255,7 +255,7 @@ def add(self, key: str, values: str | Iterable[str]): # already in fields that contains dict if key in list(self.format_fields): self.get(key).update(values) - + if key in list(self.value_fields): self.get(key).update(values) From 29b7bfaa40aa3d723f1e0e1feb4d32c2c4f5619d Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 21:13:11 +0800 Subject: [PATCH 10/30] add const_columns in default --- sdgx/data_models/metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index e1d0445e..5045bfae 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -70,6 +70,7 @@ def check_column_list(cls, v) -> Any: bool_columns: Set[str] = set() discrete_columns: Set[str] = set() datetime_columns: Set[str] = set() + const_columns: Set[str] = set() datetime_format: Dict = defaultdict(str) const_values: Dict = defaultdict(str) From c23f632461bb623d3a6afe2d5673ba7716cbe7cc Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 21:18:01 +0800 Subject: [PATCH 11/30] Refreshing the test cases Addressing issues in pytest where erroneous references to certain pytest.fixture instances arise can be resolved through the utilization of deepcopy. --- tests/data_models/inspector/test_const.py | 15 +++++++++------ .../transformers/test_transformers_const.py | 15 +++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index cc1a940f..7790b4f1 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -1,5 +1,6 @@ import pandas as pd import pytest +import copy from sdgx.data_models.inspectors.const import ConstInspector @@ -16,16 +17,18 @@ def raw_data(demo_single_table_path): @pytest.fixture def test_const_data(raw_data: pd.DataFrame): + const_col_df = copy.deepcopy(raw_data) + # Convert the columns to float to allow None values - raw_data["age"] = raw_data["age"].astype(float) - raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) + const_col_df["age"] = const_col_df["age"].astype(float) + const_col_df["fnlwgt"] = const_col_df["fnlwgt"].astype(float) # Set the values to None - raw_data["age"].values[:] = 100 - raw_data["fnlwgt"].values[:] = 3.14 - raw_data["workclass"].values[:] = "President" + const_col_df["age"].values[:] = 100 + const_col_df["fnlwgt"].values[:] = 3.14 + const_col_df["workclass"].values[:] = "President" - yield raw_data + yield const_col_df def test_inspector(inspector: ConstInspector, test_const_data): diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 7b961acf..0e75f9ed 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +import copy from sdgx.data_models.metadata import Metadata from sdgx.data_processors.transformers.const import ConstValueTransformer @@ -13,16 +14,18 @@ def raw_data(demo_single_table_path): @pytest.fixture def test_const_data(raw_data: pd.DataFrame): + + const_col_df = copy.deepcopy(raw_data) # Convert the columns to float to allow None values - raw_data["age"] = raw_data["age"].astype(float) - raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) + const_col_df["age"] = const_col_df["age"].astype(float) + const_col_df["fnlwgt"] = const_col_df["fnlwgt"].astype(float) # Set the values to None - raw_data["age"].values[:] = 100 - raw_data["fnlwgt"].values[:] = 1.41421 - raw_data["workclass"].values[:] = "President" + const_col_df["age"].values[:] = 100 + const_col_df["fnlwgt"].values[:] = 1.41421 + const_col_df["workclass"].values[:] = "President" - yield raw_data + yield const_col_df def test_const_handling_test_df(test_const_data: pd.DataFrame): From d0c5733e5ea118fb5c37063a5f2f8f0192b27b85 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:18:49 +0000 Subject: [PATCH 12/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/data_models/inspector/test_const.py | 5 +++-- .../data_processors/transformers/test_transformers_const.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index 7790b4f1..7fb968cd 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -1,6 +1,7 @@ +import copy + import pandas as pd import pytest -import copy from sdgx.data_models.inspectors.const import ConstInspector @@ -18,7 +19,7 @@ def raw_data(demo_single_table_path): @pytest.fixture def test_const_data(raw_data: pd.DataFrame): const_col_df = copy.deepcopy(raw_data) - + # Convert the columns to float to allow None values const_col_df["age"] = const_col_df["age"].astype(float) const_col_df["fnlwgt"] = const_col_df["fnlwgt"].astype(float) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 0e75f9ed..8945d1f5 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -1,7 +1,8 @@ +import copy + import numpy as np import pandas as pd import pytest -import copy from sdgx.data_models.metadata import Metadata from sdgx.data_processors.transformers.const import ConstValueTransformer From 029d5ba5de678d234dcf5d97b58a4d6b77d3aa25 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 21:28:08 +0800 Subject: [PATCH 13/30] Revise the unit tests for the module handling two constant collections to ensure they are comprehensive and reflect the latest functionality. --- tests/data_models/inspector/test_const.py | 18 ++++-------------- .../transformers/test_transformers_const.py | 9 ++------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index 7fb968cd..0df8f3a5 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -5,20 +5,9 @@ from sdgx.data_models.inspectors.const import ConstInspector - -@pytest.fixture -def inspector(): - yield ConstInspector() - - -@pytest.fixture -def raw_data(demo_single_table_path): - yield pd.read_csv(demo_single_table_path) - - @pytest.fixture -def test_const_data(raw_data: pd.DataFrame): - const_col_df = copy.deepcopy(raw_data) +def test_const_data(demo_single_table_path): + const_col_df = pd.read_csv(demo_single_table_path) # Convert the columns to float to allow None values const_col_df["age"] = const_col_df["age"].astype(float) @@ -32,7 +21,8 @@ def test_const_data(raw_data: pd.DataFrame): yield const_col_df -def test_inspector(inspector: ConstInspector, test_const_data): +def test_inspector(test_const_data): + inspector = ConstInspector() inspector.fit(test_const_data) assert inspector.ready assert inspector.const_columns diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 8945d1f5..922d7a27 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -9,14 +9,9 @@ @pytest.fixture -def raw_data(demo_single_table_path): - yield pd.read_csv(demo_single_table_path) +def test_const_data(demo_single_table_path): - -@pytest.fixture -def test_const_data(raw_data: pd.DataFrame): - - const_col_df = copy.deepcopy(raw_data) + const_col_df = pd.read_csv(demo_single_table_path) # Convert the columns to float to allow None values const_col_df["age"] = const_col_df["age"].astype(float) const_col_df["fnlwgt"] = const_col_df["fnlwgt"].astype(float) From 63d0b9546a0765c32194b37e1d250c01fad453fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:28:24 +0000 Subject: [PATCH 14/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/data_models/inspector/test_const.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index 0df8f3a5..ec8fe18e 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -5,6 +5,7 @@ from sdgx.data_models.inspectors.const import ConstInspector + @pytest.fixture def test_const_data(demo_single_table_path): const_col_df = pd.read_csv(demo_single_table_path) From 9324b45c5143f6451e55a51f8bfa011492549f5c Mon Sep 17 00:00:00 2001 From: MoooCat Date: Mon, 15 Jul 2024 21:55:09 +0800 Subject: [PATCH 15/30] add deepcopy in const.py --- sdgx/data_processors/transformers/const.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py index cba6846c..c8a60cbe 100644 --- a/sdgx/data_processors/transformers/const.py +++ b/sdgx/data_processors/transformers/const.py @@ -3,6 +3,7 @@ from typing import Any import pandas as pd +import copy from sdgx.data_models.metadata import Metadata from sdgx.data_processors.extension import hookimpl @@ -63,7 +64,7 @@ def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: A DataFrame with the specified columns removed. """ - processed_data = raw_data + processed_data = copy.deepcopy(raw_data) logger.info("Converting data using ConstValueTransformer...") From fb1d3b85a58e348b6f15e8b271ed09e9f9f2bf19 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:55:25 +0000 Subject: [PATCH 16/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_processors/transformers/const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py index c8a60cbe..f0ef99c0 100644 --- a/sdgx/data_processors/transformers/const.py +++ b/sdgx/data_processors/transformers/const.py @@ -1,9 +1,9 @@ from __future__ import annotations +import copy from typing import Any import pandas as pd -import copy from sdgx.data_models.metadata import Metadata from sdgx.data_processors.extension import hookimpl From 2fa14eb70c1b5ac219fc09b2e9bf945a8264b20b Mon Sep 17 00:00:00 2001 From: MoooCat Date: Thu, 18 Jul 2024 14:05:12 +0800 Subject: [PATCH 17/30] restore metadata, modify const inspector and transformer --- sdgx/data_models/inspectors/const.py | 4 +- sdgx/data_models/metadata.py | 48 ++++------------------ sdgx/data_processors/transformers/const.py | 5 ++- 3 files changed, 13 insertions(+), 44 deletions(-) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index 5c1e1364..f4531031 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -53,14 +53,14 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): for column in raw_data.columns: if len(raw_data[column].value_counts(normalize=True)) == 1: self.const_columns.add(column) - self.const_values[column] = raw_data[column][0] + # self.const_values[column] = raw_data[column][0] self.ready = True def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" - return {"const_columns": list(self.const_columns), "const_values": self.const_values} + return {"const_columns": list(self.const_columns)} @hookimpl diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index 5045bfae..eac72415 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -40,16 +40,17 @@ class Metadata(BaseModel): """ column_list: List[str] = Field(default_factory=list, title="The List of Column Names") + """" column_list is the actual value of self.column_list """ @field_validator("column_list") - # @classmethod - def check_column_list(cls, v) -> Any: + @classmethod + def check_column_list(cls, value) -> Any: # check if v has duplicate element - if len(v) == len(set(v)): - return v + if len(value) == len(set(value)): + return value raise MetadataInitError("column_list has duplicate element!") column_inspect_level: Dict[str, int] = defaultdict(lambda: 10) @@ -72,7 +73,6 @@ def check_column_list(cls, v) -> Any: datetime_columns: Set[str] = set() const_columns: Set[str] = set() datetime_format: Dict = defaultdict(str) - const_values: Dict = defaultdict(str) # version info version: str = "1.0" @@ -84,12 +84,7 @@ def check_column_list(cls, v) -> Any: @property def tag_fields(self) -> Iterable[str]: """ - Returns a list of fields that represent tags or labels associated with the data. - - These fields might include labels used for categorization, such as tags for machine learning labels or data classification. - - Returns: - list[str]: List of field names that represent tags or labels. + Return all tag fields in this metadata. """ return chain( @@ -100,12 +95,7 @@ def tag_fields(self) -> Iterable[str]: @property def format_fields(self) -> Iterable[str]: """ - Returns a list of fields that represent the format or structure of the data. - - These fields might include metadata about how the data is structured, such as date formats, delimiters, etc. - - Returns: - list[str]: List of field names that represent data formats. + Return all tag fields in this metadata. """ return chain( @@ -113,22 +103,6 @@ def format_fields(self) -> Iterable[str]: (k for k in self._extend.keys() if k.endswith("_format")), ) - @property - def value_fields(self) -> Iterable[str]: - """ - Returns a list of fields that represent the values in the dataset. - - These fields are typically numeric or categorical values that are used for analysis or modeling. - - Returns: - list[str]: List of field names that represent values. - """ - - return chain( - (k for k in self.model_fields if k.endswith("_values")), - (k for k in self._extend.keys() if k.endswith("_values")), - ) - def __eq__(self, other): if not isinstance(other, Metadata): return super().__eq__(other) @@ -142,10 +116,6 @@ def __eq__(self, other): self.get(key) == other.get(key) for key in set(chain(self.format_fields, other.format_fields)) ) - and all( - self.get(key) == other.get(key) - for key in set(chain(self.value_fields, other.value_fields)) - ) and self.version == other.version ) @@ -209,7 +179,6 @@ def set(self, key: str, value: Any): key in self.model_fields and key not in self.tag_fields and key not in self.format_fields - and key not in self.value_fields ): raise MetadataInitError( f"Set {key} not in tag_fields, try set it directly as m.{key} = value" @@ -257,9 +226,6 @@ def add(self, key: str, values: str | Iterable[str]): if key in list(self.format_fields): self.get(key).update(values) - if key in list(self.value_fields): - self.get(key).update(values) - # in extend if self._extend.get(key, None) is None: self._extend[key] = values diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py index f0ef99c0..50cfcbff 100644 --- a/sdgx/data_processors/transformers/const.py +++ b/sdgx/data_processors/transformers/const.py @@ -44,7 +44,7 @@ def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): for each_col in metadata.column_list: if metadata.get_column_data_type(each_col) == "const": self.const_columns.append(each_col) - self.const_values[each_col] = metadata.get("const_values")[each_col] + # self.const_values[each_col] = metadata.get("const_values")[each_col] logger.info("ConstValueTransformer Fitted.") @@ -69,6 +69,9 @@ def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: logger.info("Converting data using ConstValueTransformer...") for each_col in self.const_columns: + # record values here + if each_col not in self.const_values.keys(): + self.const_values[each_col] = processed_data[each_col].unique()[0] processed_data = self.remove_columns(processed_data, [each_col]) logger.info("Converting data using ConstValueTransformer... Finished.") From 7d0d5c71b22392395220f2592ca46ac5a535e277 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 06:05:30 +0000 Subject: [PATCH 18/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_processors/transformers/const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgx/data_processors/transformers/const.py b/sdgx/data_processors/transformers/const.py index 50cfcbff..4af042d8 100644 --- a/sdgx/data_processors/transformers/const.py +++ b/sdgx/data_processors/transformers/const.py @@ -69,7 +69,7 @@ def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: logger.info("Converting data using ConstValueTransformer...") for each_col in self.const_columns: - # record values here + # record values here if each_col not in self.const_values.keys(): self.const_values[each_col] = processed_data[each_col].unique()[0] processed_data = self.remove_columns(processed_data, [each_col]) From 327bee1753a7cf25f2c93e600e08cbfa8cb3b360 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Thu, 18 Jul 2024 14:36:46 +0800 Subject: [PATCH 19/30] remove const_values in inspector's unit test --- tests/data_models/inspector/test_const.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index ec8fe18e..d830d04a 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -22,19 +22,14 @@ def test_const_data(demo_single_table_path): yield const_col_df -def test_inspector(test_const_data): +def test_inspector(test_const_data: pd.DataFrame): inspector = ConstInspector() inspector.fit(test_const_data) assert inspector.ready assert inspector.const_columns - assert sorted(inspector.inspect()["const_columns"]) == sorted(["age", "fnlwgt", "workclass"]) + assert sorted(inspector.inspect()["const_columns"]) == sorted(["age", "fnlwgt", "workclass"]) assert inspector.inspect_level == 80 - assert sorted(list(inspector.const_values.keys())) == sorted(["age", "fnlwgt", "workclass"]) - assert inspector.const_values["age"] == 100 - assert inspector.const_values["fnlwgt"] == 3.14 - assert inspector.const_values["workclass"] == "President" - if __name__ == "__main__": pytest.main(["-vv", "-s", __file__]) From bd23517c3c2e6f08d6dbdbf7cf6e32c5adf0a933 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 06:37:07 +0000 Subject: [PATCH 20/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/data_models/inspector/test_const.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index d830d04a..d3ffe422 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -31,5 +31,6 @@ def test_inspector(test_const_data: pd.DataFrame): assert sorted(inspector.inspect()["const_columns"]) == sorted(["age", "fnlwgt", "workclass"]) assert inspector.inspect_level == 80 + if __name__ == "__main__": pytest.main(["-vv", "-s", __file__]) From dbad60f22967b539807bbb9b7a28a4f2c8d8b035 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Thu, 18 Jul 2024 16:18:39 +0800 Subject: [PATCH 21/30] remove multiple metadata in unit test --- .../transformers/test_transformers_const.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 922d7a27..53d04638 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -63,15 +63,21 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): # Transform the DataFrame using the transformer. transformed_df = const_transformer.convert(test_const_data) - # Check if the transformed DataFrame does not contain any const columns. - # assert not df_has_const_col(transformed_df) - processed_metadata = Metadata.from_dataframe(transformed_df) - assert not processed_metadata.get("const_columns") + assert 'age' not in transformed_df.columns + assert 'fnlwgt' not in transformed_df.columns + assert 'workclass' not in transformed_df.columns # reverse convert the df reverse_converted_df = const_transformer.reverse_convert(transformed_df) - reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) - assert reverse_converted_metadata.get("const_columns") == {"age", "fnlwgt", "workclass"} + + assert 'age' in reverse_converted_df.columns + assert 'fnlwgt' in reverse_converted_df.columns + assert 'workclass' in reverse_converted_df.columns + assert reverse_converted_df["age"][0] == 100 assert reverse_converted_df["fnlwgt"][0] == 1.41421 - assert reverse_converted_df["fnlwgt"][0] == "President" + assert reverse_converted_df["workclass"][0] == "President" + + assert len(reverse_converted_df["age"].unique()) == 1 + assert len(reverse_converted_df["fnlwgt"].unique()) == 1 + assert len(reverse_converted_df["workclass"].unique()) == 1 From 9357e5b31ab01ff17a793a6babd867a9266e015d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 08:20:13 +0000 Subject: [PATCH 22/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../transformers/test_transformers_const.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/data_processors/transformers/test_transformers_const.py b/tests/data_processors/transformers/test_transformers_const.py index 53d04638..585b4056 100644 --- a/tests/data_processors/transformers/test_transformers_const.py +++ b/tests/data_processors/transformers/test_transformers_const.py @@ -63,16 +63,16 @@ def test_const_handling_test_df(test_const_data: pd.DataFrame): # Transform the DataFrame using the transformer. transformed_df = const_transformer.convert(test_const_data) - assert 'age' not in transformed_df.columns - assert 'fnlwgt' not in transformed_df.columns - assert 'workclass' not in transformed_df.columns + assert "age" not in transformed_df.columns + assert "fnlwgt" not in transformed_df.columns + assert "workclass" not in transformed_df.columns # reverse convert the df reverse_converted_df = const_transformer.reverse_convert(transformed_df) - assert 'age' in reverse_converted_df.columns - assert 'fnlwgt' in reverse_converted_df.columns - assert 'workclass' in reverse_converted_df.columns + assert "age" in reverse_converted_df.columns + assert "fnlwgt" in reverse_converted_df.columns + assert "workclass" in reverse_converted_df.columns assert reverse_converted_df["age"][0] == 100 assert reverse_converted_df["fnlwgt"][0] == 1.41421 From 23dd46186092767c5dcc08990b4d4576392b2a93 Mon Sep 17 00:00:00 2001 From: MoooCat Date: Thu, 18 Jul 2024 16:59:35 +0800 Subject: [PATCH 23/30] set all inspectors not ready before fit chunk --- sdgx/data_models/metadata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index eac72415..cffd0932 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -299,6 +299,9 @@ def from_dataloader( inspectors = im.init_inspcetors( include_inspectors, exclude_inspectors, **(inspector_init_kwargs or {}) ) + # set all inspectors not ready + for inspector in inspectors: + inspector.ready = False for i, chunk in enumerate(dataloader.iter()): for inspector in inspectors: if not inspector.ready: From 18d6797d9b186f771d100d436d8598e5a62e37b4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 09:00:25 +0000 Subject: [PATCH 24/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py index cffd0932..a3ca264b 100644 --- a/sdgx/data_models/metadata.py +++ b/sdgx/data_models/metadata.py @@ -299,7 +299,7 @@ def from_dataloader( inspectors = im.init_inspcetors( include_inspectors, exclude_inspectors, **(inspector_init_kwargs or {}) ) - # set all inspectors not ready + # set all inspectors not ready for inspector in inspectors: inspector.ready = False for i, chunk in enumerate(dataloader.iter()): From 966641bd22add8c0293bf22a25a0af943a8a136b Mon Sep 17 00:00:00 2001 From: MoooCat Date: Fri, 19 Jul 2024 16:26:44 +0800 Subject: [PATCH 25/30] try reset the const columns after inspect --- sdgx/data_models/inspectors/const.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index f4531031..44c58f97 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -1,8 +1,8 @@ from __future__ import annotations from typing import Any - import pandas as pd +import copy from sdgx.data_models.inspectors.base import Inspector from sdgx.data_models.inspectors.extension import hookimpl @@ -60,7 +60,10 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" - return {"const_columns": list(self.const_columns)} + res_const_columns = copy.deepcopy(list(self.const_columns)) + # reset the const columns after inspect + self.const_columns = set() + return {"const_columns": res_const_columns} @hookimpl From accf17f8d34b6c3f1a0698250e146f6aae0d15d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 08:27:08 +0000 Subject: [PATCH 26/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/inspectors/const.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index 44c58f97..56400939 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -1,8 +1,9 @@ from __future__ import annotations +import copy from typing import Any + import pandas as pd -import copy from sdgx.data_models.inspectors.base import Inspector from sdgx.data_models.inspectors.extension import hookimpl @@ -61,7 +62,7 @@ def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" res_const_columns = copy.deepcopy(list(self.const_columns)) - # reset the const columns after inspect + # reset the const columns after inspect self.const_columns = set() return {"const_columns": res_const_columns} From afa0dbdb3e25ddb20241e3e13f9a2e54fc30a27b Mon Sep 17 00:00:00 2001 From: MoooCat <141886018+MooooCat@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:39:51 +0800 Subject: [PATCH 27/30] clear column set before inspector fit --- sdgx/data_models/inspectors/bool.py | 1 + sdgx/data_models/inspectors/const.py | 6 ++---- sdgx/data_models/inspectors/datetime.py | 2 ++ sdgx/data_models/inspectors/discrete.py | 1 + sdgx/data_models/inspectors/i_id.py | 2 ++ sdgx/data_models/inspectors/numeric.py | 3 +++ 6 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sdgx/data_models/inspectors/bool.py b/sdgx/data_models/inspectors/bool.py index 53bc7b12..acf0e6a0 100644 --- a/sdgx/data_models/inspectors/bool.py +++ b/sdgx/data_models/inspectors/bool.py @@ -22,6 +22,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Args: raw_data (pd.DataFrame): Raw data """ + self.bool_columns = set() self.bool_columns = self.bool_columns.union( set(raw_data.infer_objects().select_dtypes(include=["bool"]).columns) ) diff --git a/sdgx/data_models/inspectors/const.py b/sdgx/data_models/inspectors/const.py index 56400939..8cfc75f7 100644 --- a/sdgx/data_models/inspectors/const.py +++ b/sdgx/data_models/inspectors/const.py @@ -50,6 +50,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Returns: None """ + self.const_columns = set() # iterate each column for column in raw_data.columns: if len(raw_data[column].value_counts(normalize=True)) == 1: @@ -61,10 +62,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): def inspect(self, *args, **kwargs) -> dict[str, Any]: """Inspect raw data and generate metadata.""" - res_const_columns = copy.deepcopy(list(self.const_columns)) - # reset the const columns after inspect - self.const_columns = set() - return {"const_columns": res_const_columns} + return {"const_columns": self.const_columns} @hookimpl diff --git a/sdgx/data_models/inspectors/datetime.py b/sdgx/data_models/inspectors/datetime.py index 84c8ee03..a43c6232 100644 --- a/sdgx/data_models/inspectors/datetime.py +++ b/sdgx/data_models/inspectors/datetime.py @@ -62,6 +62,8 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Args: raw_data (pd.DataFrame): Raw data """ + self.datetime_columns = set() + self.datetime_columns = self.datetime_columns.union( set(raw_data.infer_objects().select_dtypes(include=["datetime64"]).columns) ) diff --git a/sdgx/data_models/inspectors/discrete.py b/sdgx/data_models/inspectors/discrete.py index 8f7e229c..d38923d0 100644 --- a/sdgx/data_models/inspectors/discrete.py +++ b/sdgx/data_models/inspectors/discrete.py @@ -21,6 +21,7 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Args: raw_data (pd.DataFrame): Raw data """ + self.discrete_columns = set() self.discrete_columns = self.discrete_columns.union( set(raw_data.select_dtypes(include="object").columns) diff --git a/sdgx/data_models/inspectors/i_id.py b/sdgx/data_models/inspectors/i_id.py index 7ee91065..b21c8daa 100644 --- a/sdgx/data_models/inspectors/i_id.py +++ b/sdgx/data_models/inspectors/i_id.py @@ -29,6 +29,8 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): raw_data (pd.DataFrame): Raw data """ + self.ID_columns = set() + df_length = len(raw_data) candidate_columns = set(raw_data.select_dtypes(include=["object", "int64"]).columns) diff --git a/sdgx/data_models/inspectors/numeric.py b/sdgx/data_models/inspectors/numeric.py index 2147589d..d71661d4 100644 --- a/sdgx/data_models/inspectors/numeric.py +++ b/sdgx/data_models/inspectors/numeric.py @@ -87,6 +87,9 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): raw_data (pd.DataFrame): Raw data """ + self.int_columns = set() + self.float_columns = set() + self.df_length = len(raw_data) float_candidate = self.float_columns.union( From f32311e86c99570eb3e851e76a520de4e9437a5c Mon Sep 17 00:00:00 2001 From: MoooCat <141886018+MooooCat@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:40:05 +0800 Subject: [PATCH 28/30] change test func name --- tests/data_models/inspector/test_const.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_models/inspector/test_const.py b/tests/data_models/inspector/test_const.py index d3ffe422..49fe0f62 100644 --- a/tests/data_models/inspector/test_const.py +++ b/tests/data_models/inspector/test_const.py @@ -22,7 +22,7 @@ def test_const_data(demo_single_table_path): yield const_col_df -def test_inspector(test_const_data: pd.DataFrame): +def test_const_inspector(test_const_data: pd.DataFrame): inspector = ConstInspector() inspector.fit(test_const_data) assert inspector.ready From 14097f4af4d1ceaf9b53a521d112b7ab3ea47071 Mon Sep 17 00:00:00 2001 From: MoooCat <141886018+MooooCat@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:40:21 +0800 Subject: [PATCH 29/30] add const type in test case --- tests/data_models/test_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_models/test_metadata.py b/tests/data_models/test_metadata.py index 379f75e8..db8421b0 100644 --- a/tests/data_models/test_metadata.py +++ b/tests/data_models/test_metadata.py @@ -90,7 +90,7 @@ def test_demo_multi_table_data_metadata_child(demo_multi_data_child_matadata): assert demo_multi_data_child_matadata.get_column_data_type("Store") == "int" assert demo_multi_data_child_matadata.get_column_data_type("Date") == "datetime" assert demo_multi_data_child_matadata.get_column_data_type("Customers") == "int" - assert demo_multi_data_child_matadata.get_column_data_type("StateHoliday") == "int" + assert demo_multi_data_child_matadata.get_column_data_type("StateHoliday") == "const" assert demo_multi_data_child_matadata.get_column_data_type("Sales") == "int" assert demo_multi_data_child_matadata.get_column_data_type("Promo") == "int" assert demo_multi_data_child_matadata.get_column_data_type("DayOfWeek") == "int" From fafa39f05ee74fe68b656161249ff60a45627915 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:40:46 +0000 Subject: [PATCH 30/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sdgx/data_models/inspectors/datetime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdgx/data_models/inspectors/datetime.py b/sdgx/data_models/inspectors/datetime.py index a43c6232..cecc5bb5 100644 --- a/sdgx/data_models/inspectors/datetime.py +++ b/sdgx/data_models/inspectors/datetime.py @@ -62,8 +62,8 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs): Args: raw_data (pd.DataFrame): Raw data """ - self.datetime_columns = set() - + self.datetime_columns = set() + self.datetime_columns = self.datetime_columns.union( set(raw_data.infer_objects().select_dtypes(include=["datetime64"]).columns) )