From 1b4220563f8738791406501ccd0279756e289f03 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 29 Sep 2021 11:56:26 -0400 Subject: [PATCH 1/4] Use a column to store categories --- protocol/dataframe_protocol.py | 20 +++++++++----------- protocol/pandas_implementation.py | 27 +++++++++++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 8bbf3327..4371aa67 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -175,7 +175,8 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. + - The data buffer stores encoded values, while the (single) + child column stores the categorical values themselves. Raises RuntimeError if the dtype is not categorical @@ -183,10 +184,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of - categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "is_dictionary" : bool, whether the data is integer encoded TBD: are there any other in-memory representations that are needed? """ @@ -265,12 +263,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], """ pass -# def get_children(self) -> Iterable[Column]: -# """ -# Children columns underneath the column, each object in this iterator -# must adhere to the column specification. -# """ -# pass + def get_children(self) -> Iterable[Column]: + """ + Children columns underneath the column, each object in this iterator + must adhere to the column specification. + """ + pass class DataFrame: diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index a016d25c..066f75e1 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -145,14 +145,15 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: """ Convert a categorical column to a Series instance. """ - ordered, is_dict, mapping = col.describe_categorical + ordered, is_dict = col.describe_categorical if not is_dict: raise NotImplementedError('Non-dictionary categoricals not supported yet') # If you want to cheat for testing (can't use `_col` in real-world code): # categories = col._col.values.categories.values # codes = col._col.values.codes - categories = np.asarray(list(mapping.values())) + categories_column, = col.get_children() # need to keep a reference to the child + categories = convert_column_to_ndarray(categories_column)[0] codes_buffer, codes_dtype = col.get_buffers()["data"] codes = buffer_to_ndarray(codes_buffer, codes_dtype) values = categories[codes] @@ -446,7 +447,8 @@ def describe_categorical(self) -> Dict[str, Any]: If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. + - The data buffer stores encoded values, while the (single) + child column stores the categorical values themselves. Raises RuntimeError if the dtype is not categorical @@ -454,10 +456,8 @@ def describe_categorical(self) -> Dict[str, Any]: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of - categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "is_dictionary" : bool, whether the data is integer encoded + """ if not self.dtype[0] == _DtypeKind.CATEGORICAL: raise TypeError("`describe_categorical only works on a column with " @@ -470,8 +470,7 @@ def describe_categorical(self) -> Dict[str, Any]: codes = self._col.values.codes # ndarray, length `self.size` # categories.values is ndarray of length n_categories categories = self._col.values.categories.values - mapping = {ix: val for ix, val in enumerate(categories)} - return ordered, is_dictionary, mapping + return ordered, is_dictionary @property def describe_null(self) -> Tuple[int, Any]: @@ -693,6 +692,14 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: return buffer, dtype + def get_children(self): + if self.dtype[0] == _DtypeKind.CATEGORICAL: + if self.describe_categorical[1]: + # return the categories as a child Column + return (_PandasColumn(self._col.dtype.categories.to_series()),) + else: + return tuple() + class _PandasDataFrame: """ @@ -840,7 +847,7 @@ def test_categorical_dtype(): assert col.null_count == 1 assert col.describe_null == (2, -1) # sentinel value -1 assert col.num_chunks() == 1 - assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) + assert col.describe_categorical == (False, True) df2 = from_dataframe(df) assert_dataframe_equal(df.__dataframe__(), df) From 902ba7c34d60911059d16228b85be7400ba60c19 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 1 Jun 2022 13:11:16 +0300 Subject: [PATCH 2/4] Make .describe_categoricals["mapping"] a Column --- protocol/dataframe_protocol.py | 22 ++++++++++++---------- protocol/pandas_implementation.py | 29 ++++++++++------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 4371aa67..bc6f2080 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -170,13 +170,12 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: pass @property - def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: + def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - The data buffer stores encoded values, while the (single) - child column stores the categorical values themselves. + - There is a separate non-categortical Column encoding categorical values. Raises RuntimeError if the dtype is not categorical @@ -184,7 +183,10 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether the data is integer encoded + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "mapping" : Column representing the mapping of indices to category values. + None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ @@ -263,12 +265,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], """ pass - def get_children(self) -> Iterable[Column]: - """ - Children columns underneath the column, each object in this iterator - must adhere to the column specification. - """ - pass +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass class DataFrame: diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 066f75e1..aa395083 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -145,15 +145,14 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: """ Convert a categorical column to a Series instance. """ - ordered, is_dict = col.describe_categorical + ordered, is_dict, mapping = col.describe_categorical if not is_dict: raise NotImplementedError('Non-dictionary categoricals not supported yet') # If you want to cheat for testing (can't use `_col` in real-world code): # categories = col._col.values.categories.values # codes = col._col.values.codes - categories_column, = col.get_children() # need to keep a reference to the child - categories = convert_column_to_ndarray(categories_column)[0] + categories = convert_column_to_ndarray(mapping) codes_buffer, codes_dtype = col.get_buffers()["data"] codes = buffer_to_ndarray(codes_buffer, codes_dtype) values = categories[codes] @@ -457,7 +456,8 @@ def describe_categorical(self) -> Dict[str, Any]: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether the data is integer encoded - + - "mapping" : Column representing the mapping of indices to category values. + None if not a dictionary-style categorical. """ if not self.dtype[0] == _DtypeKind.CATEGORICAL: raise TypeError("`describe_categorical only works on a column with " @@ -465,12 +465,10 @@ def describe_categorical(self) -> Dict[str, Any]: ordered = self._col.dtype.ordered is_dictionary = True - # NOTE: this shows the children approach is better, transforming - # `categories` to a "mapping" dict is inefficient - codes = self._col.values.codes # ndarray, length `self.size` - # categories.values is ndarray of length n_categories - categories = self._col.values.categories.values - return ordered, is_dictionary + categories = _PandasColumn(self._col.dtype.categories.to_series()) + return {"is_ordered": ordered, + "is_dictionary": is_dictionary, + "mapping": categories} @property def describe_null(self) -> Tuple[int, Any]: @@ -692,14 +690,6 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: return buffer, dtype - def get_children(self): - if self.dtype[0] == _DtypeKind.CATEGORICAL: - if self.describe_categorical[1]: - # return the categories as a child Column - return (_PandasColumn(self._col.dtype.categories.to_series()),) - else: - return tuple() - class _PandasDataFrame: """ @@ -847,7 +837,8 @@ def test_categorical_dtype(): assert col.null_count == 1 assert col.describe_null == (2, -1) # sentinel value -1 assert col.num_chunks() == 1 - assert col.describe_categorical == (False, True) + assert col.describe_categorical["is_ordered"] == False + assert col.describe_categorical["is_dictionary"] == True df2 = from_dataframe(df) assert_dataframe_equal(df.__dataframe__(), df) From b0b2526bd0dddaa31a2bdaeb4125c15585035560 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 7 Jul 2022 15:07:26 +0300 Subject: [PATCH 3/4] Rename "mapping" -> "categories" in .describe_categorical Signed-off-by: Vasily Litvinov --- protocol/dataframe_protocol.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index bc6f2080..de9004c4 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -185,8 +185,9 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: semantically meaningful. - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "mapping" : Column representing the mapping of indices to category values. - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ From 38698278d194b1d72bf2107dd954cd788778cc99 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 7 Jul 2022 18:09:13 +0300 Subject: [PATCH 4/4] Update protocol/dataframe_protocol.py Co-authored-by: Keith Kraus --- protocol/dataframe_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index de9004c4..14854133 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -175,7 +175,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]: If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate non-categortical Column encoding categorical values. + - There is a separate non-categorical Column encoding categorical values. Raises RuntimeError if the dtype is not categorical