From 1b4220563f8738791406501ccd0279756e289f03 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 29 Sep 2021 11:56:26 -0400
Subject: [PATCH 1/4] Use a column to store categories

---
 protocol/dataframe_protocol.py    | 20 +++++++++-----------
 protocol/pandas_implementation.py | 27 +++++++++++++++++----------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 8bbf3327..4371aa67 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -175,7 +175,8 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - There is a separate dictionary-style encoding for categorical values.
+        - The data buffer stores encoded values, while the (single)
+          child column stores the categorical values themselves.
 
         Raises RuntimeError if the dtype is not categorical
 
@@ -183,10 +184,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
 
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
-                                categorical values to other objects exists
-            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
-                          None if not a dictionary-style categorical.
+            - "is_dictionary" : bool, whether the data is integer encoded
 
         TBD: are there any other in-memory representations that are needed?
         """
@@ -265,12 +263,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
         """
         pass
 
-#    def get_children(self) -> Iterable[Column]:
-#        """
-#        Children columns underneath the column, each object in this iterator
-#        must adhere to the column specification.
-#        """
-#        pass
+   def get_children(self) -> Iterable[Column]:
+       """
+       Children columns underneath the column, each object in this iterator
+       must adhere to the column specification.
+       """
+       pass
 
 
 class DataFrame:
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index a016d25c..066f75e1 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -145,14 +145,15 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
     """
     Convert a categorical column to a Series instance.
     """
-    ordered, is_dict, mapping = col.describe_categorical
+    ordered, is_dict = col.describe_categorical
     if not is_dict:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
 
     # If you want to cheat for testing (can't use `_col` in real-world code):
     #    categories = col._col.values.categories.values
     #    codes = col._col.values.codes
-    categories = np.asarray(list(mapping.values()))
+    categories_column, = col.get_children()  # need to keep a reference to the child
+    categories = convert_column_to_ndarray(categories_column)[0]
     codes_buffer, codes_dtype = col.get_buffers()["data"]
     codes = buffer_to_ndarray(codes_buffer, codes_dtype)
     values = categories[codes]
@@ -446,7 +447,8 @@ def describe_categorical(self) -> Dict[str, Any]:
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - There is a separate dictionary-style encoding for categorical values.
+        - The data buffer stores encoded values, while the (single)
+          child column stores the categorical values themselves.
 
         Raises RuntimeError if the dtype is not categorical
 
@@ -454,10 +456,8 @@ def describe_categorical(self) -> Dict[str, Any]:
 
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
-                                categorical values to other objects exists
-            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
-                          None if not a dictionary-style categorical.
+            - "is_dictionary" : bool, whether the data is integer encoded
+
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
             raise TypeError("`describe_categorical only works on a column with "
@@ -470,8 +470,7 @@ def describe_categorical(self) -> Dict[str, Any]:
         codes = self._col.values.codes  # ndarray, length `self.size`
         # categories.values is ndarray of length n_categories
         categories = self._col.values.categories.values
-        mapping = {ix: val for ix, val in enumerate(categories)}
-        return ordered, is_dictionary, mapping
+        return ordered, is_dictionary
 
     @property
     def describe_null(self) -> Tuple[int, Any]:
@@ -693,6 +692,14 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
 
         return buffer, dtype
 
+    def get_children(self):
+        if self.dtype[0] == _DtypeKind.CATEGORICAL:
+            if self.describe_categorical[1]:
+                # return the categories as a child Column
+                return (_PandasColumn(self._col.dtype.categories.to_series()),)
+        else:
+            return tuple()
+            
 
 class _PandasDataFrame:
     """
@@ -840,7 +847,7 @@ def test_categorical_dtype():
     assert col.null_count == 1
     assert col.describe_null == (2, -1)  # sentinel value -1
     assert col.num_chunks() == 1
-    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert col.describe_categorical == (False, True)
 
     df2 = from_dataframe(df)
     assert_dataframe_equal(df.__dataframe__(), df)

From 902ba7c34d60911059d16228b85be7400ba60c19 Mon Sep 17 00:00:00 2001
From: Vasily Litvinov <fam1ly.n4me@yandex.ru>
Date: Wed, 1 Jun 2022 13:11:16 +0300
Subject: [PATCH 2/4] Make .describe_categoricals["mapping"] a Column

---
 protocol/dataframe_protocol.py    | 22 ++++++++++++----------
 protocol/pandas_implementation.py | 29 ++++++++++-------------------
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 4371aa67..bc6f2080 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -170,13 +170,12 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
         pass
 
     @property
-    def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
+    def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
         """
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - The data buffer stores encoded values, while the (single)
-          child column stores the categorical values themselves.
+        - There is a separate non-categortical Column encoding categorical values.
 
         Raises RuntimeError if the dtype is not categorical
 
@@ -184,7 +183,10 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
 
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether the data is integer encoded
+            - "is_dictionary" : bool, whether a mapping of
+                                categorical values to other objects exists
+            - "mapping" : Column representing the mapping of indices to category values.
+                          None if not a dictionary-style categorical.
 
         TBD: are there any other in-memory representations that are needed?
         """
@@ -263,12 +265,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
         """
         pass
 
-   def get_children(self) -> Iterable[Column]:
-       """
-       Children columns underneath the column, each object in this iterator
-       must adhere to the column specification.
-       """
-       pass
+#    def get_children(self) -> Iterable[Column]:
+#        """
+#        Children columns underneath the column, each object in this iterator
+#        must adhere to the column specification.
+#        """
+#        pass
 
 
 class DataFrame:
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index 066f75e1..aa395083 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -145,15 +145,14 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
     """
     Convert a categorical column to a Series instance.
     """
-    ordered, is_dict = col.describe_categorical
+    ordered, is_dict, mapping = col.describe_categorical
     if not is_dict:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
 
     # If you want to cheat for testing (can't use `_col` in real-world code):
     #    categories = col._col.values.categories.values
     #    codes = col._col.values.codes
-    categories_column, = col.get_children()  # need to keep a reference to the child
-    categories = convert_column_to_ndarray(categories_column)[0]
+    categories = convert_column_to_ndarray(mapping)
     codes_buffer, codes_dtype = col.get_buffers()["data"]
     codes = buffer_to_ndarray(codes_buffer, codes_dtype)
     values = categories[codes]
@@ -457,7 +456,8 @@ def describe_categorical(self) -> Dict[str, Any]:
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
             - "is_dictionary" : bool, whether the data is integer encoded
-
+            - "mapping" : Column representing the mapping of indices to category values.
+                          None if not a dictionary-style categorical.
         """
         if not self.dtype[0] == _DtypeKind.CATEGORICAL:
             raise TypeError("`describe_categorical only works on a column with "
@@ -465,12 +465,10 @@ def describe_categorical(self) -> Dict[str, Any]:
 
         ordered = self._col.dtype.ordered
         is_dictionary = True
-        # NOTE: this shows the children approach is better, transforming
-        # `categories` to a "mapping" dict is inefficient
-        codes = self._col.values.codes  # ndarray, length `self.size`
-        # categories.values is ndarray of length n_categories
-        categories = self._col.values.categories.values
-        return ordered, is_dictionary
+        categories = _PandasColumn(self._col.dtype.categories.to_series())
+        return {"is_ordered": ordered,
+                "is_dictionary": is_dictionary,
+                "mapping": categories}
 
     @property
     def describe_null(self) -> Tuple[int, Any]:
@@ -692,14 +690,6 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
 
         return buffer, dtype
 
-    def get_children(self):
-        if self.dtype[0] == _DtypeKind.CATEGORICAL:
-            if self.describe_categorical[1]:
-                # return the categories as a child Column
-                return (_PandasColumn(self._col.dtype.categories.to_series()),)
-        else:
-            return tuple()
-            
 
 class _PandasDataFrame:
     """
@@ -847,7 +837,8 @@ def test_categorical_dtype():
     assert col.null_count == 1
     assert col.describe_null == (2, -1)  # sentinel value -1
     assert col.num_chunks() == 1
-    assert col.describe_categorical == (False, True)
+    assert col.describe_categorical["is_ordered"] == False
+    assert col.describe_categorical["is_dictionary"] == True
 
     df2 = from_dataframe(df)
     assert_dataframe_equal(df.__dataframe__(), df)

From b0b2526bd0dddaa31a2bdaeb4125c15585035560 Mon Sep 17 00:00:00 2001
From: Vasily Litvinov <fam1ly.n4me@yandex.ru>
Date: Thu, 7 Jul 2022 15:07:26 +0300
Subject: [PATCH 3/4] Rename "mapping" -> "categories" in .describe_categorical

Signed-off-by: Vasily Litvinov <fam1ly.n4me@yandex.ru>
---
 protocol/dataframe_protocol.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index bc6f2080..de9004c4 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -185,8 +185,9 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
                              semantically meaningful.
             - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
-            - "mapping" : Column representing the mapping of indices to category values.
-                          None if not a dictionary-style categorical.
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
 
         TBD: are there any other in-memory representations that are needed?
         """

From 38698278d194b1d72bf2107dd954cd788778cc99 Mon Sep 17 00:00:00 2001
From: Vasily Litvinov <fam1ly.n4me@yandex.ru>
Date: Thu, 7 Jul 2022 18:09:13 +0300
Subject: [PATCH 4/4] Update protocol/dataframe_protocol.py

Co-authored-by: Keith Kraus <keith.j.kraus@gmail.com>
---
 protocol/dataframe_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index de9004c4..14854133 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -175,7 +175,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
         If the dtype is categorical, there are two options:
 
         - There are only values in the data buffer.
-        - There is a separate non-categortical Column encoding categorical values.
+        - There is a separate non-categorical Column encoding categorical values.
 
         Raises RuntimeError if the dtype is not categorical