rapidsai · rapids-bot · May 27, 2021 · May 24, 2021 · May 24, 2021 · May 24, 2021
@@ -1 +1,3 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
+
+from cudf.api import types
@@ -0,0 +1,3 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from .categoricals import _union_categoricals
@@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+from typing import List, Union
+
+import cudf
+
+ParentType = Union["cudf.Series", "cudf.Index"]
+
+
+def _union_categoricals(
+    to_union: List[Union[cudf.Series, cudf.Index]],
+    sort_categories: bool = False,
+    ignore_order: bool = False,
+):
+    """
+    This is an internal API which combines categorical data.
+    """
+
+    if ignore_order:
+        raise TypeError("ignore_order is not yet implemented")
+
+    result_col = cudf.core.column.CategoricalColumn._concat(
+        [obj._column for obj in to_union]
+    )
+    if sort_categories:
+        sorted_categories = (
+            cudf.Series(result_col.categories)
+            .sort_values(ascending=True, ignore_index=True)
+            ._column
+        )
+        result_col = result_col.cat().reorder_categories(
+            new_categories=sorted_categories
+        )
+
+    # TODO: The return type needs to be changed
+    # to cudf.Categorical once it is implemented.
+
+    return cudf.Index(result_col)
@@ -7,6 +7,7 @@
 
 from dask.dataframe.categorical import categorical_dtype_dispatch
 from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty
+from dask.dataframe.dispatch import union_categoricals_dispatch
 from dask.dataframe.methods import (
     concat_dispatch,
     is_categorical_dtype_dispatch,
@@ -242,6 +243,15 @@ def is_categorical_dtype_cudf(obj):
     return cudf.utils.dtypes.is_categorical_dtype(obj)
 
 
+@union_categoricals_dispatch.register((cudf.Series, cudf.Index))
+def union_categoricals_cudf(
+    to_union, sort_categories=False, ignore_order=False
+):
+    return cudf.api.types._union_categoricals(
+        to_union, sort_categories=False, ignore_order=False
+    )
+
+
 try:
 
     from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch

@@ -777,3 +777,34 @@ def test_index_map_partitions():
     mins_gd = gddf.index.map_partitions(M.min, meta=gddf.index).compute()
 
     dd.assert_eq(mins_pd, mins_gd)
+
+
+def test_merging_categorical_columns():
+    df_1 = cudf.DataFrame(
+        {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]}
+    )
+
+    ddf_1 = dgd.from_cudf(df_1, npartitions=2)
+
+    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+
+    df_2 = cudf.DataFrame(
+        {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
+    )
+
+    ddf_2 = dgd.from_cudf(df_2, npartitions=2)
+
+    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    expected = cudf.DataFrame(
+        {
+            "id_1": [2, 3],
+            "cat_col": cudf.Series(
+                ["f", "f"],
+                dtype=cudf.CategoricalDtype(
+                    categories=["a", "b", "f", "g", "h"], ordered=False
+                ),
+            ),
+            "id_2": [113, 113],
+        }
+    )
+    dd.assert_eq(ddf_1.merge(ddf_2), expected)