rapidsai · rapids-bot · May 27, 2021 · May 24, 2021 · May 24, 2021 · May 24, 2021
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -8,6 +8,7 @@
     TYPE_CHECKING,
     Any,
     Dict,
+    List,
     Mapping,
     Optional,
     Sequence,
@@ -1526,6 +1527,28 @@ def _create_empty_categorical_column(
     )
 
 
+def _union_categoricals(
+    to_union: List[Union[cudf.Series, cudf.Index]],
+    sort_categories: bool = False,
+    ignore_order: bool = False,
+):
+
+    if ignore_order:
+        raise TypeError("ignore_order is not yet implemented")
+
+    result_col = CategoricalColumn._concat([obj._column for obj in to_union])
+    if sort_categories:
+        sorted_categories = (
+            cudf.Series(result_col.categories)
+            .sort_values(ascending=True, ignore_index=True)
+            ._column
+        )
+        result_col = result_col.cat().reorder_categories(
+            new_categories=sorted_categories
+        )
+    return cudf.Index(result_col)
+
+
 def pandas_categorical_as_column(
     categorical: ColumnLike, codes: ColumnLike = None
 ) -> CategoricalColumn:

@@ -12,6 +12,7 @@
     is_categorical_dtype_dispatch,
     tolist_dispatch,
 )
+from dask.dataframe.multi import union_categoricals_dispatch
 from dask.dataframe.utils import (
     UNKNOWN_CATEGORIES,
     _nonempty_scalar,
@@ -242,6 +243,15 @@ def is_categorical_dtype_cudf(obj):
     return cudf.utils.dtypes.is_categorical_dtype(obj)
 
 
+@union_categoricals_dispatch.register((cudf.Series, cudf.Index))
+def union_categoricals_cudf(
+    to_union, sort_categories=False, ignore_order=False
+):
+    return cudf.core.column.categorical._union_categoricals(
+        to_union, sort_categories=False, ignore_order=False
+    )
+
+
 try:
 
     from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch

@@ -777,3 +777,34 @@ def test_index_map_partitions():
     mins_gd = gddf.index.map_partitions(M.min, meta=gddf.index).compute()
 
     dd.assert_eq(mins_pd, mins_gd)
+
+
+def test_merging_categorical_columns():
+    df_1 = cudf.DataFrame(
+        {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]}
+    )
+
+    ddf_1 = dgd.from_cudf(df_1, npartitions=2)
+
+    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+
+    df_2 = cudf.DataFrame(
+        {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
+    )
+
+    ddf_2 = dgd.from_cudf(df_2, npartitions=2)
+
+    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    expected = cudf.DataFrame(
+        {
+            "id_1": [2, 3],
+            "cat_col": cudf.Series(
+                ["f", "f"],
+                dtype=cudf.CategoricalDtype(
+                    categories=["a", "b", "f", "g", "h"], ordered=False
+                ),
+            ),
+            "id_2": [113, 113],
+        }
+    )
+    dd.assert_eq(ddf_1.merge(ddf_2), expected)