rapidsai · rapids-bot · Apr 20, 2022 · Apr 11, 2022 · Apr 11, 2022 · Apr 16, 2022
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,9 +88,23 @@ async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
         if batch_enabled:
             persisted = client.persist(dask_obj, workers=worker_list[0])
         else:
+            # Have the first n workers persisting the n partitions
+            # Ideally, there would be as many partitions as there are workers
             persisted = [client.persist(
                 dask_obj.get_partition(p), workers=w) for p, w in enumerate(
-                    worker_list)]
+                    worker_list[:dask_obj.npartitions])]
+            # Persist empty dataframe with the remaining workers if there are
+            # less partitions than workers
+            if dask_obj.npartitions < len(worker_list):
+                # The empty df should have the same column names and dtypes as
+                # dask_obj
+                empty_df = cudf.DataFrame(columns=list(dask_obj.columns))
+                empty_df = empty_df.astype(dict(zip(
+                    dask_obj.columns, dask_obj.dtypes)))
+                for p, w in enumerate(worker_list[dask_obj.npartitions:]):
+                    empty_ddf = dask_cudf.from_cudf(empty_df, npartitions=1)
+                    persisted.append(client.persist(empty_ddf, workers=w))
+
         parts = futures_of(persisted)
     # iterable of dask collections (need to colocate them)
     elif isinstance(dask_obj, collections.abc.Sequence):

@@ -53,11 +53,11 @@ cdef renumber_helper(shuffled_vertices_t* ptr_maj_min_w, vertex_t, weights):
     # vertex_t or weight_t. Failing to do that will create am empty column of type object
     # which is not supported by '__cuda_array_interface__'
     if shuffled_major_series is None:
-        shuffled_major_series = cudf.Series(dtype=vertex_t)
+        shuffled_df['major_vertices'] = cudf.Series(dtype=vertex_t)
     else:
         shuffled_df['major_vertices']= shuffled_major_series
     if shuffled_minor_series is None:
-        shuffled_minor_series = cudf.Series(dtype=vertex_t)
+        shuffled_df['minor_vertices'] = cudf.Series(dtype=vertex_t)
     else:
         shuffled_df['minor_vertices']= shuffled_minor_series