-
Notifications
You must be signed in to change notification settings - Fork 93
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Individual CUDA object spilling(#451)
This PR introduces a new _device host file_ that uses `ProxyObejct` to implement spilling of individual CUDA objects as opposed to the current host file, which spills entire keys. - [x] Implement spilling of individual objects - [x] Handle task level aliasing - [x] Handle shared device buffers - [x] Write docs To use, set `DASK_JIT_UNSPILL=True` ## Motivation ### Aliases at the task level Consider the following two tasks: ```python def task1(): # Create list of dataframes df1 = cudf.DataFrame({"a": range(10)}) df2 = cudf.DataFrame({"a": range(10)}) return [df1, df2] def task2(dfs): # Get the second item return dfs[1] ``` Running the two task on a worker we get something like: ```python >>> data["k1"] = task1() >>> data["k2"] = task2(data["k1"]) >>> data { "k1": [df1, df2], "k2": df2, } ``` Since the current implementation of spilling works on keys and handles each keys separately, it overestimate the device memory used: `sizeof(df)*3`. But even worse, if it decides to spill `k2` no device memory is freed since `k1` still holds a reference to `df2`! The new spilling implementation fixes this issue by wrapping identical CUDA objects in a shared `ProxyObejct` thus in this case `df2` in both `k1` and `k2` will refer to the same `ProxyObejct`. ### Sharing device buffers Consider the following code snippet: ```python >>> data["df"] = cudf.DataFrame({"a": range(10)}) >>> data["grouped"] = shuffle_group(data["df"], "a", 0, 2, 2, False, 2) >>> data["v1"] = data["grouped"][0] >>> data["v2"] = data["grouped"][1] ``` In this case `v1` and `v2` are separate objects and are handled separately both in the current and the new spilling implementation. However, the `shuffle_group()` in cudf actually returns a single device memory buffer such that `v1` and `v2` points to the same underlying memory buffer. Thus the current implement will again overestimate the memory use and spill one of the dataframes without any effect. The new implementation takes this into account when estimating memory usage and make sure that either both dataframes are spilled or none of them are. cc. @beckernick, @VibhuJawa xref: dask/distributed#3756 Authors: - Mads R. B. Kristensen <[email protected]> Approvers: - Peter Andreas Entschev URL: #451
- Loading branch information
Showing
11 changed files
with
970 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from typing import Any, Set | ||
|
||
from dask.sizeof import sizeof | ||
from dask.utils import Dispatch | ||
|
||
dispatch = Dispatch(name="get_device_memory_objects") | ||
|
||
|
||
def get_device_memory_objects(obj: Any) -> Set: | ||
""" Find all CUDA device objects in `obj` | ||
Search through `obj` and find all CUDA device objects, which are objects | ||
that either are known to `dispatch` or implement `__cuda_array_interface__`. | ||
Notice, the CUDA device objects must be hashable. | ||
Parameters | ||
---------- | ||
obj: Any | ||
Object to search through | ||
Returns | ||
------- | ||
ret: set | ||
Set of CUDA device memory objects | ||
""" | ||
return set(dispatch(obj)) | ||
|
||
|
||
@dispatch.register(object) | ||
def get_device_memory_objects_default(obj): | ||
if hasattr(obj, "_obj_pxy"): | ||
if obj._obj_pxy["serializers"] is None: | ||
return dispatch(obj._obj_pxy["obj"]) | ||
else: | ||
return [] | ||
if hasattr(obj, "data"): | ||
return dispatch(obj.data) | ||
if hasattr(obj, "_owner") and obj._owner is not None: | ||
return dispatch(obj._owner) | ||
if hasattr(obj, "__cuda_array_interface__"): | ||
return [obj] | ||
return [] | ||
|
||
|
||
@dispatch.register(list) | ||
@dispatch.register(tuple) | ||
@dispatch.register(set) | ||
@dispatch.register(frozenset) | ||
def get_device_memory_objects_python_sequence(seq): | ||
ret = [] | ||
for s in seq: | ||
ret.extend(dispatch(s)) | ||
return ret | ||
|
||
|
||
@dispatch.register(dict) | ||
def get_device_memory_objects_python_dict(seq): | ||
ret = [] | ||
for s in seq.values(): | ||
ret.extend(dispatch(s)) | ||
return ret | ||
|
||
|
||
@dispatch.register_lazy("cupy") | ||
def get_device_memory_objects_register_cupy(): | ||
from cupy.cuda.memory import MemoryPointer | ||
|
||
@dispatch.register(MemoryPointer) | ||
def get_device_memory_objects_cupy(obj): | ||
return [obj.mem] | ||
|
||
|
||
@dispatch.register_lazy("cudf") | ||
def get_device_memory_objects_register_cudf(): | ||
import cudf.core.dataframe | ||
import cudf.core.index | ||
import cudf.core.multiindex | ||
import cudf.core.series | ||
|
||
@dispatch.register(cudf.core.dataframe.DataFrame) | ||
def get_device_memory_objects_cudf_dataframe(obj): | ||
|
||
ret = dispatch(obj._index) | ||
for col in obj._data.columns: | ||
ret += dispatch(col) | ||
return ret | ||
|
||
@dispatch.register(cudf.core.series.Series) | ||
def get_device_memory_objects_cudf_series(obj): | ||
return dispatch(obj._index) + dispatch(obj._column) | ||
|
||
@dispatch.register(cudf.core.index.RangeIndex) | ||
def get_device_memory_objects_cudf_range_index(obj): | ||
# Avoid materializing RangeIndex. This introduce some inaccuracies | ||
# in total device memory usage but we accept the memory use of | ||
# RangeIndexes are limited. | ||
return [] | ||
|
||
@dispatch.register(cudf.core.index.Index) | ||
def get_device_memory_objects_cudf_index(obj): | ||
return dispatch(obj._values) | ||
|
||
@dispatch.register(cudf.core.multiindex.MultiIndex) | ||
def get_device_memory_objects_cudf_multiindex(obj): | ||
return dispatch(obj._columns) | ||
|
||
|
||
@sizeof.register_lazy("cupy") | ||
def register_cupy(): # NB: this overwrites dask.sizeof.register_cupy() | ||
import cupy.cuda.memory | ||
|
||
@sizeof.register(cupy.cuda.memory.BaseMemory) | ||
def sizeof_cupy_base_memory(x): | ||
return int(x.size) | ||
|
||
@sizeof.register(cupy.ndarray) | ||
def sizeof_cupy_ndarray(x): | ||
return int(x.nbytes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.