From ec6f27d27b9e5926588c78e123c8b9e3d998fe27 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 11:58:02 -0800
Subject: [PATCH 1/9] Add new parse_device_memory_limit util function

---
 dask_cuda/utils.py | 49 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index b2c7ccf40..c3cf203e7 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -8,7 +8,8 @@
 import pynvml
 import toolz
 
-from dask.distributed import wait
+from distributed import wait
+from distributed.utils import parse_bytes
 
 try:
     from nvtx import annotate as nvtx_annotate
@@ -442,3 +443,49 @@ def cuda_visible_devices(i, visible=None):
 
     L = visible[i:] + visible[:i]
     return ",".join(map(str, L))
+
+
+def parse_device_memory_limit(device_memory_limit, device_index=0):
+    """Parse memory limit to be used by a CUDA device.
+
+
+    Parameters
+    ----------
+    device_memory_limit: float, int, str or None
+        This can be a float (fraction of total device memory), an integer (bytes),
+        a string (like 5GB or 5000M), and "auto", 0 or None for the total device
+        size.
+    device_index: int
+        The index of device from which to obtain the total memory amount.
+
+    Examples
+    --------
+    >>> # On a 32GB CUDA device
+    >>> parse_device_memory_limit(None)
+    34089730048
+    >>> parse_device_memory_limit(0.8)
+    27271784038
+    >>> parse_device_memory_limit(1000000000)
+    1000000000
+    >>> parse_device_memory_limit("1GB")
+    1000000000
+    """
+    if any(device_memory_limit == v for v in [0, None, "auto"]):
+        return get_device_total_memory(device_index)
+    elif isinstance(device_memory_limit, int):
+        return device_memory_limit
+    elif isinstance(device_memory_limit, float):
+        if device_memory_limit < 0.0 or device_memory_limit > 1.0:
+            raise ValueError(
+                "When `device_memory_limit` is float, it must meet the "
+                "`0.0 <= device_memory_limit <= 1.0` condition, where that is "
+                "fraction of the total GPU memory."
+            )
+        return int(get_device_total_memory(device_index) * device_memory_limit)
+    elif isinstance(device_memory_limit, str):
+        return parse_bytes(device_memory_limit)
+    else:
+        raise ValueError(
+            "Invalid value for `device_memory_limit`, valid values are "
+            "floats, integers, strings, 0, None or 'auto'."
+        )

From 2e9e14c1163daf403cd9c3ef91622611f9f49deb Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 12:13:52 -0800
Subject: [PATCH 2/9] Use parse_device_memory_limit and set default to 0.8

---
 dask_cuda/cli/dask_cuda_worker.py | 15 ++++++++-------
 dask_cuda/cuda_worker.py          |  8 ++------
 dask_cuda/local_cuda_cluster.py   | 31 +++++++++++++++----------------
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index 92c7ede23..bc3910e1f 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -84,13 +84,14 @@
 )
 @click.option(
     "--device-memory-limit",
-    default="auto",
-    help="Bytes of memory per CUDA device that the worker can use. "
-    "This can be an integer (bytes), "
-    "float (fraction of total device memory), "
-    "string (like 5GB or 5000M), "
-    "'auto', or zero for no memory management "
-    "(i.e., allow full device memory usage).",
+    default=0.8,
+    help="Specifies the size of the CUDA device LRU cache, which "
+    "is used to determine when the worker starts spilling to host "
+    "memory.  This can be a float (fraction of total device "
+    "memory), an integer (bytes), a string (like 5GB or 5000M), "
+    "and 'auto' or 0 to disable spilling to host (i.e., allow "
+    "full device memory usage). Default is 0.8, 80% of the "
+    "worker's total device memory.",
 )
 @click.option(
     "--rmm-pool-size",
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 90cf99c17..ef755a706 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -29,6 +29,7 @@
     get_n_gpus,
     get_ucx_config,
     get_ucx_net_devices,
+    parse_device_memory_limit,
 )
 
 
@@ -211,12 +212,7 @@ def del_pid_file():
                 data=(
                     DeviceHostFile,
                     {
-                        "device_memory_limit": get_device_total_memory(index=i)
-                        if (
-                            device_memory_limit == "auto"
-                            or device_memory_limit == int(0)
-                        )
-                        else parse_bytes(device_memory_limit),
+                        "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i),
                         "memory_limit": memory_limit,
                         "local_directory": local_directory,
                     },
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 60fbfb059..6932b90eb 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -38,18 +38,22 @@ class LocalCUDACluster(LocalCluster):
 
     Parameters
     ----------
-    CUDA_VISIBLE_DEVICES: str
-        String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
-        different GPUs
+    CUDA_VISIBLE_DEVICES: str or list
+        String or list ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
+        different GPUs.
+    device_memory_limit: int, float or str
+        Specifies the size of the CUDA device LRU cache, which is used to
+        determine when the worker starts spilling to host memory.  This can be
+        a float (fraction of total device memory), an integer (bytes), a string
+        (like 5GB or 5000M), and "auto", 0 or None to disable spilling to
+        host (i.e., allow full device memory usage). Default is 0.8, 80% of the
+        worker's total device memory.
     interface: str
         The external interface used to connect to the scheduler, usually
         an ethernet interface is used for connection, and not an InfiniBand
         interface (if one is available).
     threads_per_worker: int
         Number of threads to be used for each CUDA worker process.
-    CUDA_VISIBLE_DEVICES: str or list
-        String or list ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
-        different GPUs.
     protocol: str
         Protocol to use for communication, e.g., "tcp" or "ucx".
     enable_tcp_over_ucx: bool
@@ -117,7 +121,7 @@ def __init__(
         threads_per_worker=1,
         processes=True,
         memory_limit="auto",
-        device_memory_limit=None,
+        device_memory_limit=0.8,
         CUDA_VISIBLE_DEVICES=None,
         data=None,
         local_directory=None,
@@ -147,7 +151,9 @@ def __init__(
         self.host_memory_limit = parse_memory_limit(
             memory_limit, threads_per_worker, n_workers
         )
-        self.device_memory_limit = device_memory_limit
+        self.device_memory_limit = parse_device_memory_limit(
+            device_memory_limit, device_index=0
+        )
 
         self.rmm_pool_size = rmm_pool_size
         self.rmm_managed_memory = rmm_managed_memory
@@ -176,11 +182,6 @@ def __init__(
                 "Processes are necessary in order to use multiple GPUs with Dask"
             )
 
-        if self.device_memory_limit is None:
-            self.device_memory_limit = get_device_total_memory(0)
-        elif isinstance(self.device_memory_limit, str):
-            self.device_memory_limit = parse_bytes(self.device_memory_limit)
-
         if data is None:
             data = (
                 DeviceHostFile,
@@ -265,9 +266,7 @@ def new_worker_spec(self):
         visible_devices = cuda_visible_devices(worker_count, self.cuda_visible_devices)
         spec["options"].update(
             {
-                "env": {
-                    "CUDA_VISIBLE_DEVICES": visible_devices,
-                },
+                "env": {"CUDA_VISIBLE_DEVICES": visible_devices,},
                 "plugins": {
                     CPUAffinity(get_cpu_affinity(worker_count)),
                     RMMSetup(self.rmm_pool_size, self.rmm_managed_memory),

From 996618e408548ca81dbf9f51ec6fa27b8a17dc7f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 12:22:08 -0800
Subject: [PATCH 3/9] Add test for parse_device_memory_limit

---
 dask_cuda/tests/test_utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 3700bd127..b56e1b6c1 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -13,6 +13,7 @@
     get_ucx_config,
     get_ucx_net_devices,
     parse_cuda_visible_device,
+    parse_device_memory_limit,
     unpack_bitmask,
 )
 
@@ -219,3 +220,15 @@ def test_parse_visible_devices():
     with pytest.raises(TypeError):
         parse_cuda_visible_device(None)
         parse_cuda_visible_device([])
+
+
+def test_parse_device_memory_limit():
+    total = get_device_total_memory(0)
+
+    assert parse_device_memory_limit(None) == total
+    assert parse_device_memory_limit(0) == total
+    assert parse_device_memory_limit("auto") == total
+
+    assert parse_device_memory_limit(0.8) == int(total * 0.8)
+    assert parse_device_memory_limit(1000000000) == 1000000000
+    assert parse_device_memory_limit("1GB") == 1000000000

From 45d85396468a7ed6602b949d89d6a6da7b633c5d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 12:34:51 -0800
Subject: [PATCH 4/9] Fix formatting

---
 dask_cuda/cuda_worker.py        | 5 +++--
 dask_cuda/local_cuda_cluster.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index ef755a706..39c464355 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -25,7 +25,6 @@
     RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
-    get_device_total_memory,
     get_n_gpus,
     get_ucx_config,
     get_ucx_net_devices,
@@ -212,7 +211,9 @@ def del_pid_file():
                 data=(
                     DeviceHostFile,
                     {
-                        "device_memory_limit": parse_device_memory_limit(device_memory_limit, device_index=i),
+                        "device_memory_limit": parse_device_memory_limit(
+                            device_memory_limit, device_index=i
+                        ),
                         "memory_limit": memory_limit,
                         "local_directory": local_directory,
                     },
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 6932b90eb..9acbc1035 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -14,10 +14,10 @@
     RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
-    get_device_total_memory,
     get_ucx_config,
     get_ucx_net_devices,
     parse_cuda_visible_device,
+    parse_device_memory_limit,
 )
 
 

From 75be026b79c00edefee870e1a027535bd366674b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 14:52:05 -0800
Subject: [PATCH 5/9] Update test_spill with correct memory limit parsing

---
 dask_cuda/tests/test_spill.py | 40 +++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 0bef9e8d8..dea4b3d3d 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -85,23 +85,23 @@ def delayed_worker_assert(total_size, device_chunk_overhead, serialized_chunk_ov
     "params",
     [
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 800e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(800e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": False,
         },
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(200e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": True,
         },
         {
-            "device_memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
             "memory_limit": 0,
             "host_target": 0.0,
             "host_spill": 0.0,
@@ -163,23 +163,23 @@ def test_device_spill(client, scheduler, worker):
     "params",
     [
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 800e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(800e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": False,
         },
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(200e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": True,
         },
         {
-            "device_memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
             "memory_limit": 0,
             "host_target": 0.0,
             "host_spill": 0.0,
@@ -241,23 +241,23 @@ async def test_cupy_cluster_device_spill(params):
     "params",
     [
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 800e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(800e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": False,
         },
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(200e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": True,
         },
         {
-            "device_memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
             "memory_limit": 0,
             "host_target": 0.0,
             "host_spill": 0.0,
@@ -330,23 +330,23 @@ def test_device_spill(client, scheduler, worker):
     "params",
     [
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 800e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(800e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": False,
         },
         {
-            "device_memory_limit": 200e6,
-            "memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
+            "memory_limit": int(200e6),
             "host_target": 0.0,
             "host_spill": 0.0,
             "host_pause": None,
             "spills_to_disk": True,
         },
         {
-            "device_memory_limit": 200e6,
+            "device_memory_limit": int(200e6),
             "memory_limit": 0,
             "host_target": 0.0,
             "host_spill": 0.0,

From 1f61a240da98bcd6278d36a962f0091b83955614 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 15:16:21 -0800
Subject: [PATCH 6/9] Better handling of strings in parse_device_memory_limit

Based off of distributed.worker.parse_memory_limit
---
 dask_cuda/utils.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index c3cf203e7..7cd1dae56 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -2,6 +2,7 @@
 import os
 import time
 import warnings
+from contextlib import suppress
 from multiprocessing import cpu_count
 
 import numpy as np
@@ -470,22 +471,15 @@ def parse_device_memory_limit(device_memory_limit, device_index=0):
     >>> parse_device_memory_limit("1GB")
     1000000000
     """
-    if any(device_memory_limit == v for v in [0, None, "auto"]):
+    if any(device_memory_limit == v for v in [0, "0", None, "auto"]):
         return get_device_total_memory(device_index)
-    elif isinstance(device_memory_limit, int):
-        return device_memory_limit
-    elif isinstance(device_memory_limit, float):
-        if device_memory_limit < 0.0 or device_memory_limit > 1.0:
-            raise ValueError(
-                "When `device_memory_limit` is float, it must meet the "
-                "`0.0 <= device_memory_limit <= 1.0` condition, where that is "
-                "fraction of the total GPU memory."
-            )
-        return int(get_device_total_memory(device_index) * device_memory_limit)
-    elif isinstance(device_memory_limit, str):
+
+    with suppress(ValueError, TypeError):
+        device_memory_limit = float(device_memory_limit)
+        if isinstance(device_memory_limit, float) and device_memory_limit <= 1:
+            return int(get_device_total_memory(device_index) * device_memory_limit)
+
+    if isinstance(device_memory_limit, str):
         return parse_bytes(device_memory_limit)
     else:
-        raise ValueError(
-            "Invalid value for `device_memory_limit`, valid values are "
-            "floats, integers, strings, 0, None or 'auto'."
-        )
+        return int(device_memory_limit)

From 1551d9577942451b26ce5b193fb89474778be16f Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 6 Nov 2020 15:17:17 -0800
Subject: [PATCH 7/9] Change --device-memory-limit default to "0.8"

---
 dask_cuda/cli/dask_cuda_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index bc3910e1f..65cef580c 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -84,7 +84,7 @@
 )
 @click.option(
     "--device-memory-limit",
-    default=0.8,
+    default="0.8",
     help="Specifies the size of the CUDA device LRU cache, which "
     "is used to determine when the worker starts spilling to host "
     "memory.  This can be a float (fraction of total device "

From 6e51a2a3b11518b6c44907ff57ce3984d993bf67 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 9 Nov 2020 01:26:02 -0800
Subject: [PATCH 8/9] Update spilling docs with info on new default

---
 docs/source/specializations.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/specializations.rst b/docs/source/specializations.rst
index e044e312e..0a158718b 100644
--- a/docs/source/specializations.rst
+++ b/docs/source/specializations.rst
@@ -13,6 +13,8 @@ Spilling From Device
 
 For applications that do not fit in GPU memory, Dask-CUDA supports spilling from device memory to host memory when the GPU can't fit more data. The spilling mechanism is automatically triggered once the user-defined limit is reached, such limit can be set via the ``--device-memory-limit`` and ``device_memory_limit`` arguments for ``dask-cuda-worker`` and ``LocalCUDACluster``, respectively.
 
+Previously, spilling was disabled by default, but since Dask-CUDA 0.17 the default has been changed to ``0.8`` -- spilling will begin when Dask-CUDA device memory utilization reaches 80% of the device's total memory. The old behavior can still be set with ``--device-memory-limit=0`` or ``device_memory_limit=0``.
+
 CPU Affinity
 ------------
 

From 8c14e3dfab8a44ef6d64c7b1ac12f91cfcb0bcc1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 9 Nov 2020 16:09:43 +0100
Subject: [PATCH 9/9] Update docs/source/specializations.rst

Co-authored-by: Benjamin Zaitlen <quasiben@users.noreply.github.com>
---
 docs/source/specializations.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/specializations.rst b/docs/source/specializations.rst
index 0a158718b..3d5bdac79 100644
--- a/docs/source/specializations.rst
+++ b/docs/source/specializations.rst
@@ -13,7 +13,7 @@ Spilling From Device
 
 For applications that do not fit in GPU memory, Dask-CUDA supports spilling from device memory to host memory when the GPU can't fit more data. The spilling mechanism is automatically triggered once the user-defined limit is reached, such limit can be set via the ``--device-memory-limit`` and ``device_memory_limit`` arguments for ``dask-cuda-worker`` and ``LocalCUDACluster``, respectively.
 
-Previously, spilling was disabled by default, but since Dask-CUDA 0.17 the default has been changed to ``0.8`` -- spilling will begin when Dask-CUDA device memory utilization reaches 80% of the device's total memory. The old behavior can still be set with ``--device-memory-limit=0`` or ``device_memory_limit=0``.
+Previously, spilling was disabled by default, but since Dask-CUDA 0.17 the default has been changed to ``0.8`` -- spilling will begin when Dask-CUDA device memory utilization reaches 80% of the device's total memory.  Behavior can configured with ``--device-memory-limit`` flag.  Users can disable spilling by setting ``--device-memory-limit=0`` or ``device_memory_limit=0``.
 
 CPU Affinity
 ------------