compile_udf: Cache PTX for similar functions (#7371)

Compiling a UDF generated in a loop will result in a distinct compilation for each loop iteration, because each new definition of the UDF does not compare equal to any previous definition, and a new compilation occurs. Furthermore, each new compilation returns PTX that differs only in a trivial way (the generated code is the same but function names are different), so JITify's cache also misses. For example: ```python for data_size in range(3): data = Series([3] * (2 ** data_size), dtype=np.float64) for i in range(3): data.applymap(lambda x: x + 1) ``` results in nine compilations when one would have sufficed. This commit adds an additional cache to `compile_udf` keyed on the signature, code, and closure variables of the function. This can hit for distinct definitions of the same function. The existing `lru_cache` wrapping `compile_udf` is left in place as it is expected to be able to hash the function much more quickly, though I don't know if this has a noticeable impact on performance - perhaps it would be worth removing it for simplicity, so that there is only one level of caching. Authors: - Graham Markall (@gmarkall) Approvers: - Keith Kraus (@kkraus14) - AJ Schmidt (@ajschmidt8) URL: #7371
rapidsai · Feb 26, 2021 · 7526be7 · 7526be7
1 parent 862559f
commit 7526be7
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 4 deletions.
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master

diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
@@ -60,6 +60,7 @@ dependencies:
   - packaging
   - protobuf
   - nvtx>=0.2.1
+  - cachetools
   - pip:
       - git+https://github.com/dask/dask.git@master
       - git+https://github.com/dask/distributed.git@master

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -45,6 +45,7 @@ requirements:
     - fsspec>=0.6.0
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     - nvtx >=0.2.1
+    - cachetools
 
 test:
   requires:

diff --git a/python/cudf/cudf/tests/test_compile_udf.py b/python/cudf/cudf/tests/test_compile_udf.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf.utils import cudautils
+from numba import types
+
+
+def setup_function():
+    cudautils._udf_code_cache.clear()
+
+
+def assert_cache_size(size):
+    assert cudautils._udf_code_cache.currsize == size
+
+
+def test_first_compile_sets_cache_entry():
+    # The first compilation should put an entry in the cache
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+
+def test_code_cache_same_code_different_function_hit():
+    # Compilation of a distinct function with the same code and signature
+    # should reuse the cached entry
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+
+def test_code_cache_different_types_miss():
+    # Compilation of a distinct function with the same code but different types
+    # should create an additional cache entry
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(lambda x: x + 1, (types.float64,))
+    assert_cache_size(2)
+
+
+def test_code_cache_different_cvars_miss():
+    # Compilation of a distinct function with the same types and code as an
+    # existing entry but different closure variables should create an
+    # additional cache entry
+
+    def gen_closure(y):
+        return lambda x: x + y
+
+    cudautils.compile_udf(gen_closure(1), (types.float32,))
+    assert_cache_size(1)
+
+    cudautils.compile_udf(gen_closure(2), (types.float32,))
+    assert_cache_size(2)
+
+
+def test_lambda_in_loop_code_cached():
+    # Compiling a UDF defined in a loop should result in the code cache being
+    # reused for each loop iteration after the first. We check for this by
+    # ensuring that there is only one entry in the code cache after the loop.
+
+    for i in range(3):
+        cudautils.compile_udf(lambda x: x + 1, (types.float32,))
+
+    assert_cache_size(1)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
-from functools import lru_cache
-
+import cachetools
 import cupy
 import numpy as np
 from numba import cuda
+from pickle import dumps
 
 import cudf
 from cudf.utils.utils import check_equals_float, check_equals_int
@@ -235,7 +235,13 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
     return window_sizes
 
 
-@lru_cache(maxsize=32)
+# This cache is keyed on the (signature, code, closure variables) of UDFs, so
+# it can hit for distinct functions that are similar. The lru_cache wrapping
+# compile_udf misses for these similar functions, but doesn't need to serialize
+# closure variables to check for a hit.
+_udf_code_cache = cachetools.LRUCache(maxsize=32)
+
+
 def compile_udf(udf, type_signature):
     """Compile ``udf`` with `numba`
 
@@ -266,8 +272,30 @@ def compile_udf(udf, type_signature):
       An numpy type
 
     """
+
+    # Check if we've already compiled a similar (but possibly distinct)
+    # function before
+    codebytes = udf.__code__.co_code
+    if udf.__closure__ is not None:
+        cvars = tuple([x.cell_contents for x in udf.__closure__])
+        cvarbytes = dumps(cvars)
+    else:
+        cvarbytes = b""
+
+    key = (type_signature, codebytes, cvarbytes)
+    res = _udf_code_cache.get(key)
+    if res:
+        return res
+
+    # We haven't compiled a function like this before, so need to fall back to
+    # compilation with Numba
     ptx_code, return_type = cuda.compile_ptx_for_current_device(
         udf, type_signature, device=True
     )
     output_type = numpy_support.as_dtype(return_type)
-    return (ptx_code, output_type.type)
+
+    # Populate the cache for this function
+    res = (ptx_code, output_type.type)
+    _udf_code_cache[key] = res
+
+    return res