From d78060668a98b4b8987db2ce97fe398ffffbdff8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 23 Jun 2022 06:52:35 -0700 Subject: [PATCH 001/124] initial commit --- python/cudf/cudf/_lib/column.pyx | 12 +++++++++--- python/cudf/cudf/_lib/transpose.pyx | 4 +++- python/cudf/cudf/_lib/unary.pyx | 2 +- python/cudf/cudf/_lib/utils.pyx | 5 ++++- python/cudf/cudf/core/column/column.py | 7 ++++++- python/cudf/cudf/core/column_accessor.py | 7 ++++++- python/cudf/cudf/core/dataframe.py | 8 +++++--- python/cudf/cudf/core/series.py | 7 ++++++- python/cudf/cudf/tests/test_dataframe.py | 18 +++++++++--------- 9 files changed, 49 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 1f9b4c1596a..2280f942d1a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -490,6 +490,7 @@ cdef class Column: ``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is ``None``, we allocate new memory for the resulting ``cudf.Column``. """ + print("493") column_owner = isinstance(owner, Column) mask_owner = owner if column_owner and is_categorical_dtype(owner.dtype): @@ -522,13 +523,17 @@ cdef class Column: data = Buffer( rmm.DeviceBuffer(ptr=data_ptr, size=0) ) - + print("526") mask_ptr = (cv.null_mask()) + print("528") mask = None if mask_ptr: + print("531") if column_owner: + print("533") mask_owner = mask_owner.base_mask if mask_owner is None: + print("536", mask_ptr) mask = Buffer( rmm.DeviceBuffer( ptr=mask_ptr, @@ -536,12 +541,13 @@ cdef class Column: ) ) else: + print("544") mask = Buffer( data=mask_ptr, size=bitmask_allocation_size_bytes(base_size), owner=mask_owner ) - + print("545") if cv.has_nulls(): null_count = cv.null_count() else: @@ -559,7 +565,7 @@ cdef class Column: ) ) children = tuple(children) - + print("563") result = cudf.core.column.build_column( data=data, dtype=dtype, diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index b9eea6169bd..53107552997 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -15,12 +15,14 @@ def transpose(list source_columns): """Transpose m n-row columns into n m-row columns """ cdef pair[unique_ptr[column], table_view] c_result + print("18") cdef table_view c_input = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_transpose(c_input)) - + print("23") result_owner = Column.from_unique_ptr(move(c_result.first)) + print("25") return columns_from_table_view( c_result.second, owners=[result_owner] * c_result.second.num_columns() diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx index acca61cf9d1..cd89595b002 100644 --- a/python/cudf/cudf/_lib/unary.pyx +++ b/python/cudf/cudf/_lib/unary.pyx @@ -12,7 +12,7 @@ import numpy as np from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view +from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 643a1adca9f..05a4f708511 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -7,6 +7,7 @@ import cudf from cython.operator cimport dereference from libc.stdint cimport uint8_t +from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -42,6 +43,8 @@ cdef table_view table_view_from_columns(columns) except*: cdef Column col for col in columns: + mask_ptr = (col.view().null_mask()) + print("45", mask_ptr) column_views.push_back(col.view()) return table_view(column_views) @@ -323,7 +326,7 @@ cdef columns_from_table_view( in the table view is ``owners[i]``. For more about memory ownership, see ``Column.from_column_view``. """ - + print("326") return [ Column.from_column_view( tv.column(i), owners[i] if isinstance(owners, list) else None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ffd17cb7d31..7df25110b5b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -7,6 +7,7 @@ from functools import cached_property from itertools import chain from types import SimpleNamespace +import weakref from typing import ( Any, Dict, @@ -358,6 +359,10 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view + def custom_deep_copy(self: T)-> T: + result = libcudf.copying.copy_column(self) + return cast(T, result._with_type_metadata(self.dtype)) + def copy(self: T, deep: bool = True) -> T: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and @@ -483,7 +488,7 @@ def __setitem__(self, key: Any, value: Any): If ``value`` and ``self`` are of different types, ``value`` is coerced to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned. """ - + raise TypeError("hi") # Normalize value to scalar/column value_normalized = ( cudf.Scalar(value, dtype=self.dtype) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 34236a8c09e..7aeafccad6a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,6 +3,7 @@ from __future__ import annotations import itertools +import weakref from collections import abc from functools import cached_property, reduce from typing import ( @@ -132,6 +133,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names + self._weak_ref = {} @classmethod def _create_unsafe( @@ -308,8 +310,11 @@ def copy(self, deep=False) -> ColumnAccessor: Make a copy of this ColumnAccessor. """ if deep: + # import pdb;pdb.set_trace() + self._weak_ref = {k: weakref.ref(v) for k, v in self._data.items()} return self.__class__( - {k: v.copy(deep=True) for k, v in self._data.items()}, + # {k: v.copy(deep=True) for k, v in self._data.items()}, + self._data.copy(), multiindex=self.multiindex, level_names=self.level_names, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index dd5e47a0733..ff7c6900548 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3367,6 +3367,7 @@ def transpose(self): # No column from index is transposed with libcudf. source_columns = [*self._columns] + # import pdb;pdb.set_trace() source_dtype = source_columns[0].dtype if is_categorical_dtype(source_dtype): if any(not is_categorical_dtype(c.dtype) for c in source_columns): @@ -3380,7 +3381,7 @@ def transpose(self): if any(c.dtype != source_columns[0].dtype for c in source_columns): raise ValueError("Columns must all have the same dtype") - + # import pdb;pdb.set_trace() result_columns = libcudf.transpose.transpose(source_columns) if is_categorical_dtype(source_dtype): @@ -4637,9 +4638,10 @@ def from_pandas(cls, dataframe, nan_as_null=None): # Set index index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) - result = df.set_index(index) + df._index = index + #set_index(index, inplace=True) - return result + return df @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0cb63fb99a4..dc5ecbfc418 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,6 +9,7 @@ from collections import abc from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union +import weakref import cupy import numpy as np @@ -144,7 +145,11 @@ def __setitem__(self, key, value): self._frame._column.astype(to_dtype), inplace=True ) - self._frame._column[key] = value + if weakref.getweakrefcount(self._frame._column) == 0: + self._frame._column[key] = value + else: + self._frame._column = self._frame._column.custom_deep_copy() + self._frame._column[key] = value class _SeriesLocIndexer(_FrameIndexer): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 06e054a9e85..de2a02665d5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4555,15 +4555,15 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize( "data", [ - cudf.datasets.randomdata( - nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} - ), - cudf.datasets.randomdata( - nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} - ), - cudf.datasets.randomdata( - nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} - ), + # cudf.datasets.randomdata( + # nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} + # ), + # cudf.datasets.randomdata( + # nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} + # ), + # cudf.datasets.randomdata( + # nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} + # ), cudf.DataFrame(), cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), cudf.DataFrame( From 9baaa0879c8bb1ca76bc1ca2193072b995bec570 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 23 Jun 2022 06:55:11 -0700 Subject: [PATCH 002/124] initial commit --- python/cudf/cudf/_lib/utils.pyx | 3 +-- python/cudf/cudf/core/column/column.py | 6 +++--- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/series.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 05a4f708511..a0fda12a0bf 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -6,8 +6,7 @@ import pyarrow as pa import cudf from cython.operator cimport dereference -from libc.stdint cimport uint8_t -from libc.stdint cimport uintptr_t +from libc.stdint cimport uint8_t, uintptr_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7df25110b5b..d0792a65c7e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,10 +4,10 @@ import pickle import warnings +import weakref from functools import cached_property from itertools import chain from types import SimpleNamespace -import weakref from typing import ( Any, Dict, @@ -359,10 +359,10 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view - def custom_deep_copy(self: T)-> T: + def custom_deep_copy(self: T) -> T: result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) - + def copy(self: T, deep: bool = True) -> T: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ff7c6900548..3ec1816be28 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4639,7 +4639,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): # Set index index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) df._index = index - #set_index(index, inplace=True) + # set_index(index, inplace=True) return df diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index dc5ecbfc418..38b21021e81 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,10 +6,10 @@ import inspect import pickle import warnings +import weakref from collections import abc from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union -import weakref import cupy import numpy as np From 98d3cae98c8e876de3ad04c3899032cf221b8eec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 29 Jun 2022 13:54:52 -0700 Subject: [PATCH 003/124] fix --- python/cudf/cudf/_lib/column.pyx | 13 ++++--------- python/cudf/cudf/_lib/transpose.pyx | 3 --- python/cudf/cudf/_lib/utils.pyx | 3 --- python/cudf/cudf/core/column/column.py | 1 - python/cudf/cudf/core/frame.py | 4 ++++ 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 2280f942d1a..fda354928eb 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -490,7 +490,6 @@ cdef class Column: ``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is ``None``, we allocate new memory for the resulting ``cudf.Column``. """ - print("493") column_owner = isinstance(owner, Column) mask_owner = owner if column_owner and is_categorical_dtype(owner.dtype): @@ -523,17 +522,14 @@ cdef class Column: data = Buffer( rmm.DeviceBuffer(ptr=data_ptr, size=0) ) - print("526") + mask_ptr = (cv.null_mask()) - print("528") + mask = None if mask_ptr: - print("531") if column_owner: - print("533") mask_owner = mask_owner.base_mask if mask_owner is None: - print("536", mask_ptr) mask = Buffer( rmm.DeviceBuffer( ptr=mask_ptr, @@ -541,13 +537,12 @@ cdef class Column: ) ) else: - print("544") mask = Buffer( data=mask_ptr, size=bitmask_allocation_size_bytes(base_size), owner=mask_owner ) - print("545") + if cv.has_nulls(): null_count = cv.null_count() else: @@ -565,7 +560,7 @@ cdef class Column: ) ) children = tuple(children) - print("563") + result = cudf.core.column.build_column( data=data, dtype=dtype, diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 53107552997..5bfaf1da815 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -15,14 +15,11 @@ def transpose(list source_columns): """Transpose m n-row columns into n m-row columns """ cdef pair[unique_ptr[column], table_view] c_result - print("18") cdef table_view c_input = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_transpose(c_input)) - print("23") result_owner = Column.from_unique_ptr(move(c_result.first)) - print("25") return columns_from_table_view( c_result.second, owners=[result_owner] * c_result.second.num_columns() diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index a0fda12a0bf..9b5226052d6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -42,8 +42,6 @@ cdef table_view table_view_from_columns(columns) except*: cdef Column col for col in columns: - mask_ptr = (col.view().null_mask()) - print("45", mask_ptr) column_views.push_back(col.view()) return table_view(column_views) @@ -325,7 +323,6 @@ cdef columns_from_table_view( in the table view is ``owners[i]``. For more about memory ownership, see ``Column.from_column_view``. """ - print("326") return [ Column.from_column_view( tv.column(i), owners[i] if isinstance(owners, list) else None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5bf730794b5..045d04910e0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -486,7 +486,6 @@ def __setitem__(self, key: Any, value: Any): If ``value`` and ``self`` are of different types, ``value`` is coerced to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned. """ - raise TypeError("hi") # Normalize value to scalar/column value_normalized = ( cudf.Scalar(value, dtype=self.dtype) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index db5f1f77f4b..7d1eb709660 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -7,6 +7,7 @@ import pickle import warnings from collections import abc +import weakref from typing import ( Any, Callable, @@ -157,6 +158,9 @@ def _mimic_inplace( if inplace: for col in self._data: if col in result._data: + if weakref.getweakrefcount(self._data[col]) > 0: + self._data[col] = self._data[col].custom_deep_copy() + self._data[col]._mimic_inplace( result._data[col], inplace=True ) From 5bd9dfa5b907fa5ebaeeecc8709d1daf84a0f225 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 29 Jun 2022 14:51:28 -0700 Subject: [PATCH 004/124] import --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7d1eb709660..b276d4fc1c8 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,8 +6,8 @@ import operator import pickle import warnings -from collections import abc import weakref +from collections import abc from typing import ( Any, Callable, From 6a941a45eee4433c08331fe12417dbad26a4da1f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 1 Jul 2022 11:20:30 -0700 Subject: [PATCH 005/124] fix --- python/cudf/cudf/core/column/column.py | 1 + python/cudf/cudf/core/frame.py | 1 + python/cudf/cudf/core/series.py | 1 + 3 files changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 045d04910e0..00c01f1fd25 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -367,6 +367,7 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: + import pdb;pdb.set_trace() result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) else: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b276d4fc1c8..a3508bfd365 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -160,6 +160,7 @@ def _mimic_inplace( if col in result._data: if weakref.getweakrefcount(self._data[col]) > 0: self._data[col] = self._data[col].custom_deep_copy() + self._weak_ref[col] = weakref.ref(self._data[col]) self._data[col]._mimic_inplace( result._data[col], inplace=True diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 38b21021e81..de2cf6b90f3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -149,6 +149,7 @@ def __setitem__(self, key, value): self._frame._column[key] = value else: self._frame._column = self._frame._column.custom_deep_copy() + self._frame._data._weak_ref[self._frame.name] = weakref.ref(self._frame._column) self._frame._column[key] = value From 98426c3a4eac6c98fe63bc148fb56fb4c7df75ec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Jul 2022 10:07:20 -0700 Subject: [PATCH 006/124] fix --- python/cudf/cudf/core/column/column.py | 1 - python/cudf/cudf/core/column_accessor.py | 1 + python/cudf/cudf/core/frame.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 00c01f1fd25..045d04910e0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -367,7 +367,6 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: - import pdb;pdb.set_trace() result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) else: diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 7aeafccad6a..9d988dda8dd 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -113,6 +113,7 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names + self._weak_ref = data._weak_ref else: # This code path is performance-critical for copies and should be # modified with care. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2e30ce778a8..9fb7d992f59 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -159,7 +159,7 @@ def _mimic_inplace( if col in result._data: if weakref.getweakrefcount(self._data[col]) > 0: self._data[col] = self._data[col].custom_deep_copy() - self._weak_ref[col] = weakref.ref(self._data[col]) + self._data._weak_ref[col] = weakref.ref(self._data[col]) self._data[col]._mimic_inplace( result._data[col], inplace=True From 827f52aea6d1b2d6765fd21359d4a1f07c553508 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Jul 2022 08:13:45 -0700 Subject: [PATCH 007/124] push down to column --- python/cudf/cudf/_lib/column.pyx | 24 ++++++++++++++------- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 21 ++++++++++++++---- python/cudf/cudf/core/column/struct.py | 1 + python/cudf/cudf/core/column_accessor.py | 10 ++++----- python/cudf/cudf/core/dataframe.py | 7 ++++-- python/cudf/cudf/core/frame.py | 15 ++++++++----- python/cudf/cudf/core/series.py | 7 +++--- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 3 ++- python/cudf/cudf/tests/test_parquet.py | 2 +- 11 files changed, 63 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index fda354928eb..559f2047d3f 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,6 +11,7 @@ import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype from cudf.core.buffer import Buffer +import weakref from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t from libcpp cimport bool @@ -70,7 +71,8 @@ cdef class Column: object mask=None, int offset=0, object null_count=None, - object children=() + object children=(), + object weak_ref=None, ): self._size = size @@ -81,6 +83,7 @@ cdef class Column: self.set_base_children(children) self.set_base_data(data) self.set_base_mask(mask) + self._weak_ref = weak_ref @property def base_size(self): @@ -319,7 +322,7 @@ cdef class Column: self._children = None self._base_children = value - def _mimic_inplace(self, other_col, inplace=False): + def _temp_mimic_inplace(self, other_col, inplace=False): """ Given another column, update the attributes of this column to mimic an inplace operation. This does not modify the memory of Buffers, but @@ -327,12 +330,17 @@ cdef class Column: object with the Buffers and attributes from the other column. """ if inplace: - self._offset = other_col.offset - self._size = other_col.size - self._dtype = other_col._dtype - self.set_base_data(other_col.base_data) - self.set_base_children(other_col.base_children) - self.set_base_mask(other_col.base_mask) + if weakref.getweakrefcount(other_col) > 0: + new_col = other_col.custom_deep_copy() + else: + new_col = other_col + + self._offset = new_col.offset + self._size = new_col.size + self._dtype = new_col._dtype + self.set_base_data(new_col.base_data) + self.set_base_children(new_col.base_children) + self.set_base_mask(new_col.base_mask) else: return other_col diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c04e2e45461..af3ce26ebeb 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1312,7 +1312,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False ) -> Optional[ColumnBase]: - out = super()._mimic_inplace(other_col, inplace=inplace) + out = super()._temp_mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes return out diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 045d04910e0..298d77e4b3f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -329,7 +329,7 @@ def _fill( return libcudf.filling.fill(self, begin, end, slr.device_value) if is_string_dtype(self.dtype): - return self._mimic_inplace( + return self._temp_mimic_inplace( libcudf.filling.fill(self, begin, end, slr.device_value), inplace=True, ) @@ -367,8 +367,21 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: - result = libcudf.copying.copy_column(self) - return cast(T, result._with_type_metadata(self.dtype)) + copied_col = cast( + T, + build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ), + ) + # result = libcudf.copying.copy_column(self) + # return cast(T, result._with_type_metadata(self.dtype)) + copied_col._weak_ref = weakref.ref(self) + return copied_col else: return cast( T, @@ -503,7 +516,7 @@ def __setitem__(self, key: Any, value: Any): out = self._scatter_by_column(key, value_normalized) if out: - self._mimic_inplace(out, inplace=True) + self._temp_mimic_inplace(out, inplace=True) def _wrap_binop_normalization(self, other): if other is NA or other is None: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index e34cc0cb9d9..c1b02182c17 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -105,6 +105,7 @@ def _rename_fields(self, names): offset=self.offset, null_count=self.null_count, children=self.base_children, + weak_ref=self._weak_ref, ) @property diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9d988dda8dd..ff6531c5e5a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -113,7 +113,7 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._weak_ref = data._weak_ref + # self._weak_ref = data._weak_ref else: # This code path is performance-critical for copies and should be # modified with care. @@ -134,7 +134,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._weak_ref = {} + # self._weak_ref = {} @classmethod def _create_unsafe( @@ -312,10 +312,10 @@ def copy(self, deep=False) -> ColumnAccessor: """ if deep: # import pdb;pdb.set_trace() - self._weak_ref = {k: weakref.ref(v) for k, v in self._data.items()} + # self._weaks_ref = {k: weakref.ref(v) for k, v in self._data.items()} return self.__class__( - # {k: v.copy(deep=True) for k, v in self._data.items()}, - self._data.copy(), + {k: v.copy(deep=True) for k, v in self._data.items()}, + # self._data.copy(), multiindex=self.multiindex, level_names=self.level_names, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 71e31f23a78..3b1836deab2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3137,7 +3137,7 @@ def rename( columns = ( mapper if columns is None and axis in (1, "columns") else columns ) - + # import pdb;pdb.set_trace() if index: if ( any(type(item) == str for item in index.values()) @@ -3155,6 +3155,9 @@ def rename( value=list(index.values()), inplace=True, ) + # for t_col in out_index_frame._data.names: + # out_index._data[t_col] = out_index_frame._data[t_col] + # out_index._data[level] = out_index_frame._data[level] out = DataFrame(index=out_index) else: to_replace = list(index.keys()) @@ -6128,7 +6131,7 @@ def to_struct(self, name=None): field_names = [str(name) for name in self._data.names] col = cudf.core.column.build_struct_column( - names=field_names, children=self._data.columns, size=len(self) + names=field_names, children=tuple([col.custom_deep_copy() for col in self._data.columns]), size=len(self) ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9fb7d992f59..4c52408b7da 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -157,11 +157,16 @@ def _mimic_inplace( if inplace: for col in self._data: if col in result._data: - if weakref.getweakrefcount(self._data[col]) > 0: - self._data[col] = self._data[col].custom_deep_copy() - self._data._weak_ref[col] = weakref.ref(self._data[col]) - - self._data[col]._mimic_inplace( + # self._data.set_by_label(col, result._data[col], validate=False) + # self._data[col] = self._data[col].custom_deep_copy() + # self._data[col] = result._data[col] + # if weakref.getweakrefcount(self._data[col]) > 0: + # self._data[col] = self._data[col].custom_deep_copy() + # self._data._weak_ref[col] = weakref.ref( + # self._data[col] + # ) + + self._data[col]._temp_mimic_inplace( result._data[col], inplace=True ) self._data = result._data diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e52a54655b3..fc00af90cdc 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -84,7 +84,7 @@ def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): val_col = as_column(value, dtype=to_type) old_col = col.astype(to_type) - col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True) + col._temp_mimic_inplace(concat_columns([old_col, val_col]), inplace=True) class _SeriesIlocIndexer(_FrameIndexer): @@ -141,15 +141,16 @@ def __setitem__(self, key, value): value.dtype, self._frame._column.dtype ) value = value.astype(to_dtype) - self._frame._column._mimic_inplace( + self._frame._column._temp_mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) if weakref.getweakrefcount(self._frame._column) == 0: self._frame._column[key] = value else: + prev_col = self._frame._column self._frame._column = self._frame._column.custom_deep_copy() - self._frame._data._weak_ref[self._frame.name] = weakref.ref(self._frame._column) + self._frame._column._weak_ref = weakref.ref(prev_col) self._frame._column[key] = value diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9145c72e389..5c5ead57d23 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8560,7 +8560,7 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level): expect = pdf.rename(index=index, level=level) got = gdf.rename(index=index, level=level) - + # import pdb;pdb.set_trace() assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index d81a9f30cfa..8ff17ac648d 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -255,7 +255,8 @@ def test_index_rename_inplace(): # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + # import pdb;pdb.set_trace() + # assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index dff871276a8..81f7feeb037 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -438,7 +438,7 @@ def num_row_groups(rows, group_size): fname = tmpdir.join("metadata.parquet") row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) - + import pdb;pdb.set_trace() num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname) assert num_rows == len(pdf.index) From 5a81c2df667dc57d6a42e4e3ebcc59796e73edfc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Jul 2022 08:42:44 -0700 Subject: [PATCH 008/124] cleanup --- python/cudf/cudf/_lib/unary.pyx | 2 +- python/cudf/cudf/tests/test_parquet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx index cd89595b002..6d6f4b5bf3f 100644 --- a/python/cudf/cudf/_lib/unary.pyx +++ b/python/cudf/cudf/_lib/unary.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from enum import IntEnum diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 81f7feeb037..dff871276a8 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -438,7 +438,7 @@ def num_row_groups(rows, group_size): fname = tmpdir.join("metadata.parquet") row_group_size = 5 pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) - import pdb;pdb.set_trace() + num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname) assert num_rows == len(pdf.index) From c10fb77788cd98432e39f5a3df52e5d458ee7d0b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Jul 2022 08:54:44 -0700 Subject: [PATCH 009/124] cleanup --- python/cudf/cudf/tests/test_dataframe.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5c5ead57d23..c9b8d9c0906 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4597,15 +4597,15 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize( "data", [ - # cudf.datasets.randomdata( - # nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} - # ), - # cudf.datasets.randomdata( - # nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} - # ), - # cudf.datasets.randomdata( - # nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} - # ), + cudf.datasets.randomdata( + nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} + ), + cudf.datasets.randomdata( + nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} + ), + cudf.datasets.randomdata( + nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} + ), cudf.DataFrame(), cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), cudf.DataFrame( From f6d1003c157fe43ed67f93c84974344e889aed84 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Jul 2022 10:27:13 -0700 Subject: [PATCH 010/124] changes --- python/cudf/cudf/_lib/column.pyx | 1 + python/cudf/cudf/core/column/column.py | 8 +++++++- python/cudf/cudf/core/series.py | 15 ++++++++------- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 559f2047d3f..b7d27fda86b 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -215,6 +215,7 @@ cdef class Column: # `self.memory_usage` was never called before, So ignore. pass self._null_count = None + self._weak_ref = None def set_mask(self, value): """ diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 298d77e4b3f..e1209c675e3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -537,11 +537,17 @@ def _scatter_by_slice( num_keys = (stop - start) // step self._check_scatter_key_length(num_keys, value) - + # import pdb;pdb.set_trace() if step == 1: if isinstance(value, cudf.core.scalar.Scalar): return self._fill(value, start, stop, inplace=True) else: + # import pdb;pdb.set_trace() + # if weakref.getweakrefcount(self) == 0: + # pass + # else: + # true_deep_copied_col = self.custom_deep_copy() + # self._temp_mimic_inplace(true_deep_copied_col, inplace=True) return libcudf.copying.copy_range( value, self, 0, num_keys, start, stop, False ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fc00af90cdc..6874edb975b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -145,13 +145,14 @@ def __setitem__(self, key, value): self._frame._column.astype(to_dtype), inplace=True ) - if weakref.getweakrefcount(self._frame._column) == 0: - self._frame._column[key] = value - else: - prev_col = self._frame._column - self._frame._column = self._frame._column.custom_deep_copy() - self._frame._column._weak_ref = weakref.ref(prev_col) - self._frame._column[key] = value + # if weakref.getweakrefcount(self._frame._column) == 0: + # import pdb;pdb.set_trace() + self._frame._column[key] = value + # else: + # prev_col = self._frame._column + # self._frame._column = self._frame._column.custom_deep_copy() + # self._frame._column._weak_ref = weakref.ref(prev_col) + # self._frame._column[key] = value class _SeriesLocIndexer(_FrameIndexer): From 212af2e8440b9ce79bbb67f8cc45a8e549a9bad3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 8 Jul 2022 10:45:13 -0700 Subject: [PATCH 011/124] cleanup --- python/cudf/cudf/_lib/column.pyx | 4 ++-- python/cudf/cudf/_lib/transpose.pyx | 1 + python/cudf/cudf/_lib/utils.pyx | 1 - python/cudf/cudf/core/column/column.py | 9 ++------- python/cudf/cudf/core/column_accessor.py | 6 ------ python/cudf/cudf/core/dataframe.py | 15 ++++++++------- python/cudf/cudf/core/frame.py | 10 ---------- python/cudf/cudf/core/series.py | 8 -------- 8 files changed, 13 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index b7d27fda86b..8c376aa9587 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. +import weakref + import cupy as cp import numpy as np import pandas as pd @@ -11,7 +13,6 @@ import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype from cudf.core.buffer import Buffer -import weakref from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t from libcpp cimport bool @@ -533,7 +534,6 @@ cdef class Column: ) mask_ptr = (cv.null_mask()) - mask = None if mask_ptr: if column_owner: diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 5bfaf1da815..b9eea6169bd 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -19,6 +19,7 @@ def transpose(list source_columns): with nogil: c_result = move(cpp_transpose(c_input)) + result_owner = Column.from_unique_ptr(move(c_result.first)) return columns_from_table_view( c_result.second, diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 9b5226052d6..2fe3af3ae8a 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -6,7 +6,6 @@ import pyarrow as pa import cudf from cython.operator cimport dereference -from libc.stdint cimport uint8_t, uintptr_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e1209c675e3..f8372bec0ad 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -499,6 +499,7 @@ def __setitem__(self, key: Any, value: Any): If ``value`` and ``self`` are of different types, ``value`` is coerced to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned. """ + # Normalize value to scalar/column value_normalized = ( cudf.Scalar(value, dtype=self.dtype) @@ -537,17 +538,11 @@ def _scatter_by_slice( num_keys = (stop - start) // step self._check_scatter_key_length(num_keys, value) - # import pdb;pdb.set_trace() + if step == 1: if isinstance(value, cudf.core.scalar.Scalar): return self._fill(value, start, stop, inplace=True) else: - # import pdb;pdb.set_trace() - # if weakref.getweakrefcount(self) == 0: - # pass - # else: - # true_deep_copied_col = self.custom_deep_copy() - # self._temp_mimic_inplace(true_deep_copied_col, inplace=True) return libcudf.copying.copy_range( value, self, 0, num_keys, start, stop, False ) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index ff6531c5e5a..34236a8c09e 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,6 @@ from __future__ import annotations import itertools -import weakref from collections import abc from functools import cached_property, reduce from typing import ( @@ -113,7 +112,6 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - # self._weak_ref = data._weak_ref else: # This code path is performance-critical for copies and should be # modified with care. @@ -134,7 +132,6 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - # self._weak_ref = {} @classmethod def _create_unsafe( @@ -311,11 +308,8 @@ def copy(self, deep=False) -> ColumnAccessor: Make a copy of this ColumnAccessor. """ if deep: - # import pdb;pdb.set_trace() - # self._weaks_ref = {k: weakref.ref(v) for k, v in self._data.items()} return self.__class__( {k: v.copy(deep=True) for k, v in self._data.items()}, - # self._data.copy(), multiindex=self.multiindex, level_names=self.level_names, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a072a52fc58..db7bc34cd36 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3136,7 +3136,7 @@ def rename( columns = ( mapper if columns is None and axis in (1, "columns") else columns ) - # import pdb;pdb.set_trace() + if index: if ( any(type(item) == str for item in index.values()) @@ -3154,9 +3154,6 @@ def rename( value=list(index.values()), inplace=True, ) - # for t_col in out_index_frame._data.names: - # out_index._data[t_col] = out_index_frame._data[t_col] - # out_index._data[level] = out_index_frame._data[level] out = DataFrame(index=out_index) else: to_replace = list(index.keys()) @@ -3488,7 +3485,7 @@ def transpose(self): # No column from index is transposed with libcudf. source_columns = [*self._columns] - # import pdb;pdb.set_trace() + source_dtype = source_columns[0].dtype if is_categorical_dtype(source_dtype): if any(not is_categorical_dtype(c.dtype) for c in source_columns): @@ -3502,7 +3499,7 @@ def transpose(self): if any(c.dtype != source_columns[0].dtype for c in source_columns): raise ValueError("Columns must all have the same dtype") - # import pdb;pdb.set_trace() + result_columns = libcudf.transpose.transpose(source_columns) if is_categorical_dtype(source_dtype): @@ -6130,7 +6127,11 @@ def to_struct(self, name=None): field_names = [str(name) for name in self._data.names] col = cudf.core.column.build_struct_column( - names=field_names, children=tuple([col.custom_deep_copy() for col in self._data.columns]), size=len(self) + names=field_names, + children=tuple( + [col.custom_deep_copy() for col in self._data.columns] + ), + size=len(self), ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c56c95761d2..ebddc24a783 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,6 @@ import operator import pickle import warnings -import weakref from collections import abc from typing import ( Any, @@ -142,15 +141,6 @@ def _mimic_inplace( if inplace: for col in self._data: if col in result._data: - # self._data.set_by_label(col, result._data[col], validate=False) - # self._data[col] = self._data[col].custom_deep_copy() - # self._data[col] = result._data[col] - # if weakref.getweakrefcount(self._data[col]) > 0: - # self._data[col] = self._data[col].custom_deep_copy() - # self._data._weak_ref[col] = weakref.ref( - # self._data[col] - # ) - self._data[col]._temp_mimic_inplace( result._data[col], inplace=True ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6874edb975b..5bb76413289 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,7 +6,6 @@ import inspect import pickle import warnings -import weakref from collections import abc from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union @@ -145,14 +144,7 @@ def __setitem__(self, key, value): self._frame._column.astype(to_dtype), inplace=True ) - # if weakref.getweakrefcount(self._frame._column) == 0: - # import pdb;pdb.set_trace() self._frame._column[key] = value - # else: - # prev_col = self._frame._column - # self._frame._column = self._frame._column.custom_deep_copy() - # self._frame._column._weak_ref = weakref.ref(prev_col) - # self._frame._column[key] = value class _SeriesLocIndexer(_FrameIndexer): From 744446ffe93f0dc60e4a982fe44c9a2de6761082 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 11 Jul 2022 13:18:57 -0700 Subject: [PATCH 012/124] use base_data --- python/cudf/cudf/_lib/column.pyx | 41 +++++++++++++++++++++++++- python/cudf/cudf/core/column/column.py | 17 ++++++++++- python/cudf/cudf/core/series.py | 7 +++-- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8c376aa9587..7c254a3f421 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -323,6 +323,41 @@ cdef class Column: self._children = None self._base_children = value + + def has_a_weakref(self): + weakref_count = weakref.getweakrefcount(self.base_data) + #print("weakref_count", weakref_count) + if weakref_count == 0: + #print("330") + return False + elif weakref_count == 1: + #print("333", weakref.getweakrefs(self.base_data)[0]() is self.base_data) + return not (weakref.getweakrefs(self.base_data)[0]() is self.base_data) + #return True + else: + #print("336") + return True + + def detach_refs(self): + """ + Given another column, update the attributes of this column to mimic an + inplace operation. This does not modify the memory of Buffers, but + instead replaces the Buffers and other attributes underneath the column + object with the Buffers and attributes from the other column. + """ + #print("334") + if self.has_a_weakref(): + #print("335") + new_col = self.custom_deep_copy() + + self._offset = new_col.offset + self._size = new_col.size + self._dtype = new_col._dtype + self.set_base_data(new_col.base_data) + self.set_base_children(new_col.base_children) + self.set_base_mask(new_col.base_mask) + self._weak_ref = None + def _temp_mimic_inplace(self, other_col, inplace=False): """ @@ -331,10 +366,13 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ + #print("369") if inplace: - if weakref.getweakrefcount(other_col) > 0: + if other_col.has_a_weakref(): + #print("355") new_col = other_col.custom_deep_copy() else: + #print("358") new_col = other_col self._offset = new_col.offset @@ -343,6 +381,7 @@ cdef class Column: self.set_base_data(new_col.base_data) self.set_base_children(new_col.base_children) self.set_base_mask(new_col.base_mask) + self._weak_ref = other_col._weak_ref else: return other_col diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f8372bec0ad..afaaf554191 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -86,6 +86,8 @@ # method in ColumnBase. Slice = TypeVar("Slice", bound=slice) +def custom_weakref_callback(ref): + pass class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _VALID_REDUCTIONS = { @@ -338,6 +340,8 @@ def _fill( mask = create_null_mask(self.size, state=MaskState.ALL_VALID) self.set_base_mask(mask) + # import pdb;pdb.set_trace() + self.detach_refs() libcudf.filling.fill_in_place(self, begin, end, slr.device_value) return self @@ -358,6 +362,7 @@ def nullmask(self) -> Buffer: return self.mask_array_view def custom_deep_copy(self: T) -> T: + # print("CUSTOM COPYING") result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) @@ -380,7 +385,17 @@ def copy(self: T, deep: bool = True) -> T: ) # result = libcudf.copying.copy_column(self) # return cast(T, result._with_type_metadata(self.dtype)) - copied_col._weak_ref = weakref.ref(self) + # copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) + if self._weak_ref is None: + self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) + copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) + else: + if self.has_a_weakref(): + copied_col._weak_ref = self._weak_ref + self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) + else: + self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) + copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) return copied_col else: return cast( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5bb76413289..58c971e9169 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -140,9 +140,10 @@ def __setitem__(self, key, value): value.dtype, self._frame._column.dtype ) value = value.astype(to_dtype) - self._frame._column._temp_mimic_inplace( - self._frame._column.astype(to_dtype), inplace=True - ) + if self._frame._column.dtype != to_dtype: + self._frame._column._temp_mimic_inplace( + self._frame._column.astype(to_dtype), inplace=True + ) self._frame._column[key] = value From d869dda0eee3e87bbc8ab17a5f4e4a3f9b08e86a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 11 Jul 2022 16:49:39 -0700 Subject: [PATCH 013/124] handle strings --- python/cudf/cudf/_lib/column.pyx | 6 ++-- python/cudf/cudf/core/column/column.py | 16 ++++++---- python/cudf/cudf/core/column/string.py | 44 ++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7c254a3f421..09ada19e4f5 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -366,13 +366,13 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ - #print("369") + print("369") if inplace: if other_col.has_a_weakref(): - #print("355") + print("355") new_col = other_col.custom_deep_copy() else: - #print("358") + print("358") new_col = other_col self._offset = new_col.offset diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index afaaf554191..4d3b70efbd5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -336,12 +336,11 @@ def _fill( inplace=True, ) + self.detach_refs() if not slr.is_valid() and not self.nullable: mask = create_null_mask(self.size, state=MaskState.ALL_VALID) self.set_base_mask(mask) - # import pdb;pdb.set_trace() - self.detach_refs() libcudf.filling.fill_in_place(self, begin, end, slr.device_value) return self @@ -365,6 +364,9 @@ def custom_deep_copy(self: T) -> T: # print("CUSTOM COPYING") result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) + + def get_weakref(self): + return weakref.ref(self.base_data, custom_weakref_callback) def copy(self: T, deep: bool = True) -> T: """Columns are immutable, so a deep copy produces a copy of the @@ -387,15 +389,15 @@ def copy(self: T, deep: bool = True) -> T: # return cast(T, result._with_type_metadata(self.dtype)) # copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) if self._weak_ref is None: - self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) - copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) + self._weak_ref = copied_col.get_weakref() + copied_col._weak_ref = self.get_weakref() else: if self.has_a_weakref(): copied_col._weak_ref = self._weak_ref - self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) + self._weak_ref = copied_col.get_weakref() else: - self._weak_ref = weakref.ref(copied_col.base_data, custom_weakref_callback) - copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) + self._weak_ref = copied_col.get_weakref() + copied_col._weak_ref = self.get_weakref() return copied_col else: return cast( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f44563f924e..6d2a9f68d1c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -15,6 +15,7 @@ cast, overload, ) +import weakref import cupy import numpy as np @@ -5098,6 +5099,49 @@ def __init__( self._start_offset = None self._end_offset = None + def get_weakref(self): + return weakref.ref(self, column.custom_weakref_callback) + + def has_a_weakref(self): + weakref_count = weakref.getweakrefcount(self) + #print("weakref_count", weakref_count) + if weakref_count == 0: + #print("330") + return False + elif weakref_count == 1: + #print("333", weakref.getweakrefs(self.base_data)[0]() is self.base_data) + return not (weakref.getweakrefs(self)[0]() is self) + #return True + else: + #print("336") + return True + + def copy(self, deep: bool = True): + """Columns are immutable, so a deep copy produces a copy of the + underlying data and mask and a shallow copy creates a new column and + copies the references of the data and mask. + """ + if deep: + copied_col = column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) + return copied_col + else: + return column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) + + @property def start_offset(self) -> int: if self._start_offset is None: From 219ee1b9d9ec67fbc71c94c7a6998c690aca2195 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 14 Jul 2022 13:42:10 -0700 Subject: [PATCH 014/124] struct & list --- python/cudf/cudf/core/column/lists.py | 38 ++++++++++++++++++++++++++ python/cudf/cudf/core/column/struct.py | 10 ++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c6a19f374bd..8dcddfd1027 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -193,6 +193,44 @@ def _with_type_metadata( return self + def copy(self, deep: bool = True): + """Columns are immutable, so a deep copy produces a copy of the + underlying data and mask and a shallow copy creates a new column and + copies the references of the data and mask. + """ + if deep: + copied_col = column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) + # result = libcudf.copying.copy_column(self) + # return cast(T, result._with_type_metadata(self.dtype)) + # copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) + # if self._weak_ref is None: + # self._weak_ref = copied_col.get_weakref() + # copied_col._weak_ref = self.get_weakref() + # else: + # if self.has_a_weakref(): + # copied_col._weak_ref = self._weak_ref + # self._weak_ref = copied_col.get_weakref() + # else: + # self._weak_ref = copied_col.get_weakref() + # copied_col._weak_ref = self.get_weakref() + return copied_col + else: + return column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) + def leaves(self): if isinstance(self.elements, ListColumn): return self.elements.leaves() diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index c1b02182c17..c85e07d6331 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -84,7 +84,15 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def copy(self, deep=True): - result = super().copy(deep=deep) + # result = super().copy(deep=deep) + result = cudf.core.column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) if deep: result = result._rename_fields(self.dtype.fields.keys()) return result From 4b423a4aeb84840fe0bff655e99a18c0cb12aac0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 19 Jul 2022 09:51:07 -0700 Subject: [PATCH 015/124] cleanup --- python/cudf/cudf/_lib/column.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 09ada19e4f5..7c254a3f421 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -366,13 +366,13 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ - print("369") + #print("369") if inplace: if other_col.has_a_weakref(): - print("355") + #print("355") new_col = other_col.custom_deep_copy() else: - print("358") + #print("358") new_col = other_col self._offset = new_col.offset From ff5985665a064d474574ca5b192e50d0144f12f9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 10:33:47 -0700 Subject: [PATCH 016/124] add copy_on_write option --- python/cudf/cudf/_lib/column.pyx | 16 ++----- python/cudf/cudf/core/column/column.py | 55 +++++++++++++----------- python/cudf/cudf/core/column/string.py | 45 +++++++------------ python/cudf/cudf/core/dataframe.py | 3 +- python/cudf/cudf/options.py | 17 ++++++++ python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 1 - 7 files changed, 67 insertions(+), 72 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 5ace2426755..8a41141a9a2 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -320,16 +320,12 @@ cdef class Column: def has_a_weakref(self): weakref_count = weakref.getweakrefcount(self.base_data) - #print("weakref_count", weakref_count) + if weakref_count == 0: - #print("330") return False elif weakref_count == 1: - #print("333", weakref.getweakrefs(self.base_data)[0]() is self.base_data) return not (weakref.getweakrefs(self.base_data)[0]() is self.base_data) - #return True else: - #print("336") return True def detach_refs(self): @@ -339,10 +335,8 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ - #print("334") if self.has_a_weakref(): - #print("335") - new_col = self.custom_deep_copy() + new_col = self.force_deep_copy() self._offset = new_col.offset self._size = new_col.size @@ -360,13 +354,11 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ - #print("369") + if inplace: if other_col.has_a_weakref(): - #print("355") - new_col = other_col.custom_deep_copy() + new_col = other_col.force_deep_copy() else: - #print("358") new_col = other_col self._offset = new_col.offset diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 857b06f19d7..6aad85872eb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -89,9 +89,11 @@ # method in ColumnBase. Slice = TypeVar("Slice", bound=slice) + def custom_weakref_callback(ref): pass + class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _VALID_REDUCTIONS = { "any", @@ -363,11 +365,10 @@ def nullmask(self) -> DeviceBufferLike: raise ValueError("Column has no null mask") return self.mask_array_view - def custom_deep_copy(self: T) -> T: - # print("CUSTOM COPYING") + def force_deep_copy(self: T) -> T: result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) - + def get_weakref(self): return weakref.ref(self.base_data, custom_weakref_callback) @@ -377,31 +378,33 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: - copied_col = cast( - T, - build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ), - ) - # result = libcudf.copying.copy_column(self) - # return cast(T, result._with_type_metadata(self.dtype)) - # copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) - if self._weak_ref is None: - self._weak_ref = copied_col.get_weakref() - copied_col._weak_ref = self.get_weakref() - else: - if self.has_a_weakref(): - copied_col._weak_ref = self._weak_ref - self._weak_ref = copied_col.get_weakref() - else: + if cudf.get_option("copy_on_write"): + copied_col = cast( + T, + build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ), + ) + + if self._weak_ref is None: self._weak_ref = copied_col.get_weakref() copied_col._weak_ref = self.get_weakref() - return copied_col + else: + if self.has_a_weakref(): + copied_col._weak_ref = self._weak_ref + self._weak_ref = copied_col.get_weakref() + else: + self._weak_ref = copied_col.get_weakref() + copied_col._weak_ref = self.get_weakref() + return copied_col + else: + result = libcudf.copying.copy_column(self) + return cast(T, result._with_type_metadata(self.dtype)) else: return cast( T, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7eeb1efea67..12f9c80ddfe 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4,6 +4,7 @@ import re import warnings +import weakref from functools import cached_property from typing import ( TYPE_CHECKING, @@ -15,7 +16,6 @@ cast, overload, ) -import weakref import cupy import numpy as np @@ -5188,43 +5188,28 @@ def get_weakref(self): def has_a_weakref(self): weakref_count = weakref.getweakrefcount(self) - #print("weakref_count", weakref_count) + if weakref_count == 0: - #print("330") return False elif weakref_count == 1: - #print("333", weakref.getweakrefs(self.base_data)[0]() is self.base_data) return not (weakref.getweakrefs(self)[0]() is self) - #return True else: - #print("336") return True def copy(self, deep: bool = True): - """Columns are immutable, so a deep copy produces a copy of the - underlying data and mask and a shallow copy creates a new column and - copies the references of the data and mask. - """ - if deep: - copied_col = column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) - return copied_col - else: - return column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) - + """String Columns are immutable, so a deep/shallow copy + produces a new column and copies the references of the + data and mask. + """ + + return column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) @property def start_offset(self) -> int: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 52e7f645e2d..378e2929cd7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3550,7 +3550,6 @@ def transpose(self): # No column from index is transposed with libcudf. source_columns = [*self._columns] - source_dtype = source_columns[0].dtype if is_categorical_dtype(source_dtype): if any(not is_categorical_dtype(c.dtype) for c in source_columns): @@ -6171,7 +6170,7 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, children=tuple( - [col.custom_deep_copy() for col in self._data.columns] + [col.force_deep_copy() for col in self._data.columns] ), size=len(self), ) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 7f6a6f10e25..5a791575e5b 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -163,3 +163,20 @@ def _validator(val): ), _make_contains_validator([None, 32, 64]), ) + +_register_option( + "copy_on_write", + True, + textwrap.dedent( + """ + Default behavior of performing deep copies. + If set to `False`, each deep copy will perform a true deep copy. + If set to `True`, each deep copy will perform a shallow copy + with underlying data actually referring to the actual column, in this + case a copy is only made when there is a write operation performed on + the column. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 375b6020274..fbc4fd619a1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8641,7 +8641,7 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level): expect = pdf.rename(index=index, level=level) got = gdf.rename(index=index, level=level) - # import pdb;pdb.set_trace() + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e119db82f44..675bb604519 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -255,7 +255,6 @@ def test_index_rename_inplace(): # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - # import pdb;pdb.set_trace() # assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr # inplace=True returns none From 3d177e08b56e4250dfd1bec2b6143ac68ab397ed Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 11:19:44 -0700 Subject: [PATCH 017/124] simply has_a_weakref --- python/cudf/cudf/_lib/column.pyx | 6 ++---- python/cudf/cudf/core/column/string.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8a41141a9a2..dee094ac40e 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -321,12 +321,10 @@ cdef class Column: def has_a_weakref(self): weakref_count = weakref.getweakrefcount(self.base_data) - if weakref_count == 0: - return False - elif weakref_count == 1: + if weakref_count == 1: return not (weakref.getweakrefs(self.base_data)[0]() is self.base_data) else: - return True + return weakref_count > 0 def detach_refs(self): """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 12f9c80ddfe..798e724c59e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5189,12 +5189,10 @@ def get_weakref(self): def has_a_weakref(self): weakref_count = weakref.getweakrefcount(self) - if weakref_count == 0: - return False - elif weakref_count == 1: + if weakref_count == 1: return not (weakref.getweakrefs(self)[0]() is self) else: - return True + return weakref_count > 0 def copy(self, deep: bool = True): """String Columns are immutable, so a deep/shallow copy From 56e4af546e873c0343bc9851c0d3fbe03fa39bed Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 11:26:04 -0700 Subject: [PATCH 018/124] internalize detach_refs --- python/cudf/cudf/_lib/column.pyx | 8 +------- python/cudf/cudf/core/column/column.py | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index dee094ac40e..03e287fdee3 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -326,13 +326,7 @@ cdef class Column: else: return weakref_count > 0 - def detach_refs(self): - """ - Given another column, update the attributes of this column to mimic an - inplace operation. This does not modify the memory of Buffers, but - instead replaces the Buffers and other attributes underneath the column - object with the Buffers and attributes from the other column. - """ + def _detach_refs(self): if self.has_a_weakref(): new_col = self.force_deep_copy() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6aad85872eb..9747d442396 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -341,7 +341,7 @@ def _fill( inplace=True, ) - self.detach_refs() + self._detach_refs() if not slr.is_valid() and not self.nullable: mask = create_null_mask(self.size, state=MaskState.ALL_VALID) self.set_base_mask(mask) From 49994d55e84ee53d464dcdce737f17f33d0d5a45 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 12:27:23 -0700 Subject: [PATCH 019/124] cleanup --- python/cudf/cudf/_lib/column.pyx | 19 +++++++------------ python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/series.py | 4 ++-- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 03e287fdee3..2550ed240bd 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -339,7 +339,7 @@ cdef class Column: self._weak_ref = None - def _temp_mimic_inplace(self, other_col, inplace=False): + def _mimic_inplace(self, other_col, inplace=False): """ Given another column, update the attributes of this column to mimic an inplace operation. This does not modify the memory of Buffers, but @@ -348,18 +348,13 @@ cdef class Column: """ if inplace: - if other_col.has_a_weakref(): - new_col = other_col.force_deep_copy() - else: - new_col = other_col - - self._offset = new_col.offset - self._size = new_col.size - self._dtype = new_col._dtype - self.set_base_data(new_col.base_data) - self.set_base_children(new_col.base_children) - self.set_base_mask(new_col.base_mask) + self._offset = other_col.offset + self._size = other_col.size + self._dtype = other_col._dtype self._weak_ref = other_col._weak_ref + self.set_base_data(other_col.base_data) + self.set_base_children(other_col.base_children) + self.set_base_mask(other_col.base_mask) else: return other_col diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 8db74197a12..d438f47e1c4 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1293,7 +1293,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False ) -> Optional[ColumnBase]: - out = super()._temp_mimic_inplace(other_col, inplace=inplace) + out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes return out diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9747d442396..a546cdd008c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -336,7 +336,7 @@ def _fill( return libcudf.filling.fill(self, begin, end, slr.device_value) if is_string_dtype(self.dtype): - return self._temp_mimic_inplace( + return self._mimic_inplace( libcudf.filling.fill(self, begin, end, slr.device_value), inplace=True, ) @@ -530,7 +530,7 @@ def __setitem__(self, key: Any, value: Any): out = self._scatter_by_column(key, value_normalized) if out: - self._temp_mimic_inplace(out, inplace=True) + self._mimic_inplace(out, inplace=True) def _wrap_binop_normalization(self, other): if other is NA or other is None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1706fef10c0..c4aa6f4663c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -147,7 +147,7 @@ def _mimic_inplace( if inplace: for col in self._data: if col in result._data: - self._data[col]._temp_mimic_inplace( + self._data[col]._mimic_inplace( result._data[col], inplace=True ) self._data = result._data diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e05953d49c4..d8da5c53aee 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -170,7 +170,7 @@ def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): val_col = as_column(value, dtype=to_type) old_col = col.astype(to_type) - col._temp_mimic_inplace(concat_columns([old_col, val_col]), inplace=True) + col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True) class _SeriesIlocIndexer(_FrameIndexer): @@ -228,7 +228,7 @@ def __setitem__(self, key, value): ) value = value.astype(to_dtype) if self._frame._column.dtype != to_dtype: - self._frame._column._temp_mimic_inplace( + self._frame._column._mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) From de6c9dc1bba47858b9e8d636a4282c3d94382fd4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 18:49:21 -0700 Subject: [PATCH 020/124] fix --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 465cfed4257..d5bffd707e1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6170,7 +6170,7 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, children=tuple( - [col.force_deep_copy() for col in self._data.columns] + [col.copy(deep=True) for col in self._data.columns] ), size=len(self), ) From e2b746ea2e311cb1fcfe5ea6bf294cc55a72fb66 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 19 Sep 2022 18:51:24 -0700 Subject: [PATCH 021/124] cleanup --- python/cudf/cudf/core/column/lists.py | 40 ++++++-------------------- python/cudf/cudf/core/column/struct.py | 16 +++++------ 2 files changed, 16 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 90b03be8b46..f021b75d803 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -202,38 +202,14 @@ def copy(self, deep: bool = True): underlying data and mask and a shallow copy creates a new column and copies the references of the data and mask. """ - if deep: - copied_col = column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) - # result = libcudf.copying.copy_column(self) - # return cast(T, result._with_type_metadata(self.dtype)) - # copied_col._weak_ref = weakref.ref(self.base_data, custom_weakref_callback) - # if self._weak_ref is None: - # self._weak_ref = copied_col.get_weakref() - # copied_col._weak_ref = self.get_weakref() - # else: - # if self.has_a_weakref(): - # copied_col._weak_ref = self._weak_ref - # self._weak_ref = copied_col.get_weakref() - # else: - # self._weak_ref = copied_col.get_weakref() - # copied_col._weak_ref = self.get_weakref() - return copied_col - else: - return column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) + return column.build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) def leaves(self): if isinstance(self.elements, ListColumn): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2eaec2caa47..24cefe3ed30 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -82,15 +82,15 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def copy(self, deep=True): - # result = super().copy(deep=deep) + result = cudf.core.column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ) if deep: result = result._rename_fields(self.dtype.fields.keys()) return result From 3bbc9f95a661582da9cf9f13d8c7250acc16f18b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 28 Sep 2022 13:11:27 -0700 Subject: [PATCH 022/124] Fix non cow tests --- python/cudf/cudf/options.py | 4 ++-- python/cudf/cudf/tests/test_index.py | 6 +++++- python/cudf/cudf/tests/test_multiindex.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 5a791575e5b..1f09c6fea79 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -4,7 +4,7 @@ from collections.abc import Container from dataclasses import dataclass from typing import Any, Callable, Dict, Optional - +import os @dataclass class Option: @@ -166,7 +166,7 @@ def _validator(val): _register_option( "copy_on_write", - True, + os.environ.get("CUDF_COPY_ON_WRITE", "0") == "1", textwrap.dedent( """ Default behavior of performing deep copies. diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e11683a103f..a11cb54ef77 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -396,7 +396,11 @@ def test_index_copy_category(name, dtype, deep=True): def test_index_copy_deep(idx, deep): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - if not deep: + if isinstance(idx, cudf.StringIndex): + # StringColumn is immutable hence, deep copies of a + # StringIndex will share the same StringColumn. + assert_column_memory_eq(idx._values, idx_copy._values) + elif not deep: assert_column_memory_eq(idx._values, idx_copy._values) else: assert_column_memory_ne(idx._values, idx_copy._values) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 37e1c353efd..a572b04c33f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -786,7 +786,7 @@ def test_multiindex_copy_deep(data, deep): lptrs = [child.base_data.ptr for child in lchildren] rptrs = [child.base_data.ptr for child in rchildren] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all((x == y) == True for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): mi1 = data From 79c5f17eceebae80f26cb2f6404e482893a6ce57 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 28 Sep 2022 13:11:41 -0700 Subject: [PATCH 023/124] Fix non cow tests --- python/cudf/cudf/options.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 1f09c6fea79..00561ed782c 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,10 +1,11 @@ # Copyright (c) 2022, NVIDIA CORPORATION. +import os import textwrap from collections.abc import Container from dataclasses import dataclass from typing import Any, Callable, Dict, Optional -import os + @dataclass class Option: From 79cc09ff84fe20d601de730147a0e6f879488a4a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 28 Sep 2022 13:59:27 -0700 Subject: [PATCH 024/124] pytest fix --- python/cudf/cudf/tests/test_dataframe_copy.py | 6 +++++- python/cudf/cudf/tests/test_index.py | 9 ++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 1a9098c70db..70644a33266 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -6,6 +6,7 @@ import pytest from numba import cuda +import cudf from cudf.core.dataframe import DataFrame from cudf.testing._utils import ALL_TYPES, assert_eq @@ -161,7 +162,10 @@ def test_kernel_deep_copy(): sr = gdf["b"] add_one[1, len(sr)](sr._column.data_array_view) - assert not gdf.to_string().split() == cdf.to_string().split() + if cudf.get_option("copy_on_write"): + assert gdf.to_string().split() == cdf.to_string().split() + else: + assert not gdf.to_string().split() == cdf.to_string().split() def test_kernel_shallow_copy(): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index a11cb54ef77..4a3e94c81d5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -396,11 +396,14 @@ def test_index_copy_category(name, dtype, deep=True): def test_index_copy_deep(idx, deep): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - if isinstance(idx, cudf.StringIndex): + + if isinstance(idx, cudf.StringIndex) or not deep or cudf.get_option("copy_on_write"): # StringColumn is immutable hence, deep copies of a # StringIndex will share the same StringColumn. - assert_column_memory_eq(idx._values, idx_copy._values) - elif not deep: + + # When `copy_on_write` is turned on, Index objects will + # have unique column object but they all point to same + # data pointers. assert_column_memory_eq(idx._values, idx_copy._values) else: assert_column_memory_ne(idx._values, idx_copy._values) From 385b7dec985506652977bf94ae19de52961318eb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Oct 2022 13:51:20 -0700 Subject: [PATCH 025/124] Handle categoricals --- python/cudf/cudf/_lib/column.pyi | 4 +++ python/cudf/cudf/core/column/categorical.py | 34 +++++-------------- python/cudf/cudf/tests/test_dataframe_copy.py | 2 +- python/cudf/cudf/tests/test_index.py | 12 +++++-- python/cudf/cudf/tests/test_multiindex.py | 4 +-- 5 files changed, 24 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index fd9aab038d4..3c14c49506d 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,6 +2,7 @@ from __future__ import annotations +import weakref from typing import Dict, Optional, Tuple, TypeVar from cudf._typing import Dtype, DtypeObj, ScalarLike @@ -22,6 +23,7 @@ class Column: _children: Tuple[ColumnBase, ...] _base_children: Tuple[ColumnBase, ...] _distinct_count: Dict[bool, int] + _weak_ref: Optional[weakref.ref] def __init__( self, @@ -70,6 +72,8 @@ class Column: @property def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... + def _detach_refs(self) -> None: ... + def has_a_weakref(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False ) -> Optional[ColumnBase]: ... diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 601ad707ba6..d25ef9a4e8b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -2,6 +2,7 @@ from __future__ import annotations +import weakref from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast @@ -26,6 +27,11 @@ min_unsigned_type, ) + +def custom_weakref_callback(ref): + pass + + if TYPE_CHECKING: from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex from cudf.core.column import ( @@ -1259,32 +1265,8 @@ def _get_decategorized_column(self) -> ColumnBase: out = out.set_mask(self.mask) return out - def copy(self, deep: bool = True) -> CategoricalColumn: - if deep: - copied_col = libcudf.copying.copy_column(self) - copied_cat = libcudf.copying.copy_column(self.dtype._categories) - - return column.build_categorical_column( - categories=copied_cat, - codes=column.build_column( - copied_col.base_data, dtype=copied_col.dtype - ), - offset=copied_col.offset, - size=copied_col.size, - mask=copied_col.base_mask, - ordered=self.dtype.ordered, - ) - else: - return column.build_categorical_column( - categories=self.dtype.categories._values, - codes=column.build_column( - self.codes.base_data, dtype=self.codes.dtype - ), - mask=self.base_mask, - ordered=self.dtype.ordered, - offset=self.offset, - size=self.size, - ) + def get_weakref(self): + return weakref.ref(self.codes.base_data, custom_weakref_callback) @cached_property def memory_usage(self) -> int: diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 70644a33266..6e9c85cc9dc 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from copy import copy, deepcopy import numpy as np diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 4a3e94c81d5..663630da052 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -255,7 +255,9 @@ def test_index_rename_inplace(): # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - # assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + assert ( + gds_renamed_deep._values.data_ptr == gds._values.data_ptr + ) == cudf.get_option("copy_on_write") # inplace=True returns none expected_ptr = gds._values.data_ptr @@ -396,8 +398,12 @@ def test_index_copy_category(name, dtype, deep=True): def test_index_copy_deep(idx, deep): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - - if isinstance(idx, cudf.StringIndex) or not deep or cudf.get_option("copy_on_write"): + # import pdb;pdb.set_trace() + if ( + isinstance(idx, cudf.StringIndex) + or not deep + or cudf.get_option("copy_on_write") + ): # StringColumn is immutable hence, deep copies of a # StringIndex will share the same StringColumn. diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a572b04c33f..f46f30d583f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -766,7 +766,7 @@ def test_multiindex_copy_deep(data, deep): Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ - same_ref = not deep + same_ref = (not deep) or cudf.get_option("copy_on_write") if isinstance(data, dict): import operator @@ -786,7 +786,7 @@ def test_multiindex_copy_deep(data, deep): lptrs = [child.base_data.ptr for child in lchildren] rptrs = [child.base_data.ptr for child in rchildren] - assert all((x == y) == True for x, y in zip(lptrs, rptrs)) + assert all((x == y) for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): mi1 = data From 2d2488206990b2e743edf1888f05a81aa600e0f4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Oct 2022 13:59:05 -0700 Subject: [PATCH 026/124] style --- python/cudf/cudf/tests/test_dataframe_copy.py | 1 + python/cudf/cudf/tests/test_index.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 6e9c85cc9dc..5f4347e2956 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -1,4 +1,5 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. + from copy import copy, deepcopy import numpy as np diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 663630da052..8f74daf3028 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -398,7 +398,7 @@ def test_index_copy_category(name, dtype, deep=True): def test_index_copy_deep(idx, deep): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - # import pdb;pdb.set_trace() + if ( isinstance(idx, cudf.StringIndex) or not deep From f755758ec6e81f41d06a588d2c7433db7f1498d2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Oct 2022 14:02:15 -0700 Subject: [PATCH 027/124] style --- python/cudf/cudf/_lib/column.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 2550ed240bd..f46fd6d0a0f 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -317,12 +317,14 @@ cdef class Column: self._children = None self._base_children = value - + def has_a_weakref(self): weakref_count = weakref.getweakrefcount(self.base_data) if weakref_count == 1: - return not (weakref.getweakrefs(self.base_data)[0]() is self.base_data) + return not ( + weakref.getweakrefs(self.base_data)[0]() is self.base_data + ) else: return weakref_count > 0 @@ -338,7 +340,6 @@ cdef class Column: self.set_base_mask(new_col.base_mask) self._weak_ref = None - def _mimic_inplace(self, other_col, inplace=False): """ Given another column, update the attributes of this column to mimic an From 1cebe44b86a995039838481df3c710a79116d691 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 6 Oct 2022 15:33:08 -0700 Subject: [PATCH 028/124] struct fix --- python/cudf/cudf/core/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2a132ceada..dfe2b356751 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6216,7 +6216,10 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, children=tuple( - [col.copy(deep=True) for col in self._data.columns] + [ + col.copy(deep=cudf.get_option("copy_on_write")) + for col in self._data.columns + ] ), size=len(self), ) From 03267632cd913bb7b3ac7897ca7e77a0dc0b2f19 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 7 Oct 2022 12:31:25 -0700 Subject: [PATCH 029/124] detach in CAI --- python/cudf/cudf/_lib/column.pyx | 3 ++- python/cudf/cudf/core/column/column.py | 3 +++ python/cudf/cudf/core/column/datetime.py | 3 +++ python/cudf/cudf/core/column/numerical.py | 3 +++ python/cudf/cudf/tests/test_dataframe_copy.py | 6 +----- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f46fd6d0a0f..8fa71abad92 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -85,6 +85,7 @@ cdef class Column: self.set_base_data(data) self.set_base_mask(mask) self._weak_ref = weak_ref + self._zero_copied = False @property def base_size(self): @@ -329,7 +330,7 @@ cdef class Column: return weakref_count > 0 def _detach_refs(self): - if self.has_a_weakref(): + if not self._zero_copied and self.has_a_weakref(): new_col = self.force_deep_copy() self._offset = new_col.offset diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f4abb80f311..35060331cae 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -121,6 +121,9 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ + self._detach_refs() + self._zero_copied = True + return cuda.as_cuda_array(self.data).view(self.dtype) @property diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1419b14e8c6..a67a5ea7eec 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -282,6 +282,9 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: + self._detach_refs() + self._zero_copied = True + output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a66c11c8bdc..9d95776eff7 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -168,6 +168,9 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: + self._detach_refs() + self._zero_copied = True + output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 5f4347e2956..3de12c6a1a0 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -7,7 +7,6 @@ import pytest from numba import cuda -import cudf from cudf.core.dataframe import DataFrame from cudf.testing._utils import ALL_TYPES, assert_eq @@ -163,10 +162,7 @@ def test_kernel_deep_copy(): sr = gdf["b"] add_one[1, len(sr)](sr._column.data_array_view) - if cudf.get_option("copy_on_write"): - assert gdf.to_string().split() == cdf.to_string().split() - else: - assert not gdf.to_string().split() == cdf.to_string().split() + assert not gdf.to_string().split() == cdf.to_string().split() def test_kernel_shallow_copy(): From 3ef8ae424fa27bf4c27d05bef28c89c9113a869d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 11 Oct 2022 07:43:30 -0700 Subject: [PATCH 030/124] add Buffer._detach --- python/cudf/cudf/_lib/column.pyx | 4 ++-- python/cudf/cudf/core/buffer.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8fa71abad92..9af157097fd 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -128,7 +128,7 @@ cdef class Column: return self.data.ptr def set_base_data(self, value): - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, (Buffer, DeviceBufferLike)): raise TypeError( "Expected a DeviceBufferLike or None for data, " f"got {type(value).__name__}" @@ -177,7 +177,7 @@ cdef class Column: modify size or offset in any way, so the passed mask is expected to be compatible with the current offset. """ - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, (Buffer, DeviceBufferLike)): raise TypeError( "Expected a DeviceBufferLike or None for mask, " f"got {type(value).__name__}" diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 647e747e127..180a68fef09 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -226,6 +226,15 @@ def __cuda_array_interface__(self) -> dict: "typestr": "|u1", "version": 0, } + + def _detach(self): + # make a deep copy of existing DeviceBuffer + # and replace pointer to it. + current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + new_buf = current_buf.copy() + self._ptr = new_buf.ptr + self._size = new_buf.size + self._owner = new_buf def memoryview(self) -> memoryview: host_buf = bytearray(self.size) From e62c9bef6c6ef04631b1a88bd9ed8c0092b920f4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 11 Oct 2022 07:43:40 -0700 Subject: [PATCH 031/124] add Buffer._detach --- python/cudf/cudf/core/buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 180a68fef09..af3df4586dc 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -226,7 +226,7 @@ def __cuda_array_interface__(self) -> dict: "typestr": "|u1", "version": 0, } - + def _detach(self): # make a deep copy of existing DeviceBuffer # and replace pointer to it. From a0f90d3dc355afaf9810fbb0ea315425d5b8fe9b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 20 Oct 2022 19:58:25 -0700 Subject: [PATCH 032/124] Lower weakref to Buffer class --- python/cudf/cudf/_lib/column.pyi | 1 + python/cudf/cudf/_lib/column.pyx | 16 +- python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/core/abc.py | 9 +- python/cudf/cudf/core/buffer.py | 118 ++++++++++-- python/cudf/cudf/core/column/categorical.py | 3 - python/cudf/cudf/core/column/column.py | 73 +++++--- python/cudf/cudf/core/column/numerical.py | 5 +- python/cudf/cudf/core/column/string.py | 10 +- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/dataframe.py | 9 +- python/cudf/cudf/core/frame.py | 8 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/tests/test_copying.py | 195 ++++++++++++++++++++ python/cudf/cudf/utils/applyutils.py | 2 +- 17 files changed, 386 insertions(+), 77 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 3c14c49506d..aa0de73e506 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -74,6 +74,7 @@ class Column: def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _detach_refs(self) -> None: ... def has_a_weakref(self) -> bool: ... + def _is_cai_zero_copied(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False ) -> Optional[ColumnBase]: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9af157097fd..2b14d9d126b 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. import weakref +from types import SimpleNamespace import cupy as cp import numpy as np @@ -228,9 +229,12 @@ cdef class Column: ) if value is None: mask = None - elif hasattr(value, "__cuda_array_interface__"): + elif isinstance(value, Buffer) or hasattr(value, "__cuda_array_interface__"): + if isinstance(value, Buffer): + value = SimpleNamespace(__cuda_array_interface__=value._cai, owner=value if value._owner is None else value._owner) if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): + # TODO : PREM value = value.data_array_view value = cp.asarray(value).view('|u1') mask = as_device_buffer_like(value) @@ -320,14 +324,10 @@ cdef class Column: self._base_children = value def has_a_weakref(self): - weakref_count = weakref.getweakrefcount(self.base_data) + return self.base_data.has_a_weakref() or (self.base_mask.has_a_weakref() if self.base_mask else False) - if weakref_count == 1: - return not ( - weakref.getweakrefs(self.base_data)[0]() is self.base_data - ) - else: - return weakref_count > 0 + def _is_cai_zero_copied(self): + return self._zero_copied or (self.base_data is not None and self.base_data._is_cai_zero_copied()) or (self.base_mask is not None and self.base_mask._is_cai_zero_copied()) def _detach_refs(self): if not self._zero_copied and self.has_a_weakref(): diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 5fa45f68357..ff78ed88ce9 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -61,7 +61,7 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, cudf.core.buffer.DeviceBufferLike): + if not isinstance(mask_buffer, (cudf.core.buffer.Buffer, cudf.core.buffer.DeviceBufferLike)): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.DeviceBufferLike") cdef bitmask_type* bit_mask = (mask_buffer.ptr) diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index dcbf96313a7..3a3f77654b2 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -97,7 +97,14 @@ def device_serialize(self): """ header, frames = self.serialize() assert all( - isinstance(f, (cudf.core.buffer.DeviceBufferLike, memoryview)) + isinstance( + f, + ( + cudf.core.buffer.Buffer, + cudf.core.buffer.DeviceBufferLike, + memoryview, + ), + ) for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index af3df4586dc..f0d683be02b 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -2,8 +2,10 @@ from __future__ import annotations +import copy import math import pickle +import weakref from typing import ( Any, Dict, @@ -125,6 +127,16 @@ def as_device_buffer_like(obj: Any) -> DeviceBufferLike: return Buffer(obj) +class BufferWeakref(object): + def __init__(self, ptr, size) -> None: + self.ptr = ptr + self.size = size + + +def custom_weakref_callback(ref): + pass + + class Buffer(Serializable): """ A Buffer represents device memory. @@ -150,10 +162,15 @@ class Buffer(Serializable): _ptr: int _size: int _owner: object + _refs = {} def __init__( self, data: Union[int, Any], *, size: int = None, owner: object = None ): + self._weak_ref = None + self._temp_ref = None + self._zero_copied = False + if isinstance(data, int): if size is None: raise ValueError( @@ -164,6 +181,7 @@ def __init__( self._ptr = data self._size = size self._owner = owner + self._update_ref() else: if size is not None or owner is not None: raise ValueError( @@ -173,10 +191,11 @@ def __init__( # `data` is a buffer-like object buf: Any = data - if isinstance(buf, rmm.DeviceBuffer): + if isinstance(buf, (Buffer, rmm.DeviceBuffer)): self._ptr = buf.ptr self._size = buf.size self._owner = buf + self._update_ref() return iface = getattr(buf, "__cuda_array_interface__", None) if iface: @@ -184,12 +203,14 @@ def __init__( self._ptr = ptr self._size = size self._owner = buf + self._update_ref() return ptr, size = get_ptr_and_size(np.asarray(buf).__array_interface__) buf = rmm.DeviceBuffer(ptr=ptr, size=size) self._ptr = buf.ptr self._size = buf.size self._owner = buf + self._update_ref() def __getitem__(self, key: slice) -> Buffer: if not isinstance(key, slice): @@ -201,6 +222,70 @@ def __getitem__(self, key: slice) -> Buffer: data=self.ptr + start, size=stop - start, owner=self.owner ) + def _is_cai_zero_copied(self): + return self._zero_copied + + def _update_ref(self): + if (self._ptr, self._size) not in Buffer._refs: + Buffer._refs[(self._ptr, self._size)] = BufferWeakref( + self._ptr, self._size + ) + self._temp_ref = Buffer._refs[(self._ptr, self._size)] + + def get_ref(self): + if self._temp_ref is None: + self._update_ref() + return self._temp_ref + + def has_a_weakref(self): + weakref_count = weakref.getweakrefcount(self.get_ref()) + + if weakref_count == 1: + return ( + not weakref.getweakrefs(self.get_ref())[0]() + is not self.get_ref() + ) + else: + return weakref_count > 0 + + def get_weakref(self): + return weakref.ref(self.get_ref(), custom_weakref_callback) + + def copy(self, deep: bool = True): + if deep: + if ( + cudf.get_option("copy_on_write") + and not self._is_cai_zero_copied() + ): + copied_buf = Buffer.__new__(Buffer) + copied_buf._ptr = self._ptr + copied_buf._size = self._size + copied_buf._owner = self._owner + copied_buf._temp_ref = None + copied_buf._weak_ref = None + copied_buf._zero_copied = False + + if self._weak_ref is None: + self._weak_ref = copied_buf.get_weakref() + copied_buf._weak_ref = self.get_weakref() + else: + if self.has_a_weakref(): + copied_buf._weak_ref = self._weak_ref + self._weak_ref = copied_buf.get_weakref() + else: + self._weak_ref = copied_buf.get_weakref() + copied_buf._weak_ref = self.get_weakref() + return copied_buf + else: + owner_copy = copy.copy(self._owner) + return Buffer(data=None, size=None, owner=owner_copy) + else: + shallow_copy = Buffer.__new__(Buffer) + shallow_copy._ptr = self._ptr + shallow_copy._size = self._size + shallow_copy._owner = self._owner + return shallow_copy + @property def size(self) -> int: return self._size @@ -218,7 +303,7 @@ def owner(self) -> Any: return self._owner @property - def __cuda_array_interface__(self) -> dict: + def _cai(self) -> dict: return { "data": (self.ptr, False), "shape": (self.size,), @@ -227,14 +312,21 @@ def __cuda_array_interface__(self) -> dict: "version": 0, } - def _detach(self): - # make a deep copy of existing DeviceBuffer - # and replace pointer to it. - current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) - new_buf = current_buf.copy() - self._ptr = new_buf.ptr - self._size = new_buf.size - self._owner = new_buf + @property + def __cuda_array_interface__(self) -> dict: + self._detach_refs() + self._zero_copied = True + return self._cai + + def _detach_refs(self): + if not self._zero_copied and self.has_a_weakref(): + # make a deep copy of existing DeviceBuffer + # and replace pointer to it. + current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + new_buf = current_buf.copy() + self._ptr = new_buf.ptr + self._size = new_buf.size + self._owner = new_buf def memoryview(self) -> memoryview: host_buf = bytearray(self.size) @@ -245,7 +337,7 @@ def serialize(self) -> Tuple[dict, list]: header = {} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) header["constructor-kwargs"] = {} - header["desc"] = self.__cuda_array_interface__.copy() + header["desc"] = self._cai.copy() header["desc"]["strides"] = (1,) header["frame_count"] = 1 frames = [self] @@ -258,11 +350,11 @@ def deserialize(cls, header: dict, frames: list) -> Buffer: ), "Only expecting to deserialize Buffer with a single frame." buf = cls(frames[0], **header["constructor-kwargs"]) - if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]: + if header["desc"]["shape"] != buf._cai["shape"]: raise ValueError( f"Received a `Buffer` with the wrong size." f" Expected {header['desc']['shape']}, " - f"but got {buf.__cuda_array_interface__['shape']}" + f"but got {buf._cai['shape']}" ) return buf diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d066e823f4a..913799d87a2 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1265,9 +1265,6 @@ def _get_decategorized_column(self) -> ColumnBase: out = out.set_mask(self.mask) return out - def get_weakref(self): - return weakref.ref(self.codes.base_data, custom_weakref_callback) - @cached_property def memory_usage(self) -> int: return self.categories.memory_usage + self.codes.memory_usage diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 35060331cae..5ab9b2be6dc 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -124,7 +124,35 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": self._detach_refs() self._zero_copied = True - return cuda.as_cuda_array(self.data).view(self.dtype) + return self._data_array_view + + @property + def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": + """ + View the data as a device array object + """ + return cuda.as_cuda_array( + SimpleNamespace( + __cuda_array_interface__=self.data._cai, + owner=self.data + if self.data._owner is None + else self.data._owner, + ) + ).view(self.dtype) + + @property + def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": + """ + View the mask as a device array + """ + return cuda.as_cuda_array( + SimpleNamespace( + __cuda_array_interface__=self.mask._cai, + owner=self.mask + if self.mask._owner is None + else self.mask._owner, + ) + ).view(mask_dtype) @property def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -169,7 +197,7 @@ def values_host(self) -> "np.ndarray": if self.has_nulls(): raise ValueError("Column must have no nulls.") - return self.data_array_view.copy_to_host() + return self._data_array_view.copy_to_host() @property def values(self) -> "cupy.ndarray": @@ -372,42 +400,44 @@ def nullmask(self) -> DeviceBufferLike: raise ValueError("Column has no null mask") return self.mask_array_view + @property + def _nullmask(self) -> DeviceBufferLike: + """The gpu buffer for the null-mask""" + if not self.nullable: + raise ValueError("Column has no null mask") + return self._mask_array_view + def force_deep_copy(self: T) -> T: result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) - def get_weakref(self): - return weakref.ref(self.base_data, custom_weakref_callback) - def copy(self: T, deep: bool = True) -> T: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and copies the references of the data and mask. """ if deep: - if cudf.get_option("copy_on_write"): + if ( + cudf.get_option("copy_on_write") + and not self._is_cai_zero_copied() + ): copied_col = cast( T, build_column( - self.base_data, + self.base_data + if self.base_data is None + else self.base_data.copy(deep=deep), self.dtype, - mask=self.base_mask, + mask=self.base_mask + if self.base_mask is None + else self.base_mask.copy(deep=deep), size=self.size, offset=self.offset, - children=self.base_children, + children=tuple( + col.copy(deep=True) for col in self.base_children + ), ), ) - - if self._weak_ref is None: - self._weak_ref = copied_col.get_weakref() - copied_col._weak_ref = self.get_weakref() - else: - if self.has_a_weakref(): - copied_col._weak_ref = self._weak_ref - self._weak_ref = copied_col.get_weakref() - else: - self._weak_ref = copied_col.get_weakref() - copied_col._weak_ref = self.get_weakref() return copied_col else: result = libcudf.copying.copy_column(self) @@ -1279,11 +1309,12 @@ def column_empty_like( ): column = cast("cudf.core.column.CategoricalColumn", column) codes = column_empty_like(column.codes, masked=masked, newsize=newsize) + return build_column( data=None, dtype=dtype, mask=codes.base_mask, - children=(as_column(codes.base_data, dtype=codes.dtype),), + children=(column_empty_like(codes, dtype=codes.dtype),), size=codes.size, ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 9d95776eff7..5ae0d3a604a 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -113,8 +113,8 @@ def __contains__(self, item: ScalarLike) -> bool: # Handles improper item types # Fails if item is of type None, so the handler. try: - if np.can_cast(item, self.data_array_view.dtype): - item = self.data_array_view.dtype.type(item) + if np.can_cast(item, self.dtype): + item = self.dtype.type(item) else: return False except (TypeError, ValueError): @@ -569,6 +569,7 @@ def fillna( def _find_value( self, value: ScalarLike, closest: bool, find: Callable, compare: str ) -> int: + # TODO : PREM value = to_cudf_compatible_scalar(value) if not is_number(value): raise ValueError("Expected a numeric value") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ffd0c11c7de..11ad51ff638 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5250,16 +5250,8 @@ def __init__( self._start_offset = None self._end_offset = None - def get_weakref(self): - return weakref.ref(self, column.custom_weakref_callback) - def has_a_weakref(self): - weakref_count = weakref.getweakrefcount(self) - - if weakref_count == 1: - return not (weakref.getweakrefs(self)[0]() is self) - else: - return weakref_count > 0 + return any(child.has_a_weakref() for child in self.children) def copy(self, deep: bool = True): """String Columns are immutable, so a deep/shallow copy diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e6d688014fa..feca68a868a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -131,7 +131,7 @@ def to_arrow(self) -> pa.Array: mask = None if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) - data = pa.py_buffer(self.as_numerical.data_array_view.copy_to_host()) + data = pa.py_buffer(self.as_numerical.values_host) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( type=pa_dtype, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index dfe2b356751..8f97c66b6c5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6216,17 +6216,12 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, children=tuple( - [ - col.copy(deep=cudf.get_option("copy_on_write")) - for col in self._data.columns - ] + [col.copy(deep=True) for col in self._data.columns] ), size=len(self), ) return cudf.Series._from_data( - cudf.core.column_accessor.ColumnAccessor( - {name: col.copy(deep=True)} - ), + cudf.core.column_accessor.ColumnAccessor({name: col}), index=self.index, name=name, ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4fb914a6409..80dd086e233 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1444,7 +1444,7 @@ def searchsorted( # Return result as cupy array if the values is non-scalar # If values is scalar, result is expected to be scalar. - result = cupy.asarray(outcol.data_array_view) + result = cupy.asarray(outcol._data_array_view) if scalar_flag: return result[0].item() else: @@ -1697,8 +1697,8 @@ def _colwise_binop( # that nulls that are present in both left_column and # right_column are not filled. if left_column.nullable and right_column.nullable: - lmask = as_column(left_column.nullmask) - rmask = as_column(right_column.nullmask) + lmask = as_column(left_column._nullmask) + rmask = as_column(right_column._nullmask) output_mask = (lmask | rmask).data left_column = left_column.fillna(fill_value) right_column = right_column.fillna(fill_value) @@ -1752,7 +1752,7 @@ def _apply_cupy_ufunc_to_operands( cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): if isinstance(inp, ColumnBase) and inp.has_nulls(): - new_mask = as_column(inp.nullmask) + new_mask = as_column(inp._nullmask) # TODO: This is a hackish way to perform a bitwise and # of bitmasks. Once we expose diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 650d1d0d83a..3dfc0ccb769 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -155,7 +155,7 @@ def __init__( source_data = {} for i, (column_name, col) in enumerate(codes._data.items()): - if -1 in col.values: + if -1 in col: level = cudf.DataFrame( {column_name: [None] + list(levels[i])}, index=range(-1, len(levels[i])), diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 8e5d0ece729..8520adb9ba6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -577,9 +577,7 @@ def _tile(A, reps): mdata[var_name] = cudf.Series( cudf.core.column.build_categorical_column( categories=value_vars, - codes=cudf.core.column.as_column( - temp._column.base_data, dtype=temp._column.dtype - ), + codes=temp._column, mask=temp._column.base_mask, size=temp._column.size, offset=temp._column.offset, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 84aa4ae5894..63e95b96f9f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4832,10 +4832,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): index = as_index(a.index) a_col = column.as_column(a) - a_array = cupy.asarray(a_col.data_array_view) + a_array = cupy.asarray(a_col._data_array_view) b_col = column.as_column(b) - b_array = cupy.asarray(b_col.data_array_view) + b_array = cupy.asarray(b_col._data_array_view) result = cupy.isclose( a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1d3d9e91ae2..70b0b90566b 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. +import cupy as cp import numpy as np import pandas as pd import pytest @@ -52,3 +53,197 @@ def test_null_copy(): col = Series(np.arange(2049)) col[:] = None assert len(col) == 2049 + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_setitem_cow(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=True) + + actual[1] = 100 + assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=True) + + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=True) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + +def test_multiple_series_cow(): + cudf.set_option("copy_on_write", True) + s = cudf.Series([10, 20, 30, 40, 50]) + s1 = s.copy(deep=True) + s2 = s.copy(deep=True) + s3 = s.copy(deep=True) + s4 = s2.copy(deep=True) + s5 = s4.copy(deep=True) + s6 = s3.copy(deep=True) + + s1[0:3] = 10000 + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + for ser in [s, s2, s3, s4, s5, s6]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s6[0:3] = 3000 + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s2, s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s2[1:4] = 4000 + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s4[2:4] = 5000 + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s5[2:4] = 6000 + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s7 = s5.copy(deep=True) + assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) + s7[1:3] = 55 + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + del s2 + + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s4 + del s1 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s + del s6 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s5 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s3 + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_zero_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series([1, 2, 3, 4, 5]) + s1 = s.copy(deep=True) + cp_array = cp.asarray(s) + + assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) + + cp_array[0:3] = 10 + + assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) + + s2 = cudf.Series(cp_array) + assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + s3 = s2.copy(deep=True) + cp_array[0] = 20 + + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) + assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_str_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series(["a", "b", "c", "d", "e"]) + s1 = s.copy(deep=True) + s2 = s.copy(deep=True) + + assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s[0:3] = "abc" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s2[1:4] = "xyz" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_cat_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series([10, 20, 30, 40, 50], dtype="category") + s1 = s.copy(deep=True) + s2 = s1.copy(deep=True) + s3 = s1.copy(deep=True) + + s[0] = 50 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + + s2[3] = 10 + s3[2:5] = 20 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) + assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index 89331b933a8..a012d73fe67 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -110,7 +110,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): col = cudf.core.dataframe.extract_col(df, k) if not col.nullable: continue - nullmask = df[k].nullmask + nullmask = cudf.Series(df[k]._column._nullmask) if out_mask is None: out_mask = column.as_column( From 6028691e687fb7634e09822792154b02ef9d7e1f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 10:46:21 -0700 Subject: [PATCH 033/124] cleanup --- python/cudf/cudf/_lib/column.pyx | 38 ++++++++++++++++++----- python/cudf/cudf/core/buffer.py | 9 +++++- python/cudf/cudf/core/column/column.py | 17 +++++----- python/cudf/cudf/core/column/numerical.py | 5 ++- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 2b14d9d126b..42a749913c7 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -129,7 +129,8 @@ cdef class Column: return self.data.ptr def set_base_data(self, value): - if value is not None and not isinstance(value, (Buffer, DeviceBufferLike)): + if value is not None and \ + not isinstance(value, (Buffer, DeviceBufferLike)): raise TypeError( "Expected a DeviceBufferLike or None for data, " f"got {type(value).__name__}" @@ -178,7 +179,8 @@ cdef class Column: modify size or offset in any way, so the passed mask is expected to be compatible with the current offset. """ - if value is not None and not isinstance(value, (Buffer, DeviceBufferLike)): + if value is not None and \ + not isinstance(value, (Buffer, DeviceBufferLike)): raise TypeError( "Expected a DeviceBufferLike or None for mask, " f"got {type(value).__name__}" @@ -229,12 +231,17 @@ cdef class Column: ) if value is None: mask = None - elif isinstance(value, Buffer) or hasattr(value, "__cuda_array_interface__"): + elif ( + isinstance(value, Buffer) or + hasattr(value, "__cuda_array_interface__") + ): if isinstance(value, Buffer): - value = SimpleNamespace(__cuda_array_interface__=value._cai, owner=value if value._owner is None else value._owner) + value = SimpleNamespace( + __cuda_array_interface__=value._cai, + owner=value + ) if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): - # TODO : PREM value = value.data_array_view value = cp.asarray(value).view('|u1') mask = as_device_buffer_like(value) @@ -324,10 +331,27 @@ cdef class Column: self._base_children = value def has_a_weakref(self): - return self.base_data.has_a_weakref() or (self.base_mask.has_a_weakref() if self.base_mask else False) + return ( + self.base_data.has_a_weakref() or + ( + self.base_mask.has_a_weakref() + if self.base_mask + else False + ) + ) def _is_cai_zero_copied(self): - return self._zero_copied or (self.base_data is not None and self.base_data._is_cai_zero_copied()) or (self.base_mask is not None and self.base_mask._is_cai_zero_copied()) + return ( + self._zero_copied or + ( + self.base_data is not None and + self.base_data._is_cai_zero_copied() + ) or + ( + self.base_mask is not None and + self.base_mask._is_cai_zero_copied() + ) + ) def _detach_refs(self): if not self._zero_copied and self.has_a_weakref(): diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index f0d683be02b..5a36df7cd9e 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -55,6 +55,13 @@ def owner(self) -> Any: def __cuda_array_interface__(self) -> Mapping: """Implementation of the CUDA Array Interface.""" + @property + def _cai(self) -> Mapping: + """""" + + def copy(self, deep: bool = True) -> DeviceBufferLike: + """Make a copy of Buffer.""" + def memoryview(self) -> memoryview: """Read-only access to the buffer through host memory.""" @@ -162,7 +169,7 @@ class Buffer(Serializable): _ptr: int _size: int _owner: object - _refs = {} + _refs: dict = {} def __init__( self, data: Union[int, Any], *, size: int = None, owner: object = None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bd47ada4cfd..ab4eda59403 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,7 +4,6 @@ import pickle import warnings -import weakref from functools import cached_property from itertools import chain from types import SimpleNamespace @@ -133,10 +132,10 @@ def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ return cuda.as_cuda_array( SimpleNamespace( - __cuda_array_interface__=self.data._cai, - owner=self.data - if self.data._owner is None - else self.data._owner, + __cuda_array_interface__=self.data._cai + if self.data is not None + else None, + owner=self.data, ) ).view(self.dtype) @@ -147,10 +146,10 @@ def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ return cuda.as_cuda_array( SimpleNamespace( - __cuda_array_interface__=self.mask._cai, - owner=self.mask - if self.mask._owner is None - else self.mask._owner, + __cuda_array_interface__=self.mask._cai + if self.mask is not None + else None, + owner=self.mask, ) ).view(mask_dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 5ae0d3a604a..6e4d4e67513 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -569,21 +569,20 @@ def fillna( def _find_value( self, value: ScalarLike, closest: bool, find: Callable, compare: str ) -> int: - # TODO : PREM value = to_cudf_compatible_scalar(value) if not is_number(value): raise ValueError("Expected a numeric value") found = 0 if len(self): found = find( - self.data_array_view, + self._data_array_view, value, mask=self.mask, ) if found == -1: if self.is_monotonic_increasing and closest: found = find( - self.data_array_view, + self._data_array_view, value, mask=self.mask, compare=compare, From 5904c8e0605e5004f03ef08d66eaedb95c193130 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 14:44:23 -0700 Subject: [PATCH 034/124] cleanup --- python/cudf/cudf/_lib/column.pyi | 1 - python/cudf/cudf/_lib/column.pyx | 5 ----- python/cudf/cudf/core/column/struct.py | 1 - 3 files changed, 7 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index aa0de73e506..27b43ff14c3 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -23,7 +23,6 @@ class Column: _children: Tuple[ColumnBase, ...] _base_children: Tuple[ColumnBase, ...] _distinct_count: Dict[bool, int] - _weak_ref: Optional[weakref.ref] def __init__( self, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 42a749913c7..913bddbab42 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -74,7 +74,6 @@ cdef class Column: int offset=0, object null_count=None, object children=(), - object weak_ref=None, ): self._size = size @@ -85,7 +84,6 @@ cdef class Column: self.set_base_children(children) self.set_base_data(data) self.set_base_mask(mask) - self._weak_ref = weak_ref self._zero_copied = False @property @@ -214,7 +212,6 @@ cdef class Column: # `self.memory_usage` was never called before, So ignore. pass self._null_count = None - self._weak_ref = None def set_mask(self, value): """ @@ -363,7 +360,6 @@ cdef class Column: self.set_base_data(new_col.base_data) self.set_base_children(new_col.base_children) self.set_base_mask(new_col.base_mask) - self._weak_ref = None def _mimic_inplace(self, other_col, inplace=False): """ @@ -377,7 +373,6 @@ cdef class Column: self._offset = other_col.offset self._size = other_col.size self._dtype = other_col._dtype - self._weak_ref = other_col._weak_ref self.set_base_data(other_col.base_data) self.set_base_children(other_col.base_children) self.set_base_mask(other_col.base_mask) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 24cefe3ed30..51a10daf4fa 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -111,7 +111,6 @@ def _rename_fields(self, names): offset=self.offset, null_count=self.null_count, children=self.base_children, - weak_ref=self._weak_ref, ) @property From 0e4ce2664b140bd9a0c5c4daaf75629cd65948be Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 14:54:33 -0700 Subject: [PATCH 035/124] cleanup --- python/cudf/cudf/_lib/column.pyi | 1 - python/cudf/cudf/_lib/column.pyx | 19 +++++++++++++++---- python/cudf/cudf/_lib/transform.pyx | 4 +++- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 27b43ff14c3..0b2904ade94 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,7 +2,6 @@ from __future__ import annotations -import weakref from typing import Dict, Optional, Tuple, TypeVar from cudf._typing import Dtype, DtypeObj, ScalarLike diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 913bddbab42..9d0456502e2 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -73,7 +73,7 @@ cdef class Column: object mask=None, int offset=0, object null_count=None, - object children=(), + object children=() ): self._size = size @@ -327,7 +327,11 @@ cdef class Column: self._children = None self._base_children = value - def has_a_weakref(self): + def has_a_weakref(self) -> bool: + """ + Determines if the column has a weak reference. + """ + return ( self.base_data.has_a_weakref() or ( @@ -337,7 +341,10 @@ cdef class Column: ) ) - def _is_cai_zero_copied(self): + def _is_cai_zero_copied(self) -> bool: + """ + Determines if the column is zero copied. + """ return ( self._zero_copied or ( @@ -351,7 +358,11 @@ cdef class Column: ) def _detach_refs(self): - if not self._zero_copied and self.has_a_weakref(): + """ + Detaches a column from it's current Buffers by making + a true deep-copy. + """ + if not self._is_cai_zero_copied() and self.has_a_weakref(): new_col = self.force_deep_copy() self._offset = new_col.offset diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index ff78ed88ce9..87c3f24803a 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -61,7 +61,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, (cudf.core.buffer.Buffer, cudf.core.buffer.DeviceBufferLike)): + if not isinstance(mask_buffer, ( + cudf.core.buffer.Buffer, cudf.core.buffer.DeviceBufferLike + )): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.DeviceBufferLike") cdef bitmask_type* bit_mask = (mask_buffer.ptr) From 995b66c7e369673b5e8d5282d54f92100ec49650 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 16:12:47 -0700 Subject: [PATCH 036/124] docstrings --- python/cudf/cudf/core/buffer.py | 143 +++++++++++++++++++- python/cudf/cudf/core/column/categorical.py | 6 - python/cudf/cudf/core/column/column.py | 10 +- 3 files changed, 141 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 5a36df7cd9e..93d96b28062 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -57,7 +57,10 @@ def __cuda_array_interface__(self) -> Mapping: @property def _cai(self) -> Mapping: - """""" + """ + Internal Implementation for the CUDA Array Interface without + triggering a deepcopy. + """ def copy(self, deep: bool = True) -> DeviceBufferLike: """Make a copy of Buffer.""" @@ -135,12 +138,91 @@ def as_device_buffer_like(obj: Any) -> DeviceBufferLike: class BufferWeakref(object): + """ + A proxy class to be used by ``Buffer`` for generating weakreferences. + """ + def __init__(self, ptr, size) -> None: self.ptr = ptr self.size = size def custom_weakref_callback(ref): + """ + A callback for ``weakref.ref`` API to generate unique + weakref instances that can be counted correctly. + + Example below shows why this is necessary: + + In [1]: import cudf + In [2]: import weakref + + Let's create an object ``x`` that we are going to weakref: + + In [3]: x = cudf.core.buffer.BufferWeakref(1, 2) + + Now generate three weak-references of it: + + In [4]: a = weakref.ref(x) + In [5]: b = weakref.ref(x) + In [6]: c = weakref.ref(x) + + ``weakref.ref`` actually returns the same singleton object: + + In [7]: a + Out[7]: + In [8]: b + Out[8]: + In [9]: c + Out[9]: + + In [10]: a is b + Out[10]: True + In [11]: b is c + Out[11]: True + + This will be problematic as we cannot determine what is the count + of weak-references: + + In [12]: weakref.getweakrefcount(x) + Out[12]: 1 + + Notice, though we want ``weakref.getweakrefcount`` to return ``3``, it + returns ``1``. So we need to work-around this by using an empty/no-op + callback: + + In [13]: def custom_weakref_callback(ref): + ...: pass + ...: + + + In [14]: d = weakref.ref(x, custom_weakref_callback) + In [15]: e = weakref.ref(x, custom_weakref_callback) + In [16]: f = weakref.ref(x, custom_weakref_callback) + + Now there is an each unique weak-reference created: + + In [17]: d + Out[17]: + In [18]: e + Out[18]: + In [19]: f + Out[19]: + + Now calling ``weakref.getweakrefcount`` will result in ``4``, which is correct: + + In [20]: weakref.getweakrefcount(x) + Out[20]: 4 + + In [21]: d is not e + Out[21]: True + + In [22]: d is not f + Out[22]: True + + In [23]: e is not f + Out[23]: True + """ # noqa: E501 pass @@ -175,7 +257,7 @@ def __init__( self, data: Union[int, Any], *, size: int = None, owner: object = None ): self._weak_ref = None - self._temp_ref = None + self._proxy_ref = None self._zero_copied = False if isinstance(data, int): @@ -230,24 +312,40 @@ def __getitem__(self, key: slice) -> Buffer: ) def _is_cai_zero_copied(self): + """ + Returns a flag, that indicates if the Buffer has been zero-copied. + """ return self._zero_copied def _update_ref(self): + """ + Generate the new proxy reference. + """ if (self._ptr, self._size) not in Buffer._refs: Buffer._refs[(self._ptr, self._size)] = BufferWeakref( self._ptr, self._size ) - self._temp_ref = Buffer._refs[(self._ptr, self._size)] + self._proxy_ref = Buffer._refs[(self._ptr, self._size)] def get_ref(self): - if self._temp_ref is None: + """ + Returns the proxy reference. + """ + if self._proxy_ref is None: self._update_ref() - return self._temp_ref + return self._proxy_ref def has_a_weakref(self): + """ + Checks if the Buffer has a weak-reference. + """ weakref_count = weakref.getweakrefcount(self.get_ref()) if weakref_count == 1: + # When the weakref_count is 1, it could be a possibility + # that a copied Buffer got destroyed and hence this + # method should return False in that case as there is only + # one Buffer pointing to the device memory. return ( not weakref.getweakrefs(self.get_ref())[0]() is not self.get_ref() @@ -256,9 +354,26 @@ def has_a_weakref(self): return weakref_count > 0 def get_weakref(self): + """ + Returns a weak-reference for the Buffer. + """ return weakref.ref(self.get_ref(), custom_weakref_callback) def copy(self, deep: bool = True): + """ + Return a copy of Buffer. + + Parameters + ---------- + deep : bool, default True + If True, returns a deep-copy of the underlying Buffer data. + If False, returns a shallow-copy of the Buffer pointing to + the same underlying data. + + Returns + ------- + Buffer + """ if deep: if ( cudf.get_option("copy_on_write") @@ -268,7 +383,7 @@ def copy(self, deep: bool = True): copied_buf._ptr = self._ptr copied_buf._size = self._size copied_buf._owner = self._owner - copied_buf._temp_ref = None + copied_buf._proxy_ref = None copied_buf._weak_ref = None copied_buf._zero_copied = False @@ -311,6 +426,10 @@ def owner(self) -> Any: @property def _cai(self) -> dict: + """ + Internal Implementation for the CUDA Array Interface without + triggering a deepcopy. + """ return { "data": (self.ptr, False), "shape": (self.size,), @@ -321,11 +440,23 @@ def _cai(self) -> dict: @property def __cuda_array_interface__(self) -> dict: + # Detach if there are any weak-references. self._detach_refs() + # Mark the Buffer as ``_zero_copied=True``, + # which will prevent any copy-on-write + # mechanism post this operation. + # This is done because we don't have any + # control over knowing if a third-party library + # has modified the data this Buffer is + # pointing to. self._zero_copied = True return self._cai def _detach_refs(self): + """ + Detaches a Buffer from it's weak-references by making + a true deep-copy. + """ if not self._zero_copied and self.has_a_weakref(): # make a deep copy of existing DeviceBuffer # and replace pointer to it. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 913799d87a2..b435e78a6cb 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -2,7 +2,6 @@ from __future__ import annotations -import weakref from collections import abc from functools import cached_property from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast @@ -27,11 +26,6 @@ min_unsigned_type, ) - -def custom_weakref_callback(ref): - pass - - if TYPE_CHECKING: from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex from cudf.core.column import ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ab4eda59403..9d1301317de 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -95,10 +95,6 @@ Slice = TypeVar("Slice", bound=slice) -def custom_weakref_callback(ref): - pass - - class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _VALID_REDUCTIONS = { "any", @@ -128,7 +124,8 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": @property def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ - View the data as a device array object + Internal implementation for viewing the data as a device array object + without triggering a deep-copy. """ return cuda.as_cuda_array( SimpleNamespace( @@ -142,7 +139,8 @@ def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": @property def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ - View the mask as a device array + Internal implementation for viewing the mask as a device array object + without triggering a deep-copy. """ return cuda.as_cuda_array( SimpleNamespace( From ae4b5e0e5c61dfe3cfcf5e7d35fc2cfac960baca Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 16:24:03 -0700 Subject: [PATCH 037/124] Move detach_refs to mutable_view() --- python/cudf/cudf/_lib/column.pyx | 1 + python/cudf/cudf/core/column/column.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9d0456502e2..8a2b5ba8c72 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -394,6 +394,7 @@ cdef class Column: return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count() cdef mutable_column_view mutable_view(self) except *: + self._detach_refs() if is_categorical_dtype(self.dtype): col = self.base_children[0] else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9d1301317de..bf267acd87e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -373,7 +373,6 @@ def _fill( inplace=True, ) - self._detach_refs() if not slr.is_valid() and not self.nullable: mask = create_null_mask(self.size, state=MaskState.ALL_VALID) self.set_base_mask(mask) From 97584deef402d3f6c1d43bfcd06a6ea1b050c596 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 16:28:34 -0700 Subject: [PATCH 038/124] refactor --- python/cudf/cudf/core/column/column.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bf267acd87e..fd97c9e018d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -404,6 +404,11 @@ def _nullmask(self) -> DeviceBufferLike: return self._mask_array_view def force_deep_copy(self: T) -> T: + """ + A method to force create a deep-copy of + a Column irrespective of `copy-on-write` + is enable/disabled. + """ result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) @@ -420,10 +425,10 @@ def copy(self: T, deep: bool = True) -> T: copied_col = cast( T, build_column( - self.base_data + data=self.base_data if self.base_data is None else self.base_data.copy(deep=deep), - self.dtype, + dtype=self.dtype, mask=self.base_mask if self.base_mask is None else self.base_mask.copy(deep=deep), @@ -436,8 +441,7 @@ def copy(self: T, deep: bool = True) -> T: ) return copied_col else: - result = libcudf.copying.copy_column(self) - return cast(T, result._with_type_metadata(self.dtype)) + return self.force_deep_copy() else: return cast( T, From 804a121d0ec939334c2148806d0d6c0cf53b391e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 24 Oct 2022 19:29:12 -0700 Subject: [PATCH 039/124] changes --- python/cudf/cudf/core/column/lists.py | 7 ++++--- python/cudf/cudf/core/column/string.py | 11 ++++------- python/cudf/cudf/core/column/struct.py | 5 +++++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index f021b75d803..f743b6bef4b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -198,9 +198,10 @@ def _with_type_metadata( return self def copy(self, deep: bool = True): - """Columns are immutable, so a deep copy produces a copy of the - underlying data and mask and a shallow copy creates a new column and - copies the references of the data and mask. + """ + List columns are immutable, so both deep + and shallow copies share the underlying + device data and mask. """ return column.build_column( self.base_data, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 11ad51ff638..5e845380c25 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4,7 +4,6 @@ import re import warnings -import weakref from functools import cached_property from typing import ( TYPE_CHECKING, @@ -5250,13 +5249,11 @@ def __init__( self._start_offset = None self._end_offset = None - def has_a_weakref(self): - return any(child.has_a_weakref() for child in self.children) - def copy(self, deep: bool = True): - """String Columns are immutable, so a deep/shallow copy - produces a new column and copies the references of the - data and mask. + """ + String columns are immutable, so both deep + and shallow copies share the underlying + device data and mask. """ return column.build_column( diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 51a10daf4fa..b21c6cd9968 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -82,6 +82,11 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def copy(self, deep=True): + """ + Struct columns are immutable, so both deep + and shallow copies share the underlying + device data and mask. + """ result = cudf.core.column.build_column( self.base_data, From bf449bee77e99ba3d25b7ebc6006927318114e85 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 14:40:40 -0700 Subject: [PATCH 040/124] design docs and improvements --- .../source/developer_guide/library_design.md | 54 +++++++++++++++++++ python/cudf/cudf/_lib/column.pyi | 2 +- python/cudf/cudf/_lib/column.pyx | 6 ++- python/cudf/cudf/core/buffer.py | 35 ++++++------ python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/column/datetime.py | 3 +- python/cudf/cudf/core/column/numerical.py | 3 +- python/cudf/cudf/tests/test_stats.py | 7 --- 8 files changed, 83 insertions(+), 30 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 457ae6a39ff..a575819e774 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -251,3 +251,57 @@ The pandas API also includes a number of helper objects, such as `GroupBy`, `Rol cuDF implements corresponding objects with the same APIs. Internally, these objects typically interact with cuDF objects at the Frame layer via composition. However, for performance reasons they frequently access internal attributes and methods of `Frame` and its subclasses. + + +## Copy on write + + +Copy on write is designed to reduce memory foot-print on GPUs. With this feature, a copy is only really made whenever +there is a write operation on a column. + +The core copy-on-write implementation relies in `Buffer` class. This class stores the pointer to the device memory and size. +With the help of `Buffer._ptr` and `Buffer._size` we create a unique singleton ``BufferWeakref``, which means all the new `Buffer`'s that are being created get the same ``BufferWeakref`` if they are all pointing to the same device memory. We +store this ``BufferWeakref`` in `Buffer._proxy_ref`. + +When ``copy_on_write`` cudf option is set to ``True`` and someone asks for a copy of a Buffer, a shallow copy of the +``Buffer`` is created which points to the same underlying device memory(same `ptr`, `size` & `owner`). Next, a weak reference +is generated by calling ``weakref.ref``. This will later serve as an indication whether or not to make a copy when a +write operation is being performed on `Column`(more on that later). There are some different scenarios on **how** the weak-references are kept alive. + +1. If the current `Buffer`(call it `current_buf`) doesn't have any weak-reference, and we are trying to generate a copy +of it(call it `copied_buf`), then we generate weak-references of both the buffer's and assign it to each-other i.e., +`current_buf` will hold the weak-reference of `copied_buf` and `copied_buf` will hold the weak-reference of `current_buf`. +This is how we achieve this in code: + +``` +copied_buf._weak_ref = current_buf.get_weakref() +current_buf._weak_ref = copied_buf.get_weakref() +``` + +2. If the current `Buffer`(call it `current_buf`) already holds a weak-reference, this means there exists at-least one +copy of this `Buffer` already somewhere. So when we are trying to generate a copy of `current_buf`(call it `copied_buf`), +we will be storing the weak-reference that `current_buf` already has into the `copied_buf`. Next, we will generate a +weak-reference of `copied_buf` and store it in `current_buf`. This will ensure we keep the weak-references of all the +Buffer's alive(as long as `Buffer` is alive only). This is how we achieve it in code: +``` +copied_buf._weak_ref = current_buf._weak_ref +current_buf._weak_ref = copied_buf.get_weakref() +``` + +There is a case when `copy-on-write` mechanism will stay in-active and return true copies though `copy_on_write` cudf setting set to `True`: + +Whenever a `Column`/`Buffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it +is technically not possible to know if the device data is modified without introspection. Hence whenever +someone accesses `__cuda_array_interface__` of `Column` or a `Buffer`, we trigger +`Column/Buffer._detach_refs` which will ensure a true copy of underlying device data is made and +detaches itself from pointing to the original device memory. We also mark the `Column`/`Buffer` as +`obj._zero_copied=True` thus indicating any future deep-copy requests will trigger a true-deep copy +rather than a copy-on-write shallow copy with weak-references. + + +Notes: +1. Weak-references are implemented only for fixed-width data types as these are only column +types that can be mutated in-place. +2. Deep copies of variable width data types return shallow-copies of the Columns, because these +types don't support real in-place mutations to the data. We just mimic in such a way that it looks +like an in-place operation. diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 0b2904ade94..9b06dfb9388 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -70,7 +70,7 @@ class Column: @property def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... - def _detach_refs(self) -> None: ... + def _detach_refs(self, zero_copied=False) -> None: ... def has_a_weakref(self) -> bool: ... def _is_cai_zero_copied(self) -> bool: ... def _mimic_inplace( diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8a2b5ba8c72..30530048795 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -357,7 +357,7 @@ cdef class Column: ) ) - def _detach_refs(self): + def _detach_refs(self, zero_copied=False): """ Detaches a column from it's current Buffers by making a true deep-copy. @@ -371,6 +371,10 @@ cdef class Column: self.set_base_data(new_col.base_data) self.set_base_children(new_col.base_children) self.set_base_mask(new_col.base_mask) + if self.base_data is not None: + self.base_data._zero_copied = zero_copied + if self.base_mask is not None: + self.base_mask._zero_copied = zero_copied def _mimic_inplace(self, other_col, inplace=False): """ diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 93d96b28062..72e9536e165 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -347,8 +347,7 @@ def has_a_weakref(self): # method should return False in that case as there is only # one Buffer pointing to the device memory. return ( - not weakref.getweakrefs(self.get_ref())[0]() - is not self.get_ref() + weakref.getweakrefs(self.get_ref())[0]() is not self.get_ref() ) else: return weakref_count > 0 @@ -387,16 +386,20 @@ def copy(self, deep: bool = True): copied_buf._weak_ref = None copied_buf._zero_copied = False - if self._weak_ref is None: - self._weak_ref = copied_buf.get_weakref() - copied_buf._weak_ref = self.get_weakref() + if self.has_a_weakref(): + # If `self` has weak-references + # we will then have to keep that + # weak-reference alive, hence + # pass it onto `copied_buf` + copied_buf._weak_ref = self._weak_ref else: - if self.has_a_weakref(): - copied_buf._weak_ref = self._weak_ref - self._weak_ref = copied_buf.get_weakref() - else: - self._weak_ref = copied_buf.get_weakref() - copied_buf._weak_ref = self.get_weakref() + # If `self` has no weak-references, + # we will have to generate a new weak-reference + # and assign it to `copied_buf` + copied_buf._weak_ref = self.get_weakref() + + self._weak_ref = copied_buf.get_weakref() + return copied_buf else: owner_copy = copy.copy(self._owner) @@ -441,18 +444,19 @@ def _cai(self) -> dict: @property def __cuda_array_interface__(self) -> dict: # Detach if there are any weak-references. - self._detach_refs() - # Mark the Buffer as ``_zero_copied=True``, + + # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write # mechanism post this operation. # This is done because we don't have any # control over knowing if a third-party library # has modified the data this Buffer is # pointing to. - self._zero_copied = True + self._detach_refs(zero_copied=True) + return self._cai - def _detach_refs(self): + def _detach_refs(self, zero_copied=False): """ Detaches a Buffer from it's weak-references by making a true deep-copy. @@ -465,6 +469,7 @@ def _detach_refs(self): self._ptr = new_buf.ptr self._size = new_buf.size self._owner = new_buf + self._zero_copied = zero_copied def memoryview(self) -> memoryview: host_buf = bytearray(self.size) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fd97c9e018d..0b0d80e71ac 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -116,8 +116,7 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ - self._detach_refs() - self._zero_copied = True + self._detach_refs(zero_copied=True) return self._data_array_view diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a67a5ea7eec..64c0406b288 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -282,8 +282,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._detach_refs() - self._zero_copied = True + self._detach_refs(zero_copied=True) output = { "shape": (len(self),), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6e4d4e67513..4a3a593e1b8 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -168,8 +168,7 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._detach_refs() - self._zero_copied = True + self._detach_refs(zero_copied=True) output = { "shape": (len(self),), diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 4635d6d531b..3cc443a1526 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -495,13 +495,6 @@ def test_nans_stats(data, ops, skipna): getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = _create_pandas_series(data) - gsr = cudf.Series(data, nan_as_null=False) - # Since there is no concept of `nan_as_null` in pandas, - # nulls will be returned in the operations. So only - # testing for `skipna=True` when `nan_as_null=False` - assert_eq(getattr(psr, ops)(skipna=True), getattr(gsr, ops)(skipna=True)) - @pytest.mark.parametrize( "data", From a0d4fd4d05b93904fb790469477d2256ba096e9a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 14:49:16 -0700 Subject: [PATCH 041/124] revert --- python/cudf/cudf/tests/test_dataframe_copy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 3de12c6a1a0..1a9098c70db 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -1,5 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. - +# Copyright (c) 2018, NVIDIA CORPORATION. from copy import copy, deepcopy import numpy as np From e36e553e70bb5bcf635e26021b441edd6ee70b2a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 17:19:34 -0700 Subject: [PATCH 042/124] Add user facing docs --- docs/cudf/source/user_guide/copy-on-write.md | 89 ++++++++++++++++++++ docs/cudf/source/user_guide/index.md | 1 + 2 files changed, 90 insertions(+) create mode 100644 docs/cudf/source/user_guide/copy-on-write.md diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md new file mode 100644 index 00000000000..b1e78cade34 --- /dev/null +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -0,0 +1,89 @@ +# Copy on write + +Copy on write enables ability to save on GPU memory usage when deep copies of a column +are made. + +## How to enable it + +i. Either by using `set_option` in `cudf`: + +```python +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +``` + +ii. Or, by setting an environment variable ``CUDF_COPY_ON_WRITE`` to ``1``: + +```bash +export CUDF_COPY_ON_WRITE="1" +``` + + +## Making copies + +There are no additional changes required in the code to make use of copy-on-write. + +```python +>>> series = cudf.Series([1, 2, 3, 4]) +``` + +Performing a deep copy will create a new series object but pointing to the +same underlying device memory: + +```python +>>> copied_series = series.copy(deep=True) +>>> series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> copied_series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> +>>> +>>> series.data.ptr +140102175031296 +>>> copied_series.data.ptr +140102175031296 +``` + +But, when there is a write-operation being performed on either ``series`` or +``copied_series``, a true deep-copy is triggered: + +```python +>>> series[0:2] = 10 +>>> series +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> copied_series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +Notice the underlying data these both series objects now point to completely +different device objects: + +```python +>>> series.data.ptr +140102175032832 +>>> copied_series.data.ptr +140102175031296 +``` + +````{Warning} +When ``copy_on_write`` is enabled, all of the deep copies are constructed with +weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface` +or `series.data.__cuda_array_interface__` which will then take care of detaching any existing weak-references that a column contains. +```` + diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md index d99056f69f2..e69b4062e5e 100644 --- a/docs/cudf/source/user_guide/index.md +++ b/docs/cudf/source/user_guide/index.md @@ -14,4 +14,5 @@ cupy-interop dask-cudf options PandasCompat +copy-on-write ``` From e611c0b201cbc67fc8f7ac6386f81d49ca45bf64 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 17:36:15 -0700 Subject: [PATCH 043/124] improvements --- docs/cudf/source/user_guide/copy-on-write.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index b1e78cade34..f0a0cb3755c 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -12,7 +12,8 @@ i. Either by using `set_option` in `cudf`: >>> cudf.set_option("copy_on_write", True) ``` -ii. Or, by setting an environment variable ``CUDF_COPY_ON_WRITE`` to ``1``: +ii. Or, by setting an environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the +launch of the python interpreter: ```bash export CUDF_COPY_ON_WRITE="1" @@ -87,3 +88,12 @@ weak-references, and it is recommended to not hand-construct the contents of `__ or `series.data.__cuda_array_interface__` which will then take care of detaching any existing weak-references that a column contains. ```` + +## How to disable it + + +Copy on write can be disable by setting ``copy_on_write`` cudf option to ``False``: + +```python +>>> cudf.set_option("copy_on_write", False) +``` \ No newline at end of file From 4b2fd7f3ddb65c978cfdab84b9c677b8d85061ad Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 17:36:30 -0700 Subject: [PATCH 044/124] improvements --- docs/cudf/source/user_guide/copy-on-write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index f0a0cb3755c..ec849417294 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -96,4 +96,4 @@ Copy on write can be disable by setting ``copy_on_write`` cudf option to ``False ```python >>> cudf.set_option("copy_on_write", False) -``` \ No newline at end of file +``` From 27009e65aa86bb51f364564897ea46d98062d3ae Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 17:41:13 -0700 Subject: [PATCH 045/124] update tests --- python/cudf/cudf/tests/test_index.py | 4 +++- python/cudf/cudf/tests/test_multiindex.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 4f8c35b180e..9bd44b9660a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -248,7 +248,9 @@ def test_index_rename(): assert_eq(pds, gds) -def test_index_rename_inplace(): +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_index_rename_inplace(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f46f30d583f..f3b2f522021 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -760,12 +760,14 @@ def test_multiindex_copy_sem(data, levels, codes, names): ), ], ) +@pytest.mark.parametrize("copy_on_write", [True, False]) @pytest.mark.parametrize("deep", [True, False]) -def test_multiindex_copy_deep(data, deep): +def test_multiindex_copy_deep(data, copy_on_write, deep): """Test memory identity for deep copy Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ + cudf.set_option("copy_on_write", copy_on_write) same_ref = (not deep) or cudf.get_option("copy_on_write") if isinstance(data, dict): From c8490ff72354ed74ee4bad8c2df5d668c1ed1226 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 25 Oct 2022 17:50:59 -0700 Subject: [PATCH 046/124] make get_weakref internal --- docs/cudf/source/developer_guide/library_design.md | 6 +++--- python/cudf/cudf/core/buffer.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index a575819e774..052a18b855b 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -274,8 +274,8 @@ of it(call it `copied_buf`), then we generate weak-references of both the buffer This is how we achieve this in code: ``` -copied_buf._weak_ref = current_buf.get_weakref() -current_buf._weak_ref = copied_buf.get_weakref() +copied_buf._weak_ref = current_buf._get_weakref() +current_buf._weak_ref = copied_buf._get_weakref() ``` 2. If the current `Buffer`(call it `current_buf`) already holds a weak-reference, this means there exists at-least one @@ -285,7 +285,7 @@ weak-reference of `copied_buf` and store it in `current_buf`. This will ensure w Buffer's alive(as long as `Buffer` is alive only). This is how we achieve it in code: ``` copied_buf._weak_ref = current_buf._weak_ref -current_buf._weak_ref = copied_buf.get_weakref() +current_buf._weak_ref = copied_buf._get_weakref() ``` There is a case when `copy-on-write` mechanism will stay in-active and return true copies though `copy_on_write` cudf setting set to `True`: diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 72e9536e165..131ea8dec84 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -352,7 +352,7 @@ def has_a_weakref(self): else: return weakref_count > 0 - def get_weakref(self): + def _get_weakref(self): """ Returns a weak-reference for the Buffer. """ @@ -396,9 +396,9 @@ def copy(self, deep: bool = True): # If `self` has no weak-references, # we will have to generate a new weak-reference # and assign it to `copied_buf` - copied_buf._weak_ref = self.get_weakref() + copied_buf._weak_ref = self._get_weakref() - self._weak_ref = copied_buf.get_weakref() + self._weak_ref = copied_buf._get_weakref() return copied_buf else: From 18146a92c431733f09924e52cfaaf0e093b8e58c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 26 Oct 2022 09:33:31 -0500 Subject: [PATCH 047/124] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- docs/cudf/source/developer_guide/library_design.md | 12 ++++++------ docs/cudf/source/user_guide/copy-on-write.md | 2 +- python/cudf/cudf/_lib/column.pyx | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 052a18b855b..72187c17be9 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -256,15 +256,15 @@ However, for performance reasons they frequently access internal attributes and ## Copy on write -Copy on write is designed to reduce memory foot-print on GPUs. With this feature, a copy is only really made whenever +Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy is only really made whenever there is a write operation on a column. -The core copy-on-write implementation relies in `Buffer` class. This class stores the pointer to the device memory and size. -With the help of `Buffer._ptr` and `Buffer._size` we create a unique singleton ``BufferWeakref``, which means all the new `Buffer`'s that are being created get the same ``BufferWeakref`` if they are all pointing to the same device memory. We +The core copy-on-write implementation relies on the `Buffer` class. This class stores the pointer to the device memory and size. +With the help of `Buffer._ptr` and `Buffer._size` we create a unique singleton `BufferWeakref`, which means all the new `Buffer`s that are created get the same `BufferWeakref` if they are all pointing to the same device memory. We store this ``BufferWeakref`` in `Buffer._proxy_ref`. -When ``copy_on_write`` cudf option is set to ``True`` and someone asks for a copy of a Buffer, a shallow copy of the -``Buffer`` is created which points to the same underlying device memory(same `ptr`, `size` & `owner`). Next, a weak reference +When the cudf option ``copy_on_write`` is ``True`` and a copy of a `Buffer` is obtained, a shallow copy of the +``Buffer`` is created which points to the same underlying device memory (same `ptr`, `size` & `owner`). Next, a weak reference is generated by calling ``weakref.ref``. This will later serve as an indication whether or not to make a copy when a write operation is being performed on `Column`(more on that later). There are some different scenarios on **how** the weak-references are kept alive. @@ -288,7 +288,7 @@ copied_buf._weak_ref = current_buf._weak_ref current_buf._weak_ref = copied_buf._get_weakref() ``` -There is a case when `copy-on-write` mechanism will stay in-active and return true copies though `copy_on_write` cudf setting set to `True`: +There is a case when copy-on-write will be inactive and return true copies even though the cudf option `copy_on_write` is `True`: Whenever a `Column`/`Buffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it is technically not possible to know if the device data is modified without introspection. Hence whenever diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index ec849417294..2d44fe41a73 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -84,7 +84,7 @@ different device objects: ````{Warning} When ``copy_on_write`` is enabled, all of the deep copies are constructed with -weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface` +weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` or `series.data.__cuda_array_interface__` which will then take care of detaching any existing weak-references that a column contains. ```` diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 30530048795..90bfb80ccbc 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -359,7 +359,7 @@ cdef class Column: def _detach_refs(self, zero_copied=False): """ - Detaches a column from it's current Buffers by making + Detaches a column from its current Buffers by making a true deep-copy. """ if not self._is_cai_zero_copied() and self.has_a_weakref(): From 1ef83496402aceef6d4c854f7a6075a434882926 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 3 Nov 2022 05:28:19 -0700 Subject: [PATCH 048/124] align with pandas --- python/cudf/cudf/core/buffer/buffer.py | 28 ++++++++++++------------ python/cudf/cudf/core/column/column.py | 26 +++++++++++----------- python/cudf/cudf/core/column_accessor.py | 4 ++-- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 9bbcf9797da..09295529c4a 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -332,7 +332,7 @@ def copy(self, deep: bool = True): ------- Buffer """ - if deep: + if not deep: if ( cudf.get_option("copy_on_write") and not self._is_cai_zero_copied() @@ -361,20 +361,20 @@ def copy(self, deep: bool = True): return copied_buf else: - owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) - return self._from_device_memory( - cuda_array_interface_wrapper( - ptr=owner_copy.ptr, - size=owner_copy.size, - owner=owner_copy, - ) - ) + shallow_copy = Buffer.__new__(Buffer) + shallow_copy._ptr = self._ptr + shallow_copy._size = self._size + shallow_copy._owner = self._owner + return shallow_copy else: - shallow_copy = Buffer.__new__(Buffer) - shallow_copy._ptr = self._ptr - shallow_copy._size = self._size - shallow_copy._owner = self._owner - return shallow_copy + owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) + return self._from_device_memory( + cuda_array_interface_wrapper( + ptr=owner_copy.ptr, + size=owner_copy.size, + owner=owner_copy, + ) + ) @property def size(self) -> int: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a1588165ae2..424da9cb13d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -417,6 +417,8 @@ def copy(self: T, deep: bool = True) -> T: copies the references of the data and mask. """ if deep: + return self.force_deep_copy() + else: if ( cudf.get_option("copy_on_write") and not self._is_cai_zero_copied() @@ -440,19 +442,17 @@ def copy(self: T, deep: bool = True) -> T: ) return copied_col else: - return self.force_deep_copy() - else: - return cast( - T, - build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ), - ) + return cast( + T, + build_column( + self.base_data, + self.dtype, + mask=self.base_mask, + size=self.size, + offset=self.offset, + children=self.base_children, + ), + ) def view(self, dtype: Dtype) -> ColumnBase: """ diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 525dfde5d33..ae689b5abd7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -307,9 +307,9 @@ def copy(self, deep=False) -> ColumnAccessor: """ Make a copy of this ColumnAccessor. """ - if deep: + if deep or cudf.get_option("copy_on_write"): return self.__class__( - {k: v.copy(deep=True) for k, v in self._data.items()}, + {k: v.copy(deep=deep) for k, v in self._data.items()}, multiindex=self.multiindex, level_names=self.level_names, ) From 0642c2e71d8db1fd9839ab3b4f79bcfe570b5b4d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 7 Nov 2022 09:23:26 -0800 Subject: [PATCH 049/124] fix tests --- python/cudf/cudf/tests/test_index.py | 8 ++------ python/cudf/cudf/tests/test_multiindex.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 9bd44b9660a..93063962452 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -248,18 +248,14 @@ def test_index_rename(): assert_eq(pds, gds) -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_index_rename_inplace(copy_on_write): - cudf.set_option("copy_on_write", copy_on_write) +def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = as_index(pds) # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert ( - gds_renamed_deep._values.data_ptr == gds._values.data_ptr - ) == cudf.get_option("copy_on_write") + assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f3b2f522021..ed7d6f1c9a8 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -768,7 +768,7 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): Case2: Constructed from MultiIndex, NumericColumns """ cudf.set_option("copy_on_write", copy_on_write) - same_ref = (not deep) or cudf.get_option("copy_on_write") + same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) if isinstance(data, dict): import operator From 0dcc7cc4e2e5dcf5769332f524d285cb2c5025d5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 18 Nov 2022 08:52:36 -0800 Subject: [PATCH 050/124] changes --- docs/cudf/source/developer_guide/library_design.md | 6 +++--- docs/cudf/source/user_guide/copy-on-write.md | 10 ++++------ python/cudf/cudf/_lib/column.pyx | 4 ++++ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 3556aed6ccc..257bd7a1d1d 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -256,10 +256,10 @@ However, for performance reasons they frequently access internal attributes and ## Copy on write -Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy is only really made whenever +Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever there is a write operation on a column. -The core copy-on-write implementation relies on the `Buffer` class. This class stores the pointer to the device memory and size. +The core copy-on-write implementation relies in the `Buffer` class. This class stores the pointer to the device memory and size. With the help of `Buffer._ptr` and `Buffer._size` we create a unique singleton `BufferWeakref`, which means all the new `Buffer`s that are created get the same `BufferWeakref` if they are all pointing to the same device memory. We store this ``BufferWeakref`` in `Buffer._proxy_ref`. @@ -295,7 +295,7 @@ is technically not possible to know if the device data is modified without intro someone accesses `__cuda_array_interface__` of `Column` or a `Buffer`, we trigger `Column/Buffer._detach_refs` which will ensure a true copy of underlying device data is made and detaches itself from pointing to the original device memory. We also mark the `Column`/`Buffer` as -`obj._zero_copied=True` thus indicating any future deep-copy requests will trigger a true-deep copy +`obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak-references. diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 2d44fe41a73..f682cf17977 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -1,6 +1,6 @@ # Copy on write -Copy on write enables ability to save on GPU memory usage when deep copies of a column +Copy on write enables ability to save on GPU memory usage when copies(`.copy(deep=False)`) of a column are made. ## How to enable it @@ -32,7 +32,7 @@ Performing a deep copy will create a new series object but pointing to the same underlying device memory: ```python ->>> copied_series = series.copy(deep=True) +>>> copied_series = series.copy(deep=False) >>> series 0 1 1 2 @@ -45,8 +45,6 @@ dtype: int64 2 3 3 4 dtype: int64 ->>> ->>> >>> series.data.ptr 140102175031296 >>> copied_series.data.ptr @@ -54,7 +52,7 @@ dtype: int64 ``` But, when there is a write-operation being performed on either ``series`` or -``copied_series``, a true deep-copy is triggered: +``copied_series``, a true physical copy of the data is created: ```python >>> series[0:2] = 10 @@ -83,7 +81,7 @@ different device objects: ``` ````{Warning} -When ``copy_on_write`` is enabled, all of the deep copies are constructed with +When ``copy_on_write`` is enabled, all of the shallow copies are constructed with weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` or `series.data.__cuda_array_interface__` which will then take care of detaching any existing weak-references that a column contains. ```` diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 1477b7065ac..561c20a5fa0 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -210,6 +210,10 @@ cdef class Column: mask = None elif ( isinstance(value, Buffer) or + # Check for Buffer instance, because + # hasattr will trigger invocation of + # `__cuda_array_interface__` which could + # be expensive. hasattr(value, "__cuda_array_interface__") ): if isinstance(value, Buffer): From c11dfde4eb8e581a73f0138c21bf80d953397ff6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 18 Nov 2022 12:57:18 -0800 Subject: [PATCH 051/124] fix --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2b3ac1052ca..d887f67ac63 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1309,7 +1309,7 @@ def column_empty_like( data=None, dtype=dtype, mask=codes.base_mask, - children=(column_empty_like(codes, dtype=codes.dtype),), + children=(codes,), size=codes.size, ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 93063962452..a3b69826d37 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -400,7 +400,7 @@ def test_index_copy_deep(idx, deep): if ( isinstance(idx, cudf.StringIndex) or not deep - or cudf.get_option("copy_on_write") + or (cudf.get_option("copy_on_write") and not deep) ): # StringColumn is immutable hence, deep copies of a # StringIndex will share the same StringColumn. From f36fa259114eac51aa3c92aaf795bb16981ac9c6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 18 Nov 2022 15:42:08 -0800 Subject: [PATCH 052/124] Handle more cases --- python/cudf/cudf/core/column/column.py | 18 +++++-- python/cudf/cudf/core/column/numerical.py | 3 +- python/cudf/cudf/core/series.py | 11 +++- python/cudf/cudf/tests/test_copying.py | 65 ++++++++++++++++------- 4 files changed, 70 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d887f67ac63..5f77bfd9673 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -419,10 +419,10 @@ def copy(self: T, deep: bool = True) -> T: if deep: return self.force_deep_copy() else: - if ( - cudf.get_option("copy_on_write") - and not self._is_cai_zero_copied() - ): + if cudf.get_option("copy_on_write"): + if self._is_cai_zero_copied(): + return self.force_deep_copy() + copied_col = cast( T, build_column( @@ -1754,6 +1754,8 @@ def as_column( nan_as_null: bool = None, dtype: Dtype = None, length: int = None, + copy: bool = False, + fastpath: bool = False, ): """Create a Column from an arbitrary object @@ -1838,6 +1840,14 @@ def as_column( data = as_buffer(arbitrary) col = build_column(data, dtype=current_dtype, mask=mask) + if copy: + col = col.copy(deep=True) + elif ( + fastpath + and cudf.get_option("copy_on_write") + and col.base_data is not None + ): + col.base_data._zero_copied = True if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f371fa5e03f..12f8e627ccf 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -13,7 +13,6 @@ cast, ) -import cupy import numpy as np import pandas as pd @@ -726,7 +725,7 @@ def to_pandas( pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) pd_series = pd.Series(pandas_array, copy=False) elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls(): - pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) + pd_series = pd.Series(self.values_host, copy=False) else: pd_series = self.to_arrow().to_pandas(**kwargs) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 684f876f48e..0cfd3813b8d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -491,6 +491,7 @@ def __init__( index=None, dtype=None, name=None, + copy=False, nan_as_null=True, ): if isinstance(data, pd.Series): @@ -521,6 +522,8 @@ def __init__( if name is None: name = data.name data = data._column + if copy: + data = data.copy(deep=True) if dtype is not None: data = data.astype(dtype) @@ -539,7 +542,13 @@ def __init__( data = {} if not isinstance(data, ColumnBase): - data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype) + data = column.as_column( + data, + nan_as_null=nan_as_null, + dtype=dtype, + copy=copy, + fastpath=True, + ) else: if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 70b0b90566b..1a99e7ec829 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -59,40 +59,56 @@ def test_null_copy(): def test_series_setitem_cow(copy_on_write): cudf.set_option("copy_on_write", copy_on_write) actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=True) + new_copy = actual.copy(deep=False) actual[1] = 100 assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=True) + new_copy = actual.copy(deep=False) actual[slice(0, 2, 1)] = 100 assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + if copy_on_write: + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + else: + assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=True) + new_copy = actual.copy(deep=False) new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + if copy_on_write: + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) def test_multiple_series_cow(): cudf.set_option("copy_on_write", True) s = cudf.Series([10, 20, 30, 40, 50]) - s1 = s.copy(deep=True) - s2 = s.copy(deep=True) - s3 = s.copy(deep=True) - s4 = s2.copy(deep=True) - s5 = s4.copy(deep=True) - s6 = s3.copy(deep=True) + s1 = s.copy(deep=False) + s2 = s.copy(deep=False) + s3 = s.copy(deep=False) + s4 = s2.copy(deep=False) + s5 = s4.copy(deep=False) + s6 = s3.copy(deep=False) s1[0:3] = 10000 assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) @@ -129,7 +145,7 @@ def test_multiple_series_cow(): for ser in [s3]: assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - s7 = s5.copy(deep=True) + s7 = s5.copy(deep=False) assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) s7[1:3] = 55 assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) @@ -178,7 +194,7 @@ def test_multiple_series_cow(): def test_series_zero_copy(copy_on_write): cudf.set_option("copy_on_write", copy_on_write) s = cudf.Series([1, 2, 3, 4, 5]) - s1 = s.copy(deep=True) + s1 = s.copy(deep=False) cp_array = cp.asarray(s) assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) @@ -188,19 +204,28 @@ def test_series_zero_copy(copy_on_write): cp_array[0:3] = 10 assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + if copy_on_write: + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) s2 = cudf.Series(cp_array) assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) - s3 = s2.copy(deep=True) + s3 = s2.copy(deep=False) cp_array[0] = 20 assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + if copy_on_write: + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + if copy_on_write: + assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + else: + assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) @pytest.mark.parametrize("copy_on_write", [True, False]) From f01a017b8883411f7cb7d81509386d8ed5496ae3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 1 Dec 2022 14:39:07 -0800 Subject: [PATCH 053/124] disable spilling + cow --- python/cudf/cudf/options.py | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index f6f056e3a45..a69b08e6b4a 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -150,6 +150,22 @@ def _validator(val): return _validator +def _make_cow_validator(valid_options): + def _validator(val): + if get_option("spill"): + raise ValueError( + "Copy on write is not supported when spilling is enabled. " + "Please set `spill` to `False`" + ) + if val not in valid_options: + raise ValueError( + f"{val} is not a valid option. " + f"Must be one of {set(valid_options)}." + ) + + return _validator + + def _integer_validator(val): try: int(val) @@ -205,6 +221,19 @@ def _integer_and_none_validator(val): _make_contains_validator([None, 32, 64]), ) +_register_option( + "spill", + _env_get_bool("CUDF_SPILL", False), + textwrap.dedent( + """ + Enables spilling. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + + _register_option( "copy_on_write", os.environ.get("CUDF_COPY_ON_WRITE", "0") == "1", @@ -219,20 +248,9 @@ def _integer_and_none_validator(val): \tValid values are True or False. Default is False. """ ), - _make_contains_validator([False, True]), + _make_cow_validator([False, True]), ) -_register_option( - "spill", - _env_get_bool("CUDF_SPILL", False), - textwrap.dedent( - """ - Enables spilling. - \tValid values are True or False. Default is False. - """ - ), - _make_contains_validator([False, True]), -) _register_option( "spill_on_demand", From e18a9d900023654f698ca3bb25016e4c97c85dbb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 5 Dec 2022 09:19:52 -0800 Subject: [PATCH 054/124] fix issues --- python/cudf/cudf/_lib/column.pyi | 2 +- python/cudf/cudf/_lib/column.pyx | 8 ++++---- python/cudf/cudf/core/buffer/buffer.py | 6 +++--- python/cudf/cudf/options.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 9dfd2b41499..ce78f69be0b 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -69,7 +69,7 @@ class Column: def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _detach_refs(self, zero_copied=False) -> None: ... - def has_a_weakref(self) -> bool: ... + def _has_a_weakref(self) -> bool: ... def _is_cai_zero_copied(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 784568298a9..0f56ceddc3a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -311,15 +311,15 @@ cdef class Column: self._children = None self._base_children = value - def has_a_weakref(self) -> bool: + def _has_a_weakref(self) -> bool: """ Determines if the column has a weak reference. """ return ( - self.base_data.has_a_weakref() or + self.base_data._has_a_weakref() or ( - self.base_mask.has_a_weakref() + self.base_mask._has_a_weakref() if self.base_mask else False ) @@ -346,7 +346,7 @@ cdef class Column: Detaches a column from its current Buffers by making a true deep-copy. """ - if not self._is_cai_zero_copied() and self.has_a_weakref(): + if not self._is_cai_zero_copied() and self._has_a_weakref(): new_col = self.force_deep_copy() self._offset = new_col.offset diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 09295529c4a..1c319562529 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -294,7 +294,7 @@ def get_ref(self): self._update_ref() return self._proxy_ref - def has_a_weakref(self): + def _has_a_weakref(self): """ Checks if the Buffer has a weak-reference. """ @@ -345,7 +345,7 @@ def copy(self, deep: bool = True): copied_buf._weak_ref = None copied_buf._zero_copied = False - if self.has_a_weakref(): + if self._has_a_weakref(): # If `self` has weak-references # we will then have to keep that # weak-reference alive, hence @@ -430,7 +430,7 @@ def _detach_refs(self, zero_copied=False): Detaches a Buffer from it's weak-references by making a true deep-copy. """ - if not self._zero_copied and self.has_a_weakref(): + if not self._zero_copied and self._has_a_weakref(): # make a deep copy of existing DeviceBuffer # and replace pointer to it. current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index a69b08e6b4a..115128921c8 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -152,7 +152,7 @@ def _validator(val): def _make_cow_validator(valid_options): def _validator(val): - if get_option("spill"): + if get_option("spill") and val: raise ValueError( "Copy on write is not supported when spilling is enabled. " "Please set `spill` to `False`" From 7411805782fbe39e5ad66dc2fabf3987361bb4e3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 02:23:56 -0800 Subject: [PATCH 055/124] Move copy on write logic to a separate Buffer implementation --- python/cudf/cudf/_lib/column.pyx | 12 +- python/cudf/cudf/core/buffer/__init__.py | 1 + python/cudf/cudf/core/buffer/buffer.py | 215 +------------ python/cudf/cudf/core/buffer/utils.py | 8 + .../cudf/core/buffer/weakrefable_buffer.py | 290 ++++++++++++++++++ python/cudf/cudf/core/column/column.py | 4 +- 6 files changed, 318 insertions(+), 212 deletions(-) create mode 100644 python/cudf/cudf/core/buffer/weakrefable_buffer.py diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 0f56ceddc3a..6b907163bdb 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -12,6 +12,7 @@ import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype from cudf.core.buffer import ( Buffer, + RefCountableBuffer, SpillableBuffer, SpillLock, acquire_spill_lock, @@ -317,11 +318,14 @@ cdef class Column: """ return ( - self.base_data._has_a_weakref() or ( + isinstance(self.base_data, RefCountableBuffer) and + self.base_data._has_a_weakref() + ) + or + ( + isinstance(self.base_mask, RefCountableBuffer) and self.base_mask._has_a_weakref() - if self.base_mask - else False ) ) @@ -333,10 +337,12 @@ cdef class Column: self._zero_copied or ( self.base_data is not None and + isinstance(self.base_data, RefCountableBuffer) and self.base_data._is_cai_zero_copied() ) or ( self.base_mask is not None and + isinstance(self.base_mask, RefCountableBuffer) and self.base_mask._is_cai_zero_copied() ) ) diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index 49f2c57b17f..bfe1b7a4468 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -7,3 +7,4 @@ as_buffer, get_spill_lock, ) +from cudf.core.buffer.weakrefable_buffer import RefCountableBuffer diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 1c319562529..20df9a2c7a4 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -5,7 +5,6 @@ import copy import math import pickle -import weakref from types import SimpleNamespace from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar @@ -72,95 +71,6 @@ def cuda_array_interface_wrapper( ) -class BufferWeakref(object): - """ - A proxy class to be used by ``Buffer`` for generating weakreferences. - """ - - def __init__(self, ptr, size) -> None: - self.ptr = ptr - self.size = size - - -def custom_weakref_callback(ref): - """ - A callback for ``weakref.ref`` API to generate unique - weakref instances that can be counted correctly. - - Example below shows why this is necessary: - - In [1]: import cudf - In [2]: import weakref - - Let's create an object ``x`` that we are going to weakref: - - In [3]: x = cudf.core.buffer.BufferWeakref(1, 2) - - Now generate three weak-references of it: - - In [4]: a = weakref.ref(x) - In [5]: b = weakref.ref(x) - In [6]: c = weakref.ref(x) - - ``weakref.ref`` actually returns the same singleton object: - - In [7]: a - Out[7]: - In [8]: b - Out[8]: - In [9]: c - Out[9]: - - In [10]: a is b - Out[10]: True - In [11]: b is c - Out[11]: True - - This will be problematic as we cannot determine what is the count - of weak-references: - - In [12]: weakref.getweakrefcount(x) - Out[12]: 1 - - Notice, though we want ``weakref.getweakrefcount`` to return ``3``, it - returns ``1``. So we need to work-around this by using an empty/no-op - callback: - - In [13]: def custom_weakref_callback(ref): - ...: pass - ...: - - - In [14]: d = weakref.ref(x, custom_weakref_callback) - In [15]: e = weakref.ref(x, custom_weakref_callback) - In [16]: f = weakref.ref(x, custom_weakref_callback) - - Now there is an each unique weak-reference created: - - In [17]: d - Out[17]: - In [18]: e - Out[18]: - In [19]: f - Out[19]: - - Now calling ``weakref.getweakrefcount`` will result in ``4``, which is correct: - - In [20]: weakref.getweakrefcount(x) - Out[20]: 4 - - In [21]: d is not e - Out[21]: True - - In [22]: d is not f - Out[22]: True - - In [23]: e is not f - Out[23]: True - """ # noqa: E501 - pass - - class Buffer(Serializable): """A Buffer represents device memory. @@ -170,10 +80,6 @@ class Buffer(Serializable): _ptr: int _size: int _owner: object - _weak_ref: object - _proxy_ref: None | BufferWeakref - _zero_copied: bool - _refs: dict = {} def __init__(self): raise ValueError( @@ -201,9 +107,6 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: # Bypass `__init__` and initialize attributes manually ret = cls.__new__(cls) ret._owner = data - ret._weak_ref = None - ret._proxy_ref = None - ret._zero_copied = False if isinstance(data, rmm.DeviceBuffer): # Common case shortcut ret._ptr = data.ptr ret._size = data.size @@ -213,7 +116,6 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: ) if ret.size < 0: raise ValueError("size cannot be negative") - ret._update_ref() return ret @classmethod @@ -270,53 +172,6 @@ def __getitem__(self, key: slice) -> Buffer: raise ValueError("slice must be C-contiguous") return self._getitem(offset=start, size=stop - start) - def _is_cai_zero_copied(self): - """ - Returns a flag, that indicates if the Buffer has been zero-copied. - """ - return self._zero_copied - - def _update_ref(self): - """ - Generate the new proxy reference. - """ - if (self._ptr, self._size) not in Buffer._refs: - Buffer._refs[(self._ptr, self._size)] = BufferWeakref( - self._ptr, self._size - ) - self._proxy_ref = Buffer._refs[(self._ptr, self._size)] - - def get_ref(self): - """ - Returns the proxy reference. - """ - if self._proxy_ref is None: - self._update_ref() - return self._proxy_ref - - def _has_a_weakref(self): - """ - Checks if the Buffer has a weak-reference. - """ - weakref_count = weakref.getweakrefcount(self.get_ref()) - - if weakref_count == 1: - # When the weakref_count is 1, it could be a possibility - # that a copied Buffer got destroyed and hence this - # method should return False in that case as there is only - # one Buffer pointing to the device memory. - return ( - weakref.getweakrefs(self.get_ref())[0]() is not self.get_ref() - ) - else: - return weakref_count > 0 - - def _get_weakref(self): - """ - Returns a weak-reference for the Buffer. - """ - return weakref.ref(self.get_ref(), custom_weakref_callback) - def copy(self, deep: bool = True): """ Return a copy of Buffer. @@ -333,39 +188,11 @@ def copy(self, deep: bool = True): Buffer """ if not deep: - if ( - cudf.get_option("copy_on_write") - and not self._is_cai_zero_copied() - ): - copied_buf = Buffer.__new__(Buffer) - copied_buf._ptr = self._ptr - copied_buf._size = self._size - copied_buf._owner = self._owner - copied_buf._proxy_ref = None - copied_buf._weak_ref = None - copied_buf._zero_copied = False - - if self._has_a_weakref(): - # If `self` has weak-references - # we will then have to keep that - # weak-reference alive, hence - # pass it onto `copied_buf` - copied_buf._weak_ref = self._weak_ref - else: - # If `self` has no weak-references, - # we will have to generate a new weak-reference - # and assign it to `copied_buf` - copied_buf._weak_ref = self._get_weakref() - - self._weak_ref = copied_buf._get_weakref() - - return copied_buf - else: - shallow_copy = Buffer.__new__(Buffer) - shallow_copy._ptr = self._ptr - shallow_copy._size = self._size - shallow_copy._owner = self._owner - return shallow_copy + shallow_copy = Buffer.__new__(Buffer) + shallow_copy._ptr = self._ptr + shallow_copy._size = self._size + shallow_copy._owner = self._owner + return shallow_copy else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( @@ -397,7 +224,7 @@ def owner(self) -> Any: return self._owner @property - def _cai(self) -> dict: + def __cuda_array_interface__(self) -> dict: """ Internal Implementation for the CUDA Array Interface without triggering a deepcopy. @@ -411,34 +238,8 @@ def _cai(self) -> dict: } @property - def __cuda_array_interface__(self) -> dict: - # Detach if there are any weak-references. - - # Mark the Buffer as ``zero_copied=True``, - # which will prevent any copy-on-write - # mechanism post this operation. - # This is done because we don't have any - # control over knowing if a third-party library - # has modified the data this Buffer is - # pointing to. - self._detach_refs(zero_copied=True) - - return self._cai - - def _detach_refs(self, zero_copied=False): - """ - Detaches a Buffer from it's weak-references by making - a true deep-copy. - """ - if not self._zero_copied and self._has_a_weakref(): - # make a deep copy of existing DeviceBuffer - # and replace pointer to it. - current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) - new_buf = current_buf.copy() - self._ptr = new_buf.ptr - self._size = new_buf.size - self._owner = new_buf - self._zero_copied = zero_copied + def _cai(self) -> dict: + return self.__cuda_array_interface__ def memoryview(self) -> memoryview: """Read-only access to the buffer through host memory.""" diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 062e86d0cb1..8dc1d6db194 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -9,6 +9,8 @@ from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock +from cudf.core.buffer.weakrefable_buffer import RefCountableBuffer +from cudf.options import get_option def as_buffer( @@ -71,6 +73,12 @@ def as_buffer( "`data` is a buffer-like or array-like object" ) + if get_option("copy_on_write"): + if isinstance(data, (Buffer, RefCountableBuffer)) or hasattr( + data, "__cuda_array_interface__" + ): + return RefCountableBuffer._from_device_memory(data) + return RefCountableBuffer._from_host_memory(data) if get_global_manager() is not None: if hasattr(data, "__cuda_array_interface__"): return SpillableBuffer._from_device_memory(data, exposed=exposed) diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py new file mode 100644 index 00000000000..33c33c830c4 --- /dev/null +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -0,0 +1,290 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from __future__ import annotations + +import copy +import weakref +from typing import Any, Type, TypeVar + +import rmm + +import cudf +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper + +T = TypeVar("T", bound="RefCountableBuffer") + + +class BufferWeakref(object): + """ + A proxy class to be used by ``Buffer`` for generating weakreferences. + """ + + def __init__(self, ptr, size) -> None: + self.ptr = ptr + self.size = size + + +def custom_weakref_callback(ref): + """ + A callback for ``weakref.ref`` API to generate unique + weakref instances that can be counted correctly. + + Example below shows why this is necessary: + + In [1]: import cudf + In [2]: import weakref + + Let's create an object ``x`` that we are going to weakref: + + In [3]: x = cudf.core.buffer.BufferWeakref(1, 2) + + Now generate three weak-references of it: + + In [4]: a = weakref.ref(x) + In [5]: b = weakref.ref(x) + In [6]: c = weakref.ref(x) + + ``weakref.ref`` actually returns the same singleton object: + + In [7]: a + Out[7]: + In [8]: b + Out[8]: + In [9]: c + Out[9]: + + In [10]: a is b + Out[10]: True + In [11]: b is c + Out[11]: True + + This will be problematic as we cannot determine what is the count + of weak-references: + + In [12]: weakref.getweakrefcount(x) + Out[12]: 1 + + Notice, though we want ``weakref.getweakrefcount`` to return ``3``, it + returns ``1``. So we need to work-around this by using an empty/no-op + callback: + + In [13]: def custom_weakref_callback(ref): + ...: pass + ...: + + + In [14]: d = weakref.ref(x, custom_weakref_callback) + In [15]: e = weakref.ref(x, custom_weakref_callback) + In [16]: f = weakref.ref(x, custom_weakref_callback) + + Now there is an each unique weak-reference created: + + In [17]: d + Out[17]: + In [18]: e + Out[18]: + In [19]: f + Out[19]: + + Now calling ``weakref.getweakrefcount`` will result in ``4``, which is correct: + + In [20]: weakref.getweakrefcount(x) + Out[20]: 4 + + In [21]: d is not e + Out[21]: True + + In [22]: d is not f + Out[22]: True + + In [23]: e is not f + Out[23]: True + """ # noqa: E501 + pass + + +class RefCountableBuffer(Buffer): + """A Buffer represents device memory. + + Use the factory function `as_buffer` to create a Buffer instance. + """ + + _weak_ref: object + _proxy_ref: None | BufferWeakref + _zero_copied: bool + _refs: dict = {} + + @classmethod + def _from_device_memory(cls: Type[T], data: Any) -> T: + """Create a Buffer from an object exposing `__cuda_array_interface__`. + + No data is being copied. + + Parameters + ---------- + data : device-buffer-like + An object implementing the CUDA Array Interface. + + Returns + ------- + Buffer + Buffer representing the same device memory as `data` + """ + + # Bypass `__init__` and initialize attributes manually + ret = super()._from_device_memory(data) + ret._weak_ref = None + ret._proxy_ref = None + ret._zero_copied = False + ret._update_ref() + return ret + + def _is_cai_zero_copied(self): + """ + Returns a flag, that indicates if the Buffer has been zero-copied. + """ + return self._zero_copied + + def _update_ref(self): + """ + Generate the new proxy reference. + """ + if (self._ptr, self._size) not in RefCountableBuffer._refs: + RefCountableBuffer._refs[(self._ptr, self._size)] = BufferWeakref( + self._ptr, self._size + ) + self._proxy_ref = RefCountableBuffer._refs[(self._ptr, self._size)] + + def get_ref(self): + """ + Returns the proxy reference. + """ + if self._proxy_ref is None: + self._update_ref() + return self._proxy_ref + + def _has_a_weakref(self): + """ + Checks if the Buffer has a weak-reference. + """ + weakref_count = weakref.getweakrefcount(self.get_ref()) + + if weakref_count == 1: + # When the weakref_count is 1, it could be a possibility + # that a copied Buffer got destroyed and hence this + # method should return False in that case as there is only + # one Buffer pointing to the device memory. + return ( + weakref.getweakrefs(self.get_ref())[0]() is not self.get_ref() + ) + else: + return weakref_count > 0 + + def _get_weakref(self): + """ + Returns a weak-reference for the Buffer. + """ + return weakref.ref(self.get_ref(), custom_weakref_callback) + + def copy(self, deep: bool = True): + """ + Return a copy of Buffer. + + Parameters + ---------- + deep : bool, default True + If True, returns a deep-copy of the underlying Buffer data. + If False, returns a shallow-copy of the Buffer pointing to + the same underlying data. + + Returns + ------- + Buffer + """ + if not deep: + if ( + cudf.get_option("copy_on_write") + and not self._is_cai_zero_copied() + ): + copied_buf = RefCountableBuffer.__new__(RefCountableBuffer) + copied_buf._ptr = self._ptr + copied_buf._size = self._size + copied_buf._owner = self._owner + copied_buf._proxy_ref = None + copied_buf._weak_ref = None + copied_buf._zero_copied = False + + if self._has_a_weakref(): + # If `self` has weak-references + # we will then have to keep that + # weak-reference alive, hence + # pass it onto `copied_buf` + copied_buf._weak_ref = self._weak_ref + else: + # If `self` has no weak-references, + # we will have to generate a new weak-reference + # and assign it to `copied_buf` + copied_buf._weak_ref = self._get_weakref() + + self._weak_ref = copied_buf._get_weakref() + + return copied_buf + else: + shallow_copy = RefCountableBuffer.__new__(RefCountableBuffer) + shallow_copy._ptr = self._ptr + shallow_copy._size = self._size + shallow_copy._owner = self._owner + return shallow_copy + else: + owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) + return self._from_device_memory( + cuda_array_interface_wrapper( + ptr=owner_copy.ptr, + size=owner_copy.size, + owner=owner_copy, + ) + ) + + @property + def _cai(self) -> dict: + """ + Internal Implementation for the CUDA Array Interface without + triggering a deepcopy. + """ + return { + "data": (self.ptr, False), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } + + @property + def __cuda_array_interface__(self) -> dict: + # Detach if there are any weak-references. + + # Mark the Buffer as ``zero_copied=True``, + # which will prevent any copy-on-write + # mechanism post this operation. + # This is done because we don't have any + # control over knowing if a third-party library + # has modified the data this Buffer is + # pointing to. + self._detach_refs(zero_copied=True) + + return self._cai + + def _detach_refs(self, zero_copied=False): + """ + Detaches a Buffer from it's weak-references by making + a true deep-copy. + """ + if not self._zero_copied and self._has_a_weakref(): + # make a deep copy of existing DeviceBuffer + # and replace pointer to it. + current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + new_buf = current_buf.copy() + self._ptr = new_buf.ptr + self._size = new_buf.size + self._owner = new_buf + self._zero_copied = zero_copied diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9c053dfe6a8..1d77d716fc3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,7 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, as_buffer +from cudf.core.buffer import Buffer, RefCountableBuffer, as_buffer from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -1912,7 +1912,7 @@ def as_column( elif ( fastpath and cudf.get_option("copy_on_write") - and col.base_data is not None + and isinstance(col.base_data, RefCountableBuffer) ): col.base_data._zero_copied = True From e72eaa3b5bc6d57a39d6e9f3e3dfcaec6ee6c65a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 07:52:01 -0800 Subject: [PATCH 056/124] Use CachedInstanceMeta to ensure BufferWeakref is a singleton --- .../cudf/core/buffer/weakrefable_buffer.py | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index 33c33c830c4..3759407e05c 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -4,7 +4,7 @@ import copy import weakref -from typing import Any, Type, TypeVar +from typing import Any, Dict, Type, TypeVar import rmm @@ -14,7 +14,38 @@ T = TypeVar("T", bound="RefCountableBuffer") -class BufferWeakref(object): +class CachedInstanceMeta(type): + """ + Metaclass for BufferWeakref, which will ensure creation + of singleton instance. + """ + + __instances: Dict[int, int] = {} + + def __call__(cls, ptr, size): + cache_key = (ptr, size) + try: + # try retrieving an instance from the cache: + return cls.__instances[cache_key] + except KeyError: + # if an instance couldn't be found in the cache, + # construct it and add to cache: + obj = super().__call__(ptr, size) + try: + cls.__instances[cache_key] = obj + except TypeError: + # couldn't hash the arguments, don't cache: + return obj + return obj + except TypeError: + # couldn't hash the arguments, don't cache: + return super().__call__(ptr, size) + + def _clear_instance_cache(cls): + cls.__instances.clear() + + +class BufferWeakref(object, metaclass=CachedInstanceMeta): """ A proxy class to be used by ``Buffer`` for generating weakreferences. """ From 978f3796e532f1eefea78e656a92f197825924b7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 07:54:22 -0800 Subject: [PATCH 057/124] type --- python/cudf/cudf/core/buffer/weakrefable_buffer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index 3759407e05c..cf71de1543a 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -4,7 +4,7 @@ import copy import weakref -from typing import Any, Dict, Type, TypeVar +from typing import Any, Dict, Tuple, Type, TypeVar import rmm @@ -20,7 +20,7 @@ class CachedInstanceMeta(type): of singleton instance. """ - __instances: Dict[int, int] = {} + __instances: Dict[Tuple, BufferWeakref] = {} def __call__(cls, ptr, size): cache_key = (ptr, size) From 87f7641bce818fc59e973968c3dc8a4e61bbb993 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 08:22:13 -0800 Subject: [PATCH 058/124] read only cai --- python/cudf/cudf/core/buffer/buffer.py | 15 ++++++++++++--- .../cudf/cudf/core/buffer/weakrefable_buffer.py | 6 ++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 20df9a2c7a4..2456fab7261 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -226,8 +226,7 @@ def owner(self) -> Any: @property def __cuda_array_interface__(self) -> dict: """ - Internal Implementation for the CUDA Array Interface without - triggering a deepcopy. + Implementation for the CUDA Array Interface. """ return { "data": (self.ptr, False), @@ -239,7 +238,17 @@ def __cuda_array_interface__(self) -> dict: @property def _cai(self) -> dict: - return self.__cuda_array_interface__ + """ + Internal Implementation for the CUDA Array Interface which is + read-only. + """ + return { + "data": (self.ptr, True), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } def memoryview(self) -> memoryview: """Read-only access to the buffer through host memory.""" diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index cf71de1543a..c4e44be0084 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -283,7 +283,7 @@ def _cai(self) -> dict: triggering a deepcopy. """ return { - "data": (self.ptr, False), + "data": (self.ptr, True), "shape": (self.size,), "strides": None, "typestr": "|u1", @@ -303,7 +303,9 @@ def __cuda_array_interface__(self) -> dict: # pointing to. self._detach_refs(zero_copied=True) - return self._cai + result = self._cai + result["data"] = (self.ptr, False) + return result def _detach_refs(self, zero_copied=False): """ From 057967b2115efd3011d589a77e359cfefacee674 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 09:10:58 -0800 Subject: [PATCH 059/124] add slots --- .../cudf/cudf/core/buffer/weakrefable_buffer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index c4e44be0084..36ab8c2de57 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -45,11 +45,13 @@ def _clear_instance_cache(cls): cls.__instances.clear() -class BufferWeakref(object, metaclass=CachedInstanceMeta): +class BufferWeakref(metaclass=CachedInstanceMeta): """ A proxy class to be used by ``Buffer`` for generating weakreferences. """ + __slots__ = ("ptr", "size", "__weakref__") + def __init__(self, ptr, size) -> None: self.ptr = ptr self.size = size @@ -142,8 +144,9 @@ class RefCountableBuffer(Buffer): _weak_ref: object _proxy_ref: None | BufferWeakref + # TODO: This is synonymous to SpillableBuffer._exposed attribute + # and has to be merged. _zero_copied: bool - _refs: dict = {} @classmethod def _from_device_memory(cls: Type[T], data: Any) -> T: @@ -180,11 +183,9 @@ def _update_ref(self): """ Generate the new proxy reference. """ - if (self._ptr, self._size) not in RefCountableBuffer._refs: - RefCountableBuffer._refs[(self._ptr, self._size)] = BufferWeakref( - self._ptr, self._size - ) - self._proxy_ref = RefCountableBuffer._refs[(self._ptr, self._size)] + # TODO: See if this can be merged into spill-lock + # once spilling and copy on write are merged. + self._proxy_ref = BufferWeakref(self._ptr, self._size) def get_ref(self): """ From 082202f177bf049416bf3e9808904a2183a554be Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 14:23:45 -0800 Subject: [PATCH 060/124] more validation --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/options.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 6b907163bdb..8066858f422 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -217,7 +217,7 @@ cdef class Column: # be expensive. hasattr(value, "__cuda_array_interface__") ): - if isinstance(value, Buffer): + if isinstance(value, RefCountableBuffer): value = SimpleNamespace( __cuda_array_interface__=value._cai, owner=value diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 115128921c8..54fedb7b23e 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -166,6 +166,25 @@ def _validator(val): return _validator +def _make_spill_validator(valid_options): + def _validator(val): + try: + if get_option("copy_on_write") and val: + raise ValueError( + "Spilling is not supported when copy on write is enabled. " + "Please set `copy_on_write` to `False`" + ) + except KeyError: + pass + if val not in valid_options: + raise ValueError( + f"{val} is not a valid option. " + f"Must be one of {set(valid_options)}." + ) + + return _validator + + def _integer_validator(val): try: int(val) @@ -230,7 +249,7 @@ def _integer_and_none_validator(val): \tValid values are True or False. Default is False. """ ), - _make_contains_validator([False, True]), + _make_spill_validator([False, True]), ) From b55f03981c594ee03a56d741e3fe1b0bb52baebc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 7 Dec 2022 18:09:58 -0800 Subject: [PATCH 061/124] More validation and rename cai to readonly cai --- python/cudf/cudf/_lib/column.pyx | 4 ++- python/cudf/cudf/core/buffer/buffer.py | 2 +- .../cudf/core/buffer/weakrefable_buffer.py | 4 +-- python/cudf/cudf/core/column/column.py | 30 +++++++++---------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8066858f422..4420c2bef7a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -219,7 +219,9 @@ cdef class Column: ): if isinstance(value, RefCountableBuffer): value = SimpleNamespace( - __cuda_array_interface__=value._cai, + __cuda_array_interface__=( + value._cuda_array_interface_readonly + ), owner=value ) if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 2456fab7261..1e69bec6365 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -237,7 +237,7 @@ def __cuda_array_interface__(self) -> dict: } @property - def _cai(self) -> dict: + def _cuda_array_interface_readonly(self) -> dict: """ Internal Implementation for the CUDA Array Interface which is read-only. diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index 36ab8c2de57..4eefd3f991a 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -278,7 +278,7 @@ def copy(self, deep: bool = True): ) @property - def _cai(self) -> dict: + def _cuda_array_interface_readonly(self) -> dict: """ Internal Implementation for the CUDA Array Interface without triggering a deepcopy. @@ -304,7 +304,7 @@ def __cuda_array_interface__(self) -> dict: # pointing to. self._detach_refs(zero_copied=True) - result = self._cai + result = self._cuda_array_interface_readonly result["data"] = (self.ptr, False) return result diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1d77d716fc3..8a07bfc5e49 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -128,14 +128,13 @@ def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": Internal implementation for viewing the data as a device array object without triggering a deep-copy. """ - return cuda.as_cuda_array( - SimpleNamespace( - __cuda_array_interface__=self.data._cai - if self.data is not None - else None, - owner=self.data, - ) - ).view(self.dtype) + arr_obj = SimpleNamespace( + __cuda_array_interface__=self.data._cuda_array_interface_readonly + if self.data is not None + else None, + owner=self.data, + ) + return cuda.as_cuda_array(arr_obj).view(self.dtype) @property def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -143,14 +142,13 @@ def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": Internal implementation for viewing the mask as a device array object without triggering a deep-copy. """ - return cuda.as_cuda_array( - SimpleNamespace( - __cuda_array_interface__=self.mask._cai - if self.mask is not None - else None, - owner=self.mask, - ) - ).view(mask_dtype) + arr_obj = SimpleNamespace( + __cuda_array_interface__=self.mask._cuda_array_interface_readonly + if self.mask is not None + else None, + owner=self.mask, + ) + return cuda.as_cuda_array(arr_obj).view(mask_dtype) @property def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": From 35a64a5f1f9119eb71e44cbed6cf03f1592dca8f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 8 Dec 2022 13:22:36 -0500 Subject: [PATCH 062/124] Refactor 1 --- python/cudf/cudf/_lib/column.pyi | 4 +- python/cudf/cudf/_lib/column.pyx | 57 +++---- python/cudf/cudf/core/buffer/__init__.py | 2 +- python/cudf/cudf/core/buffer/utils.py | 8 +- .../cudf/core/buffer/weakrefable_buffer.py | 141 +++++------------- python/cudf/cudf/core/column/column.py | 6 +- 6 files changed, 71 insertions(+), 147 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index ce78f69be0b..84fd12c02b0 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -69,8 +69,8 @@ class Column: def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _detach_refs(self, zero_copied=False) -> None: ... - def _has_a_weakref(self) -> bool: ... - def _is_cai_zero_copied(self) -> bool: ... + def _buffers_shallow_copied(self) -> bool: ... + def _buffers_zero_copied(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False ) -> Optional[ColumnBase]: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 4420c2bef7a..ffd95f670c4 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -12,7 +12,7 @@ import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype from cudf.core.buffer import ( Buffer, - RefCountableBuffer, + CopyOnWriteBuffer, SpillableBuffer, SpillLock, acquire_spill_lock, @@ -74,7 +74,6 @@ cdef class Column: self.set_base_children(children) self.set_base_data(data) self.set_base_mask(mask) - self._zero_copied = False @property def base_size(self): @@ -217,7 +216,7 @@ cdef class Column: # be expensive. hasattr(value, "__cuda_array_interface__") ): - if isinstance(value, RefCountableBuffer): + if isinstance(value, CopyOnWriteBuffer): value = SimpleNamespace( __cuda_array_interface__=( value._cuda_array_interface_readonly @@ -314,49 +313,39 @@ cdef class Column: self._children = None self._base_children = value - def _has_a_weakref(self) -> bool: + def _buffers_shallow_copied(self) -> bool: """ - Determines if the column has a weak reference. + Determines if any of the buffers underneath the column + have been shallow copied """ - - return ( - ( - isinstance(self.base_data, RefCountableBuffer) and - self.base_data._has_a_weakref() - ) - or - ( - isinstance(self.base_mask, RefCountableBuffer) and - self.base_mask._has_a_weakref() - ) + data_shallow_copied = ( + isinstance(self.base_data, CopyOnWriteBuffer) and + self.base_data._shallow_copied() ) + mask_shallow_copied = ( + isinstance(self.base_mask, CopyOnWriteBuffer) and + self.base_mask._shallow_copied() + ) + return mask_shallow_copied or data_shallow_copied - def _is_cai_zero_copied(self) -> bool: - """ - Determines if the column is zero copied. - """ - return ( - self._zero_copied or - ( - self.base_data is not None and - isinstance(self.base_data, RefCountableBuffer) and - self.base_data._is_cai_zero_copied() - ) or - ( - self.base_mask is not None and - isinstance(self.base_mask, RefCountableBuffer) and - self.base_mask._is_cai_zero_copied() - ) + def _buffers_zero_copied(self): + data_zero_copied = ( + isinstance(self.base_data, CopyOnWriteBuffer) and + self.base_data._zero_copied ) + mask_zero_copied = ( + isinstance(self.base_mask, CopyOnWriteBuffer) and + self.base_mask._zero_copied + ) + return data_zero_copied or mask_zero_copied def _detach_refs(self, zero_copied=False): """ Detaches a column from its current Buffers by making a true deep-copy. """ - if not self._is_cai_zero_copied() and self._has_a_weakref(): + if not self._buffers_zero_copied() and self._buffers_shallow_copied(): new_col = self.force_deep_copy() - self._offset = new_col.offset self._size = new_col.size self._dtype = new_col._dtype diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index bfe1b7a4468..e6c46d5f8b7 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -7,4 +7,4 @@ as_buffer, get_spill_lock, ) -from cudf.core.buffer.weakrefable_buffer import RefCountableBuffer +from cudf.core.buffer.weakrefable_buffer import CopyOnWriteBuffer diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 8dc1d6db194..2ee1f041a2f 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -9,7 +9,7 @@ from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock -from cudf.core.buffer.weakrefable_buffer import RefCountableBuffer +from cudf.core.buffer.weakrefable_buffer import CopyOnWriteBuffer from cudf.options import get_option @@ -74,11 +74,11 @@ def as_buffer( ) if get_option("copy_on_write"): - if isinstance(data, (Buffer, RefCountableBuffer)) or hasattr( + if isinstance(data, (Buffer, CopyOnWriteBuffer)) or hasattr( data, "__cuda_array_interface__" ): - return RefCountableBuffer._from_device_memory(data) - return RefCountableBuffer._from_host_memory(data) + return CopyOnWriteBuffer._from_device_memory(data) + return CopyOnWriteBuffer._from_host_memory(data) if get_global_manager() is not None: if hasattr(data, "__cuda_array_interface__"): return SpillableBuffer._from_device_memory(data, exposed=exposed) diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/weakrefable_buffer.py index 4eefd3f991a..313c8474afc 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/weakrefable_buffer.py @@ -11,43 +11,33 @@ import cudf from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper -T = TypeVar("T", bound="RefCountableBuffer") +T = TypeVar("T", bound="CopyOnWriteBuffer") -class CachedInstanceMeta(type): +class _InstanceCountableMeta(type): """ - Metaclass for BufferWeakref, which will ensure creation - of singleton instance. + Metaclass that allows counting the number of instances that are + constructed with the same `ptr` and `size`. """ - __instances: Dict[Tuple, BufferWeakref] = {} + __weakrefs: Dict[Tuple, Any] = {} def __call__(cls, ptr, size): - cache_key = (ptr, size) - try: - # try retrieving an instance from the cache: - return cls.__instances[cache_key] - except KeyError: - # if an instance couldn't be found in the cache, - # construct it and add to cache: - obj = super().__call__(ptr, size) - try: - cls.__instances[cache_key] = obj - except TypeError: - # couldn't hash the arguments, don't cache: - return obj - return obj - except TypeError: - # couldn't hash the arguments, don't cache: - return super().__call__(ptr, size) - - def _clear_instance_cache(cls): - cls.__instances.clear() - - -class BufferWeakref(metaclass=CachedInstanceMeta): + obj = super().__call__(ptr, size) + key = (ptr, size) + if key not in cls.__weakrefs: + cls.__weakrefs[key] = weakref.WeakSet() + cls.__weakrefs[key].add(obj) + return obj + + def _instance_count(cls, ptr, size): + return len(cls.__weakrefs[(ptr, size)]) + + +class _BufferInstanceProxy(metaclass=_InstanceCountableMeta): """ - A proxy class to be used by ``Buffer`` for generating weakreferences. + A proxy class used to count the number of instances of a `Buffer` + constructed with the same `ptr` and `size`. """ __slots__ = ("ptr", "size", "__weakref__") @@ -56,6 +46,9 @@ def __init__(self, ptr, size) -> None: self.ptr = ptr self.size = size + def instance_count(self): + return self.__class__._instance_count(self.ptr, self.size) + def custom_weakref_callback(ref): """ @@ -136,14 +129,13 @@ def custom_weakref_callback(ref): pass -class RefCountableBuffer(Buffer): +class CopyOnWriteBuffer(Buffer): """A Buffer represents device memory. Use the factory function `as_buffer` to create a Buffer instance. """ - _weak_ref: object - _proxy_ref: None | BufferWeakref + _proxy_ref: _BufferInstanceProxy # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool @@ -167,56 +159,12 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: # Bypass `__init__` and initialize attributes manually ret = super()._from_device_memory(data) - ret._weak_ref = None - ret._proxy_ref = None + ret._proxy_ref = _BufferInstanceProxy(ret._ptr, ret._size) ret._zero_copied = False - ret._update_ref() return ret - def _is_cai_zero_copied(self): - """ - Returns a flag, that indicates if the Buffer has been zero-copied. - """ - return self._zero_copied - - def _update_ref(self): - """ - Generate the new proxy reference. - """ - # TODO: See if this can be merged into spill-lock - # once spilling and copy on write are merged. - self._proxy_ref = BufferWeakref(self._ptr, self._size) - - def get_ref(self): - """ - Returns the proxy reference. - """ - if self._proxy_ref is None: - self._update_ref() - return self._proxy_ref - - def _has_a_weakref(self): - """ - Checks if the Buffer has a weak-reference. - """ - weakref_count = weakref.getweakrefcount(self.get_ref()) - - if weakref_count == 1: - # When the weakref_count is 1, it could be a possibility - # that a copied Buffer got destroyed and hence this - # method should return False in that case as there is only - # one Buffer pointing to the device memory. - return ( - weakref.getweakrefs(self.get_ref())[0]() is not self.get_ref() - ) - else: - return weakref_count > 0 - - def _get_weakref(self): - """ - Returns a weak-reference for the Buffer. - """ - return weakref.ref(self.get_ref(), custom_weakref_callback) + def _shallow_copied(self): + return self._proxy_ref.instance_count() > 1 def copy(self, deep: bool = True): """ @@ -234,38 +182,24 @@ def copy(self, deep: bool = True): Buffer """ if not deep: - if ( - cudf.get_option("copy_on_write") - and not self._is_cai_zero_copied() - ): - copied_buf = RefCountableBuffer.__new__(RefCountableBuffer) + if cudf.get_option("copy_on_write") and not self._zero_copied: + copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) copied_buf._ptr = self._ptr copied_buf._size = self._size copied_buf._owner = self._owner - copied_buf._proxy_ref = None - copied_buf._weak_ref = None copied_buf._zero_copied = False - - if self._has_a_weakref(): - # If `self` has weak-references - # we will then have to keep that - # weak-reference alive, hence - # pass it onto `copied_buf` - copied_buf._weak_ref = self._weak_ref - else: - # If `self` has no weak-references, - # we will have to generate a new weak-reference - # and assign it to `copied_buf` - copied_buf._weak_ref = self._get_weakref() - - self._weak_ref = copied_buf._get_weakref() - + # make the `_proxy_ref` of the copy a new instance: + copied_buf._proxy_ref = _BufferInstanceProxy( + self._ptr, self._size + ) return copied_buf else: - shallow_copy = RefCountableBuffer.__new__(RefCountableBuffer) + shallow_copy = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) shallow_copy._ptr = self._ptr shallow_copy._size = self._size shallow_copy._owner = self._owner + # when shallow copying, don't make a new instance: + shallow_copy._proxy_ref = self._proxy_ref return shallow_copy else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) @@ -313,12 +247,13 @@ def _detach_refs(self, zero_copied=False): Detaches a Buffer from it's weak-references by making a true deep-copy. """ - if not self._zero_copied and self._has_a_weakref(): + if not self._zero_copied and self._shallow_copied(): # make a deep copy of existing DeviceBuffer # and replace pointer to it. current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) new_buf = current_buf.copy() self._ptr = new_buf.ptr self._size = new_buf.size + self._proxy_ref = _BufferInstanceProxy(self._ptr, self._size) self._owner = new_buf self._zero_copied = zero_copied diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8a07bfc5e49..ccf902ee286 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,7 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, RefCountableBuffer, as_buffer +from cudf.core.buffer import Buffer, CopyOnWriteBuffer, as_buffer from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -420,7 +420,7 @@ def copy(self: T, deep: bool = True) -> T: return self.force_deep_copy() else: if cudf.get_option("copy_on_write"): - if self._is_cai_zero_copied(): + if self._buffers_zero_copied(): return self.force_deep_copy() copied_col = cast( @@ -1910,7 +1910,7 @@ def as_column( elif ( fastpath and cudf.get_option("copy_on_write") - and isinstance(col.base_data, RefCountableBuffer) + and isinstance(col.base_data, CopyOnWriteBuffer) ): col.base_data._zero_copied = True From 1abaca08ee4f1be4a4c5b9ca8849099009bcec94 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 8 Dec 2022 13:25:50 -0500 Subject: [PATCH 063/124] Rename --- python/cudf/cudf/core/buffer/__init__.py | 2 +- .../cudf/core/buffer/{weakrefable_buffer.py => cow_buffer.py} | 4 ++-- python/cudf/cudf/core/buffer/utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename python/cudf/cudf/core/buffer/{weakrefable_buffer.py => cow_buffer.py} (98%) diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index e6c46d5f8b7..f92d414d797 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -1,10 +1,10 @@ # Copyright (c) 2022, NVIDIA CORPORATION. from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock from cudf.core.buffer.utils import ( acquire_spill_lock, as_buffer, get_spill_lock, ) -from cudf.core.buffer.weakrefable_buffer import CopyOnWriteBuffer diff --git a/python/cudf/cudf/core/buffer/weakrefable_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py similarity index 98% rename from python/cudf/cudf/core/buffer/weakrefable_buffer.py rename to python/cudf/cudf/core/buffer/cow_buffer.py index 313c8474afc..4058c573754 100644 --- a/python/cudf/cudf/core/buffer/weakrefable_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -36,8 +36,8 @@ def _instance_count(cls, ptr, size): class _BufferInstanceProxy(metaclass=_InstanceCountableMeta): """ - A proxy class used to count the number of instances of a `Buffer` - constructed with the same `ptr` and `size`. + A proxy class used to count the number of instances of a + `CopyOnWriteBuffer` constructed with the same `ptr` and `size`. """ __slots__ = ("ptr", "size", "__weakref__") diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 2ee1f041a2f..c5ff4cbcd88 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -7,9 +7,9 @@ from typing import Any, Dict, Optional, Tuple, Union from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock -from cudf.core.buffer.weakrefable_buffer import CopyOnWriteBuffer from cudf.options import get_option From eba1525db7bd1c55c3241e517d4a30e015deafa4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 8 Dec 2022 14:54:54 -0500 Subject: [PATCH 064/124] Refactor 2 --- python/cudf/cudf/core/buffer/cow_buffer.py | 155 +++------------------ 1 file changed, 17 insertions(+), 138 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 4058c573754..2059123f1fe 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -14,132 +14,23 @@ T = TypeVar("T", bound="CopyOnWriteBuffer") -class _InstanceCountableMeta(type): - """ - Metaclass that allows counting the number of instances that are - constructed with the same `ptr` and `size`. - """ - - __weakrefs: Dict[Tuple, Any] = {} - - def __call__(cls, ptr, size): - obj = super().__call__(ptr, size) - key = (ptr, size) - if key not in cls.__weakrefs: - cls.__weakrefs[key] = weakref.WeakSet() - cls.__weakrefs[key].add(obj) - return obj - - def _instance_count(cls, ptr, size): - return len(cls.__weakrefs[(ptr, size)]) - - -class _BufferInstanceProxy(metaclass=_InstanceCountableMeta): - """ - A proxy class used to count the number of instances of a - `CopyOnWriteBuffer` constructed with the same `ptr` and `size`. - """ - - __slots__ = ("ptr", "size", "__weakref__") - - def __init__(self, ptr, size) -> None: - self.ptr = ptr - self.size = size - - def instance_count(self): - return self.__class__._instance_count(self.ptr, self.size) - - -def custom_weakref_callback(ref): - """ - A callback for ``weakref.ref`` API to generate unique - weakref instances that can be counted correctly. - - Example below shows why this is necessary: - - In [1]: import cudf - In [2]: import weakref - - Let's create an object ``x`` that we are going to weakref: - - In [3]: x = cudf.core.buffer.BufferWeakref(1, 2) - - Now generate three weak-references of it: - - In [4]: a = weakref.ref(x) - In [5]: b = weakref.ref(x) - In [6]: c = weakref.ref(x) - - ``weakref.ref`` actually returns the same singleton object: - - In [7]: a - Out[7]: - In [8]: b - Out[8]: - In [9]: c - Out[9]: - - In [10]: a is b - Out[10]: True - In [11]: b is c - Out[11]: True - - This will be problematic as we cannot determine what is the count - of weak-references: - - In [12]: weakref.getweakrefcount(x) - Out[12]: 1 - - Notice, though we want ``weakref.getweakrefcount`` to return ``3``, it - returns ``1``. So we need to work-around this by using an empty/no-op - callback: - - In [13]: def custom_weakref_callback(ref): - ...: pass - ...: - - - In [14]: d = weakref.ref(x, custom_weakref_callback) - In [15]: e = weakref.ref(x, custom_weakref_callback) - In [16]: f = weakref.ref(x, custom_weakref_callback) - - Now there is an each unique weak-reference created: - - In [17]: d - Out[17]: - In [18]: e - Out[18]: - In [19]: f - Out[19]: - - Now calling ``weakref.getweakrefcount`` will result in ``4``, which is correct: - - In [20]: weakref.getweakrefcount(x) - Out[20]: 4 - - In [21]: d is not e - Out[21]: True - - In [22]: d is not f - Out[22]: True - - In [23]: e is not f - Out[23]: True - """ # noqa: E501 - pass - - class CopyOnWriteBuffer(Buffer): """A Buffer represents device memory. Use the factory function `as_buffer` to create a Buffer instance. """ - _proxy_ref: _BufferInstanceProxy + _weakrefs = {} + # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool + def _track_instance(self): + if (self.ptr, self.size) not in self.__class__._weakrefs: + self.__class__._weakrefs[(self.ptr, self.size)] = weakref.WeakSet() + self.__class__._weakrefs[(self.ptr, self.size)].add(self) + @classmethod def _from_device_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from an object exposing `__cuda_array_interface__`. @@ -159,12 +50,12 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: # Bypass `__init__` and initialize attributes manually ret = super()._from_device_memory(data) - ret._proxy_ref = _BufferInstanceProxy(ret._ptr, ret._size) + ret._track_instance() ret._zero_copied = False return ret def _shallow_copied(self): - return self._proxy_ref.instance_count() > 1 + return len(self.__class__._weakrefs[(self.ptr, self.size)]) > 1 def copy(self, deep: bool = True): """ @@ -182,25 +73,13 @@ def copy(self, deep: bool = True): Buffer """ if not deep: - if cudf.get_option("copy_on_write") and not self._zero_copied: - copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) - copied_buf._ptr = self._ptr - copied_buf._size = self._size - copied_buf._owner = self._owner - copied_buf._zero_copied = False - # make the `_proxy_ref` of the copy a new instance: - copied_buf._proxy_ref = _BufferInstanceProxy( - self._ptr, self._size - ) - return copied_buf - else: - shallow_copy = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) - shallow_copy._ptr = self._ptr - shallow_copy._size = self._size - shallow_copy._owner = self._owner - # when shallow copying, don't make a new instance: - shallow_copy._proxy_ref = self._proxy_ref - return shallow_copy + copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) + copied_buf._ptr = self._ptr + copied_buf._size = self._size + copied_buf._owner = self._owner + copied_buf._zero_copied = False + copied_buf._track_instance() + return copied_buf else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( @@ -254,6 +133,6 @@ def _detach_refs(self, zero_copied=False): new_buf = current_buf.copy() self._ptr = new_buf.ptr self._size = new_buf.size - self._proxy_ref = _BufferInstanceProxy(self._ptr, self._size) self._owner = new_buf + self._track_instance() self._zero_copied = zero_copied From 65cb7ac1398f7129391514b442cbf07da05524a6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 8 Dec 2022 15:31:50 -0500 Subject: [PATCH 065/124] Refactor 3 --- python/cudf/cudf/core/buffer/cow_buffer.py | 41 ++++++++++++++-------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 2059123f1fe..15458264f4a 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -3,12 +3,12 @@ from __future__ import annotations import copy -import weakref -from typing import Any, Dict, Tuple, Type, TypeVar +from collections import defaultdict +from typing import Any, DefaultDict, Tuple, Type, TypeVar +from weakref import WeakSet import rmm -import cudf from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper T = TypeVar("T", bound="CopyOnWriteBuffer") @@ -20,16 +20,22 @@ class CopyOnWriteBuffer(Buffer): Use the factory function `as_buffer` to create a Buffer instance. """ - _weakrefs = {} + # This dict keeps track of all instances that have the same `ptr` + # and `size` attributes. Each key of the dict is a `(ptr, size)` + # tuple and the corresponding value is a set of weak references to + # instances with that `ptr` and `size`. + _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet) # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool - def _track_instance(self): - if (self.ptr, self.size) not in self.__class__._weakrefs: - self.__class__._weakrefs[(self.ptr, self.size)] = weakref.WeakSet() - self.__class__._weakrefs[(self.ptr, self.size)].add(self) + def _finalize_init(self): + # the last step in initializing a `CopyOnWriteBuffer` + # is to track it in `_instances`: + key = (self.ptr, self.size) + self.__class__._instances[key].add(self) + self._zero_copied = False @classmethod def _from_device_memory(cls: Type[T], data: Any) -> T: @@ -50,12 +56,20 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: # Bypass `__init__` and initialize attributes manually ret = super()._from_device_memory(data) - ret._track_instance() - ret._zero_copied = False + ret._finalize_init() + return ret + + @classmethod + def _from_host_memory(cls: Type[T], data: Any) -> T: + ret = super()._from_host_memory(data) + ret._finalize_init() return ret def _shallow_copied(self): - return len(self.__class__._weakrefs[(self.ptr, self.size)]) > 1 + """ + Return `True` if shallow copies of `self` exist. + """ + return len(self.__class__._instances[(self.ptr, self.size)]) > 1 def copy(self, deep: bool = True): """ @@ -77,8 +91,7 @@ def copy(self, deep: bool = True): copied_buf._ptr = self._ptr copied_buf._size = self._size copied_buf._owner = self._owner - copied_buf._zero_copied = False - copied_buf._track_instance() + copied_buf._finalize_init() return copied_buf else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) @@ -134,5 +147,5 @@ def _detach_refs(self, zero_copied=False): self._ptr = new_buf.ptr self._size = new_buf.size self._owner = new_buf - self._track_instance() + self._finalize_init() self._zero_copied = zero_copied From 2e8fc970697796b6247c5e7e08e09dc124f91935 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 13 Dec 2022 21:01:09 +0530 Subject: [PATCH 066/124] Apply suggestions from code review --- python/cudf/cudf/_lib/column.pyi | 4 ++-- python/cudf/cudf/_lib/column.pyx | 18 +++++++++--------- python/cudf/cudf/core/buffer/cow_buffer.py | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 84fd12c02b0..1ab50af6bf3 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -68,8 +68,8 @@ class Column: @property def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... - def _detach_refs(self, zero_copied=False) -> None: ... - def _buffers_shallow_copied(self) -> bool: ... + def _unlink_shared_buffers(self, zero_copied=False) -> None: ... + def _is_shared_buffers(self) -> bool: ... def _buffers_zero_copied(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index ffd95f670c4..082ff94f9c2 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -313,20 +313,20 @@ cdef class Column: self._children = None self._base_children = value - def _buffers_shallow_copied(self) -> bool: + def _is_shared_buffers(self) -> bool: """ Determines if any of the buffers underneath the column - have been shallow copied + have been shared else-where. """ - data_shallow_copied = ( + is_data_shared = ( isinstance(self.base_data, CopyOnWriteBuffer) and - self.base_data._shallow_copied() + self.base_data._is_shared() ) - mask_shallow_copied = ( + is_mask_shared = ( isinstance(self.base_mask, CopyOnWriteBuffer) and - self.base_mask._shallow_copied() + self.base_mask._is_shared() ) - return mask_shallow_copied or data_shallow_copied + return is_mask_shared or is_data_shared def _buffers_zero_copied(self): data_zero_copied = ( @@ -339,12 +339,12 @@ cdef class Column: ) return data_zero_copied or mask_zero_copied - def _detach_refs(self, zero_copied=False): + def _unlink_shared_buffers(self, zero_copied=False): """ Detaches a column from its current Buffers by making a true deep-copy. """ - if not self._buffers_zero_copied() and self._buffers_shallow_copied(): + if not self._buffers_zero_copied() and self._is_shared_buffers(): new_col = self.force_deep_copy() self._offset = new_col.offset self._size = new_col.size diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 15458264f4a..7377c6f199e 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -65,9 +65,9 @@ def _from_host_memory(cls: Type[T], data: Any) -> T: ret._finalize_init() return ret - def _shallow_copied(self): + def _is_shared(self): """ - Return `True` if shallow copies of `self` exist. + Return `True` if `self`'s memory is shared with other columns. """ return len(self.__class__._instances[(self.ptr, self.size)]) > 1 @@ -128,15 +128,15 @@ def __cuda_array_interface__(self) -> dict: # control over knowing if a third-party library # has modified the data this Buffer is # pointing to. - self._detach_refs(zero_copied=True) + self._unlink_shared_buffers(zero_copied=True) result = self._cuda_array_interface_readonly result["data"] = (self.ptr, False) return result - def _detach_refs(self, zero_copied=False): + def _unlink_shared_buffers(self, zero_copied=False): """ - Detaches a Buffer from it's weak-references by making + Unlinks a Buffer if it is shared with other buffers(i.e., weak references exist) by making a true deep-copy. """ if not self._zero_copied and self._shallow_copied(): From 2a72d9a1b3bb78258ef14fe770fa25b7f175dc20 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 01:35:27 -0800 Subject: [PATCH 067/124] fix naming errors --- docs/cudf/source/developer_guide/library_design.md | 2 +- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 733e5f82a65..fc9d628acd3 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -358,7 +358,7 @@ There is a case when copy-on-write will be inactive and return true copies even Whenever a `Column`/`Buffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it is technically not possible to know if the device data is modified without introspection. Hence whenever someone accesses `__cuda_array_interface__` of `Column` or a `Buffer`, we trigger -`Column/Buffer._detach_refs` which will ensure a true copy of underlying device data is made and +`Column/Buffer._unlink_shared_buffers` which will ensure a true copy of underlying device data is made and detaches itself from pointing to the original device memory. We also mark the `Column`/`Buffer` as `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak-references. diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 082ff94f9c2..ee41deb2caf 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -380,7 +380,7 @@ cdef class Column: return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count() cdef mutable_column_view mutable_view(self) except *: - self._detach_refs() + self._unlink_shared_buffers() if is_categorical_dtype(self.dtype): col = self.base_children[0] else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ccf902ee286..dc2a0c6b0db 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -118,7 +118,7 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ - self._detach_refs(zero_copied=True) + self._unlink_shared_buffers(zero_copied=True) return self._data_array_view diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2b35a8a4127..e2cf2d803ad 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -279,7 +279,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._detach_refs(zero_copied=True) + self._unlink_shared_buffers(zero_copied=True) output = { "shape": (len(self),), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 12f8e627ccf..fd91f4e1138 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -164,7 +164,7 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._detach_refs(zero_copied=True) + self._unlink_shared_buffers(zero_copied=True) output = { "shape": (len(self),), From 37892557d95e8ca67b2ddb929c324de9f27591b9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 14 Dec 2022 15:35:20 +0530 Subject: [PATCH 068/124] Apply suggestions from code review Co-authored-by: Mads R. B. Kristensen --- python/cudf/cudf/core/buffer/buffer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 1e69bec6365..f2b9fe003d4 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -188,11 +188,7 @@ def copy(self, deep: bool = True): Buffer """ if not deep: - shallow_copy = Buffer.__new__(Buffer) - shallow_copy._ptr = self._ptr - shallow_copy._size = self._size - shallow_copy._owner = self._owner - return shallow_copy + return self[:] else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( From 7ea425c6862932ee21ec2c87d1a206decddf2855 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 02:05:47 -0800 Subject: [PATCH 069/124] address review --- python/cudf/cudf/_lib/column.pyx | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index ee41deb2caf..de83966118f 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -318,26 +318,16 @@ cdef class Column: Determines if any of the buffers underneath the column have been shared else-where. """ - is_data_shared = ( - isinstance(self.base_data, CopyOnWriteBuffer) and - self.base_data._is_shared() + return any( + isinstance(buf, CopyOnWriteBuffer) and buf._is_shared() + for buf in (self.base_data, self.base_mask) ) - is_mask_shared = ( - isinstance(self.base_mask, CopyOnWriteBuffer) and - self.base_mask._is_shared() - ) - return is_mask_shared or is_data_shared def _buffers_zero_copied(self): - data_zero_copied = ( - isinstance(self.base_data, CopyOnWriteBuffer) and - self.base_data._zero_copied - ) - mask_zero_copied = ( - isinstance(self.base_mask, CopyOnWriteBuffer) and - self.base_mask._zero_copied + return any( + isinstance(buf, CopyOnWriteBuffer) and buf._zero_copied + for buf in (self.base_data, self.base_mask) ) - return data_zero_copied or mask_zero_copied def _unlink_shared_buffers(self, zero_copied=False): """ From ccfd064f9b90b037bfd5468217fad2128e68db83 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 14 Dec 2022 15:51:34 +0530 Subject: [PATCH 070/124] Apply suggestions from code review Co-authored-by: Mads R. B. Kristensen --- python/cudf/cudf/core/buffer/buffer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index f2b9fe003d4..a6e20e48b80 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -192,11 +192,7 @@ def copy(self, deep: bool = True): else: owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( - cuda_array_interface_wrapper( - ptr=owner_copy.ptr, - size=owner_copy.size, - owner=owner_copy, - ) + rmm.DeviceBuffer(ptr=self.ptr, size=self.size) ) @property From 93d449e63d7a07a721fd5d3c183992c9fbe51b1d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 02:35:38 -0800 Subject: [PATCH 071/124] fix deep copies --- python/cudf/cudf/core/buffer/buffer.py | 2 -- python/cudf/cudf/core/buffer/cow_buffer.py | 14 ++++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index a6e20e48b80..b8875f50df8 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -2,7 +2,6 @@ from __future__ import annotations -import copy import math import pickle from types import SimpleNamespace @@ -190,7 +189,6 @@ def copy(self, deep: bool = True): if not deep: return self[:] else: - owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( rmm.DeviceBuffer(ptr=self.ptr, size=self.size) ) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 7377c6f199e..0f285c1c951 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -2,14 +2,13 @@ from __future__ import annotations -import copy from collections import defaultdict from typing import Any, DefaultDict, Tuple, Type, TypeVar from weakref import WeakSet import rmm -from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.buffer import Buffer T = TypeVar("T", bound="CopyOnWriteBuffer") @@ -94,13 +93,8 @@ def copy(self, deep: bool = True): copied_buf._finalize_init() return copied_buf else: - owner_copy: rmm.DeviceBuffer = copy.copy(self._owner) return self._from_device_memory( - cuda_array_interface_wrapper( - ptr=owner_copy.ptr, - size=owner_copy.size, - owner=owner_copy, - ) + rmm.DeviceBuffer(ptr=self.ptr, size=self.size) ) @property @@ -136,8 +130,8 @@ def __cuda_array_interface__(self) -> dict: def _unlink_shared_buffers(self, zero_copied=False): """ - Unlinks a Buffer if it is shared with other buffers(i.e., weak references exist) by making - a true deep-copy. + Unlinks a Buffer if it is shared with other buffers(i.e., + weak references exist) by making a true deep-copy. """ if not self._zero_copied and self._shallow_copied(): # make a deep copy of existing DeviceBuffer From 6dbdf2f87f09ffd1eee8aff07b84bec4fc6d536e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 02:47:44 -0800 Subject: [PATCH 072/124] Make _is_shared a property --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/core/buffer/cow_buffer.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index de83966118f..13d38d85ece 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -319,7 +319,7 @@ cdef class Column: have been shared else-where. """ return any( - isinstance(buf, CopyOnWriteBuffer) and buf._is_shared() + isinstance(buf, CopyOnWriteBuffer) and buf._is_shared for buf in (self.base_data, self.base_mask) ) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 0f285c1c951..e1a25920561 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -64,6 +64,7 @@ def _from_host_memory(cls: Type[T], data: Any) -> T: ret._finalize_init() return ret + @property def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. From 4376702b0d085d427a2e8357b15794b4066afa8c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 04:46:26 -0800 Subject: [PATCH 073/124] use WeakKeyDictionary --- python/cudf/cudf/core/buffer/cow_buffer.py | 35 +++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index e1a25920561..a98e25b7a34 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -2,9 +2,8 @@ from __future__ import annotations -from collections import defaultdict -from typing import Any, DefaultDict, Tuple, Type, TypeVar -from weakref import WeakSet +from typing import Any, Type, TypeVar +from weakref import WeakKeyDictionary, WeakSet import rmm @@ -13,6 +12,25 @@ T = TypeVar("T", bound="CopyOnWriteBuffer") +class _PtrSize: + __slots__ = ("t", "__weakref__") + + def __init__(self, ptr, size): + self.t = (ptr, size) + + def __hash__(self): + return hash(self.t) + + def __eq__(self, o): + return type(o) is type(self) and o.t == self.t + + def __repr__(self): + return f"PtrSize{self.t!r}" + + def __str__(self): + return f"PtrSize{self.t}" + + class CopyOnWriteBuffer(Buffer): """A Buffer represents device memory. @@ -23,17 +41,20 @@ class CopyOnWriteBuffer(Buffer): # and `size` attributes. Each key of the dict is a `(ptr, size)` # tuple and the corresponding value is a set of weak references to # instances with that `ptr` and `size`. - _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet) + _instances: WeakKeyDictionary = WeakKeyDictionary() # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool + _ptrsize: _PtrSize def _finalize_init(self): # the last step in initializing a `CopyOnWriteBuffer` # is to track it in `_instances`: - key = (self.ptr, self.size) - self.__class__._instances[key].add(self) + self._ptrsize = _PtrSize(self.ptr, self.size) + self.__class__._instances.setdefault(self._ptrsize, WeakSet()).add( + self + ) self._zero_copied = False @classmethod @@ -69,7 +90,7 @@ def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. """ - return len(self.__class__._instances[(self.ptr, self.size)]) > 1 + return len(self.__class__._instances[self._ptrsize]) > 1 def copy(self, deep: bool = True): """ From 87cadfe6014c66d917d46b29ec4d321602fdc3ca Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 04:47:56 -0800 Subject: [PATCH 074/124] rename --- python/cudf/cudf/core/buffer/cow_buffer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index a98e25b7a34..021ae487d3b 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -12,7 +12,7 @@ T = TypeVar("T", bound="CopyOnWriteBuffer") -class _PtrSize: +class _PtrAndSize: __slots__ = ("t", "__weakref__") def __init__(self, ptr, size): @@ -46,13 +46,13 @@ class CopyOnWriteBuffer(Buffer): # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool - _ptrsize: _PtrSize + _PtrAndSize: _PtrAndSize def _finalize_init(self): # the last step in initializing a `CopyOnWriteBuffer` # is to track it in `_instances`: - self._ptrsize = _PtrSize(self.ptr, self.size) - self.__class__._instances.setdefault(self._ptrsize, WeakSet()).add( + self._PtrAndSize = _PtrAndSize(self.ptr, self.size) + self.__class__._instances.setdefault(self._PtrAndSize, WeakSet()).add( self ) self._zero_copied = False @@ -90,7 +90,7 @@ def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. """ - return len(self.__class__._instances[self._ptrsize]) > 1 + return len(self.__class__._instances[self._PtrAndSize]) > 1 def copy(self, deep: bool = True): """ From 61260daf44587035bcf3957fc6d4919e7d79631c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 05:05:25 -0800 Subject: [PATCH 075/124] rename to _is_internally_referenced and _is_externally_referenced --- python/cudf/cudf/_lib/column.pyi | 4 ++-- python/cudf/cudf/_lib/column.pyx | 7 ++++--- python/cudf/cudf/core/column/column.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 1ab50af6bf3..8948e1ae453 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -69,8 +69,8 @@ class Column: def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _unlink_shared_buffers(self, zero_copied=False) -> None: ... - def _is_shared_buffers(self) -> bool: ... - def _buffers_zero_copied(self) -> bool: ... + def _is_internally_referenced(self) -> bool: ... + def _is_externally_referenced(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False ) -> Optional[ColumnBase]: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 13d38d85ece..78ad1525e57 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -313,7 +313,7 @@ cdef class Column: self._children = None self._base_children = value - def _is_shared_buffers(self) -> bool: + def _is_internally_referenced(self) -> bool: """ Determines if any of the buffers underneath the column have been shared else-where. @@ -323,7 +323,7 @@ cdef class Column: for buf in (self.base_data, self.base_mask) ) - def _buffers_zero_copied(self): + def _is_externally_referenced(self): return any( isinstance(buf, CopyOnWriteBuffer) and buf._zero_copied for buf in (self.base_data, self.base_mask) @@ -334,7 +334,8 @@ cdef class Column: Detaches a column from its current Buffers by making a true deep-copy. """ - if not self._buffers_zero_copied() and self._is_shared_buffers(): + if not self._is_externally_referenced() \ + and self._is_internally_referenced(): new_col = self.force_deep_copy() self._offset = new_col.offset self._size = new_col.size diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dc2a0c6b0db..d219c103b5c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -420,7 +420,7 @@ def copy(self: T, deep: bool = True) -> T: return self.force_deep_copy() else: if cudf.get_option("copy_on_write"): - if self._buffers_zero_copied(): + if self._is_externally_referenced(): return self.force_deep_copy() copied_col = cast( From 499c902e359fa3e60c348c75b7c0ca725fe36031 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 05:13:28 -0800 Subject: [PATCH 076/124] make methods with no params as properties --- python/cudf/cudf/_lib/column.pyi | 2 ++ python/cudf/cudf/_lib/column.pyx | 12 +++++++++--- python/cudf/cudf/core/column/column.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 8948e1ae453..557a66c85c7 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -69,7 +69,9 @@ class Column: def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... def _unlink_shared_buffers(self, zero_copied=False) -> None: ... + @property def _is_internally_referenced(self) -> bool: ... + @property def _is_externally_referenced(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 78ad1525e57..0203b865ee3 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -313,17 +313,23 @@ cdef class Column: self._children = None self._base_children = value + @property def _is_internally_referenced(self) -> bool: """ Determines if any of the buffers underneath the column - have been shared else-where. + have been shared internally(i.e., between other columns). """ return any( isinstance(buf, CopyOnWriteBuffer) and buf._is_shared for buf in (self.base_data, self.base_mask) ) + @property def _is_externally_referenced(self): + """ + Determines if any of the buffers underneath the column + have been shared externally(i.e., via __cuda_array_interface__). + """ return any( isinstance(buf, CopyOnWriteBuffer) and buf._zero_copied for buf in (self.base_data, self.base_mask) @@ -334,8 +340,8 @@ cdef class Column: Detaches a column from its current Buffers by making a true deep-copy. """ - if not self._is_externally_referenced() \ - and self._is_internally_referenced(): + if not self._is_externally_referenced \ + and self._is_internally_referenced: new_col = self.force_deep_copy() self._offset = new_col.offset self._size = new_col.size diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d219c103b5c..9fb1fbbace1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -420,7 +420,7 @@ def copy(self: T, deep: bool = True) -> T: return self.force_deep_copy() else: if cudf.get_option("copy_on_write"): - if self._is_externally_referenced(): + if self._is_externally_referenced: return self.force_deep_copy() copied_col = cast( From 4dd8927ee1530d2a0f9f3db9f1f8f527da4282e4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 06:38:15 -0800 Subject: [PATCH 077/124] revert --- python/cudf/cudf/tests/test_stats.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 08246f9cde6..88e63eccf69 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -531,6 +531,12 @@ def test_nans_stats(data, ops, skipna): getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) + gsr = cudf.Series(data, nan_as_null=False) + # Since there is no concept of `nan_as_null` in pandas, + # nulls will be returned in the operations. So only + # testing for `skipna=True` when `nan_as_null=False` + assert_eq(getattr(psr, ops)(skipna=True), getattr(gsr, ops)(skipna=True)) + @pytest.mark.parametrize( "data", From 25e1be899d7feee05bb17e9b847fd12410c6d3d5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 06:42:33 -0800 Subject: [PATCH 078/124] updates --- python/cudf/cudf/options.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 54fedb7b23e..59cd25cccb9 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -258,9 +258,9 @@ def _integer_and_none_validator(val): os.environ.get("CUDF_COPY_ON_WRITE", "0") == "1", textwrap.dedent( """ - Default behavior of performing deep copies. - If set to `False`, each deep copy will perform a true deep copy. - If set to `True`, each deep copy will perform a shallow copy + Default behavior of performing shallow copies. + If set to `False`, each shallow copy will perform a true shallow copy. + If set to `True`, each shallow copy will perform a shallow copy with underlying data actually referring to the actual column, in this case a copy is only made when there is a write operation performed on the column. From 35d513dbd40693ee7bb3830328e8b3011514e1f3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 07:13:06 -0800 Subject: [PATCH 079/124] add Series copy tests --- python/cudf/cudf/core/series.py | 3 +++ python/cudf/cudf/tests/test_series.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 15457ebddfd..37392f423a1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -365,6 +365,9 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): name : str, optional The name to give to the Series. + copy : bool, default False + Copy input data. Only affects Series or 1d ndarray input. See examples. + nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to ``null`` values. diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 0f6e9f48e10..fc9dddaa1b5 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2054,3 +2054,25 @@ def test_series_duplicated(data, index, keep): ps = gs.to_pandas() assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep)) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + [10, 20, None, None], + ], +) +@pytest.mark.parametrize("copy", [True, False]) +def test_series_copy(data, copy): + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + + new_psr = pd.Series(psr, copy=copy) + new_gsr = cudf.Series(gsr, copy=copy) + + new_psr.iloc[0] = 999 + new_gsr.iloc[0] = 999 + + assert_eq(psr, gsr) + assert_eq(new_psr, new_gsr) From d4a9a3f5a9ba3c90e2815f8e4caf63fd87ef58ce Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 07:13:42 -0800 Subject: [PATCH 080/124] docs --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 37392f423a1..f5537a06486 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -366,7 +366,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): The name to give to the Series. copy : bool, default False - Copy input data. Only affects Series or 1d ndarray input. See examples. + Copy input data. Only affects Series or 1d ndarray input. nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to From dd2f61a468708115a83699b8546db213168716ae Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 07:23:08 -0800 Subject: [PATCH 081/124] docs --- python/cudf/cudf/core/buffer/cow_buffer.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 021ae487d3b..3252d3bb2ef 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -38,8 +38,8 @@ class CopyOnWriteBuffer(Buffer): """ # This dict keeps track of all instances that have the same `ptr` - # and `size` attributes. Each key of the dict is a `(ptr, size)` - # tuple and the corresponding value is a set of weak references to + # and `size` attributes. Each key of the dict is a `self._PtrAndSize` + # object and the corresponding value is a set of weak references to # instances with that `ptr` and `size`. _instances: WeakKeyDictionary = WeakKeyDictionary() @@ -119,23 +119,9 @@ def copy(self, deep: bool = True): rmm.DeviceBuffer(ptr=self.ptr, size=self.size) ) - @property - def _cuda_array_interface_readonly(self) -> dict: - """ - Internal Implementation for the CUDA Array Interface without - triggering a deepcopy. - """ - return { - "data": (self.ptr, True), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } - @property def __cuda_array_interface__(self) -> dict: - # Detach if there are any weak-references. + # Unlink if there are any weak-references. # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write From 71d7d88a4a81e2ba15557a515d97bd30b7897e42 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 23:44:50 -0800 Subject: [PATCH 082/124] update library design doc --- .../source/developer_guide/library_design.md | 40 +++++-------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index fc9d628acd3..aecd0c872e7 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -324,42 +324,24 @@ However, for performance reasons they frequently access internal attributes and Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever there is a write operation on a column. -The core copy-on-write implementation relies in the `Buffer` class. This class stores the pointer to the device memory and size. -With the help of `Buffer._ptr` and `Buffer._size` we create a unique singleton `BufferWeakref`, which means all the new `Buffer`s that are created get the same `BufferWeakref` if they are all pointing to the same device memory. We -store this ``BufferWeakref`` in `Buffer._proxy_ref`. +The core copy-on-write implementation relies in the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. +With the help of `CopyOnWriteBuffer._PtrAndSize` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`, +this is a `WeakKeyDictionary` whose key-value pairs consist of `_PtrAndSize` & a `WeakSet` containing weakreferences to `CopyOnWriteBuffer`. This +means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` +i.e., if they are all pointing to the same device memory. -When the cudf option ``copy_on_write`` is ``True`` and a copy of a `Buffer` is obtained, a shallow copy of the -``Buffer`` is created which points to the same underlying device memory (same `ptr`, `size` & `owner`). Next, a weak reference -is generated by calling ``weakref.ref``. This will later serve as an indication whether or not to make a copy when a -write operation is being performed on `Column`(more on that later). There are some different scenarios on **how** the weak-references are kept alive. +When the cudf option ``copy_on_write`` is ``True``, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the +mechanism to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, it's weak-reference is generated and added to the `WeakSet` which is inturn stored in the `WeakKeyDictionary`. This will later serve as an indication whether or not to make a copy when a +write operation is being performed on `Column`(more on that below). -1. If the current `Buffer`(call it `current_buf`) doesn't have any weak-reference, and we are trying to generate a copy -of it(call it `copied_buf`), then we generate weak-references of both the buffer's and assign it to each-other i.e., -`current_buf` will hold the weak-reference of `copied_buf` and `copied_buf` will hold the weak-reference of `current_buf`. -This is how we achieve this in code: - -``` -copied_buf._weak_ref = current_buf._get_weakref() -current_buf._weak_ref = copied_buf._get_weakref() -``` - -2. If the current `Buffer`(call it `current_buf`) already holds a weak-reference, this means there exists at-least one -copy of this `Buffer` already somewhere. So when we are trying to generate a copy of `current_buf`(call it `copied_buf`), -we will be storing the weak-reference that `current_buf` already has into the `copied_buf`. Next, we will generate a -weak-reference of `copied_buf` and store it in `current_buf`. This will ensure we keep the weak-references of all the -Buffer's alive(as long as `Buffer` is alive only). This is how we achieve it in code: -``` -copied_buf._weak_ref = current_buf._weak_ref -current_buf._weak_ref = copied_buf._get_weakref() -``` There is a case when copy-on-write will be inactive and return true copies even though the cudf option `copy_on_write` is `True`: -Whenever a `Column`/`Buffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it +Whenever a `Column`/`CopyOnWriteBuffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it is technically not possible to know if the device data is modified without introspection. Hence whenever -someone accesses `__cuda_array_interface__` of `Column` or a `Buffer`, we trigger +someone accesses `__cuda_array_interface__` of `Column` or a `CopyOnWriteBuffer`, we trigger `Column/Buffer._unlink_shared_buffers` which will ensure a true copy of underlying device data is made and -detaches itself from pointing to the original device memory. We also mark the `Column`/`Buffer` as +unlinks itself from pointing to the original device memory. We also mark the `Column`/`CopyOnWriteBuffer` as `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak-references. From 2c95e85bd3937082157e83d00dd39eb8eee4674d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 14 Dec 2022 23:55:05 -0800 Subject: [PATCH 083/124] Updated end user docs --- docs/cudf/source/user_guide/copy-on-write.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index f682cf17977..0eeef4cac60 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -28,7 +28,7 @@ There are no additional changes required in the code to make use of copy-on-writ >>> series = cudf.Series([1, 2, 3, 4]) ``` -Performing a deep copy will create a new series object but pointing to the +Performing a shallow copy will create a new series object but pointing to the same underlying device memory: ```python @@ -83,9 +83,18 @@ different device objects: ````{Warning} When ``copy_on_write`` is enabled, all of the shallow copies are constructed with weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` -or `series.data.__cuda_array_interface__` which will then take care of detaching any existing weak-references that a column contains. +or `series.data.__cuda_array_interface__` which will then take care of unlinking any existing weak-references that a column contains. ```` +## Notes + +When copy-on-write is enabled, there is no concept of views. i.e., modifying any view created inside cudf will not actually not modify +the original object it was viewing and thus a separate copy is created and then modified. + +## Advantages + +With copy-on-write enabled and by requesting `.copy(deep=False)`, the GPU memory usage can be reduced drastically if you are not performing +write operations on all of those copies. This will also increase the speed at which objects are created for execution of your ETL workflow. ## How to disable it From d3bdd862cb35dfe95b91e2d8d5b9ad9c1cf84868 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 15 Dec 2022 03:15:26 -0800 Subject: [PATCH 084/124] update docs --- .../source/developer_guide/library_design.md | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index aecd0c872e7..9d2c5c94c0b 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -352,3 +352,129 @@ types that can be mutated in-place. 2. Deep copies of variable width data types return shallow-copies of the Columns, because these types don't support real in-place mutations to the data. We just mimic in such a way that it looks like an in-place operation. + + +### Examples + +When copy-on-write is enabled, take a shallow copy of a `Series` or a `DataFrame` will not +eagerly create a copy of the data. But instead, it produces a view which will be lazily +copied when a write operation is performed on any of it's copies. + +Let's create a series: + +```python +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +>>> s1 = cudf.Series([1, 2, 3, 4]) +``` + +Make a copy of `s1`: +```python +>>> s2 = s1.copy(deep=False) +``` + +Make another copy, but of `s2`: +```python +>>> s3 = s2.copy(deep=False) +``` + +Viewing the data & memory addresses show that they all point to the same device memory: +```python +>>> s1 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s2 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 + +>>> s1.data.ptr +139796315897856 +>>> s2.data.ptr +139796315897856 +>>> s3.data.ptr +139796315897856 +``` + +Now, when we perform a write operation on one of them, say on `s2`. A new copy is created +for `s2` on device and then modified: + +```python +>>> s2[0:2] = 10 +>>> s2 +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> s1 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +If we inspect the memory address of the data, `s1` & `s3` will still share the same address but `s2` will have a new one: + +```python +>>> s1.data.ptr +139796315897856 +>>> s3.data.ptr +139796315897856 +>>> s2.data.ptr +139796315899392 +``` + +Now, performing write operation on `s1` will trigger a new copy on device memory as there +is a weakreference being shared in `s3`: + +```python +>>> s1[0:2] = 11 +>>> s1 +0 11 +1 11 +2 3 +3 4 +dtype: int64 +>>> s2 +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +If we inspect the memory address of the data, `s2` & `s3` addresses will remain un-touched, but `s1` memory address will change because of a copy operation performed during the writing: + +```python +>>> s2.data.ptr +139796315899392 +>>> s3.data.ptr +139796315897856 +>>> s1.data.ptr +139796315879723 +``` From a43f1f993321c6ad588282c87453a6a7021088f5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 15 Dec 2022 03:17:15 -0800 Subject: [PATCH 085/124] review --- python/cudf/cudf/core/buffer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index c5ff4cbcd88..edbc57209ea 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -74,7 +74,7 @@ def as_buffer( ) if get_option("copy_on_write"): - if isinstance(data, (Buffer, CopyOnWriteBuffer)) or hasattr( + if isinstance(data, Buffer) or hasattr( data, "__cuda_array_interface__" ): return CopyOnWriteBuffer._from_device_memory(data) From 8968adb3d0f6e5d4e33ec94276b60a9161ef7669 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 15 Dec 2022 04:40:52 -0800 Subject: [PATCH 086/124] fix tracking of weakreferences --- .../source/developer_guide/library_design.md | 6 +-- python/cudf/cudf/core/buffer/cow_buffer.py | 46 ++++++++----------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 9d2c5c94c0b..593aba3929a 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -325,13 +325,13 @@ Copy on write is designed to reduce memory footprint on GPUs. With this feature, there is a write operation on a column. The core copy-on-write implementation relies in the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer._PtrAndSize` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`, -this is a `WeakKeyDictionary` whose key-value pairs consist of `_PtrAndSize` & a `WeakSet` containing weakreferences to `CopyOnWriteBuffer`. This +With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`, +this is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` i.e., if they are all pointing to the same device memory. When the cudf option ``copy_on_write`` is ``True``, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the -mechanism to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, it's weak-reference is generated and added to the `WeakSet` which is inturn stored in the `WeakKeyDictionary`. This will later serve as an indication whether or not to make a copy when a +mechanism to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, it's weak-reference is generated and added to the `WeakSet` which is inturn stored in the `defaultdict`. This will later serve as an indication whether or not to make a copy when a write operation is being performed on `Column`(more on that below). diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 3252d3bb2ef..3ff7a018c6b 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -2,8 +2,10 @@ from __future__ import annotations -from typing import Any, Type, TypeVar -from weakref import WeakKeyDictionary, WeakSet +import weakref +from collections import defaultdict +from typing import Any, DefaultDict, Tuple, Type, TypeVar +from weakref import WeakSet import rmm @@ -12,23 +14,13 @@ T = TypeVar("T", bound="CopyOnWriteBuffer") -class _PtrAndSize: - __slots__ = ("t", "__weakref__") - - def __init__(self, ptr, size): - self.t = (ptr, size) - - def __hash__(self): - return hash(self.t) - - def __eq__(self, o): - return type(o) is type(self) and o.t == self.t - - def __repr__(self): - return f"PtrSize{self.t!r}" - - def __str__(self): - return f"PtrSize{self.t}" +def _keys_cleanup(ptr, size): + weak_set_values = CopyOnWriteBuffer._instances[(ptr, size)] + if len(weak_set_values) == 1 and list(weak_set_values.data)[0]() is None: + # When the last remaining reference is being cleaned up we will still + # have a dead weak-reference in `weak_set_values`, if that is the case + # we are good to perform the key's cleanup + del CopyOnWriteBuffer._instances[(ptr, size)] class CopyOnWriteBuffer(Buffer): @@ -38,24 +30,22 @@ class CopyOnWriteBuffer(Buffer): """ # This dict keeps track of all instances that have the same `ptr` - # and `size` attributes. Each key of the dict is a `self._PtrAndSize` - # object and the corresponding value is a set of weak references to + # and `size` attributes. Each key of the dict is a `(ptr, size)` + # tuple and the corresponding value is a set of weak references to # instances with that `ptr` and `size`. - _instances: WeakKeyDictionary = WeakKeyDictionary() + _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet) # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. _zero_copied: bool - _PtrAndSize: _PtrAndSize def _finalize_init(self): # the last step in initializing a `CopyOnWriteBuffer` # is to track it in `_instances`: - self._PtrAndSize = _PtrAndSize(self.ptr, self.size) - self.__class__._instances.setdefault(self._PtrAndSize, WeakSet()).add( - self - ) + key = (self.ptr, self.size) + self.__class__._instances[key].add(self) self._zero_copied = False + weakref.finalize(self, _keys_cleanup, self.ptr, self.size) @classmethod def _from_device_memory(cls: Type[T], data: Any) -> T: @@ -90,7 +80,7 @@ def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. """ - return len(self.__class__._instances[self._PtrAndSize]) > 1 + return len(self.__class__._instances[(self.ptr, self.size)]) > 1 def copy(self, deep: bool = True): """ From 5bec54e7202d4bd83d9345e1d1e5a68ba4288445 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 16 Dec 2022 03:45:24 -0800 Subject: [PATCH 087/124] Fix data array view --- python/cudf/cudf/core/column/column.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9fb1fbbace1..bd45b6fde77 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -120,7 +120,7 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ self._unlink_shared_buffers(zero_copied=True) - return self._data_array_view + return cuda.as_cuda_array(self.data).view(self.dtype) @property def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -155,6 +155,8 @@ def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array """ + self._unlink_shared_buffers(zero_copied=True) + return cuda.as_cuda_array(self.mask).view(mask_dtype) def __len__(self) -> int: From 1edef2b8c30e6c4566e9120e064ad194a7fa1601 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 3 Jan 2023 08:29:45 -0800 Subject: [PATCH 088/124] style --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/_lib/utils.pyx | 2 +- python/cudf/cudf/core/buffer/__init__.py | 2 +- python/cudf/cudf/core/buffer/buffer.py | 2 +- python/cudf/cudf/core/buffer/cow_buffer.py | 2 +- python/cudf/cudf/core/buffer/utils.py | 2 +- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 2 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/column/struct.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/column_accessor.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/options.py | 2 +- python/cudf/cudf/tests/test_copying.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/utils/applyutils.py | 2 +- 27 files changed, 27 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 0203b865ee3..26eba69bc17 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from types import SimpleNamespace diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 2fe3af3ae8a..94a8bd83903 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pyarrow as pa diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index f92d414d797..0d433509497 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index b8875f50df8..5830df6f499 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 3ff7a018c6b..f6c18e0ad8b 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index edbc57209ea..e117f164963 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0cf95eebf86..df5766f8ac5 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bd45b6fde77..3753df84c24 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index e2cf2d803ad..d409dc79f4c 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index f743b6bef4b..a4533c40da5 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from functools import cached_property from typing import List, Optional, Sequence, Tuple, Union diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index fd91f4e1138..41b6fd54ef1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index aa11c51eb76..8ba799bc681 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 0ea3aa29c76..36bcc6f8aab 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 01b9610e8d0..74cb1efb3a5 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 5493ba45d62..707eda3f5e6 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ca39d206b71..ba77f8edcdb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ad354d8752a..2e85d75eb71 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a3190bb2ae2..783c3996400 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index e3c83a9521d..fd890f21b41 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import itertools from collections import abc diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index dd886e07958..5949d26a807 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 59cd25cccb9..d9c07155813 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import os import textwrap diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1a99e7ec829..4e974ce3017 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import cupy as cp import numpy as np diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index d72fa36b34d..cbdf79d2d19 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. """ Test related to Index diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 1619e2e369d..103f58f16c1 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. """ Test related to MultiIndex diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 7504d459033..0fe175083b4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import hashlib import operator diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 88e63eccf69..886a38d441c 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from concurrent.futures import ThreadPoolExecutor diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index a012d73fe67..b7677f7b43e 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import functools from typing import Any, Dict From bce978108649d82376197937063c7bf1d74fb8e1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 3 Jan 2023 12:21:47 -0600 Subject: [PATCH 089/124] Hide cow details from column --- python/cudf/cudf/_lib/column.pyx | 51 +---------------- python/cudf/cudf/core/buffer/buffer.py | 5 ++ python/cudf/cudf/core/buffer/cow_buffer.py | 55 ++++++++++++++++--- .../cudf/cudf/core/buffer/spillable_buffer.py | 6 +- python/cudf/cudf/core/column/column.py | 6 -- python/cudf/cudf/core/column/datetime.py | 1 - python/cudf/cudf/core/column/numerical.py | 1 - 7 files changed, 59 insertions(+), 66 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 26eba69bc17..5c6a7d7abbb 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -313,47 +313,6 @@ cdef class Column: self._children = None self._base_children = value - @property - def _is_internally_referenced(self) -> bool: - """ - Determines if any of the buffers underneath the column - have been shared internally(i.e., between other columns). - """ - return any( - isinstance(buf, CopyOnWriteBuffer) and buf._is_shared - for buf in (self.base_data, self.base_mask) - ) - - @property - def _is_externally_referenced(self): - """ - Determines if any of the buffers underneath the column - have been shared externally(i.e., via __cuda_array_interface__). - """ - return any( - isinstance(buf, CopyOnWriteBuffer) and buf._zero_copied - for buf in (self.base_data, self.base_mask) - ) - - def _unlink_shared_buffers(self, zero_copied=False): - """ - Detaches a column from its current Buffers by making - a true deep-copy. - """ - if not self._is_externally_referenced \ - and self._is_internally_referenced: - new_col = self.force_deep_copy() - self._offset = new_col.offset - self._size = new_col.size - self._dtype = new_col._dtype - self.set_base_data(new_col.base_data) - self.set_base_children(new_col.base_children) - self.set_base_mask(new_col.base_mask) - if self.base_data is not None: - self.base_data._zero_copied = zero_copied - if self.base_mask is not None: - self.base_mask._zero_copied = zero_copied - def _mimic_inplace(self, other_col, inplace=False): """ Given another column, update the attributes of this column to mimic an @@ -377,7 +336,7 @@ cdef class Column: return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count() cdef mutable_column_view mutable_view(self) except *: - self._unlink_shared_buffers() + if is_categorical_dtype(self.dtype): col = self.base_children[0] else: @@ -391,12 +350,8 @@ cdef class Column: if col.base_data is None: data = NULL - elif isinstance(col.base_data, SpillableBuffer): - data = (col.base_data).get_ptr( - spill_lock=get_spill_lock() - ) else: - data = (col.base_data.ptr) + data = (col.base_data.mutable_ptr) cdef Column child_column if col.base_children: @@ -456,7 +411,7 @@ cdef class Column: spill_lock=get_spill_lock() ) else: - data = (col.base_data.ptr) + data = (col.base_data._ptr) cdef Column child_column if col.base_children: diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 5830df6f499..7770bc3efe7 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -208,6 +208,11 @@ def ptr(self) -> int: """Device pointer to the start of the buffer.""" return self._ptr + @property + def mutable_ptr(self) -> int: + """Device pointer to the start of the buffer.""" + return self._ptr + @property def owner(self) -> Any: """Object owning the memory of the buffer.""" diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index f6c18e0ad8b..5a19035c277 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -9,7 +9,7 @@ import rmm -from cudf.core.buffer.buffer import Buffer +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper T = TypeVar("T", bound="CopyOnWriteBuffer") @@ -42,10 +42,10 @@ class CopyOnWriteBuffer(Buffer): def _finalize_init(self): # the last step in initializing a `CopyOnWriteBuffer` # is to track it in `_instances`: - key = (self.ptr, self.size) + key = (self._ptr, self._size) self.__class__._instances[key].add(self) self._zero_copied = False - weakref.finalize(self, _keys_cleanup, self.ptr, self.size) + weakref.finalize(self, _keys_cleanup, self._ptr, self._size) @classmethod def _from_device_memory(cls: Type[T], data: Any) -> T: @@ -80,7 +80,30 @@ def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. """ - return len(self.__class__._instances[(self.ptr, self.size)]) > 1 + return len(self.__class__._instances[(self._ptr, self._size)]) > 1 + + @property + def ptr(self) -> int: + """Device pointer to the start of the buffer.""" + self._unlink_shared_buffers(zero_copied=True) + return self._ptr + + @property + def mutable_ptr(self) -> int: + """Device pointer to the start of the buffer.""" + self._unlink_shared_buffers() + return self._ptr + + def _getitem(self, offset: int, size: int) -> Buffer: + """ + Sub-classes can overwrite this to implement __getitem__ + without having to handle non-slice inputs. + """ + return self._from_device_memory( + cuda_array_interface_wrapper( + ptr=self._ptr + offset, size=size, owner=self.owner + ) + ) def copy(self, deep: bool = True): """ @@ -97,7 +120,7 @@ def copy(self, deep: bool = True): ------- Buffer """ - if not deep: + if not deep and not self._zero_copied: copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) copied_buf._ptr = self._ptr copied_buf._size = self._size @@ -106,7 +129,7 @@ def copy(self, deep: bool = True): return copied_buf else: return self._from_device_memory( - rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + rmm.DeviceBuffer(ptr=self._ptr, size=self.size) ) @property @@ -123,18 +146,32 @@ def __cuda_array_interface__(self) -> dict: self._unlink_shared_buffers(zero_copied=True) result = self._cuda_array_interface_readonly - result["data"] = (self.ptr, False) + result["data"] = (self._ptr, False) return result + @property + def _cuda_array_interface_readonly(self) -> dict: + """ + Internal Implementation for the CUDA Array Interface which is + read-only. + """ + return { + "data": (self._ptr, True), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } + def _unlink_shared_buffers(self, zero_copied=False): """ Unlinks a Buffer if it is shared with other buffers(i.e., weak references exist) by making a true deep-copy. """ - if not self._zero_copied and self._shallow_copied(): + if not self._zero_copied and self._is_shared: # make a deep copy of existing DeviceBuffer # and replace pointer to it. - current_buf = rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + current_buf = rmm.DeviceBuffer(ptr=self._ptr, size=self._size) new_buf = current_buf.copy() self._ptr = new_buf.ptr self._size = new_buf.size diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index c21f980e599..b10c87fd0a2 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -254,6 +254,10 @@ def ptr(self) -> int: self._last_accessed = time.monotonic() return self._ptr + @property + def mutable_ptr(self) -> int: + return self.get_ptr(spill_lock=SpillLock()) + def spill_lock(self, spill_lock: SpillLock) -> None: """Spill lock the buffer diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3753df84c24..8c08ddc6a0a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -118,8 +118,6 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ - self._unlink_shared_buffers(zero_copied=True) - return cuda.as_cuda_array(self.data).view(self.dtype) @property @@ -155,8 +153,6 @@ def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array """ - self._unlink_shared_buffers(zero_copied=True) - return cuda.as_cuda_array(self.mask).view(mask_dtype) def __len__(self) -> int: @@ -422,8 +418,6 @@ def copy(self: T, deep: bool = True) -> T: return self.force_deep_copy() else: if cudf.get_option("copy_on_write"): - if self._is_externally_referenced: - return self.force_deep_copy() copied_col = cast( T, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d409dc79f4c..449691750d8 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -279,7 +279,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._unlink_shared_buffers(zero_copied=True) output = { "shape": (len(self),), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 41b6fd54ef1..37d1d7b6e61 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -164,7 +164,6 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - self._unlink_shared_buffers(zero_copied=True) output = { "shape": (len(self),), From ce32cf576b1c47345649c5450782c42e36fdd634 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 3 Jan 2023 10:52:32 -0800 Subject: [PATCH 090/124] cleanup --- python/cudf/cudf/_lib/column.pyi | 5 ----- python/cudf/cudf/_lib/utils.pyx | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 557a66c85c7..612f3cdf95a 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -68,11 +68,6 @@ class Column: @property def children(self) -> Tuple[ColumnBase, ...]: ... def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... - def _unlink_shared_buffers(self, zero_copied=False) -> None: ... - @property - def _is_internally_referenced(self) -> bool: ... - @property - def _is_externally_referenced(self) -> bool: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False ) -> Optional[ColumnBase]: ... diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 94a8bd83903..5f4d3e17fbc 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np import pyarrow as pa @@ -322,6 +322,7 @@ cdef columns_from_table_view( in the table view is ``owners[i]``. For more about memory ownership, see ``Column.from_column_view``. """ + return [ Column.from_column_view( tv.column(i), owners[i] if isinstance(owners, list) else None From 25511fcd21777bcaf5e1113e2659421159137eb8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 4 Jan 2023 18:13:15 -0800 Subject: [PATCH 091/124] use getattr_static --- python/cudf/cudf/_lib/column.pyx | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 5c6a7d7abbb..56a87418f09 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. +import inspect from types import SimpleNamespace import cupy as cp @@ -206,16 +207,20 @@ cdef class Column: "The value for mask is smaller than expected, got {} bytes, " "expected " + str(required_num_bytes) + " bytes." ) + + # Check for Buffer instance, because + # hasattr will trigger invocation of + # `__cuda_array_interface__` which could + # be expensive in CopyOnWriteBuffer case. + value_cai = inspect.getattr_static( + value, + "__cuda_array_interface__", + None + ) + if value is None: mask = None - elif ( - isinstance(value, Buffer) or - # Check for Buffer instance, because - # hasattr will trigger invocation of - # `__cuda_array_interface__` which could - # be expensive. - hasattr(value, "__cuda_array_interface__") - ): + elif type(value_cai) is property: if isinstance(value, CopyOnWriteBuffer): value = SimpleNamespace( __cuda_array_interface__=( @@ -223,7 +228,7 @@ cdef class Column: ), owner=value ) - if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): + if value_cai.__get__(value)["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view value = cp.asarray(value).view('|u1') From 22198a310eea77777e9b07e48ecb91d5c788a81f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 4 Jan 2023 18:15:53 -0800 Subject: [PATCH 092/124] update comment --- python/cudf/cudf/_lib/column.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 56a87418f09..d645a550c80 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -208,8 +208,7 @@ cdef class Column: "expected " + str(required_num_bytes) + " bytes." ) - # Check for Buffer instance, because - # hasattr will trigger invocation of + # Because hasattr will trigger invocation of # `__cuda_array_interface__` which could # be expensive in CopyOnWriteBuffer case. value_cai = inspect.getattr_static( From 80c48f1e8a73fbfda0c1462a56e41553e1f2c813 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 4 Jan 2023 18:34:15 -0800 Subject: [PATCH 093/124] use iter --- python/cudf/cudf/core/buffer/cow_buffer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 5a19035c277..1c83ddd0671 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -16,7 +16,10 @@ def _keys_cleanup(ptr, size): weak_set_values = CopyOnWriteBuffer._instances[(ptr, size)] - if len(weak_set_values) == 1 and list(weak_set_values.data)[0]() is None: + if ( + len(weak_set_values) == 1 + and next(iter(weak_set_values.data))() is None + ): # When the last remaining reference is being cleaned up we will still # have a dead weak-reference in `weak_set_values`, if that is the case # we are good to perform the key's cleanup From 0a21c83101738acdf0b7da025c009db857d29899 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 4 Jan 2023 18:49:10 -0800 Subject: [PATCH 094/124] separate setting _zero_copied from _unlink_shared_buffers --- python/cudf/cudf/core/buffer/cow_buffer.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 1c83ddd0671..9abf9b762c1 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -88,7 +88,8 @@ def _is_shared(self): @property def ptr(self) -> int: """Device pointer to the start of the buffer.""" - self._unlink_shared_buffers(zero_copied=True) + self._unlink_shared_buffers() + self._zero_copied = True return self._ptr @property @@ -138,7 +139,7 @@ def copy(self, deep: bool = True): @property def __cuda_array_interface__(self) -> dict: # Unlink if there are any weak-references. - + self._unlink_shared_buffers() # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write # mechanism post this operation. @@ -146,8 +147,7 @@ def __cuda_array_interface__(self) -> dict: # control over knowing if a third-party library # has modified the data this Buffer is # pointing to. - self._unlink_shared_buffers(zero_copied=True) - + self._zero_copied = True result = self._cuda_array_interface_readonly result["data"] = (self._ptr, False) return result @@ -166,7 +166,7 @@ def _cuda_array_interface_readonly(self) -> dict: "version": 0, } - def _unlink_shared_buffers(self, zero_copied=False): + def _unlink_shared_buffers(self): """ Unlinks a Buffer if it is shared with other buffers(i.e., weak references exist) by making a true deep-copy. @@ -180,4 +180,3 @@ def _unlink_shared_buffers(self, zero_copied=False): self._size = new_buf.size self._owner = new_buf self._finalize_init() - self._zero_copied = zero_copied From 9d6c09e2dd077f24f519d75917542bcf6f5cafa5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 4 Jan 2023 20:50:41 -0600 Subject: [PATCH 095/124] Apply suggestions from code review Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> --- docs/cudf/source/developer_guide/library_design.md | 2 +- python/cudf/cudf/core/buffer/cow_buffer.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 593aba3929a..b90cb5cd84d 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -324,7 +324,7 @@ However, for performance reasons they frequently access internal attributes and Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever there is a write operation on a column. -The core copy-on-write implementation relies in the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. +The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`, this is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 5a19035c277..84381699bf6 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -40,8 +40,6 @@ class CopyOnWriteBuffer(Buffer): _zero_copied: bool def _finalize_init(self): - # the last step in initializing a `CopyOnWriteBuffer` - # is to track it in `_instances`: key = (self._ptr, self._size) self.__class__._instances[key].add(self) self._zero_copied = False @@ -165,8 +163,7 @@ def _cuda_array_interface_readonly(self) -> dict: def _unlink_shared_buffers(self, zero_copied=False): """ - Unlinks a Buffer if it is shared with other buffers(i.e., - weak references exist) by making a true deep-copy. + Unlinks a Buffer if it is shared with other buffers by making a true deep-copy. """ if not self._zero_copied and self._is_shared: # make a deep copy of existing DeviceBuffer From 141ca49ab911e10cf3a07f250faeccc12f410976 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 4 Jan 2023 19:43:15 -0800 Subject: [PATCH 096/124] drop extra args for as_column --- python/cudf/cudf/core/buffer/cow_buffer.py | 8 ++++++-- python/cudf/cudf/core/buffer/utils.py | 4 +++- python/cudf/cudf/core/column/column.py | 12 +----------- python/cudf/cudf/core/series.py | 4 ++-- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 75c740f972d..443966a0082 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -49,7 +49,9 @@ def _finalize_init(self): weakref.finalize(self, _keys_cleanup, self._ptr, self._size) @classmethod - def _from_device_memory(cls: Type[T], data: Any) -> T: + def _from_device_memory( + cls: Type[T], data: Any, *, exposed: bool = False + ) -> T: """Create a Buffer from an object exposing `__cuda_array_interface__`. No data is being copied. @@ -68,6 +70,7 @@ def _from_device_memory(cls: Type[T], data: Any) -> T: # Bypass `__init__` and initialize attributes manually ret = super()._from_device_memory(data) ret._finalize_init() + ret._zero_copied = exposed return ret @classmethod @@ -166,7 +169,8 @@ def _cuda_array_interface_readonly(self) -> dict: def _unlink_shared_buffers(self): """ - Unlinks a Buffer if it is shared with other buffers by making a true deep-copy. + Unlinks a Buffer if it is shared with other buffers by + making a true deep-copy. """ if not self._zero_copied and self._is_shared: # make a deep copy of existing DeviceBuffer diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index e117f164963..fc67138de42 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -77,7 +77,9 @@ def as_buffer( if isinstance(data, Buffer) or hasattr( data, "__cuda_array_interface__" ): - return CopyOnWriteBuffer._from_device_memory(data) + return CopyOnWriteBuffer._from_device_memory(data, exposed=exposed) + if exposed: + raise ValueError("cannot created exposed host memory") return CopyOnWriteBuffer._from_host_memory(data) if get_global_manager() is not None: if hasattr(data, "__cuda_array_interface__"): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8c08ddc6a0a..e24fe438140 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,7 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, CopyOnWriteBuffer, as_buffer +from cudf.core.buffer import Buffer, as_buffer from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -1815,8 +1815,6 @@ def as_column( nan_as_null: bool = None, dtype: Dtype = None, length: int = None, - copy: bool = False, - fastpath: bool = False, ): """Create a Column from an arbitrary object @@ -1901,14 +1899,6 @@ def as_column( data = as_buffer(arbitrary, exposed=True) col = build_column(data, dtype=current_dtype, mask=mask) - if copy: - col = col.copy(deep=True) - elif ( - fastpath - and cudf.get_option("copy_on_write") - and isinstance(col.base_data, CopyOnWriteBuffer) - ): - col.base_data._zero_copied = True if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5949d26a807..75aac6688f0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -548,9 +548,9 @@ def __init__( data, nan_as_null=nan_as_null, dtype=dtype, - copy=copy, - fastpath=True, ) + if copy: + data = data.copy(deep=True) else: if dtype is not None: data = data.astype(dtype) From 6356765ba353b266ed5f452e28dde25fe38d1b6a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 6 Jan 2023 15:06:27 -0600 Subject: [PATCH 097/124] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- .../source/developer_guide/library_design.md | 20 +++++++++---------- docs/cudf/source/user_guide/copy-on-write.md | 14 ++++++------- python/cudf/cudf/core/column/column.py | 5 ++--- python/cudf/cudf/options.py | 2 +- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index b90cb5cd84d..fd5f3a00ac9 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -325,14 +325,14 @@ Copy on write is designed to reduce memory footprint on GPUs. With this feature, there is a write operation on a column. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`, -this is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This +With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +This is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` i.e., if they are all pointing to the same device memory. When the cudf option ``copy_on_write`` is ``True``, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the -mechanism to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, it's weak-reference is generated and added to the `WeakSet` which is inturn stored in the `defaultdict`. This will later serve as an indication whether or not to make a copy when a -write operation is being performed on `Column`(more on that below). +mechanisms to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, its weakref is generated and added to the `WeakSet` which is in turn stored in the `defaultdict`. This will later serve as an indication of whether or not to make a copy when a +write operation is performed on `Column`(more on that below). There is a case when copy-on-write will be inactive and return true copies even though the cudf option `copy_on_write` is `True`: @@ -348,7 +348,7 @@ rather than a copy-on-write shallow copy with weak-references. Notes: 1. Weak-references are implemented only for fixed-width data types as these are only column -types that can be mutated in-place. +types that can be mutated in place. 2. Deep copies of variable width data types return shallow-copies of the Columns, because these types don't support real in-place mutations to the data. We just mimic in such a way that it looks like an in-place operation. @@ -356,9 +356,9 @@ like an in-place operation. ### Examples -When copy-on-write is enabled, take a shallow copy of a `Series` or a `DataFrame` will not -eagerly create a copy of the data. But instead, it produces a view which will be lazily -copied when a write operation is performed on any of it's copies. +When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` will not +eagerly create a copy of the data. Instead, it will produce a view that will be lazily +copied when a write operation is performed on any of its copies. Let's create a series: @@ -407,7 +407,7 @@ dtype: int64 139796315897856 ``` -Now, when we perform a write operation on one of them, say on `s2`. A new copy is created +Now, when we perform a write operation on one of them, say on `s2`, a new copy is created for `s2` on device and then modified: ```python @@ -468,7 +468,7 @@ dtype: int64 dtype: int64 ``` -If we inspect the memory address of the data, `s2` & `s3` addresses will remain un-touched, but `s1` memory address will change because of a copy operation performed during the writing: +If we inspect the memory address of the data, `s2` & `s3` addresses will remain untouched, but `s1` memory address will change because of a copy operation performed during the writing: ```python >>> s2.data.ptr diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 0eeef4cac60..5b128b9fb0e 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -1,22 +1,22 @@ # Copy on write -Copy on write enables ability to save on GPU memory usage when copies(`.copy(deep=False)`) of a column +Copy on write reduces GPU memory usage when copies(`.copy(deep=False)`) of a column are made. ## How to enable it -i. Either by using `set_option` in `cudf`: +i. Use `cudf.set_option`: ```python >>> import cudf >>> cudf.set_option("copy_on_write", True) ``` -ii. Or, by setting an environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the -launch of the python interpreter: +ii. Set the environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the +launch of the Python interpreter: ```bash -export CUDF_COPY_ON_WRITE="1" +export CUDF_COPY_ON_WRITE="1" python -c "import cudf" ``` @@ -28,7 +28,7 @@ There are no additional changes required in the code to make use of copy-on-writ >>> series = cudf.Series([1, 2, 3, 4]) ``` -Performing a shallow copy will create a new series object but pointing to the +Performing a shallow copy will create a new Series object pointing to the same underlying device memory: ```python @@ -51,7 +51,7 @@ dtype: int64 140102175031296 ``` -But, when there is a write-operation being performed on either ``series`` or +Then, when a write operation is performed on either ``series`` or ``copied_series``, a true physical copy of the data is created: ```python diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e24fe438140..4fb82472b61 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -402,9 +402,8 @@ def _nullmask(self) -> Buffer: def force_deep_copy(self: T) -> T: """ - A method to force create a deep-copy of - a Column irrespective of `copy-on-write` - is enable/disabled. + A method to create deep copy irrespective of whether + `copy-on-write` is enable. """ result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index d9c07155813..8d06a859f82 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -255,7 +255,7 @@ def _integer_and_none_validator(val): _register_option( "copy_on_write", - os.environ.get("CUDF_COPY_ON_WRITE", "0") == "1", + _env_get_bool("CUDF_COPY_ON_WRITE", False), textwrap.dedent( """ Default behavior of performing shallow copies. From 56eb8091cb675a1c86ddd9f718d27d7df9c24118 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 6 Jan 2023 13:06:30 -0800 Subject: [PATCH 098/124] rename copy on write for consistency --- docs/cudf/source/developer_guide/library_design.md | 4 ++-- docs/cudf/source/user_guide/copy-on-write.md | 6 +++--- python/cudf/cudf/options.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index b90cb5cd84d..87d17c1fac0 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -318,10 +318,10 @@ Internally, these objects typically interact with cuDF objects at the Frame laye However, for performance reasons they frequently access internal attributes and methods of `Frame` and its subclasses. -## Copy on write +## Copy-on-write -Copy on write is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever +Copy-on-write(COW) is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever there is a write operation on a column. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 0eeef4cac60..115f6077dda 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -1,6 +1,6 @@ -# Copy on write +# Copy-on-write -Copy on write enables ability to save on GPU memory usage when copies(`.copy(deep=False)`) of a column +Copy-on-write(COW) enables ability to save on GPU memory usage when copies(`.copy(deep=False)`) of a column are made. ## How to enable it @@ -99,7 +99,7 @@ write operations on all of those copies. This will also increase the speed at wh ## How to disable it -Copy on write can be disable by setting ``copy_on_write`` cudf option to ``False``: +Copy-on-write can be disable by setting ``copy_on_write`` cudf option to ``False``: ```python >>> cudf.set_option("copy_on_write", False) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index d9c07155813..2d38ca2977e 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -154,7 +154,7 @@ def _make_cow_validator(valid_options): def _validator(val): if get_option("spill") and val: raise ValueError( - "Copy on write is not supported when spilling is enabled. " + "Copy-on-write is not supported when spilling is enabled. " "Please set `spill` to `False`" ) if val not in valid_options: @@ -171,7 +171,7 @@ def _validator(val): try: if get_option("copy_on_write") and val: raise ValueError( - "Spilling is not supported when copy on write is enabled. " + "Spilling is not supported when Copy-on-write is enabled. " "Please set `copy_on_write` to `False`" ) except KeyError: From da343c5eafda64137c4cfc2327ca9cbf1cc0ae73 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 08:59:53 -0800 Subject: [PATCH 099/124] address reviews --- python/cudf/cudf/_lib/column.pyx | 10 ++-------- python/cudf/cudf/core/buffer/buffer.py | 15 ++++++++------- python/cudf/cudf/core/buffer/cow_buffer.py | 15 ++++++++------- python/cudf/cudf/core/column/column.py | 22 +++++++++------------- python/cudf/cudf/core/column/lists.py | 9 +-------- python/cudf/cudf/core/column/string.py | 9 +-------- python/cudf/cudf/core/column/struct.py | 9 +-------- python/cudf/cudf/tests/test_multiindex.py | 4 ++-- 8 files changed, 32 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index d645a550c80..9fdbb031271 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. import inspect -from types import SimpleNamespace import cupy as cp import numpy as np @@ -221,13 +220,8 @@ cdef class Column: mask = None elif type(value_cai) is property: if isinstance(value, CopyOnWriteBuffer): - value = SimpleNamespace( - __cuda_array_interface__=( - value._cuda_array_interface_readonly - ), - owner=value - ) - if value_cai.__get__(value)["typestr"] not in ("|i1", "|u1"): + value = value._cuda_array_interface_readonly + if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view value = cp.asarray(value).view('|u1') diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 7770bc3efe7..349d367dce7 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -237,13 +237,14 @@ def _cuda_array_interface_readonly(self) -> dict: Internal Implementation for the CUDA Array Interface which is read-only. """ - return { - "data": (self.ptr, True), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } + return cuda_array_interface_wrapper( + ptr=self.ptr, + size=self.size, + owner=self, + readonly=True, + typestr="|u1", + version=0, + ) def memoryview(self) -> memoryview: """Read-only access to the buffer through host memory.""" diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 443966a0082..212f42fa9c7 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -159,13 +159,14 @@ def _cuda_array_interface_readonly(self) -> dict: Internal Implementation for the CUDA Array Interface which is read-only. """ - return { - "data": (self._ptr, True), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } + return cuda_array_interface_wrapper( + ptr=self._ptr, + size=self.size, + owner=self, + readonly=True, + typestr="|u1", + version=0, + ) def _unlink_shared_buffers(self): """ diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 85e252172fc..e3f5c8ec7dd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -126,13 +126,11 @@ def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": Internal implementation for viewing the data as a device array object without triggering a deep-copy. """ - arr_obj = SimpleNamespace( - __cuda_array_interface__=self.data._cuda_array_interface_readonly + return cuda.as_cuda_array( + self.data._cuda_array_interface_readonly if self.data is not None - else None, - owner=self.data, - ) - return cuda.as_cuda_array(arr_obj).view(self.dtype) + else None + ).view(self.dtype) @property def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -140,13 +138,11 @@ def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": Internal implementation for viewing the mask as a device array object without triggering a deep-copy. """ - arr_obj = SimpleNamespace( - __cuda_array_interface__=self.mask._cuda_array_interface_readonly + return cuda.as_cuda_array( + self.mask._cuda_array_interface_readonly if self.mask is not None - else None, - owner=self.mask, - ) - return cuda.as_cuda_array(arr_obj).view(mask_dtype) + else None + ).view(mask_dtype) @property def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -431,7 +427,7 @@ def copy(self: T, deep: bool = True) -> T: size=self.size, offset=self.offset, children=tuple( - col.copy(deep=True) for col in self.base_children + col.copy(deep=False) for col in self.base_children ), ), ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6b68c463fd6..aa407e06433 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -204,14 +204,7 @@ def copy(self, deep: bool = True): and shallow copies share the underlying device data and mask. """ - return column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) + return super().copy(deep=False) def leaves(self): if isinstance(self.elements, ListColumn): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7b8f85ab06a..ed97c7af6ae 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5258,14 +5258,7 @@ def copy(self, deep: bool = True): device data and mask. """ - return column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) + return super().copy(deep=False) @property def start_offset(self) -> int: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 36bcc6f8aab..b1300af031c 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -101,14 +101,7 @@ def copy(self, deep=True): device data and mask. """ - result = cudf.core.column.build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ) + result = super().copy(deep=False) if deep: result = result._rename_fields(self.dtype.fields.keys()) return result diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index c53941a75de..ee7c10d607a 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -806,8 +806,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): lchildren = reduce(operator.add, lchildren) rchildren = reduce(operator.add, rchildren) - lptrs = [child.base_data.ptr for child in lchildren] - rptrs = [child.base_data.ptr for child in rchildren] + lptrs = [child.base_data._ptr for child in lchildren] + rptrs = [child.base_data._ptr for child in rchildren] assert all((x == y) for x, y in zip(lptrs, rptrs)) From 751294b9aca2da074cb5e313d9d2667bb2c4afde Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 10:03:01 -0800 Subject: [PATCH 100/124] fix categorical copy --- python/cudf/cudf/core/column/categorical.py | 10 +++++++++- python/cudf/cudf/testing/_utils.py | 5 +++++ python/cudf/cudf/tests/test_index.py | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 646db531fb6..4b53c3ccd92 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -733,7 +733,7 @@ def categories(self) -> ColumnBase: @categories.setter def categories(self, value): - self.dtype = CategoricalDtype( + self._dtype = CategoricalDtype( categories=value, ordered=self.dtype.ordered ) @@ -1274,6 +1274,14 @@ def _get_decategorized_column(self) -> ColumnBase: out = out.set_mask(self.mask) return out + def copy(self, deep: bool = True) -> CategoricalColumn: + result_col = super().copy(deep=deep) + if deep: + result_col.categories = libcudf.copying.copy_column( + self.dtype._categories + ) + return result_col + @cached_property def memory_usage(self) -> int: return self.categories.memory_usage + self.codes.memory_usage diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index cbaf47a4c68..c27e1eaab36 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -346,6 +346,11 @@ def get_ptr(x) -> int: assert len(lhs.base_children) == len(rhs.base_children) for lhs_child, rhs_child in zip(lhs.base_children, rhs.base_children): assert_column_memory_eq(lhs_child, rhs_child) + if isinstance(lhs, cudf.core.column.CategoricalColumn) and isinstance( + rhs, cudf.core.column.CategoricalColumn + ): + assert_column_memory_eq(lhs.categories, rhs.categories) + assert_column_memory_eq(lhs.codes, rhs.codes) def assert_column_memory_ne( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 77913f3e459..32d83bb1c83 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -392,6 +392,7 @@ def test_index_copy_category(name, dtype, deep=True): with pytest.warns(FutureWarning): cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + assert_column_memory_ne(cidx._values, cidx_copy._values) assert_eq(pidx_copy, cidx_copy) From c0fe9554ef333f58efe0a2d1f7f44418e1a13c09 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 10:27:42 -0800 Subject: [PATCH 101/124] refactor option validators --- python/cudf/cudf/options.py | 54 ++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index d543f28f63b..2e86e1d6001 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -150,39 +150,31 @@ def _validator(val): return _validator -def _make_cow_validator(valid_options): - def _validator(val): - if get_option("spill") and val: - raise ValueError( - "Copy-on-write is not supported when spilling is enabled. " - "Please set `spill` to `False`" - ) - if val not in valid_options: - raise ValueError( - f"{val} is not a valid option. " - f"Must be one of {set(valid_options)}." - ) - - return _validator +def _cow_validator(val): + if get_option("spill") and val: + raise ValueError( + "Copy-on-write is not supported when spilling is enabled. " + "Please set `spill` to `False`" + ) + if val not in {False, True}: + raise ValueError( + f"{val} is not a valid option. " f"Must be one of {{False, True}}." + ) -def _make_spill_validator(valid_options): - def _validator(val): - try: - if get_option("copy_on_write") and val: - raise ValueError( - "Spilling is not supported when Copy-on-write is enabled. " - "Please set `copy_on_write` to `False`" - ) - except KeyError: - pass - if val not in valid_options: +def _spill_validator(val): + try: + if get_option("copy_on_write") and val: raise ValueError( - f"{val} is not a valid option. " - f"Must be one of {set(valid_options)}." + "Spilling is not supported when Copy-on-write is enabled. " + "Please set `copy_on_write` to `False`" ) - - return _validator + except KeyError: + pass + if val not in {False, True}: + raise ValueError( + f"{val} is not a valid option. " f"Must be one of {{False, True}}." + ) def _integer_validator(val): @@ -249,7 +241,7 @@ def _integer_and_none_validator(val): \tValid values are True or False. Default is False. """ ), - _make_spill_validator([False, True]), + _spill_validator, ) @@ -267,7 +259,7 @@ def _integer_and_none_validator(val): \tValid values are True or False. Default is False. """ ), - _make_cow_validator([False, True]), + _cow_validator, ) From 2237af2e97790218d1f1a82381bdd033625fee8c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 10:28:10 -0800 Subject: [PATCH 102/124] fix --- python/cudf/cudf/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 2e86e1d6001..d98b194722a 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -158,7 +158,7 @@ def _cow_validator(val): ) if val not in {False, True}: raise ValueError( - f"{val} is not a valid option. " f"Must be one of {{False, True}}." + f"{val} is not a valid option. Must be one of {{False, True}}." ) @@ -173,7 +173,7 @@ def _spill_validator(val): pass if val not in {False, True}: raise ValueError( - f"{val} is not a valid option. " f"Must be one of {{False, True}}." + f"{val} is not a valid option. Must be one of {{False, True}}." ) From e2900d82dda9d61ddcfe6205e1fc520893237d0a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 10:47:05 -0800 Subject: [PATCH 103/124] update comments --- python/cudf/cudf/core/buffer/buffer.py | 4 +--- python/cudf/cudf/core/column/column.py | 3 +++ python/cudf/cudf/core/column/lists.py | 7 ++----- python/cudf/cudf/core/column/string.py | 8 ++------ python/cudf/cudf/core/column/struct.py | 8 ++------ 5 files changed, 10 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 349d367dce7..f86bd1be229 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -220,9 +220,7 @@ def owner(self) -> Any: @property def __cuda_array_interface__(self) -> dict: - """ - Implementation for the CUDA Array Interface. - """ + """Implementation for the CUDA Array Interface.""" return { "data": (self.ptr, False), "shape": (self.size,), diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e3f5c8ec7dd..449389aa5b4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -408,6 +408,9 @@ def copy(self: T, deep: bool = True) -> T: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and copies the references of the data and mask. + + Note : Only Fixed width columns are mutable i.e., which support + creation of a `mutable_view`. """ if deep: return self.force_deep_copy() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index aa407e06433..92bb0d2e4c5 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -199,11 +199,8 @@ def _with_type_metadata( return self def copy(self, deep: bool = True): - """ - List columns are immutable, so both deep - and shallow copies share the underlying - device data and mask. - """ + # Since list columns are immutable, both deep and shallow copies share + # the underlying device data and mask. return super().copy(deep=False) def leaves(self): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ed97c7af6ae..da717edf0e2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5252,12 +5252,8 @@ def __init__( self._end_offset = None def copy(self, deep: bool = True): - """ - String columns are immutable, so both deep - and shallow copies share the underlying - device data and mask. - """ - + # Since string columns are immutable, both deep + # and shallow copies share the underlying device data and mask. return super().copy(deep=False) @property diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index b1300af031c..6838d711641 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -95,12 +95,8 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def copy(self, deep=True): - """ - Struct columns are immutable, so both deep - and shallow copies share the underlying - device data and mask. - """ - + # Since struct columns are immutable, both deep and + # shallow copies share the underlying device data and mask. result = super().copy(deep=False) if deep: result = result._rename_fields(self.dtype.fields.keys()) From 5cacc1f504f849845e7d6f93b8a298ea03d8cb07 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 12:39:17 -0800 Subject: [PATCH 104/124] rename --- docs/cudf/source/developer_guide/library_design.md | 4 ++-- docs/cudf/source/user_guide/copy-on-write.md | 8 ++++---- python/cudf/cudf/core/buffer/cow_buffer.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index a7d15b810a3..fc944773028 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -343,11 +343,11 @@ someone accesses `__cuda_array_interface__` of `Column` or a `CopyOnWriteBuffer` `Column/Buffer._unlink_shared_buffers` which will ensure a true copy of underlying device data is made and unlinks itself from pointing to the original device memory. We also mark the `Column`/`CopyOnWriteBuffer` as `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy -rather than a copy-on-write shallow copy with weak-references. +rather than a copy-on-write shallow copy with weak references. Notes: -1. Weak-references are implemented only for fixed-width data types as these are only column +1. Weak references are implemented only for fixed-width data types as these are only column types that can be mutated in place. 2. Deep copies of variable width data types return shallow-copies of the Columns, because these types don't support real in-place mutations to the data. We just mimic in such a way that it looks diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 4685961701e..c7001c835b3 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -45,9 +45,9 @@ dtype: int64 2 3 3 4 dtype: int64 ->>> series.data.ptr +>>> series.data._ptr 140102175031296 ->>> copied_series.data.ptr +>>> copied_series.data._ptr 140102175031296 ``` @@ -82,8 +82,8 @@ different device objects: ````{Warning} When ``copy_on_write`` is enabled, all of the shallow copies are constructed with -weak-references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` -or `series.data.__cuda_array_interface__` which will then take care of unlinking any existing weak-references that a column contains. +weak references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` +or `series.data.__cuda_array_interface__` which will then take care of unlinking any existing weak references that a column contains. ```` ## Notes diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 212f42fa9c7..d5f37675db3 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -139,7 +139,7 @@ def copy(self, deep: bool = True): @property def __cuda_array_interface__(self) -> dict: - # Unlink if there are any weak-references. + # Unlink if there are any weak references. self._unlink_shared_buffers() # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write From 4387eacacf10dec97243891482b4c1c9da1439b4 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 12:42:28 -0800 Subject: [PATCH 105/124] add more clarification --- docs/cudf/source/developer_guide/library_design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index fc944773028..8182f1d22c8 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -351,7 +351,7 @@ Notes: types that can be mutated in place. 2. Deep copies of variable width data types return shallow-copies of the Columns, because these types don't support real in-place mutations to the data. We just mimic in such a way that it looks -like an in-place operation. +like an in-place operation using `ColumnBase\Series\DataFrame._mimic_inplace`. ### Examples From 21f93fb2ca007c38936804e5e2baa7d741372871 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 12:59:38 -0800 Subject: [PATCH 106/124] add to advantages --- docs/cudf/source/user_guide/copy-on-write.md | 39 +++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index c7001c835b3..187f4bfa5a9 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -93,8 +93,45 @@ the original object it was viewing and thus a separate copy is created and then ## Advantages -With copy-on-write enabled and by requesting `.copy(deep=False)`, the GPU memory usage can be reduced drastically if you are not performing +1. With copy-on-write enabled and by requesting `.copy(deep=False)`, the GPU memory usage can be reduced drastically if you are not performing write operations on all of those copies. This will also increase the speed at which objects are created for execution of your ETL workflow. +2. With the concept of views going away, every object is a copy of it's original object. This will bring consistency across operations and cudf closer to parity with +pandas. Following is one of the inconsistency: + +```python + +>>> import pandas as pd +>>> s = pd.Series([1, 2, 3, 4, 5]) +>>> s_view = s[0:2] +>>> s_view[0] = 10 +>>> s_view +0 10 +1 2 +dtype: int64 +>>> s +0 10 +1 2 +2 3 +3 4 +4 5 +dtype: int64 + +>>> import cudf +>>> s = cudf.Series([1, 2, 3, 4, 5]) +>>> s_view = s[0:2] +>>> s_view[0] = 10 +>>> s_view +0 10 +1 2 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 +``` + ## How to disable it From 662756ae438cf8ee2efa9fbff67d65cc705414bc Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 13:07:52 -0800 Subject: [PATCH 107/124] add weakref url --- docs/cudf/source/developer_guide/library_design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 8182f1d22c8..b16d46ef00b 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -325,7 +325,7 @@ Copy-on-write(COW) is designed to reduce memory footprint on GPUs. With this fea there is a write operation on a column. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate weakreferences of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate [weakreferences](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. This is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` i.e., if they are all pointing to the same device memory. From e78871ccd0024d6c65916e6267df1e9a74343b57 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 13:51:51 -0800 Subject: [PATCH 108/124] Handle slice operation properly --- python/cudf/cudf/core/buffer/cow_buffer.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index d5f37675db3..36d42abd7df 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -101,12 +101,19 @@ def mutable_ptr(self) -> int: def _getitem(self, offset: int, size: int) -> Buffer: """ - Sub-classes can overwrite this to implement __getitem__ - without having to handle non-slice inputs. + Helper for `__getitem__` + + Returns the same underlying memory pointer if offset is 0 + and size == self.size, else makes a copy to return the + slice. """ - return self._from_device_memory( + if offset != 0 or self.size != size: + buf = self.copy(deep=True) + else: + buf = self + return buf._from_device_memory( cuda_array_interface_wrapper( - ptr=self._ptr + offset, size=size, owner=self.owner + ptr=buf._ptr + offset, size=size, owner=buf.owner ) ) From d02a9365d6b476186d19f9c354d8fea01eca8b4e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 9 Jan 2023 14:05:46 -0800 Subject: [PATCH 109/124] add a table --- docs/cudf/source/developer_guide/library_design.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index b16d46ef00b..ce1fe88ab93 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -354,6 +354,12 @@ types don't support real in-place mutations to the data. We just mimic in such a like an in-place operation using `ColumnBase\Series\DataFrame._mimic_inplace`. +| | Copy-on-Write is `ON` | Copy-on-Write is `OFF` | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `.copy(deep=True)` | A true physical copies are made and changes don't propagate to the original object. | A true physical copies are made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy and then the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes done to one will propagate to the other object. | + + ### Examples When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` will not From 3f05a635976fb48c4df9bf617ea5a784fd460df1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 11 Jan 2023 15:08:30 -0600 Subject: [PATCH 110/124] Apply suggestions from code review Co-authored-by: Lawrence Mitchell Co-authored-by: Mads R. B. Kristensen --- .../source/developer_guide/library_design.md | 43 +++++++++---------- python/cudf/cudf/core/buffer/buffer.py | 6 +-- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index ce1fe88ab93..91f0d80e9c6 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -321,27 +321,26 @@ However, for performance reasons they frequently access internal attributes and ## Copy-on-write -Copy-on-write(COW) is designed to reduce memory footprint on GPUs. With this feature, a copy(`.copy(deep=False)`) is only really made whenever +Copy-on-write (COW) is designed to reduce memory footprint on GPUs. With this feature, a copy (`.copy(deep=False)`) is only really made whenever there is a write operation on a column. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate [weakreferences](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. -This is a defaultdict whose key-value pairs consist of `(ptr, size)` as key and `WeakSet` as value containing weakreferences to `CopyOnWriteBuffer`. This +With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +This is a mapping from `(ptr, size)` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` i.e., if they are all pointing to the same device memory. -When the cudf option ``copy_on_write`` is ``True``, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the -mechanisms to enable copy-on-write for all Buffers. When a `CopyOnWriteBuffer` is created, its weakref is generated and added to the `WeakSet` which is in turn stored in the `defaultdict`. This will later serve as an indication of whether or not to make a copy when a -write operation is performed on `Column`(more on that below). +When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the +mechanisms to enable copy-on-write for all buffers. When a `CopyOnWriteBuffer` is created, its weakref is generated and added to the `WeakSet` which is in turn stored in `CopyOnWriterBuffer._instances`. This will later serve as an indication of whether or not to make a copy when a +when write operation is performed on a `Column` (see below). -There is a case when copy-on-write will be inactive and return true copies even though the cudf option `copy_on_write` is `True`: +### Eager copies when exposing to third-party libraries -Whenever a `Column`/`CopyOnWriteBuffer` are zero-copied to a third-party library via `__cuda_array_interface__`, it -is technically not possible to know if the device data is modified without introspection. Hence whenever -someone accesses `__cuda_array_interface__` of `Column` or a `CopyOnWriteBuffer`, we trigger -`Column/Buffer._unlink_shared_buffers` which will ensure a true copy of underlying device data is made and -unlinks itself from pointing to the original device memory. We also mark the `Column`/`CopyOnWriteBuffer` as +If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred without introspection. Hence whenever +someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling +`_unlink_shared_buffers` which ensures a true copy of underlying device data is made and +unlinks the buffer from any shared "weak" references. Any future shallow-copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object), to handle this we also mark the `Column`/`CopyOnWriteBuffer` as `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak references. @@ -351,19 +350,19 @@ Notes: types that can be mutated in place. 2. Deep copies of variable width data types return shallow-copies of the Columns, because these types don't support real in-place mutations to the data. We just mimic in such a way that it looks -like an in-place operation using `ColumnBase\Series\DataFrame._mimic_inplace`. +like an in-place operation using `_mimic_inplace`. -| | Copy-on-Write is `ON` | Copy-on-Write is `OFF` | +| | Copy-on-Write enabled | Copy-on-Write disabled (default) | |---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `.copy(deep=True)` | A true physical copies are made and changes don't propagate to the original object. | A true physical copies are made and changes don't propagate to the original object. | -| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy and then the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes done to one will propagate to the other object. | +| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | ### Examples -When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` will not -eagerly create a copy of the data. Instead, it will produce a view that will be lazily +When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` does not +eagerly create a copy of the data. Instead, it produces a view that will be lazily copied when a write operation is performed on any of its copies. Let's create a series: @@ -384,7 +383,7 @@ Make another copy, but of `s2`: >>> s3 = s2.copy(deep=False) ``` -Viewing the data & memory addresses show that they all point to the same device memory: +Viewing the data and memory addresses show that they all point to the same device memory: ```python >>> s1 0 1 @@ -438,7 +437,7 @@ dtype: int64 dtype: int64 ``` -If we inspect the memory address of the data, `s1` & `s3` will still share the same address but `s2` will have a new one: +If we inspect the memory address of the data, `s1` and `s3` still share the same address but `s2` has a new one: ```python >>> s1.data.ptr @@ -450,7 +449,7 @@ If we inspect the memory address of the data, `s1` & `s3` will still share the s ``` Now, performing write operation on `s1` will trigger a new copy on device memory as there -is a weakreference being shared in `s3`: +is a weak reference being shared in `s3`: ```python >>> s1[0:2] = 11 @@ -474,7 +473,7 @@ dtype: int64 dtype: int64 ``` -If we inspect the memory address of the data, `s2` & `s3` addresses will remain untouched, but `s1` memory address will change because of a copy operation performed during the writing: +If we inspect the memory address of the data, the addresses of `s2` and `s3` remain unchanged, but `s1`'s memory address has changed because of a copy operation performed during the writing: ```python >>> s2.data.ptr diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index f86bd1be229..ac991ee979a 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -178,8 +178,8 @@ def copy(self, deep: bool = True): Parameters ---------- deep : bool, default True - If True, returns a deep-copy of the underlying Buffer data. - If False, returns a shallow-copy of the Buffer pointing to + If True, returns a deep copy of the underlying Buffer data. + If False, returns a shallow copy of the Buffer pointing to the same underlying data. Returns @@ -222,7 +222,7 @@ def owner(self) -> Any: def __cuda_array_interface__(self) -> dict: """Implementation for the CUDA Array Interface.""" return { - "data": (self.ptr, False), + "data": (self.ptr, True), "shape": (self.size,), "strides": None, "typestr": "|u1", From 59ac57ebb673a3b179fb00fd472555ead6b2d6a6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Jan 2023 13:47:33 -0800 Subject: [PATCH 111/124] Add self._instances --- python/cudf/cudf/core/buffer/cow_buffer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 36d42abd7df..7368a0427aa 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -45,6 +45,7 @@ class CopyOnWriteBuffer(Buffer): def _finalize_init(self): key = (self._ptr, self._size) self.__class__._instances[key].add(self) + self._instances = self.__class__._instances[key] self._zero_copied = False weakref.finalize(self, _keys_cleanup, self._ptr, self._size) @@ -84,7 +85,7 @@ def _is_shared(self): """ Return `True` if `self`'s memory is shared with other columns. """ - return len(self.__class__._instances[(self._ptr, self._size)]) > 1 + return len(self._instances) > 1 @property def ptr(self) -> int: From 2a2876fc02bcc9253fdb7921bbd242954aa5c0da Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Jan 2023 13:48:45 -0800 Subject: [PATCH 112/124] Add docstring --- python/cudf/cudf/core/buffer/cow_buffer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 7368a0427aa..e5f2465a526 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -61,6 +61,8 @@ def _from_device_memory( ---------- data : device-buffer-like An object implementing the CUDA Array Interface. + exposed : bool, optional + Mark the buffer as zero copied. Returns ------- From 2fcb1f0372501d56ff5dcb9e91caa9d4d1df75f8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Jan 2023 14:41:37 -0800 Subject: [PATCH 113/124] Add _get_cuda_array_interface --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/core/buffer/buffer.py | 7 +++++-- python/cudf/cudf/core/buffer/cow_buffer.py | 15 +++++++++++---- python/cudf/cudf/core/column/column.py | 4 ++-- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9fdbb031271..636d06dfb81 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -220,7 +220,7 @@ cdef class Column: mask = None elif type(value_cai) is property: if isinstance(value, CopyOnWriteBuffer): - value = value._cuda_array_interface_readonly + value = value._get_readonly_proxy_obj if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ac991ee979a..e98b2bb6fbe 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -221,8 +221,11 @@ def owner(self) -> Any: @property def __cuda_array_interface__(self) -> dict: """Implementation for the CUDA Array Interface.""" + return self._get_cuda_array_interface(readonly=False) + + def _get_cuda_array_interface(self, readonly=False): return { - "data": (self.ptr, True), + "data": (self.ptr, readonly), "shape": (self.size,), "strides": None, "typestr": "|u1", @@ -230,7 +233,7 @@ def __cuda_array_interface__(self) -> dict: } @property - def _cuda_array_interface_readonly(self) -> dict: + def _get_readonly_proxy_obj(self) -> dict: """ Internal Implementation for the CUDA Array Interface which is read-only. diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index e5f2465a526..34cf953faa1 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -159,12 +159,19 @@ def __cuda_array_interface__(self) -> dict: # has modified the data this Buffer is # pointing to. self._zero_copied = True - result = self._cuda_array_interface_readonly - result["data"] = (self._ptr, False) - return result + return self._get_cuda_array_interface(readonly=False) + + def _get_cuda_array_interface(self, readonly=False): + return { + "data": (self._ptr, readonly), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } @property - def _cuda_array_interface_readonly(self) -> dict: + def _get_readonly_proxy_obj(self) -> dict: """ Internal Implementation for the CUDA Array Interface which is read-only. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 449389aa5b4..c6be6522665 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -127,7 +127,7 @@ def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": without triggering a deep-copy. """ return cuda.as_cuda_array( - self.data._cuda_array_interface_readonly + self.data._get_readonly_proxy_obj if self.data is not None else None ).view(self.dtype) @@ -139,7 +139,7 @@ def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": without triggering a deep-copy. """ return cuda.as_cuda_array( - self.mask._cuda_array_interface_readonly + self.mask._get_readonly_proxy_obj if self.mask is not None else None ).view(mask_dtype) From 201d423984a1c3f87a0e1ac979e70716a95dcad2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Jan 2023 16:03:46 -0800 Subject: [PATCH 114/124] skip copy for host objects --- python/cudf/cudf/core/series.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 659e4ab452b..74dfb69593e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -544,12 +544,20 @@ def __init__( data = {} if not isinstance(data, ColumnBase): + has_cai = ( + type( + inspect.getattr_static( + data, "__cuda_array_interface__", None + ) + ) + is property + ) data = column.as_column( data, nan_as_null=nan_as_null, dtype=dtype, ) - if copy: + if copy and has_cai: data = data.copy(deep=True) else: if dtype is not None: From f123ad813e4dc2e3c11303707c07b64916b62d8e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 10:39:45 -0800 Subject: [PATCH 115/124] simplify copy --- python/cudf/cudf/core/column/column.py | 51 ++++++++++---------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c6be6522665..228e902bac9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -415,38 +415,25 @@ def copy(self: T, deep: bool = True) -> T: if deep: return self.force_deep_copy() else: - if cudf.get_option("copy_on_write"): - - copied_col = cast( - T, - build_column( - data=self.base_data - if self.base_data is None - else self.base_data.copy(deep=deep), - dtype=self.dtype, - mask=self.base_mask - if self.base_mask is None - else self.base_mask.copy(deep=deep), - size=self.size, - offset=self.offset, - children=tuple( - col.copy(deep=False) for col in self.base_children - ), - ), - ) - return copied_col - else: - return cast( - T, - build_column( - self.base_data, - self.dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - children=self.base_children, - ), - ) + return cast( + T, + build_column( + data=self.base_data + if self.base_data is None + else self.base_data.copy(deep=False), + dtype=self.dtype, + mask=self.base_mask + if self.base_mask is None + else self.base_mask.copy(deep=False), + size=self.size, + offset=self.offset, + children=tuple( + col.copy(deep=False) for col in self.base_children + ) + if cudf.get_option("copy_on_write") + else self.base_children, + ), + ) def view(self, dtype: Dtype) -> ColumnBase: """ From 3f65c7a9de4e011040b175b4adf8323e0804b996 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 10:57:53 -0800 Subject: [PATCH 116/124] docstring update --- python/cudf/cudf/core/column/column.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 228e902bac9..1f0b675f536 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -405,12 +405,22 @@ def force_deep_copy(self: T) -> T: return cast(T, result._with_type_metadata(self.dtype)) def copy(self: T, deep: bool = True) -> T: - """Columns are immutable, so a deep copy produces a copy of the - underlying data and mask and a shallow copy creates a new column and - copies the references of the data and mask. + """ + Makes a copy of the Column. - Note : Only Fixed width columns are mutable i.e., which support - creation of a `mutable_view`. + Parameters + ---------- + deep : bool, default True + If True, a true physical copy of the column + is made. + If False and `copy_on_write` is False, the same + memory is shared between the buffers of the Column + and changes made to one Column will propagate to + it's copy and vice-versa. + If False and `copy_on_write` is True, the same + memory is shared between the buffers of the Column + until there is a write operation being performed on + them. """ if deep: return self.force_deep_copy() From 9a16606b441248310c6830ffa677f6a9e508e3fa Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 11:09:58 -0800 Subject: [PATCH 117/124] add test --- python/cudf/cudf/tests/test_copying.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 4e974ce3017..14521678868 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -227,6 +227,13 @@ def test_series_zero_copy(copy_on_write): else: assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) + s4 = cudf.Series([10, 20, 30, 40, 50]) + s5 = cudf.Series(s4) + assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) + s5[0:2] = 1 + assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) + assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + @pytest.mark.parametrize("copy_on_write", [True, False]) def test_series_str_copy(copy_on_write): From 71f3ff4bae9b3efcf2f7803953fade8ffe4947ff Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 11:17:11 -0800 Subject: [PATCH 118/124] flip if condition --- python/cudf/cudf/core/buffer/cow_buffer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 34cf953faa1..2eb7fe60021 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -135,17 +135,17 @@ def copy(self, deep: bool = True): ------- Buffer """ - if not deep and not self._zero_copied: + if deep or self._zero_copied: + return self._from_device_memory( + rmm.DeviceBuffer(ptr=self._ptr, size=self.size) + ) + else: copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) copied_buf._ptr = self._ptr copied_buf._size = self._size copied_buf._owner = self._owner copied_buf._finalize_init() return copied_buf - else: - return self._from_device_memory( - rmm.DeviceBuffer(ptr=self._ptr, size=self.size) - ) @property def __cuda_array_interface__(self) -> dict: From 71c4473384b328c35bfe7cc09f42ce77a36b8db6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 12:22:36 -0800 Subject: [PATCH 119/124] add comment --- python/cudf/cudf/_lib/column.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 636d06dfb81..572fe8aad55 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -409,6 +409,10 @@ cdef class Column: spill_lock=get_spill_lock() ) else: + # Shouldn't access `.ptr`, because in case + # of `CopyOnWriteBuffer` that could trigger + # a copy, which isn't required to create a + # view that is read only. data = (col.base_data._ptr) cdef Column child_column From 863a7ae788a60d994367177d9e564643c874ec9c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 12:34:34 -0800 Subject: [PATCH 120/124] add more docstring --- python/cudf/cudf/core/buffer/cow_buffer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 2eb7fe60021..6bc51576c70 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -91,14 +91,24 @@ def _is_shared(self): @property def ptr(self) -> int: - """Device pointer to the start of the buffer.""" + """Device pointer to the start of the buffer. + + This will trigger a deep copy if there are any weak references. + The Buffer would be marked as zero copied. + """ self._unlink_shared_buffers() self._zero_copied = True return self._ptr @property def mutable_ptr(self) -> int: - """Device pointer to the start of the buffer.""" + """Device pointer to the start of the buffer. + + This will trigger a deep copy if there are any weak references. + """ + # Shouldn't need to mark the Buffer as zero copied, + # because this API is used by libcudf only to create + # mutable views. self._unlink_shared_buffers() return self._ptr From c230e94025d3d0da67ff9f336b7aec636d0a42fe Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 12 Jan 2023 14:34:37 -0600 Subject: [PATCH 121/124] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/buffer/cow_buffer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 34cf953faa1..81e5616d0f1 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -21,8 +21,8 @@ def _keys_cleanup(ptr, size): and next(iter(weak_set_values.data))() is None ): # When the last remaining reference is being cleaned up we will still - # have a dead weak-reference in `weak_set_values`, if that is the case - # we are good to perform the key's cleanup + # have a dead reference in `weak_set_values`. If that is the case, then + # we can safely clean up the key del CopyOnWriteBuffer._instances[(ptr, size)] From 6e51a5a06b4bbd71d098ee91333ccd01b46d91d5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 13:50:52 -0800 Subject: [PATCH 122/124] update and address docs reviews --- .../source/developer_guide/library_design.md | 40 +++++----- docs/cudf/source/user_guide/copy-on-write.md | 78 ++++++++++++------- 2 files changed, 72 insertions(+), 46 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 91f0d80e9c6..e27b96c55a2 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -322,12 +322,14 @@ However, for performance reasons they frequently access internal attributes and Copy-on-write (COW) is designed to reduce memory footprint on GPUs. With this feature, a copy (`.copy(deep=False)`) is only really made whenever -there is a write operation on a column. +there is a write operation on a column. It is first recommended to see +the public usage [here](copy-on-write-user-doc) of this functionality before reading through the internals +below. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer.ptr` and `CopyOnWriteBuffer.size` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. -This is a mapping from `(ptr, size)` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This -means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` & `.size` +With the help of `CopyOnWriteBuffer.ptr` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +This is a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This +means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` i.e., if they are all pointing to the same device memory. When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the @@ -338,7 +340,7 @@ when write operation is performed on a `Column` (see below). ### Eager copies when exposing to third-party libraries If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred without introspection. Hence whenever -someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling +someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling `_unlink_shared_buffers` which ensures a true copy of underlying device data is made and unlinks the buffer from any shared "weak" references. Any future shallow-copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object), to handle this we also mark the `Column`/`CopyOnWriteBuffer` as `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy @@ -353,12 +355,6 @@ types don't support real in-place mutations to the data. We just mimic in such a like an in-place operation using `_mimic_inplace`. -| | Copy-on-Write enabled | Copy-on-Write disabled (default) | -|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | -| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | - - ### Examples When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` does not @@ -404,11 +400,11 @@ dtype: int64 3 4 dtype: int64 ->>> s1.data.ptr +>>> s1.data._ptr 139796315897856 ->>> s2.data.ptr +>>> s2.data._ptr 139796315897856 ->>> s3.data.ptr +>>> s3.data._ptr 139796315897856 ``` @@ -440,11 +436,11 @@ dtype: int64 If we inspect the memory address of the data, `s1` and `s3` still share the same address but `s2` has a new one: ```python ->>> s1.data.ptr +>>> s1.data._ptr 139796315897856 ->>> s3.data.ptr +>>> s3.data._ptr 139796315897856 ->>> s2.data.ptr +>>> s2.data._ptr 139796315899392 ``` @@ -476,10 +472,14 @@ dtype: int64 If we inspect the memory address of the data, the addresses of `s2` and `s3` remain unchanged, but `s1`'s memory address has changed because of a copy operation performed during the writing: ```python ->>> s2.data.ptr +>>> s2.data._ptr 139796315899392 ->>> s3.data.ptr +>>> s3.data._ptr 139796315897856 ->>> s1.data.ptr +>>> s1.data._ptr 139796315879723 ``` + +cudf Copy-on-write implementation is motivated by pandas Copy-on-write proposal here: +1. [Google doc](https://docs.google.com/document/d/1ZCQ9mx3LBMy-nhwRl33_jgcvWo9IWdEfxDNQ2thyTb0/edit#heading=h.iexejdstiz8u) +2. [Github issue](https://github.com/pandas-dev/pandas/issues/36195) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 187f4bfa5a9..14ca3656250 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -1,8 +1,15 @@ +(copy-on-write-user-doc)= + # Copy-on-write Copy-on-write reduces GPU memory usage when copies(`.copy(deep=False)`) of a column are made. +| | Copy-on-Write enabled | Copy-on-Write disabled (default) | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | + ## How to enable it i. Use `cudf.set_option`: @@ -45,13 +52,9 @@ dtype: int64 2 3 3 4 dtype: int64 ->>> series.data._ptr -140102175031296 ->>> copied_series.data._ptr -140102175031296 ``` -Then, when a write operation is performed on either ``series`` or +When a write operation is performed on either ``series`` or ``copied_series``, a true physical copy of the data is created: ```python @@ -70,21 +73,6 @@ dtype: int64 dtype: int64 ``` -Notice the underlying data these both series objects now point to completely -different device objects: - -```python ->>> series.data.ptr -140102175032832 ->>> copied_series.data.ptr -140102175031296 -``` - -````{Warning} -When ``copy_on_write`` is enabled, all of the shallow copies are constructed with -weak references, and it is recommended to not hand-construct the contents of `__cuda_array_interface__`, instead please use the `series.__cuda_array_interface__` -or `series.data.__cuda_array_interface__` which will then take care of unlinking any existing weak references that a column contains. -```` ## Notes @@ -102,9 +90,9 @@ pandas. Following is one of the inconsistency: >>> import pandas as pd >>> s = pd.Series([1, 2, 3, 4, 5]) ->>> s_view = s[0:2] ->>> s_view[0] = 10 ->>> s_view +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 0 10 1 2 dtype: int64 @@ -118,9 +106,9 @@ dtype: int64 >>> import cudf >>> s = cudf.Series([1, 2, 3, 4, 5]) ->>> s_view = s[0:2] ->>> s_view[0] = 10 ->>> s_view +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 0 10 1 2 >>> s @@ -132,6 +120,44 @@ dtype: int64 dtype: int64 ``` +The above inconsistency is solved when Copy-on-write is enabled: + +```python +>>> import pandas as pd +>>> pd.set_option("mode.copy_on_write", True) +>>> s = pd.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +dtype: int64 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 + + +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +>>> s = cudf.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +dtype: int64 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 +``` ## How to disable it From ab44c9e084ff390652131043c62788ee4dd492ac Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 12 Jan 2023 14:18:56 -0800 Subject: [PATCH 123/124] Address doc reviews --- .../source/developer_guide/library_design.md | 10 ++++++++++ python/cudf/cudf/core/buffer/buffer.py | 3 +-- python/cudf/cudf/core/buffer/cow_buffer.py | 17 +++++++++++++---- .../cudf/cudf/core/buffer/spillable_buffer.py | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index e27b96c55a2..f9a51f005cb 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -346,6 +346,16 @@ unlinks the buffer from any shared "weak" references. Any future shallow-copy re `obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak references. +### How to obtain read-only object? + +A read-only object can be quite useful for operations that will not +mutate the data. This can be achieved by calling `._get_readonly_proxy_obj` +API, this API will return a proxy object that has `__cuda_array_interface__` +implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` +has weak references. It is only recommended to use this API as long as +the objects/arrays created with this proxy object gets cleaned up during +the developer code execution. We currently use this API for device to host +copies like in `ColumnBase._data_array_view` which is used for `Column.values_host`. Notes: 1. Weak references are implemented only for fixed-width data types as these are only column diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 75c0d79f812..5479dc1fd50 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -256,8 +256,7 @@ def _get_cuda_array_interface(self, readonly=False): @property def _get_readonly_proxy_obj(self) -> dict: """ - Internal Implementation for the CUDA Array Interface which is - read-only. + Returns a proxy object with a read-only CUDA Array Interface. """ return cuda_array_interface_wrapper( ptr=self.ptr, diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 6bc51576c70..14aa9307940 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -27,9 +27,16 @@ def _keys_cleanup(ptr, size): class CopyOnWriteBuffer(Buffer): - """A Buffer represents device memory. + """A Copy-on-write buffer that implements Buffer. - Use the factory function `as_buffer` to create a Buffer instance. + This buffer enables making copies of data only when there + is a write operation being performed. + + See `Copy-on-write` section in `library_design.md` for + detailed information on `CopyOnWriteBuffer`. + + Use the factory function `as_buffer` to create a CopyOnWriteBuffer + instance. """ # This dict keeps track of all instances that have the same `ptr` @@ -183,8 +190,10 @@ def _get_cuda_array_interface(self, readonly=False): @property def _get_readonly_proxy_obj(self) -> dict: """ - Internal Implementation for the CUDA Array Interface which is - read-only. + Returns a proxy object with a read-only CUDA Array Interface. + + See `Copy-on-write` section in `library_design.md` for + more information on this API. """ return cuda_array_interface_wrapper( ptr=self._ptr, diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 5f6ca356177..d22fb6fdc20 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -62,7 +62,7 @@ def __getitem__(self, i): class SpillableBuffer(Buffer): - """A spillable buffer that implements DeviceBufferLike. + """A spillable buffer that implements Buffer. This buffer supports spilling the represented data to host memory. Spilling can be done manually by calling `.spill(target="cpu")` but From f4c9114a95bb33f226a32fc1072d96e1f46ed4a4 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Jan 2023 14:19:01 -0600 Subject: [PATCH 124/124] Use only `ptr` as key (#5) * Use only ptr as key * assert ptrs * use self._ptr * Update python/cudf/cudf/_lib/column.pyx --- python/cudf/cudf/_lib/column.pyx | 10 ++++++++- python/cudf/cudf/core/buffer/cow_buffer.py | 25 +++++++--------------- python/cudf/cudf/tests/test_copying.py | 9 ++++++++ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 572fe8aad55..8f09a0f41f4 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -543,6 +543,12 @@ cdef class Column: rmm.DeviceBuffer(ptr=data_ptr, size=(size+offset) * dtype_itemsize) ) + elif column_owner and isinstance(data_owner, CopyOnWriteBuffer): + # TODO: In future, see if we can just pass on the + # CopyOnWriteBuffer reference to another column + # and still create a weak reference. + # With the current design that's not possible. + data = data_owner.copy(deep=False) elif ( # This is an optimization of the most common case where # from_column_view creates a "view" that is identical to @@ -569,7 +575,9 @@ cdef class Column: owner=data_owner, exposed=True, ) - if isinstance(data_owner, SpillableBuffer): + if isinstance(data_owner, CopyOnWriteBuffer): + data_owner.ptr # accessing the pointer marks it exposed. + elif isinstance(data_owner, SpillableBuffer): if data_owner.is_spilled: raise ValueError( f"{data_owner} is spilled, which invalidates " diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index ac75ff5cd47..e322912ed4f 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -14,8 +14,8 @@ T = TypeVar("T", bound="CopyOnWriteBuffer") -def _keys_cleanup(ptr, size): - weak_set_values = CopyOnWriteBuffer._instances[(ptr, size)] +def _keys_cleanup(ptr): + weak_set_values = CopyOnWriteBuffer._instances[ptr] if ( len(weak_set_values) == 1 and next(iter(weak_set_values.data))() is None @@ -23,7 +23,7 @@ def _keys_cleanup(ptr, size): # When the last remaining reference is being cleaned up we will still # have a dead reference in `weak_set_values`. If that is the case, then # we can safely clean up the key - del CopyOnWriteBuffer._instances[(ptr, size)] + del CopyOnWriteBuffer._instances[ptr] class CopyOnWriteBuffer(Buffer): @@ -50,11 +50,10 @@ class CopyOnWriteBuffer(Buffer): _zero_copied: bool def _finalize_init(self): - key = (self._ptr, self._size) - self.__class__._instances[key].add(self) - self._instances = self.__class__._instances[key] + self.__class__._instances[self._ptr].add(self) + self._instances = self.__class__._instances[self._ptr] self._zero_copied = False - weakref.finalize(self, _keys_cleanup, self._ptr, self._size) + weakref.finalize(self, _keys_cleanup, self._ptr) @classmethod def _from_device_memory( @@ -122,18 +121,10 @@ def mutable_ptr(self) -> int: def _getitem(self, offset: int, size: int) -> Buffer: """ Helper for `__getitem__` - - Returns the same underlying memory pointer if offset is 0 - and size == self.size, else makes a copy to return the - slice. """ - if offset != 0 or self.size != size: - buf = self.copy(deep=True) - else: - buf = self - return buf._from_device_memory( + return self._from_device_memory( cuda_array_interface_wrapper( - ptr=buf._ptr + offset, size=size, owner=buf.owner + ptr=self._ptr + offset, size=size, owner=self.owner ) ) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 14521678868..1dd73a69384 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -99,6 +99,15 @@ def test_series_setitem_cow(copy_on_write): assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + new_slice = actual[2:] + assert new_slice._column.base_data._ptr == actual._column.base_data._ptr + new_slice[0:2] = 10 + assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) + if copy_on_write: + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) + def test_multiple_series_cow(): cudf.set_option("copy_on_write", True)