diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index d7416625248..153b116cd33 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -99,10 +99,12 @@ cdef class GroupBy: c_grouped_values = move(c_groups.values) c_group_offsets = c_groups.offsets - grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( - move(c_grouped_keys), - column_names=range(c_grouped_keys.get()[0].num_columns()) - )) + grouped_keys = cudf.core.index._index_from_data( + *data_from_unique_ptr( + move(c_grouped_keys), + column_names=range(c_grouped_keys.get()[0].num_columns()) + ) + ) grouped_values = data_from_unique_ptr( move(c_grouped_values), index_names=values._index_names, @@ -186,7 +188,8 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return result_data, cudf.Index._from_data(grouped_keys) + return result_data, cudf.core.index._index_from_data( + grouped_keys) def scan_internal(self, Table values, aggregations): from cudf.core.column_accessor import ColumnAccessor @@ -264,7 +267,8 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return result_data, cudf.Index._from_data(grouped_keys) + return result_data, cudf.core.index._index_from_data( + grouped_keys) def aggregate(self, Table values, aggregations): """ @@ -311,10 +315,12 @@ cdef class GroupBy: self.c_obj.get()[0].shift(view, offsets, c_fill_values) ) - grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( - move(c_result.first), - column_names=self.keys._column_names - )) + grouped_keys = cudf.core.index._index_from_data( + *data_from_unique_ptr( + move(c_result.first), + column_names=self.keys._column_names + ) + ) shifted, _ = data_from_unique_ptr( move(c_result.second), column_names=values._column_names diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cd258102228..2456aa334e9 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -251,7 +251,7 @@ cdef data_from_unique_ptr( # Frame factories we may want to look for a less dissonant approach # that does not impose performance penalties. The same applies to # data_from_table_view below. - cudf.Index._from_data( + cudf.core.index._index_from_data( { name: columns[i] for i, name in enumerate(index_names) @@ -301,7 +301,8 @@ cdef data_from_table_view( ) ) column_idx += 1 - index = cudf.Index._from_data(dict(zip(index_names, index_columns))) + index = cudf.core.index._index_from_data( + dict(zip(index_names, index_columns))) # Construct the data dict cdef size_type source_column_idx = 0 diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 7eb0c7bdce4..793a5d1d9e8 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -29,3 +29,6 @@ DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] +SeriesOrSingleColumnIndex = Union[ + "cudf.Series", "cudf.core.index.GenericIndex" +] diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index bf296e11178..10bbb620715 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -194,7 +194,7 @@ def wrapped_func(obj): def _union_categoricals( - to_union: List[Union[cudf.Series, cudf.Index]], + to_union: List[Union[cudf.Series, cudf.CategoricalIndex]], sort_categories: bool = False, ignore_order: bool = False, ): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py new file mode 100644 index 00000000000..5f12cbaf21f --- /dev/null +++ b/python/cudf/cudf/core/_base_index.py @@ -0,0 +1,964 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations, division, print_function + +import pickle +from typing import Any, Set + +import cupy +import pandas as pd + +import cudf +from cudf._typing import DtypeObj +from cudf.api.types import is_dtype_equal, is_integer +from cudf.core.abc import Serializable +from cudf.core.column import ColumnBase, column +from cudf.core.column_accessor import ColumnAccessor +from cudf.utils import ioutils +from cudf.utils.dtypes import ( + is_list_like, + is_mixed_with_object_dtype, + is_scalar, + numeric_normalize_types, +) +from cudf.utils.utils import cached_property + + +class BaseIndex(Serializable): + """Base class for all cudf Index types.""" + + dtype: DtypeObj + _accessors: Set[Any] = set() + _data: ColumnAccessor + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + + if method == "__call__" and hasattr(cudf, ufunc.__name__): + func = getattr(cudf, ufunc.__name__) + return func(*inputs) + else: + return NotImplemented + + @cached_property + def _values(self) -> ColumnBase: + raise NotImplementedError + + def copy(self, deep: bool = True) -> BaseIndex: + raise NotImplementedError + + @property + def values(self): + return self._values.values + + def get_loc(self, key, method=None, tolerance=None): + raise NotImplementedError + + def __getitem__(self, key): + raise NotImplementedError() + + def serialize(self): + header = {} + header["index_column"] = {} + # store metadata values of index separately + # Indexes: Numerical/DateTime/String are often GPU backed + header["index_column"], frames = self._values.serialize() + + header["name"] = pickle.dumps(self.name) + header["dtype"] = pickle.dumps(self.dtype) + header["type-serialized"] = pickle.dumps(type(self)) + header["frame_count"] = len(frames) + return header, frames + + def __contains__(self, item): + return item in self._values + + def get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + Index + Calling object, as there is only one level in the Index. + + See Also + -------- + cudf.core.multiindex.MultiIndex.get_level_values : Get values for + a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(["a", "b", "c"]) + >>> idx.get_level_values(0) + StringIndex(['a' 'b' 'c'], dtype='object') + """ + + if level == self.name: + return self + elif is_integer(level): + if level != 0: + raise IndexError( + f"Cannot get level: {level} " f"for index with 1 level" + ) + return self + else: + raise KeyError(f"Requested level with name {level} " "not found") + + @classmethod + def deserialize(cls, header, frames): + h = header["index_column"] + idx_typ = pickle.loads(header["type-serialized"]) + name = pickle.loads(header["name"]) + + col_typ = pickle.loads(h["type-serialized"]) + index = col_typ.deserialize(h, frames[: header["frame_count"]]) + return idx_typ(index, name=name) + + @property + def names(self): + """ + Returns a tuple containing the name of the Index. + """ + return (self.name,) + + @names.setter + def names(self, values): + if not is_list_like(values): + raise ValueError("Names must be a list-like") + + num_values = len(values) + if num_values > 1: + raise ValueError( + "Length of new names must be 1, got %d" % num_values + ) + + self.name = values[0] + + def _clean_nulls_from_index(self): + """ + Convert all na values(if any) in Index object + to `` as a preprocessing step to `__repr__` methods. + + This will involve changing type of Index object + to StringIndex but it is the responsibility of the `__repr__` + methods using this method to replace or handle representation + of the actual types correctly. + """ + if self._values.has_nulls: + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) + else: + return self + + @property + def nlevels(self): + """ + Number of levels. + """ + return 1 + + def _set_names(self, names, inplace=False): + if inplace: + idx = self + else: + idx = self.copy(deep=False) + + idx.names = names + if not inplace: + return idx + + def set_names(self, names, level=None, inplace=False): + """ + Set Index or MultiIndex name. + Able to set new names partially and by level. + + Parameters + ---------- + names : label or list of label + Name(s) to set. + level : int, label or list of int or label, optional + If the index is a MultiIndex, level(s) to set (None for all + levels). Otherwise level must be None. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + cudf.Index.rename : Able to set new names without level. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1, 2, 3, 4]) + >>> idx + Int64Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) + >>> idx.names + FrozenList([None, None]) + >>> idx.set_names(['kind', 'year'], inplace=True) + >>> idx.names + FrozenList(['kind', 'year']) + >>> idx.set_names('species', level=0, inplace=True) + >>> idx.names + FrozenList(['species', 'year']) + """ + if level is not None: + raise ValueError("Level must be None for non-MultiIndex") + + if not is_list_like(names): + names = [names] + + return self._set_names(names=names, inplace=inplace) + + def fillna(self, value, downcast=None): + """ + Fill null values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill nulls. This value cannot be a + list-likes. + + downcast : dict, default is None + This Parameter is currently NON-FUNCTIONAL. + + Returns + ------- + filled : Index + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, None, 4]) + >>> index + Int64Index([1, 2, null, 4], dtype='int64') + >>> index.fillna(3) + Int64Index([1, 2, 3, 4], dtype='int64') + """ + if downcast is not None: + raise NotImplementedError( + "`downcast` parameter is not yet supported" + ) + + return super().fillna(value=value) + + def take(self, indices): + """Gather only the specific subset of indices + + Parameters + ---------- + indices: An array-like that maps to values contained in this Index. + """ + return self[indices] + + def argsort(self, ascending=True, **kwargs): + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + ascending : bool, default True + If True, returns the indices for ascending order. + If False, returns the indices for descending order. + + Returns + ------- + array : A cupy array containing Integer indices that + would sort the index if used as an indexer. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([10, 100, 1, 1000]) + >>> index + Int64Index([10, 100, 1, 1000], dtype='int64') + >>> index.argsort() + array([2, 0, 1, 3], dtype=int32) + + The order of argsort can be reversed using + ``ascending`` parameter, by setting it to ``False``. + >>> index.argsort(ascending=False) + array([3, 1, 0, 2], dtype=int32) + + ``argsort`` on a MultiIndex: + + >>> index = cudf.MultiIndex( + ... levels=[[1, 3, 4, -10], [1, 11, 5]], + ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + ... names=["x", "y"], + ... ) + >>> index + MultiIndex([( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11), + (-10, 1)], + names=['x', 'y']) + >>> index.argsort() + array([4, 0, 1, 2, 3], dtype=int32) + >>> index.argsort(ascending=False) + array([3, 2, 1, 0, 4], dtype=int32) + """ + indices = self._values.argsort(ascending=ascending, **kwargs) + return cupy.asarray(indices) + + def to_frame(self, index=True, name=None): + """Create a DataFrame with a column containing this Index + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index + name : str, default None + Name to be used for the column + + Returns + ------- + DataFrame + cudf DataFrame + """ + + if name is not None: + col_name = name + elif self.name is None: + col_name = 0 + else: + col_name = self.name + return cudf.DataFrame( + {col_name: self._values}, index=self if index else None + ) + + def any(self): + """ + Return whether any elements is True in Index. + """ + return self._values.any() + + def to_pandas(self): + """ + Convert to a Pandas Index. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([-3, 10, 15, 20]) + >>> idx + Int64Index([-3, 10, 15, 20], dtype='int64') + >>> idx.to_pandas() + Int64Index([-3, 10, 15, 20], dtype='int64') + >>> type(idx.to_pandas()) + + >>> type(idx) + + """ + return pd.Index(self._values.to_pandas(), name=self.name) + + @ioutils.doc_to_dlpack() + def to_dlpack(self): + """{docstring}""" + + return cudf.io.dlpack.to_dlpack(self) + + @property + def gpu_values(self): + """ + View the data as a numba device array object + """ + return self._values.data_array_view + + def append(self, other): + """ + Append a collection of Index options together. + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1, 2, 10, 100]) + >>> idx + Int64Index([1, 2, 10, 100], dtype='int64') + >>> other = cudf.Index([200, 400, 50]) + >>> other + Int64Index([200, 400, 50], dtype='int64') + >>> idx.append(other) + Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') + + append accepts list of Index objects + + >>> idx.append([other, other]) + Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') + """ + + if is_list_like(other): + to_concat = [self] + to_concat.extend(other) + else: + this = self + if len(other) == 0: + # short-circuit and return a copy + to_concat = [self] + + other = cudf.Index(other) + + if len(self) == 0: + to_concat = [other] + + if len(self) and len(other): + if is_mixed_with_object_dtype(this, other): + got_dtype = ( + other.dtype + if this.dtype == cudf.dtype("object") + else this.dtype + ) + raise TypeError( + f"cudf does not support appending an Index of " + f"dtype `{cudf.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ) + + if isinstance(self._values, cudf.core.column.NumericalColumn): + if self.dtype != other.dtype: + this, other = numeric_normalize_types(self, other) + to_concat = [this, other] + + for obj in to_concat: + if not isinstance(obj, BaseIndex): + raise TypeError("all inputs must be Index") + + return self._concat(to_concat) + + def difference(self, other, sort=None): + """ + Return a new Index with elements from the index that are not in + `other`. + + This is the set difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by cudf. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + + Returns + ------- + difference : Index + + Examples + -------- + >>> import cudf + >>> idx1 = cudf.Index([2, 1, 3, 4]) + >>> idx1 + Int64Index([2, 1, 3, 4], dtype='int64') + >>> idx2 = cudf.Index([3, 4, 5, 6]) + >>> idx2 + Int64Index([3, 4, 5, 6], dtype='int64') + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') + """ + if sort not in {None, False}: + raise ValueError( + f"The 'sort' keyword only takes the values " + f"of None or False; {sort} was passed." + ) + + other = cudf.Index(other) + + if is_mixed_with_object_dtype(self, other): + difference = self.copy() + else: + difference = self.join(other, how="leftanti") + if self.dtype != other.dtype: + difference = difference.astype(self.dtype) + + if sort is None: + return difference.sort_values() + + return difference + + def sort_values(self, return_indexer=False, ascending=True, key=None): + """ + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. + key : None, optional + This parameter is NON-FUNCTIONAL. + + Returns + ------- + sorted_index : Index + Sorted copy of the index. + indexer : cupy.ndarray, optional + The indices that the index itself was sorted by. + + See Also + -------- + cudf.Series.min : Sort values of a Series. + cudf.DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], + dtype=int32)) + + Sorting values in a MultiIndex: + + >>> midx = cudf.MultiIndex( + ... levels=[[1, 3, 4, -10], [1, 11, 5]], + ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + ... names=["x", "y"], + ... ) + >>> midx + MultiIndex([( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11), + (-10, 1)], + names=['x', 'y']) + >>> midx.sort_values() + MultiIndex([(-10, 1), + ( 1, 1), + ( 1, 5), + ( 3, 11), + ( 4, 11)], + names=['x', 'y']) + >>> midx.sort_values(ascending=False) + MultiIndex([( 4, 11), + ( 3, 11), + ( 1, 5), + ( 1, 1), + (-10, 1)], + names=['x', 'y']) + """ + if key is not None: + raise NotImplementedError("key parameter is not yet implemented.") + + indices = self._values.argsort(ascending=ascending) + index_sorted = cudf.Index(self.take(indices), name=self.name) + + if return_indexer: + return index_sorted, cupy.asarray(indices) + else: + return index_sorted + + def unique(self): + """ + Return unique values in the index. + + Returns + ------- + Index without duplicates + """ + return cudf.Index(self._values.unique(), name=self.name) + + def join( + self, other, how="left", level=None, return_indexers=False, sort=False + ): + """ + Compute join_index and indexers to conform data structures + to the new index. + + Parameters + ---------- + other : Index. + how : {'left', 'right', 'inner', 'outer'} + return_indexers : bool, default False + sort : bool, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword). + + Returns: index + + Examples + -------- + >>> import cudf + >>> lhs = cudf.DataFrame( + ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] + ... ).index + >>> lhs + MultiIndex([(2, 3), + (3, 4), + (1, 2)], + names=['a', 'b']) + >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index + >>> rhs + Int64Index([1, 4, 3], dtype='int64', name='a') + >>> lhs.join(rhs, how='inner') + MultiIndex([(3, 4), + (1, 2)], + names=['a', 'b']) + """ + + if isinstance(self, cudf.MultiIndex) and isinstance( + other, cudf.MultiIndex + ): + raise TypeError( + "Join on level between two MultiIndex objects is ambiguous" + ) + + if level is not None and not is_scalar(level): + raise ValueError("level should be an int or a label only") + + if isinstance(other, cudf.MultiIndex): + if how == "left": + how = "right" + elif how == "right": + how = "left" + rhs = self.copy(deep=False) + lhs = other.copy(deep=False) + else: + lhs = self.copy(deep=False) + rhs = other.copy(deep=False) + + on = level + # In case of MultiIndex, it will be None as + # we don't need to update name + left_names = lhs.names + right_names = rhs.names + # There should be no `None` values in Joined indices, + # so essentially it would be `left/right` or 'inner' + # in case of MultiIndex + if isinstance(lhs, cudf.MultiIndex): + if level is not None and isinstance(level, int): + on = lhs._data.select_by_index(level).names[0] + right_names = (on,) or right_names + on = right_names[0] + if how == "outer": + how = "left" + elif how == "right": + how = "inner" + else: + # Both are nomal indices + right_names = left_names + on = right_names[0] + + lhs.names = left_names + rhs.names = right_names + + output = lhs._merge(rhs, how=how, on=on, sort=sort) + + return output + + def rename(self, name, inplace=False): + """ + Alter Index name. + + Defaults to returning new index. + + Parameters + ---------- + name : label + Name(s) to set. + + Returns + ------- + Index + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3], name='one') + >>> index + Int64Index([1, 2, 3], dtype='int64', name='one') + >>> index.name + 'one' + >>> renamed_index = index.rename('two') + >>> renamed_index + Int64Index([1, 2, 3], dtype='int64', name='two') + >>> renamed_index.name + 'two' + """ + if inplace is True: + self.name = name + return None + else: + out = self.copy(deep=False) + out.name = name + return out.copy(deep=True) + + def astype(self, dtype, copy=False): + """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype + Use a numpy.dtype to cast entire Index object to. + copy : bool, default False + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3]) + >>> index + Int64Index([1, 2, 3], dtype='int64') + >>> index.astype('float64') + Float64Index([1.0, 2.0, 3.0], dtype='float64') + """ + if is_dtype_equal(dtype, self.dtype): + return self.copy(deep=copy) + + return cudf.Index( + self.copy(deep=copy)._values.astype(dtype), name=self.name + ) + + def to_array(self, fillna=None): + """Get a dense numpy array for the data. + + Parameters + ---------- + fillna : str or None + Defaults to None, which will skip null values. + If it equals "pandas", null values are filled with NaNs. + Non integral dtype is promoted to np.float64. + + Notes + ----- + + if ``fillna`` is ``None``, null values are skipped. Therefore, the + output size could be smaller. + """ + return self._values.to_array(fillna=fillna) + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys. + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Dame of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + """ + return cudf.Series( + self._values, + index=self.copy(deep=False) if index is None else index, + name=self.name if name is None else name, + ) + + def get_slice_bound(self, label, side, kind): + """ + Calculate slice bound that corresponds to given label. + Returns leftmost (one-past-the-rightmost if ``side=='right'``) position + of given label. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + int + Index of label. + """ + raise (NotImplementedError) + + def __array_function__(self, func, types, args, kwargs): + + # check if the function is implemented for the current type + cudf_index_module = type(self) + for submodule in func.__module__.split(".")[1:]: + # point cudf_index_module to the correct submodule + if hasattr(cudf_index_module, submodule): + cudf_index_module = getattr(cudf_index_module, submodule) + else: + return NotImplemented + + fname = func.__name__ + + handled_types = [BaseIndex, cudf.Series] + + # check if we don't handle any of the types (including sub-class) + for t in types: + if not any( + issubclass(t, handled_type) for handled_type in handled_types + ): + return NotImplemented + + if hasattr(cudf_index_module, fname): + cudf_func = getattr(cudf_index_module, fname) + # Handle case if cudf_func is same as numpy function + if cudf_func is func: + return NotImplemented + else: + return cudf_func(*args, **kwargs) + + else: + return NotImplemented + + def isin(self, values): + """Return a boolean array where the index values are in values. + + Compute boolean array of whether each index value is found in + the passed set of values. The length of the returned boolean + array matches the length of the index. + + Parameters + ---------- + values : set, list-like, Index + Sought values. + + Returns + ------- + is_contained : cupy array + CuPy array of boolean values. + + Examples + -------- + >>> idx = cudf.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + + Check whether each index value in a list of values. + + >>> idx.isin([1, 4]) + array([ True, False, False]) + """ + + return self._values.isin(values).values + + def memory_usage(self, deep=False): + """ + Memory usage of the values. + + Parameters + ---------- + deep : bool + Introspect the data deeply, + interrogate `object` dtypes for system-level + memory consumption. + + Returns + ------- + bytes used + """ + return self._values._memory_usage(deep=deep) + + @classmethod + def from_pandas(cls, index, nan_as_null=None): + """ + Convert from a Pandas Index. + + Parameters + ---------- + index : Pandas Index object + A Pandas Index object which has to be converted + to cuDF Index. + nan_as_null : bool, Default None + If ``None``/``True``, converts ``np.nan`` values + to ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + + Raises + ------ + TypeError for invalid input type. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> import numpy as np + >>> data = [10, 20, 30, np.nan] + >>> pdi = pd.Index(data) + >>> cudf.Index.from_pandas(pdi) + Float64Index([10.0, 20.0, 30.0, ], dtype='float64') + >>> cudf.Index.from_pandas(pdi, nan_as_null=False) + Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') + """ + if not isinstance(index, pd.Index): + raise TypeError("not a pandas.Index") + + ind = cudf.Index(column.as_column(index, nan_as_null=nan_as_null)) + ind.name = index.name + return ind + + @property + def _constructor_expanddim(self): + return cudf.MultiIndex diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 0688283bc43..ea3c7bfb91f 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -10,7 +10,7 @@ from cudf._typing import ColumnLike, ScalarLike from cudf.core.column import ColumnBase from cudf.core.dataframe import DataFrame -from cudf.core.frame import Frame +from cudf.core.frame import Frame, SingleColumnFrame from cudf.core.index import Index from cudf.core.series import Series @@ -94,9 +94,9 @@ def _check_and_cast_columns_with_other( def _normalize_columns_and_scalars_type( - frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, + frame: Frame, other: Any, inplace: bool = False, ) -> Tuple[ - Union[Series, Index, DataFrame, ColumnLike], Any, + Union[Frame, ColumnLike], Any, ]: """ Try to normalize the other's dtypes as per frame. @@ -177,10 +177,7 @@ def _normalize_columns_and_scalars_type( def where( - frame: Union[Series, Index, DataFrame], - cond: Any, - other: Any = None, - inplace: bool = False, + frame: Frame, cond: Any, other: Any = None, inplace: bool = False, ) -> Optional[Union[Frame]]: """ Replace values where the condition is False. @@ -332,6 +329,7 @@ def where( return frame._mimic_inplace(out_df, inplace=inplace) else: + frame = cast(SingleColumnFrame, frame) if isinstance(other, DataFrame): raise NotImplementedError( "cannot align with a higher dimensional Frame" diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 50ad592b54f..fa6c49284f0 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -6,8 +6,8 @@ from cudf.core.column import as_column from cudf.core.frame import Frame -from cudf.core.index import RangeIndex -from cudf.core.series import Index, Series +from cudf.core.index import Index, RangeIndex +from cudf.core.series import Series def factorize(values, sort=False, na_sentinel=-1, size_hint=None): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7333ae119cd..76dd0683a5a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -37,7 +37,7 @@ ) if TYPE_CHECKING: - from cudf._typing import SeriesOrIndex + from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -104,7 +104,7 @@ class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn - def __init__(self, parent: SeriesOrIndex): + def __init__(self, parent: SeriesOrSingleColumnIndex): if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index a587c58a49d..9bea94cfecb 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -8,7 +8,7 @@ import cudf -ParentType = Union["cudf.Series", "cudf.BaseIndex"] +ParentType = Union["cudf.Series", "cudf.core.index.GenericIndex"] class ColumnMethods: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a739eba71f3..aac0b027c0b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2,6 +2,7 @@ from __future__ import annotations, division +import functools import inspect import itertools import numbers @@ -10,7 +11,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, MutableMapping, Optional, TypeVar +from typing import Any, MutableMapping, Optional, Set, TypeVar import cupy import numpy as np @@ -25,10 +26,15 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf.api.types import is_bool_dtype, is_dict_like +from cudf.api.types import is_bool_dtype, is_dict_like, is_dtype_equal from cudf.core import column, reshape from cudf.core.abc import Serializable -from cudf.core.column import as_column, column_empty +from cudf.core.column import ( + as_column, + build_categorical_column, + column_empty, + concat_columns, +) from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy @@ -47,9 +53,11 @@ is_datetime_dtype, is_list_dtype, is_list_like, + is_numerical_dtype, is_scalar, is_string_dtype, is_struct_dtype, + min_scalar_type, numeric_normalize_types, ) from cudf.utils.utils import GetAttrGetItemMixin @@ -160,7 +168,8 @@ class DataFrame(Frame, Serializable, GetAttrGetItemMixin): 3 3 0.3 """ - _PROTECTED_KEYS = frozenset(("_data", "_index")) + _PROTECTED_KEYS = frozenset(("_column_accessor", "_data", "_index")) + _accessors: Set[Any] = set() @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") def __init__(self, data=None, index=None, columns=None, dtype=None): @@ -1029,6 +1038,209 @@ def assign(self, **kwargs): new[k] = v return new + @classmethod + @annotate("CONCAT", color="orange", domain="cudf_python") + def _concat( + cls, objs, axis=0, join="outer", ignore_index=False, sort=False + ): + # flag to indicate at least one empty input frame also has an index + empty_has_index = False + # length of output frame's RangeIndex if all input frames are empty, + # and at least one has an index + result_index_length = 0 + # the number of empty input frames + num_empty_input_frames = 0 + + for i, obj in enumerate(objs): + # shallow-copy the input DFs in case the same DF instance + # is concatenated with itself + objs[i] = obj.copy(deep=False) + + # If ignore_index is true, determine if + # all or some objs are empty(and have index). + # 1. If all objects are empty(and have index), we + # should set the index separately using RangeIndex. + # 2. If some objects are empty(and have index), we + # create empty columns later while populating `columns` + # variable. Detailed explanation of second case before + # allocation of `columns` variable below. + if ignore_index and obj.empty: + num_empty_input_frames += 1 + result_index_length += len(obj) + empty_has_index = empty_has_index or len(obj) > 0 + + if join == "inner": + sets_of_column_names = [set(obj._column_names) for obj in objs] + + intersecting_columns = functools.reduce( + set.intersection, sets_of_column_names + ) + union_of_columns = functools.reduce( + set.union, sets_of_column_names + ) + non_intersecting_columns = union_of_columns.symmetric_difference( + intersecting_columns + ) + + # Get an ordered list of the intersecting columns to preserve input + # order, which is promised by pandas for inner joins. + ordered_intersecting_columns = [ + name + for obj in objs + for name in obj._column_names + if name in intersecting_columns + ] + + names = dict.fromkeys(ordered_intersecting_columns).keys() + + if axis == 0: + if ignore_index and ( + num_empty_input_frames > 0 + or len(intersecting_columns) == 0 + ): + # When ignore_index is True and if there is + # at least 1 empty dataframe and no + # intersecting columns are present, an empty dataframe + # needs to be returned just with an Index. + empty_has_index = True + num_empty_input_frames = len(objs) + result_index_length = sum(len(obj) for obj in objs) + + # remove columns not present in all objs + for obj in objs: + obj.drop( + columns=non_intersecting_columns, + inplace=True, + errors="ignore", + ) + elif join == "outer": + # Get a list of the unique table column names + names = [name for f in objs for name in f._column_names] + names = dict.fromkeys(names).keys() + + else: + raise ValueError( + "Only can inner (intersect) or outer (union) when joining" + "the other axis" + ) + + if sort: + try: + # Sorted always returns a list, but will fail to sort if names + # include different types that are not comparable. + names = sorted(names) + except TypeError: + names = list(names) + else: + names = list(names) + + # Combine the index and table columns for each Frame into a list of + # [...index_cols, ...table_cols]. + # + # If any of the input frames have a non-empty index, include these + # columns in the list of columns to concatenate, even if the input + # frames are empty and `ignore_index=True`. + columns = [ + ( + [] + if (ignore_index and not empty_has_index) + else list(f._index._data.columns) + ) + + [f._data[name] if name in f._data else None for name in names] + for f in objs + ] + + # Get a list of the combined index and table column indices + indices = list(range(functools.reduce(max, map(len, columns)))) + # The position of the first table colum in each + # combined index + table columns list + first_data_column_position = len(indices) - len(names) + + # Get the non-null columns and their dtypes + non_null_cols, dtypes = _get_non_null_cols_and_dtypes(indices, columns) + + # Infer common dtypes between numeric columns + # and combine CategoricalColumn categories + categories = _find_common_dtypes_and_categories(non_null_cols, dtypes) + + # Cast all columns to a common dtype, assign combined categories, + # and back-fill missing columns with all-null columns + _cast_cols_to_common_dtypes(indices, columns, dtypes, categories) + + # Construct input tables with the index and data columns in the same + # order. This strips the given index/column names and replaces the + # names with their integer positions in the `cols` list + tables = [] + for cols in columns: + table_index = None + if 1 == first_data_column_position: + table_index = cudf.core.index.as_index(cols[0]) + elif first_data_column_position > 1: + table_index = libcudf.table.Table( + data=dict( + zip( + indices[:first_data_column_position], + cols[:first_data_column_position], + ) + ) + ) + tables.append( + libcudf.table.Table( + data=dict( + zip( + indices[first_data_column_position:], + cols[first_data_column_position:], + ) + ), + index=table_index, + ) + ) + + # Concatenate the Tables + out = cls._from_data( + *libcudf.concat.concat_tables(tables, ignore_index) + ) + + # If ignore_index is True, all input frames are empty, and at + # least one input frame has an index, assign a new RangeIndex + # to the result frame. + if empty_has_index and num_empty_input_frames == len(objs): + out._index = cudf.RangeIndex(result_index_length) + # Reassign the categories for any categorical table cols + _reassign_categories( + categories, out._data, indices[first_data_column_position:] + ) + + # Reassign the categories for any categorical index cols + if not isinstance(out._index, cudf.RangeIndex): + _reassign_categories( + categories, + out._index._data, + indices[:first_data_column_position], + ) + if not isinstance( + out._index, cudf.MultiIndex + ) and is_categorical_dtype(out._index._values.dtype): + out = out.set_index( + cudf.core.index.as_index(out.index._values) + ) + + # Reassign precision for any decimal cols + for name, col in out._data.items(): + if isinstance(col, cudf.core.column.Decimal64Column): + col = col._with_type_metadata(tables[0]._data[name].dtype) + + # Reassign index and column names + if isinstance(objs[0].columns, pd.MultiIndex): + out.columns = objs[0].columns + else: + out.columns = names + if not ignore_index: + out._index.name = objs[0]._index.name + out._index.names = objs[0]._index.names + + return out + def astype(self, dtype, copy=False, errors="raise", **kwargs): """ Cast the DataFrame to the given dtype @@ -7295,7 +7507,7 @@ def _get_union_of_indices(indexes): if len(indexes) == 1: return indexes[0] else: - merged_index = cudf.Index._concat(indexes) + merged_index = cudf.core.index.GenericIndex._concat(indexes) merged_index = merged_index.drop_duplicates() _, inds = merged_index._values.sort_by_values() return merged_index.take(inds) @@ -7336,3 +7548,95 @@ def _drop_columns(df: DataFrame, columns: Iterable, errors: str): pass else: raise e + + +# Create a dictionary of the common, non-null columns +def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): + # A mapping of {idx: np.dtype} + dtypes = dict() + # A mapping of {idx: [...columns]}, where `[...columns]` + # is a list of columns with at least one valid value for each + # column name across all input frames + non_null_columns = dict() + for idx in col_idxs: + for cols in list_of_columns: + # Skip columns not in this frame + if idx >= len(cols) or cols[idx] is None: + continue + # Store the first dtype we find for a column, even if it's + # all-null. This ensures we always have at least one dtype + # for each name. This dtype will be overwritten later if a + # non-null Column with the same name is found. + if idx not in dtypes: + dtypes[idx] = cols[idx].dtype + if cols[idx].valid_count > 0: + if idx not in non_null_columns: + non_null_columns[idx] = [cols[idx]] + else: + non_null_columns[idx].append(cols[idx]) + return non_null_columns, dtypes + + +def _find_common_dtypes_and_categories(non_null_columns, dtypes): + # A mapping of {idx: categories}, where `categories` is a + # column of all the unique categorical values from each + # categorical column across all input frames + categories = dict() + for idx, cols in non_null_columns.items(): + # default to the first non-null dtype + dtypes[idx] = cols[0].dtype + # If all the non-null dtypes are int/float, find a common dtype + if all(is_numerical_dtype(col.dtype) for col in cols): + dtypes[idx] = find_common_type([col.dtype for col in cols]) + # If all categorical dtypes, combine the categories + elif all( + isinstance(col, cudf.core.column.CategoricalColumn) for col in cols + ): + # Combine and de-dupe the categories + categories[idx] = ( + cudf.Series(concat_columns([col.categories for col in cols])) + .drop_duplicates(ignore_index=True) + ._column + ) + # Set the column dtype to the codes' dtype. The categories + # will be re-assigned at the end + dtypes[idx] = min_scalar_type(len(categories[idx])) + # Otherwise raise an error if columns have different dtypes + elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): + raise ValueError("All columns must be the same type") + return categories + + +def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): + # Cast all columns to a common dtype, assign combined categories, + # and back-fill missing columns with all-null columns + for idx in col_idxs: + dtype = dtypes[idx] + for cols in list_of_columns: + # If column not in this df, fill with an all-null column + if idx >= len(cols) or cols[idx] is None: + n = len(next(x for x in cols if x is not None)) + cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) + else: + # If column is categorical, rebase the codes with the + # combined categories, and cast the new codes to the + # min-scalar-sized dtype + if idx in categories: + cols[idx] = ( + cols[idx] + ._set_categories(categories[idx], is_unique=True,) + .codes + ) + cols[idx] = cols[idx].astype(dtype) + + +def _reassign_categories(categories, cols, col_idxs): + for name, idx in zip(cols, col_idxs): + if idx in categories: + cols[name] = build_categorical_column( + categories=categories[idx], + codes=as_column(cols[name].base_data, dtype=cols[name].dtype), + mask=cols[name].base_mask, + offset=cols[name].offset, + size=cols[name].size, + ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b6eb3108550..33be14462d4 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,10 +3,18 @@ from __future__ import annotations import copy -import functools import warnings from collections import abc -from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Tuple, + TypeVar, + Union, + cast, +) import cupy import numpy as np @@ -17,13 +25,12 @@ import cudf from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries -from cudf.api.types import is_dict_like, is_dtype_equal, issubdtype +from cudf.api.types import is_dict_like, issubdtype from cudf.core.column import ( ColumnBase, as_column, build_categorical_column, column_empty, - concat_columns, ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge @@ -33,14 +40,10 @@ from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - find_common_type, - is_categorical_dtype, is_column_like, is_decimal_dtype, is_integer_dtype, - is_numerical_dtype, is_scalar, - min_scalar_type, ) T = TypeVar("T", bound="Frame") @@ -60,12 +63,6 @@ class Frame(libcudf.table.Table): _data: "ColumnAccessor" - @classmethod - def __init_subclass__(cls): - # All subclasses contain a set _accessors that is used to hold custom - # accessors defined by user APIs (see cudf/api/extensions/accessor.py). - cls._accessors = set() - @classmethod def _from_data( cls, @@ -326,209 +323,6 @@ def copy(self: T, deep: bool = True) -> T: return new_frame - @classmethod - @annotate("CONCAT", color="orange", domain="cudf_python") - def _concat( - cls, objs, axis=0, join="outer", ignore_index=False, sort=False - ): - # flag to indicate at least one empty input frame also has an index - empty_has_index = False - # length of output frame's RangeIndex if all input frames are empty, - # and at least one has an index - result_index_length = 0 - # the number of empty input frames - num_empty_input_frames = 0 - - for i, obj in enumerate(objs): - # shallow-copy the input DFs in case the same DF instance - # is concatenated with itself - objs[i] = obj.copy(deep=False) - - # If ignore_index is true, determine if - # all or some objs are empty(and have index). - # 1. If all objects are empty(and have index), we - # should set the index separately using RangeIndex. - # 2. If some objects are empty(and have index), we - # create empty columns later while populating `columns` - # variable. Detailed explanation of second case before - # allocation of `columns` variable below. - if ignore_index and obj.empty: - num_empty_input_frames += 1 - result_index_length += len(obj) - empty_has_index = empty_has_index or len(obj) > 0 - - if join == "inner": - sets_of_column_names = [set(obj._column_names) for obj in objs] - - intersecting_columns = functools.reduce( - set.intersection, sets_of_column_names - ) - union_of_columns = functools.reduce( - set.union, sets_of_column_names - ) - non_intersecting_columns = union_of_columns.symmetric_difference( - intersecting_columns - ) - - # Get an ordered list of the intersecting columns to preserve input - # order, which is promised by pandas for inner joins. - ordered_intersecting_columns = [ - name - for obj in objs - for name in obj._column_names - if name in intersecting_columns - ] - - names = dict.fromkeys(ordered_intersecting_columns).keys() - - if axis == 0: - if ignore_index and ( - num_empty_input_frames > 0 - or len(intersecting_columns) == 0 - ): - # When ignore_index is True and if there is - # at least 1 empty dataframe and no - # intersecting columns are present, an empty dataframe - # needs to be returned just with an Index. - empty_has_index = True - num_empty_input_frames = len(objs) - result_index_length = sum(len(obj) for obj in objs) - - # remove columns not present in all objs - for obj in objs: - obj.drop( - columns=non_intersecting_columns, - inplace=True, - errors="ignore", - ) - elif join == "outer": - # Get a list of the unique table column names - names = [name for f in objs for name in f._column_names] - names = dict.fromkeys(names).keys() - - else: - raise ValueError( - "Only can inner (intersect) or outer (union) when joining" - "the other axis" - ) - - if sort: - try: - # Sorted always returns a list, but will fail to sort if names - # include different types that are not comparable. - names = sorted(names) - except TypeError: - names = list(names) - else: - names = list(names) - - # Combine the index and table columns for each Frame into a list of - # [...index_cols, ...table_cols]. - # - # If any of the input frames have a non-empty index, include these - # columns in the list of columns to concatenate, even if the input - # frames are empty and `ignore_index=True`. - columns = [ - ( - [] - if (ignore_index and not empty_has_index) - else list(f._index._data.columns) - ) - + [f._data[name] if name in f._data else None for name in names] - for f in objs - ] - - # Get a list of the combined index and table column indices - indices = list(range(functools.reduce(max, map(len, columns)))) - # The position of the first table colum in each - # combined index + table columns list - first_data_column_position = len(indices) - len(names) - - # Get the non-null columns and their dtypes - non_null_cols, dtypes = _get_non_null_cols_and_dtypes(indices, columns) - - # Infer common dtypes between numeric columns - # and combine CategoricalColumn categories - categories = _find_common_dtypes_and_categories(non_null_cols, dtypes) - - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - _cast_cols_to_common_dtypes(indices, columns, dtypes, categories) - - # Construct input tables with the index and data columns in the same - # order. This strips the given index/column names and replaces the - # names with their integer positions in the `cols` list - tables = [] - for cols in columns: - table_index = None - if 1 == first_data_column_position: - table_index = cudf.core.index.as_index(cols[0]) - elif first_data_column_position > 1: - table_index = libcudf.table.Table( - data=dict( - zip( - indices[:first_data_column_position], - cols[:first_data_column_position], - ) - ) - ) - tables.append( - libcudf.table.Table( - data=dict( - zip( - indices[first_data_column_position:], - cols[first_data_column_position:], - ) - ), - index=table_index, - ) - ) - - # Concatenate the Tables - out = cls._from_data( - *libcudf.concat.concat_tables(tables, ignore_index) - ) - - # If ignore_index is True, all input frames are empty, and at - # least one input frame has an index, assign a new RangeIndex - # to the result frame. - if empty_has_index and num_empty_input_frames == len(objs): - out._index = cudf.RangeIndex(result_index_length) - # Reassign the categories for any categorical table cols - _reassign_categories( - categories, out._data, indices[first_data_column_position:] - ) - - # Reassign the categories for any categorical index cols - if not isinstance(out._index, cudf.RangeIndex): - _reassign_categories( - categories, - out._index._data, - indices[:first_data_column_position], - ) - if not isinstance( - out._index, cudf.MultiIndex - ) and is_categorical_dtype(out._index._values.dtype): - out = out.set_index( - cudf.core.index.as_index(out.index._values) - ) - - # Reassign precision for any decimal cols - for name, col in out._data.items(): - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(tables[0]._data[name].dtype) - - # Reassign index and column names - if isinstance(objs[0].columns, pd.MultiIndex): - out.columns = objs[0].columns - else: - out.columns = names - if not ignore_index: - out._index.name = objs[0]._index.name - out._index.names = objs[0]._index.names - - return out - def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -2336,7 +2130,7 @@ def _copy_type_metadata( if include_index: if self._index is not None and other._index is not None: - self._index._copy_type_metadata(other._index) + self._index._copy_type_metadata(other._index) # type: ignore # When other._index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn # (the above _copy_type_metadata call will have converted the @@ -2347,7 +2141,9 @@ def _copy_type_metadata( ) and not isinstance( self._index, cudf.core.index.CategoricalIndex ): - self._index = cudf.Index(self._index._column) + self._index = cudf.Index( + cast(cudf.core.index.NumericIndex, self._index)._column + ) return self @@ -3429,6 +3225,26 @@ def _binaryop( *args, **kwargs, ) -> Frame: + """Perform a binary operation between two frames. + + Parameters + ---------- + other : Frame + The second operand. + fn : str + The operation to perform. + fill_value : Any, default None + The value to replace null values with. If ``None``, nulls are not + filled before the operation. + reflect : bool, default False + If ``True`` the operation is reflected (i.e whether to swap the + left and right operands). + + Returns + ------- + Frame + A new instance containing the result of the operation. + """ raise NotImplementedError @classmethod @@ -3455,8 +3271,8 @@ def _colwise_binop( Returns ------- - Frame - A subclass of Frame constructed from the result of performing the + Dict[ColumnBase] + A dict of columns constructed from the result of performing the requested operation on the operands. """ @@ -5089,39 +4905,32 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - def _binaryop( + def _make_operands_for_binop( self, other: T, - fn: str, fill_value: Any = None, reflect: bool = False, *args, **kwargs, - ) -> SingleColumnFrame: - """Perform a binary operation between two single column frames. + ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]: + """Generate the dictionary of operands used for a binary operation. Parameters ---------- other : SingleColumnFrame The second operand. - fn : str - The operation fill_value : Any, default None The value to replace null values with. If ``None``, nulls are not filled before the operation. reflect : bool, default False If ``True`` the operation is reflected (i.e whether to swap the left and right operands). - lhs : SingleColumnFrame, default None - The left hand operand. If ``None``, self is used. This parameter - allows child classes to preprocess the inputs if necessary. Returns ------- - SingleColumnFrame - A new instance containing the result of the operation. + Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] + The operands to be passed to _colwise_binop. """ - # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. @@ -5143,15 +4952,7 @@ def _binaryop( except Exception: return NotImplemented - operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] = { - result_name: (self._column, other, reflect, fill_value) - } - - return self._from_data( - data=type(self)._colwise_binop(operands, fn), - index=self._index, - name=result_name, - ) + return {result_name: (self._column, other, reflect, fill_value)} def _get_replacement_values_for_columns( @@ -5310,98 +5111,6 @@ def _get_replacement_values_for_columns( return all_na_columns, to_replace_columns, values_columns -# Create a dictionary of the common, non-null columns -def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): - # A mapping of {idx: np.dtype} - dtypes = dict() - # A mapping of {idx: [...columns]}, where `[...columns]` - # is a list of columns with at least one valid value for each - # column name across all input frames - non_null_columns = dict() - for idx in col_idxs: - for cols in list_of_columns: - # Skip columns not in this frame - if idx >= len(cols) or cols[idx] is None: - continue - # Store the first dtype we find for a column, even if it's - # all-null. This ensures we always have at least one dtype - # for each name. This dtype will be overwritten later if a - # non-null Column with the same name is found. - if idx not in dtypes: - dtypes[idx] = cols[idx].dtype - if cols[idx].valid_count > 0: - if idx not in non_null_columns: - non_null_columns[idx] = [cols[idx]] - else: - non_null_columns[idx].append(cols[idx]) - return non_null_columns, dtypes - - -def _find_common_dtypes_and_categories(non_null_columns, dtypes): - # A mapping of {idx: categories}, where `categories` is a - # column of all the unique categorical values from each - # categorical column across all input frames - categories = dict() - for idx, cols in non_null_columns.items(): - # default to the first non-null dtype - dtypes[idx] = cols[0].dtype - # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = find_common_type([col.dtype for col in cols]) - # If all categorical dtypes, combine the categories - elif all( - isinstance(col, cudf.core.column.CategoricalColumn) for col in cols - ): - # Combine and de-dupe the categories - categories[idx] = ( - cudf.Series(concat_columns([col.categories for col in cols])) - .drop_duplicates(ignore_index=True) - ._column - ) - # Set the column dtype to the codes' dtype. The categories - # will be re-assigned at the end - dtypes[idx] = min_scalar_type(len(categories[idx])) - # Otherwise raise an error if columns have different dtypes - elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): - raise ValueError("All columns must be the same type") - return categories - - -def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - for idx in col_idxs: - dtype = dtypes[idx] - for cols in list_of_columns: - # If column not in this df, fill with an all-null column - if idx >= len(cols) or cols[idx] is None: - n = len(next(x for x in cols if x is not None)) - cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) - else: - # If column is categorical, rebase the codes with the - # combined categories, and cast the new codes to the - # min-scalar-sized dtype - if idx in categories: - cols[idx] = ( - cols[idx] - ._set_categories(categories[idx], is_unique=True,) - .codes - ) - cols[idx] = cols[idx].astype(dtype) - - -def _reassign_categories(categories, cols, col_idxs): - for name, idx in zip(cols, col_idxs): - if idx in categories: - cols[name] = build_categorical_column( - categories=categories[idx], - codes=as_column(cols[name].base_data, dtype=cols[name].dtype), - mask=cols[name].base_mask, - offset=cols[name].offset, - size=cols[name].size, - ) - - def _is_series(obj): """ Checks if the `obj` is of type `cudf.Series` diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6be21ce74d2..6b4b77fabc5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2,6 +2,7 @@ from __future__ import annotations, division, print_function +import math import pickle from numbers import Number from typing import ( @@ -12,6 +13,7 @@ Optional, Tuple, Type, + TypeVar, Union, ) @@ -26,14 +28,8 @@ from cudf._lib.filling import sequence from cudf._lib.search import search_sorted from cudf._lib.table import Table -from cudf._typing import DtypeObj -from cudf.api.types import ( - _is_scalar_or_zero_d_array, - is_dtype_equal, - is_integer, - is_string_dtype, -) -from cudf.core.abc import Serializable +from cudf.api.types import _is_scalar_or_zero_d_array, is_string_dtype +from cudf.core._base_index import BaseIndex from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -41,1282 +37,78 @@ IntervalColumn, NumericalColumn, StringColumn, - TimeDeltaColumn, - arange, - column, -) -from cudf.core.column.column import as_column, concat_columns -from cudf.core.column.string import StringMethods as StringMethods -from cudf.core.dtypes import IntervalDtype -from cudf.core.frame import SingleColumnFrame -from cudf.utils import ioutils -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - _is_non_decimal_numeric_dtype, - find_common_type, - is_categorical_dtype, - is_interval_dtype, - is_list_like, - is_mixed_with_object_dtype, - is_scalar, - numeric_normalize_types, -) -from cudf.utils.utils import cached_property, search_range - - -class BaseIndex(SingleColumnFrame, Serializable): - """Base class for all cudf Index types.""" - - dtype: DtypeObj - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(*inputs) - else: - return NotImplemented - - @cached_property - def _values(self) -> ColumnBase: - raise NotImplementedError - - def __getitem__(self, key): - raise NotImplementedError() - - def drop_duplicates(self, keep="first"): - """ - Return Index with duplicate values removed - - Parameters - ---------- - keep : {‘first’, ‘last’, False}, default ‘first’ - * ‘first’ : Drop duplicates except for the - first occurrence. - * ‘last’ : Drop duplicates except for the - last occurrence. - * False : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - >>> idx - StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') - >>> idx.drop_duplicates() - StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') - """ # noqa: E501 - return super().drop_duplicates(keep=keep) - - def serialize(self): - header = {} - header["index_column"] = {} - # store metadata values of index separately - # Indexes: Numerical/DateTime/String are often GPU backed - header["index_column"], frames = self._values.serialize() - - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) - header["frame_count"] = len(frames) - return header, frames - - def __contains__(self, item): - return item in self._values - - @annotate("INDEX_EQUALS", color="green", domain="cudf_python") - def equals(self, other, **kwargs): - """ - Determine if two Index objects contain the same elements. - - Returns - ------- - out: bool - True if “other” is an Index and it has the same elements - as calling index; False otherwise. - """ - if not isinstance(other, BaseIndex): - return False - - check_types = False - - self_is_categorical = isinstance(self, CategoricalIndex) - other_is_categorical = isinstance(other, CategoricalIndex) - if self_is_categorical and not other_is_categorical: - other = other.astype(self.dtype) - check_types = True - elif other_is_categorical and not self_is_categorical: - self = self.astype(other.dtype) - check_types = True - - try: - return super().equals(other, check_types=check_types) - except TypeError: - return False - - def get_level_values(self, level): - """ - Return an Index of values for requested level. - - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatibility. - - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - Index - Calling object, as there is only one level in the Index. - - See Also - -------- - cudf.core.multiindex.MultiIndex.get_level_values : Get values for - a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(["a", "b", "c"]) - >>> idx.get_level_values(0) - StringIndex(['a' 'b' 'c'], dtype='object') - """ - - if level == self.name: - return self - elif is_integer(level): - if level != 0: - raise IndexError( - f"Cannot get level: {level} " f"for index with 1 level" - ) - return self - else: - raise KeyError(f"Requested level with name {level} " "not found") - - @classmethod - def deserialize(cls, header, frames): - h = header["index_column"] - idx_typ = pickle.loads(header["type-serialized"]) - name = pickle.loads(header["name"]) - - col_typ = pickle.loads(h["type-serialized"]) - index = col_typ.deserialize(h, frames[: header["frame_count"]]) - return idx_typ(index, name=name) - - @property - def names(self): - """ - Returns a tuple containing the name of the Index. - """ - return (self.name,) - - @names.setter - def names(self, values): - if not is_list_like(values): - raise ValueError("Names must be a list-like") - - num_values = len(values) - if num_values > 1: - raise ValueError( - "Length of new names must be 1, got %d" % num_values - ) - - self.name = values[0] - - def dropna(self, how="any"): - """ - Return an Index with null values removed. - - Parameters - ---------- - how : {‘any’, ‘all’}, default ‘any’ - If the Index is a MultiIndex, drop the value when any or - all levels are NaN. - - Returns - ------- - valid : Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index(['a', None, 'b', 'c']) - >>> index - StringIndex(['a' None 'b' 'c'], dtype='object') - >>> index.dropna() - StringIndex(['a' 'b' 'c'], dtype='object') - - Using `dropna` on a `MultiIndex`: - - >>> midx = cudf.MultiIndex( - ... levels=[[1, None, 4, None], [1, 2, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 1, 1), - ( 1, 5), - (, 2), - ( 4, 2), - (, 1)], - names=['x', 'y']) - >>> midx.dropna() - MultiIndex([(1, 1), - (1, 5), - (4, 2)], - names=['x', 'y']) - """ - return super().dropna(how=how) - - def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - - This will involve changing type of Index object - to StringIndex but it is the responsibility of the `__repr__` - methods using this method to replace or handle representation - of the actual types correctly. - """ - if self._values.has_nulls: - return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name - ) - else: - return self - - @property - def nlevels(self): - """ - Number of levels. - """ - return 1 - - def _set_names(self, names, inplace=False): - if inplace: - idx = self - else: - idx = self.copy(deep=False) - - idx.names = names - if not inplace: - return idx - - def set_names(self, names, level=None, inplace=False): - """ - Set Index or MultiIndex name. - Able to set new names partially and by level. - - Parameters - ---------- - names : label or list of label - Name(s) to set. - level : int, label or list of int or label, optional - If the index is a MultiIndex, level(s) to set (None for all - levels). Otherwise level must be None. - inplace : bool, default False - Modifies the object directly, instead of creating a new Index or - MultiIndex. - - Returns - ------- - Index - The same type as the caller or None if inplace is True. - - See Also - -------- - cudf.Index.rename : Able to set new names without level. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) - >>> idx - MultiIndex([('python', 2018), - ('python', 2019), - ( 'cobra', 2018), - ( 'cobra', 2019)], - ) - >>> idx.names - FrozenList([None, None]) - >>> idx.set_names(['kind', 'year'], inplace=True) - >>> idx.names - FrozenList(['kind', 'year']) - >>> idx.set_names('species', level=0, inplace=True) - >>> idx.names - FrozenList(['species', 'year']) - """ - if level is not None: - raise ValueError("Level must be None for non-MultiIndex") - - if not is_list_like(names): - names = [names] - - return self._set_names(names=names, inplace=inplace) - - def fillna(self, value, downcast=None): - """ - Fill null values with the specified value. - - Parameters - ---------- - value : scalar - Scalar value to use to fill nulls. This value cannot be a - list-likes. - - downcast : dict, default is None - This Parameter is currently NON-FUNCTIONAL. - - Returns - ------- - filled : Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, None, 4]) - >>> index - Int64Index([1, 2, null, 4], dtype='int64') - >>> index.fillna(3) - Int64Index([1, 2, 3, 4], dtype='int64') - """ - if downcast is not None: - raise NotImplementedError( - "`downcast` parameter is not yet supported" - ) - - return super().fillna(value=value) - - def take(self, indices): - """Gather only the specific subset of indices - - Parameters - ---------- - indices: An array-like that maps to values contained in this Index. - """ - return self[indices] - - def argsort(self, ascending=True, **kwargs): - """ - Return the integer indices that would sort the index. - - Parameters - ---------- - ascending : bool, default True - If True, returns the indices for ascending order. - If False, returns the indices for descending order. - - Returns - ------- - array : A cupy array containing Integer indices that - would sort the index if used as an indexer. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([10, 100, 1, 1000]) - >>> index - Int64Index([10, 100, 1, 1000], dtype='int64') - >>> index.argsort() - array([2, 0, 1, 3], dtype=int32) - - The order of argsort can be reversed using - ``ascending`` parameter, by setting it to ``False``. - >>> index.argsort(ascending=False) - array([3, 1, 0, 2], dtype=int32) - - ``argsort`` on a MultiIndex: - - >>> index = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> index.argsort() - array([4, 0, 1, 2, 3], dtype=int32) - >>> index.argsort(ascending=False) - array([3, 2, 1, 0, 4], dtype=int32) - """ - indices = self._values.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) - - def to_frame(self, index=True, name=None): - """Create a DataFrame with a column containing this Index - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column - - Returns - ------- - DataFrame - cudf DataFrame - """ - - if name is not None: - col_name = name - elif self.name is None: - col_name = 0 - else: - col_name = self.name - return cudf.DataFrame( - {col_name: self._values}, index=self if index else None - ) - - def any(self): - """ - Return whether any elements is True in Index. - """ - return self._values.any() - - def to_pandas(self): - """ - Convert to a Pandas Index. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([-3, 10, 15, 20]) - >>> idx - Int64Index([-3, 10, 15, 20], dtype='int64') - >>> idx.to_pandas() - Int64Index([-3, 10, 15, 20], dtype='int64') - >>> type(idx.to_pandas()) - - >>> type(idx) - - """ - return pd.Index(self._values.to_pandas(), name=self.name) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - - @property - def gpu_values(self): - """ - View the data as a numba device array object - """ - return self._values.data_array_view - - @classmethod - def _concat(cls, objs): - if all(isinstance(obj, RangeIndex) for obj in objs): - result = _concat_range_index(objs) - else: - data = concat_columns([o._values for o in objs]) - result = as_index(data) - - names = {obj.name for obj in objs} - if len(names) == 1: - [name] = names - else: - name = None - - result.name = name - return result - - def append(self, other): - """ - Append a collection of Index options together. - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 10, 100]) - >>> idx - Int64Index([1, 2, 10, 100], dtype='int64') - >>> other = cudf.Index([200, 400, 50]) - >>> other - Int64Index([200, 400, 50], dtype='int64') - >>> idx.append(other) - Int64Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') - - append accepts list of Index objects - - >>> idx.append([other, other]) - Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') - """ - - if is_list_like(other): - to_concat = [self] - to_concat.extend(other) - else: - this = self - if len(other) == 0: - # short-circuit and return a copy - to_concat = [self] - - other = as_index(other) - - if len(self) == 0: - to_concat = [other] - - if len(self) and len(other): - if is_mixed_with_object_dtype(this, other): - got_dtype = ( - other.dtype - if this.dtype == cudf.dtype("object") - else this.dtype - ) - raise TypeError( - f"cudf does not support appending an Index of " - f"dtype `{cudf.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ) - - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) - to_concat = [this, other] - - for obj in to_concat: - if not isinstance(obj, BaseIndex): - raise TypeError("all inputs must be Index") - - return self._concat(to_concat) - - def difference(self, other, sort=None): - """ - Return a new Index with elements from the index that are not in - `other`. - - This is the set difference of two Index objects. - - Parameters - ---------- - other : Index or array-like - sort : False or None, default None - Whether to sort the resulting index. By default, the - values are attempted to be sorted, but any TypeError from - incomparable elements is caught by cudf. - - * None : Attempt to sort the result, but catch any TypeErrors - from comparing incomparable elements. - * False : Do not sort the result. - - Returns - ------- - difference : Index - - Examples - -------- - >>> import cudf - >>> idx1 = cudf.Index([2, 1, 3, 4]) - >>> idx1 - Int64Index([2, 1, 3, 4], dtype='int64') - >>> idx2 = cudf.Index([3, 4, 5, 6]) - >>> idx2 - Int64Index([3, 4, 5, 6], dtype='int64') - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') - """ - if sort not in {None, False}: - raise ValueError( - f"The 'sort' keyword only takes the values " - f"of None or False; {sort} was passed." - ) - - other = as_index(other) - - if is_mixed_with_object_dtype(self, other): - difference = self.copy() - else: - difference = self.join(other, how="leftanti") - if self.dtype != other.dtype: - difference = difference.astype(self.dtype) - - if sort is None: - return difference.sort_values() - - return difference - - def sort_values(self, return_indexer=False, ascending=True, key=None): - """ - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. - - Parameters - ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. - key : None, optional - This parameter is NON-FUNCTIONAL. - - Returns - ------- - sorted_index : Index - Sorted copy of the index. - indexer : cupy.ndarray, optional - The indices that the index itself was sorted by. - - See Also - -------- - cudf.Series.min : Sort values of a Series. - cudf.DataFrame.sort_values : Sort values in a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') - - Sort values in ascending order (default behavior). - - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') - - Sort values in descending order, and also get the indices `idx` was - sorted by. - - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], - dtype=int32)) - - Sorting values in a MultiIndex: - - >>> midx = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> midx.sort_values() - MultiIndex([(-10, 1), - ( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11)], - names=['x', 'y']) - >>> midx.sort_values(ascending=False) - MultiIndex([( 4, 11), - ( 3, 11), - ( 1, 5), - ( 1, 1), - (-10, 1)], - names=['x', 'y']) - """ - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - - indices = self._values.argsort(ascending=ascending) - index_sorted = as_index(self.take(indices), name=self.name) - - if return_indexer: - return index_sorted, cupy.asarray(indices) - else: - return index_sorted - - def unique(self): - """ - Return unique values in the index. - - Returns - ------- - Index without duplicates - """ - return as_index(self._values.unique(), name=self.name) - - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): - """ - Compute join_index and indexers to conform data structures - to the new index. - - Parameters - ---------- - other : Index. - how : {'left', 'right', 'inner', 'outer'} - return_indexers : bool, default False - sort : bool, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword). - - Returns: index - - Examples - -------- - >>> import cudf - >>> lhs = cudf.DataFrame( - ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] - ... ).index - >>> lhs - MultiIndex([(2, 3), - (3, 4), - (1, 2)], - names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index - >>> rhs - Int64Index([1, 4, 3], dtype='int64', name='a') - >>> lhs.join(rhs, how='inner') - MultiIndex([(3, 4), - (1, 2)], - names=['a', 'b']) - """ - - if isinstance(self, cudf.MultiIndex) and isinstance( - other, cudf.MultiIndex - ): - raise TypeError( - "Join on level between two MultiIndex objects is ambiguous" - ) - - if level is not None and not is_scalar(level): - raise ValueError("level should be an int or a label only") - - if isinstance(other, cudf.MultiIndex): - if how == "left": - how = "right" - elif how == "right": - how = "left" - rhs = self.copy(deep=False) - lhs = other.copy(deep=False) - else: - lhs = self.copy(deep=False) - rhs = other.copy(deep=False) - - on = level - # In case of MultiIndex, it will be None as - # we don't need to update name - left_names = lhs.names - right_names = rhs.names - # There should be no `None` values in Joined indices, - # so essentially it would be `left/right` or 'inner' - # in case of MultiIndex - if isinstance(lhs, cudf.MultiIndex): - if level is not None and isinstance(level, int): - on = lhs._data.select_by_index(level).names[0] - right_names = (on,) or right_names - on = right_names[0] - if how == "outer": - how = "left" - elif how == "right": - how = "inner" - else: - # Both are nomal indices - right_names = left_names - on = right_names[0] - - lhs.names = left_names - rhs.names = right_names - - output = lhs._merge(rhs, how=how, on=on, sort=sort) - - return output - - def rename(self, name, inplace=False): - """ - Alter Index name. - - Defaults to returning new index. - - Parameters - ---------- - name : label - Name(s) to set. - - Returns - ------- - Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3], name='one') - >>> index - Int64Index([1, 2, 3], dtype='int64', name='one') - >>> index.name - 'one' - >>> renamed_index = index.rename('two') - >>> renamed_index - Int64Index([1, 2, 3], dtype='int64', name='two') - >>> renamed_index.name - 'two' - """ - if inplace is True: - self.name = name - return None - else: - out = self.copy(deep=False) - out.name = name - return out.copy(deep=True) - - def astype(self, dtype, copy=False): - """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype - Use a numpy.dtype to cast entire Index object to. - copy : bool, default False - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - Returns - ------- - Index - Index with values cast to specified dtype. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3]) - >>> index - Int64Index([1, 2, 3], dtype='int64') - >>> index.astype('float64') - Float64Index([1.0, 2.0, 3.0], dtype='float64') - """ - if is_dtype_equal(dtype, self.dtype): - return self.copy(deep=copy) - - return as_index( - self.copy(deep=copy)._values.astype(dtype), name=self.name - ) - - def to_array(self, fillna=None): - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : str or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - return self._values.to_array(fillna=fillna) - - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys. - Useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - Index of resulting Series. If None, defaults to original index. - name : str, optional - Dame of resulting Series. If None, defaults to name of original - index. - - Returns - ------- - Series - The dtype will be based on the type of the Index values. - """ - return cudf.Series( - self._values, - index=self.copy(deep=False) if index is None else index, - name=self.name if name is None else name, - ) - - def get_slice_bound(self, label, side, kind): - """ - Calculate slice bound that corresponds to given label. - Returns leftmost (one-past-the-rightmost if ``side=='right'``) position - of given label. - - Parameters - ---------- - label : object - side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} - - Returns - ------- - int - Index of label. - """ - raise (NotImplementedError) - - def __array_function__(self, func, types, args, kwargs): - - # check if the function is implemented for the current type - cudf_index_module = type(self) - for submodule in func.__module__.split(".")[1:]: - # point cudf_index_module to the correct submodule - if hasattr(cudf_index_module, submodule): - cudf_index_module = getattr(cudf_index_module, submodule) - else: - return NotImplemented - - fname = func.__name__ - - handled_types = [Index, cudf.Series] - - # check if we don't handle any of the types (including sub-class) - for t in types: - if not any( - issubclass(t, handled_type) for handled_type in handled_types - ): - return NotImplemented - - if hasattr(cudf_index_module, fname): - cudf_func = getattr(cudf_index_module, fname) - # Handle case if cudf_func is same as numpy function - if cudf_func is func: - return NotImplemented - else: - return cudf_func(*args, **kwargs) - - else: - return NotImplemented - - def isin(self, values): - """Return a boolean array where the index values are in values. - - Compute boolean array of whether each index value is found in - the passed set of values. The length of the returned boolean - array matches the length of the index. - - Parameters - ---------- - values : set, list-like, Index - Sought values. - - Returns - ------- - is_contained : cupy array - CuPy array of boolean values. - - Examples - -------- - >>> idx = cudf.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') - - Check whether each index value in a list of values. - - >>> idx.isin([1, 4]) - array([ True, False, False]) - """ - - return self._values.isin(values).values - - def where(self, cond, other=None): - """ - Replace values where the condition is False. - - Parameters - ---------- - cond : bool array-like with the same length as self - Where cond is True, keep the original value. - Where False, replace with corresponding value from other. - Callables are not supported. - other: scalar, or array-like - Entries where cond is False are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([4, 3, 2, 1, 0]) - >>> index - Int64Index([4, 3, 2, 1, 0], dtype='int64') - >>> index.where(index > 2, 15) - Int64Index([4, 3, 15, 15, 15], dtype='int64') - """ - return super().where(cond=cond, other=other) - - def memory_usage(self, deep=False): - """ - Memory usage of the values. - - Parameters - ---------- - deep : bool - Introspect the data deeply, - interrogate `object` dtypes for system-level - memory consumption. - - Returns - ------- - bytes used - """ - return self._values._memory_usage(deep=deep) - - def get_loc(self, key, method=None, tolerance=None): - """Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - - default: exact matches only. - - pad / ffill: find the PREVIOUS index value if no exact match. - - backfill / bfill: use NEXT index value if no exact match. - - nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index - value. - tolerance : int or float, optional - Maximum distance from index value for inexact matches. The value - of the index at the matching location must satisfy the equation - ``abs(index[loc] - key) <= tolerance``. - - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - """ - if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is unsupported yet." - ) - if method not in { - None, - "ffill", - "bfill", - "pad", - "backfill", - "nearest", - }: - raise ValueError( - f"Invalid fill method. Expecting pad (ffill), backfill (bfill)" - f" or nearest. Got {method}" - ) - - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - - if not is_sorted and method is not None: - raise ValueError( - "index must be monotonic increasing or decreasing if `method`" - "is specified." - ) - - key_as_table = Table({"None": as_column(key, length=1)}) - lower_bound, upper_bound, sort_inds = self._lexsorted_equal_range( - key_as_table, is_sorted - ) - - if lower_bound == upper_bound: - # Key not found, apply method - if method in ("pad", "ffill"): - if lower_bound == 0: - raise KeyError(key) - return lower_bound - 1 - elif method in ("backfill", "bfill"): - if lower_bound == self._data.nrows: - raise KeyError(key) - return lower_bound - elif method == "nearest": - if lower_bound == self._data.nrows: - return lower_bound - 1 - elif lower_bound == 0: - return 0 - lower_val = self._column.element_indexing(lower_bound - 1) - upper_val = self._column.element_indexing(lower_bound) - return ( - lower_bound - 1 - if abs(lower_val - key) < abs(upper_val - key) - else lower_bound - ) - else: - raise KeyError(key) - - if lower_bound + 1 == upper_bound: - # Search result is unique, return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) - ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array() - mask[cupy.array(true_inds)] = True - return mask - - def _lexsorted_equal_range( - self, key_as_table: Table, is_sorted: bool - ) -> Tuple[int, int, Optional[ColumnBase]]: - """Get equal range for key in lexicographically sorted index. If index - is not sorted when called, a sort will take place and `sort_inds` is - returned. Otherwise `None` is returned in that position. - """ - if not is_sorted: - sort_inds = self._get_sorted_inds() - sort_vals = self._gather(sort_inds) - else: - sort_inds = None - sort_vals = self - lower_bound = search_sorted( - sort_vals, key_as_table, side="left" - ).element_indexing(0) - upper_bound = search_sorted( - sort_vals, key_as_table, side="right" - ).element_indexing(0) - - return lower_bound, upper_bound, sort_inds - - @classmethod - def from_pandas(cls, index, nan_as_null=None): - """ - Convert from a Pandas Index. - - Parameters - ---------- - index : Pandas Index object - A Pandas Index object which has to be converted - to cuDF Index. - nan_as_null : bool, Default None - If ``None``/``True``, converts ``np.nan`` values - to ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> import numpy as np - >>> data = [10, 20, 30, np.nan] - >>> pdi = pd.Index(data) - >>> cudf.Index.from_pandas(pdi) - Float64Index([10.0, 20.0, 30.0, ], dtype='float64') - >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Float64Index([10.0, 20.0, 30.0, nan], dtype='float64') - """ - if not isinstance(index, pd.Index): - raise TypeError("not a pandas.Index") + TimeDeltaColumn, + arange, + column, +) +from cudf.core.column.column import as_column, concat_columns +from cudf.core.column.string import StringMethods as StringMethods +from cudf.core.dtypes import IntervalDtype +from cudf.core.frame import Frame, SingleColumnFrame +from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import ( + _is_non_decimal_numeric_dtype, + find_common_type, + is_categorical_dtype, + is_interval_dtype, +) +from cudf.utils.utils import cached_property, search_range - ind = as_index(column.as_column(index, nan_as_null=nan_as_null)) - ind.name = index.name - return ind +T = TypeVar("T", bound="Frame") - @classmethod - def _from_data( - cls, - data: MutableMapping, - index: Optional[BaseIndex] = None, - name: Any = None, - ) -> BaseIndex: - assert index is None - if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): - data = cudf.core.column_accessor.ColumnAccessor(data) - if len(data) == 0: - raise ValueError("Cannot construct Index from any empty Table") - if len(data) == 1: - values = next(iter(data.values())) - - if isinstance(values, NumericalColumn): - try: - index_class_type = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex - out = super(BaseIndex, index_class_type).__new__( - index_class_type - ) - elif isinstance(values, DatetimeColumn): - out = super(BaseIndex, DatetimeIndex).__new__(DatetimeIndex) - elif isinstance(values, TimeDeltaColumn): - out = super(BaseIndex, TimedeltaIndex).__new__(TimedeltaIndex) - elif isinstance(values, StringColumn): - out = super(BaseIndex, StringIndex).__new__(StringIndex) - elif isinstance(values, CategoricalColumn): - out = super(BaseIndex, CategoricalIndex).__new__( - CategoricalIndex - ) - out._data = data - out._index = None - return out - else: - return cudf.MultiIndex._from_data(data) - @property - def _constructor_expanddim(self): - return cudf.MultiIndex +def _lexsorted_equal_range( + idx: Union[GenericIndex, cudf.MultiIndex], + key_as_table: Table, + is_sorted: bool, +) -> Tuple[int, int, Optional[ColumnBase]]: + """Get equal range for key in lexicographically sorted index. If index + is not sorted when called, a sort will take place and `sort_inds` is + returned. Otherwise `None` is returned in that position. + """ + if not is_sorted: + sort_inds = idx._get_sorted_inds() + sort_vals = idx._gather(sort_inds) + else: + sort_inds = None + sort_vals = idx + lower_bound = search_sorted( + sort_vals, key_as_table, side="left" + ).element_indexing(0) + upper_bound = search_sorted( + sort_vals, key_as_table, side="right" + ).element_indexing(0) + + return lower_bound, upper_bound, sort_inds + + +def _index_from_data(data: MutableMapping, name: Any = None): + """Construct an index of the appropriate type from some data.""" + if len(data) == 0: + raise ValueError("Cannot construct Index from any empty Table") + if len(data) == 1: + values = next(iter(data.values())) + + if isinstance(values, NumericalColumn): + try: + index_class_type: Type[ + Union[GenericIndex, cudf.MultiIndex] + ] = _dtype_to_index[values.dtype.type] + except KeyError: + index_class_type = GenericIndex + elif isinstance(values, DatetimeColumn): + index_class_type = DatetimeIndex + elif isinstance(values, TimeDeltaColumn): + index_class_type = TimedeltaIndex + elif isinstance(values, StringColumn): + index_class_type = StringIndex + elif isinstance(values, CategoricalColumn): + index_class_type = CategoricalIndex + elif isinstance(values, IntervalColumn): + index_class_type = IntervalIndex + else: + index_class_type = cudf.MultiIndex + return index_class_type._from_data(data, None, name) class RangeIndex(BaseIndex): @@ -1371,6 +163,13 @@ def __init__( self._index = None self._name = name + def _copy_type_metadata( + self, other: Frame, include_index: bool = True + ) -> RangeIndex: + # There is no metadata to be copied for RangeIndex since it does not + # have an underlying column. + return self + @property def name(self): """ @@ -1461,6 +260,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): start=self._start, stop=self._stop, step=self._step, name=name ) + def drop_duplicates(self, keep="first"): + return self + def __repr__(self): return ( f"{self.__class__.__name__}(start={self._start}, stop={self._stop}" @@ -1509,7 +311,7 @@ def equals(self, other): other._step, ): return True - return super().equals(other) + return cudf.Int64Index._from_data(self._data).equals(other) def serialize(self): header = {} @@ -1683,10 +485,102 @@ def __mul__(self, other): return RangeIndex( self.start * other, self.stop * other, self.step * other ) - return super().__mul__(other) + return self._as_int64().__mul__(other) + + def __rmul__(self, other): + # Multiplication is commutative. + return self.__mul__(other) + + def _as_int64(self): + # Convert self to an Int64Index. This method is used to perform ops + # that are not defined directly on RangeIndex. + return cudf.Int64Index._from_data(self._data) + + def __getattr__(self, key): + # For methods that are not defined for RangeIndex we attempt to operate + # on the corresponding integer index if possible. + try: + return getattr(self._as_int64(), key) + except AttributeError: + raise AttributeError( + f"'{type(self)}' object has no attribute {key}" + ) + + def get_loc(self, key, method=None, tolerance=None): + # Given an actual integer, + idx = (key - self._start) / self._step + idx_int_upper_bound = (self._stop - self._start) // self._step + if method is None: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if using pad, " + "backfill or nearest lookups" + ) + + if idx > idx_int_upper_bound or idx < 0: + raise KeyError(key) + + idx_int = (key - self._start) // self._step + if idx_int != idx: + raise KeyError(key) + return idx_int + + if (method == "ffill" and idx < 0) or ( + method == "bfill" and idx > idx_int_upper_bound + ): + raise KeyError(key) + + round_method = { + "ffill": math.floor, + "bfill": math.ceil, + "nearest": round, + }[method] + if tolerance is not None and (abs(idx) * self._step > tolerance): + raise KeyError(key) + return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + + +# Patch in all binops and unary ops, which bypass __getattr__ on the instance +# and prevent the above overload from working. +for binop in ( + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mod__", + "__rmod__", + "__pow__", + "__rpow__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__and__", + "__or__", + "__xor__", + "__eq__", + "__ne__", + "__lt__", + "__le__", + "__gt__", + "__ge__", +): + setattr( + RangeIndex, + binop, + lambda self, other, op=binop: getattr(self._as_int64(), op)(other), + ) + + +for unaop in ("__neg__", "__pos__", "__abs__"): + setattr( + RangeIndex, + binop, + lambda self, op=unaop: getattr(self._as_int64(), op)(), + ) -class GenericIndex(BaseIndex): +class GenericIndex(SingleColumnFrame, BaseIndex): """ An array of orderable values that represent the indices of another Column @@ -1725,10 +619,118 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + def drop_duplicates(self, keep="first"): + """ + Return Index with duplicate values removed + + Parameters + ---------- + keep : {‘first’, ‘last’, False}, default ‘first’ + * ‘first’ : Drop duplicates except for the + first occurrence. + * ‘last’ : Drop duplicates except for the + last occurrence. + * False : Drop all duplicates. + + Returns + ------- + Index + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + >>> idx + StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') + >>> idx.drop_duplicates() + StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') + """ # noqa: E501 + return super().drop_duplicates(keep=keep) + + def _binaryop( + self, + other: T, + fn: str, + fill_value: Any = None, + reflect: bool = False, + *args, + **kwargs, + ) -> SingleColumnFrame: + # Specialize binops to generate the appropriate output index type. + operands = self._make_operands_for_binop(other, fill_value, reflect) + return ( + _index_from_data(data=self._colwise_binop(operands, fn),) + if operands is not NotImplemented + else NotImplemented + ) + + def _copy_type_metadata( + self, other: Frame, include_index: bool = True + ) -> GenericIndex: + """ + Copy type metadata from each column of `other` to the corresponding + column of `self`. + See `ColumnBase._with_type_metadata` for more information. + """ + for name, col, other_col in zip( + self._data.keys(), self._data.values(), other._data.values() + ): + self._data.set_by_label( + name, col._with_type_metadata(other_col.dtype), validate=False + ) + return self + @property def _values(self): return self._column + @classmethod + def _concat(cls, objs): + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + + names = {obj.name for obj in objs} + if len(names) == 1: + [name] = names + else: + name = None + + result.name = name + return result + + @annotate("INDEX_EQUALS", color="green", domain="cudf_python") + def equals(self, other, **kwargs): + """ + Determine if two Index objects contain the same elements. + + Returns + ------- + out: bool + True if “other” is an Index and it has the same elements + as calling index; False otherwise. + """ + if not isinstance(other, BaseIndex): + return False + + check_types = False + + self_is_categorical = isinstance(self, CategoricalIndex) + other_is_categorical = isinstance(other, CategoricalIndex) + if self_is_categorical and not other_is_categorical: + other = other.astype(self.dtype) + check_types = True + elif other_is_categorical and not self_is_categorical: + self = self.astype(other.dtype) + check_types = True + + try: + return super().equals(other, check_types=check_types) + except TypeError: + return False + def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -1753,7 +755,124 @@ def copy(self, name=None, deep=False, dtype=None, names=None): dtype = self.dtype if dtype is None else dtype name = self.name if name is None else name - return as_index(self._values.astype(dtype), name=name, copy=deep) + col = self._values.astype(dtype) + return _index_from_data({name: col.copy(True) if deep else col}) + + def get_loc(self, key, method=None, tolerance=None): + """Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional + - default: exact matches only. + - pad / ffill: find the PREVIOUS index value if no exact match. + - backfill / bfill: use NEXT index value if no exact match. + - nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index + value. + tolerance : int or float, optional + Maximum distance from index value for inexact matches. The value + of the index at the matching location must satisfy the equation + ``abs(index[loc] - key) <= tolerance``. + + Returns + ------- + int or slice or boolean mask + - If result is unique, return integer index + - If index is monotonic, loc is returned as a slice object + - Otherwise, a boolean mask is returned + + Examples + -------- + >>> unique_index = cudf.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + >>> monotonic_index = cudf.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + >>> non_monotonic_index = cudf.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + >>> numeric_unique_index = cudf.Index([1, 2, 3]) + >>> numeric_unique_index.get_loc(3) + 2 + """ + if tolerance is not None: + raise NotImplementedError( + "Parameter tolerance is unsupported yet." + ) + if method not in { + None, + "ffill", + "bfill", + "pad", + "backfill", + "nearest", + }: + raise ValueError( + f"Invalid fill method. Expecting pad (ffill), backfill (bfill)" + f" or nearest. Got {method}" + ) + + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) + + if not is_sorted and method is not None: + raise ValueError( + "index must be monotonic increasing or decreasing if `method`" + "is specified." + ) + + key_as_table = Table({"None": as_column(key, length=1)}) + lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( + self, key_as_table, is_sorted + ) + + if lower_bound == upper_bound: + # Key not found, apply method + if method in ("pad", "ffill"): + if lower_bound == 0: + raise KeyError(key) + return lower_bound - 1 + elif method in ("backfill", "bfill"): + if lower_bound == self._data.nrows: + raise KeyError(key) + return lower_bound + elif method == "nearest": + if lower_bound == self._data.nrows: + return lower_bound - 1 + elif lower_bound == 0: + return 0 + lower_val = self._column.element_indexing(lower_bound - 1) + upper_val = self._column.element_indexing(lower_bound) + return ( + lower_bound - 1 + if abs(lower_val - key) < abs(upper_val - key) + else lower_bound + ) + else: + raise KeyError(key) + + if lower_bound + 1 == upper_bound: + # Search result is unique, return int. + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) + + if is_sorted: + # In monotonic index, lex search result is continuous. A slice for + # the range is returned. + return slice(lower_bound, upper_bound) + + # Not sorted and not unique. Return a boolean mask + mask = cupy.full(self._data.nrows, False) + true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array() + mask[cupy.array(true_inds)] = True + return mask def __sizeof__(self): return self._values.__sizeof__() @@ -3070,37 +2189,27 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: idx = arbitrary.copy(deep=False) idx.rename(kwargs["name"], inplace=True) return idx - elif isinstance(arbitrary, NumericalColumn): - try: - return _dtype_to_index[arbitrary.dtype.type](arbitrary, **kwargs) - except KeyError: - return GenericIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, StringColumn): - return StringIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, DatetimeColumn): - return DatetimeIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, TimeDeltaColumn): - return TimedeltaIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, CategoricalColumn): - return CategoricalIndex(arbitrary, **kwargs) - elif isinstance(arbitrary, IntervalColumn): - return IntervalIndex(arbitrary, **kwargs) + elif isinstance(arbitrary, ColumnBase): + return _index_from_data({kwargs.get("name", None): arbitrary}) elif isinstance(arbitrary, cudf.Series): return as_index(arbitrary._column, **kwargs) - elif isinstance(arbitrary, pd.RangeIndex): - return RangeIndex(start=arbitrary.start, stop=arbitrary.stop, **kwargs) + elif isinstance(arbitrary, (pd.RangeIndex, range)): + return RangeIndex( + start=arbitrary.start, + stop=arbitrary.stop, + step=arbitrary.step, + **kwargs, + ) elif isinstance(arbitrary, pd.MultiIndex): return cudf.MultiIndex.from_pandas(arbitrary) elif isinstance(arbitrary, cudf.DataFrame): return cudf.MultiIndex(source_data=arbitrary) - elif isinstance(arbitrary, range): - return RangeIndex(arbitrary, **kwargs) return as_index( column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs ) -_dtype_to_index: Dict[Any, Type[BaseIndex]] = { +_dtype_to_index: Dict[Any, Type[NumericIndex]] = { np.int8: Int8Index, np.int16: Int16Index, np.int32: Int32Index, @@ -3180,9 +2289,9 @@ def __new__( tupleize_cols=True, **kwargs, ): - assert cls is Index, ( - "Index cannot be subclassed, extend BaseIndex " "instead." - ) + assert ( + cls is Index + ), "Index cannot be subclassed, extend BaseIndex instead." if tupleize_cols is not True: raise NotImplementedError( "tupleize_cols != True is not yet supported" @@ -3190,6 +2299,14 @@ def __new__( return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + @classmethod + def from_arrow(cls, obj): + try: + return cls(ColumnBase.from_arrow(obj)) + except TypeError: + # Try interpreting object as a MultiIndex before failing. + return cudf.MultiIndex.from_arrow(obj) + def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 78fc7a863d6..1d1f661779f 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -70,7 +70,9 @@ def _frame_select_by_indexers( else: data.set_by_label(idx.name, idx.get(frame), validate=False) - result_index = cudf.Index._from_data(index_data) if index_data else None + result_index = ( + cudf.core.index._index_from_data(index_data) if index_data else None + ) result = cudf.core.frame.Frame(data=data, index=result_index) return result diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 418d24f41df..3b364a3fa86 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,7 +6,7 @@ import pickle import warnings from collections.abc import Sequence -from typing import Any, List, Mapping, Tuple, Union +from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy import numpy as np @@ -18,12 +18,12 @@ from cudf._typing import DataFrameOrSeries from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import as_column, column -from cudf.core.frame import SingleColumnFrame -from cudf.core.index import BaseIndex, as_index +from cudf.core.frame import Frame +from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index from cudf.utils.utils import _maybe_indices_to_slice -class MultiIndex(BaseIndex): +class MultiIndex(Frame, BaseIndex): """A multi-level or hierarchical index. Provides N-Dimensional indexing into Series and DataFrame objects. @@ -191,11 +191,6 @@ def names(self, value): ) self._names = pd.core.indexes.frozen.FrozenList(value) - @property - def _num_columns(self): - # MultiIndex is not a single-columned frame. - return super(SingleColumnFrame, self)._num_columns - def rename(self, names, inplace=False): """ Alter MultiIndex level names @@ -283,14 +278,18 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) - # TODO: This type ignore is indicating a real problem, which is that - # MultiIndex should not be inheriting from SingleColumnFrame, but fixing - # that will have to wait until we reshuffle the Index hierarchy. @classmethod - def _from_data( # type: ignore - cls, data: Mapping, index=None + def _from_data( + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, + name: Any = None, ) -> MultiIndex: - return cls.from_frame(cudf.DataFrame._from_data(data)) + assert index is None + obj = cls.from_frame(cudf.DataFrame._from_data(data)) + if name is not None: + obj.name = name + return obj @property def shape(self): @@ -434,6 +433,15 @@ def deepcopy(self): def __copy__(self): return self.copy(deep=True) + def __iter__(self): + """ + Iterating over a GPU object is not effecient and hence not supported. + + Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host`` + if you wish to iterate over the values. + """ + cudf.utils.utils.raise_iteration_error(obj=self) + def _popn(self, n): """ Returns a copy of this index without the left-most n values. @@ -535,68 +543,6 @@ def __repr__(self): data_output = "\n".join(lines) return output_prefix + data_output - @classmethod - def from_arrow(cls, table): - """ - Convert PyArrow Table to MultiIndex - - Parameters - ---------- - table : PyArrow Table - PyArrow Object which has to be converted to MultiIndex - - Returns - ------- - cudf MultiIndex - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> tbl = pa.table({"a":[1, 2, 3], "b":["a", "b", "c"]}) - >>> cudf.MultiIndex.from_arrow(tbl) - MultiIndex([(1, 'a'), - (2, 'b'), - (3, 'c')], - names=['a', 'b']) - """ - - return super(SingleColumnFrame, cls).from_arrow(table) - - def to_arrow(self): - """Convert MultiIndex to PyArrow Table - - Returns - ------- - PyArrow Table - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a":[1, 2, 3], "b":[2, 3, 4]}) - >>> mindex = cudf.Index(df) - >>> mindex - MultiIndex([(1, 2), - (2, 3), - (3, 4)], - names=['a', 'b']) - >>> mindex.to_arrow() - pyarrow.Table - a: int64 - b: int64 - >>> mindex.to_arrow()['a'] - - [ - [ - 1, - 2, - 3 - ] - ] - """ - - return super(SingleColumnFrame, self).to_arrow() - @property def codes(self): """ @@ -1401,7 +1347,7 @@ def _poplevels(self, level): popped_data[n] = self._data.pop(n) # construct the popped result - popped = cudf.Index._from_data(popped_data) + popped = cudf.core.index._index_from_data(popped_data) popped.names = popped_names # update self @@ -1548,6 +1494,18 @@ def is_unique(self): ) return self._is_unique + @property + def is_monotonic(self): + """Return boolean if values in the object are monotonic_increasing. + + This property is an alias for :attr:`is_monotonic_increasing`. + + Returns + ------- + bool + """ + return self.is_monotonic_increasing + @property def is_monotonic_increasing(self): """ @@ -1853,11 +1811,9 @@ def get_loc(self, key, method=None, tolerance=None): partial_index = self.__class__._from_data( data=self._data.select_by_index(slice(key_as_table._num_columns)) ) - ( - lower_bound, - upper_bound, - sort_inds, - ) = partial_index._lexsorted_equal_range(key_as_table, is_sorted) + (lower_bound, upper_bound, sort_inds,) = _lexsorted_equal_range( + partial_index, key_as_table, is_sorted + ) if lower_bound == upper_bound: raise KeyError(key) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1b8405af1a4..392a251dfc4 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -386,7 +386,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.Index._concat(objs) + return cudf.core.index.GenericIndex._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7943d033cf8..4fe5712f240 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, MutableMapping, Optional +from typing import Any, MutableMapping, Optional, Set from uuid import uuid4 import cupy @@ -39,7 +39,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, SingleColumnFrame, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy -from cudf.core.index import BaseIndex, Index, RangeIndex, as_index +from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer from cudf.utils import cudautils, docutils from cudf.utils.docutils import copy_docstring @@ -105,6 +105,8 @@ class Series(SingleColumnFrame, Serializable): If ``False``, leaves ``np.nan`` values as is. """ + _accessors: Set[Any] = set() + # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property def _constructor(self): @@ -1216,6 +1218,7 @@ def _binaryop( *args, **kwargs, ): + # Specialize binops to align indices. if isinstance(other, SingleColumnFrame): if ( # TODO: The can_reindex logic also needs to be applied for @@ -1238,8 +1241,14 @@ def _binaryop( else: lhs = self - # Note that we call the super on lhs, not self. - return super(Series, lhs)._binaryop(other, fn, fill_value, reflect) + operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return ( + lhs._from_data( + data=lhs._colwise_binop(operands, fn), index=lhs._index, + ) + if operands is not NotImplemented + else NotImplemented + ) def add(self, other, fill_value=None, axis=0): """ @@ -2246,7 +2255,9 @@ def _concat(cls, objs, axis=0, index=True): if isinstance(objs[0].index, cudf.MultiIndex): index = cudf.MultiIndex._concat([o.index for o in objs]) else: - index = Index._concat([o.index for o in objs]) + index = cudf.core.index.GenericIndex._concat( + [o.index for o in objs] + ) names = {obj.name for obj in objs} if len(names) == 1: diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f80bdec0ab5..29b39fbd195 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -672,11 +672,11 @@ def test_index_where(data, condition, other, error): else: assert_eq( ps.where(ps_condition, other=ps_other) - .fillna(gs._columns[0].default_na_value()) + .fillna(gs._values.default_na_value()) .values, gs.where(gs_condition, other=gs_other) .to_pandas() - .fillna(gs._columns[0].default_na_value()) + .fillna(gs._values.default_na_value()) .values, ) else: @@ -2098,6 +2098,35 @@ def test_get_loc_single_unique_numeric(idx, key, method): assert_eq(expected, got) +@pytest.mark.parametrize( + "idx", [pd.RangeIndex(3, 100, 4)], +) +@pytest.mark.parametrize("key", list(range(1, 110, 3))) +@pytest.mark.parametrize("method", [None, "ffill"]) +def test_get_loc_rangeindex(idx, key, method): + pi = idx + gi = cudf.from_pandas(pi) + + if ( + (key not in pi and method is None) + # Get key before the first element is KeyError + or (key < pi.start and method in "ffill") + # Get key after the last element is KeyError + or (key >= pi.stop and method in "bfill") + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_loc(key, method=method) + got = gi.get_loc(key, method=method) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "idx", [