diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index 52ddbd8b8fb..5b644fda2f8 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -12,6 +12,7 @@ from cudf._lib.cpp.io.types cimport table_with_metadata from cudf._lib.cpp.types cimport size_type from cudf._lib.io.utils cimport make_source_info from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): @@ -52,4 +53,4 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): names = [name.decode() for name in c_result.metadata.column_names] - return Table.from_unique_ptr(move(c_result.tbl), column_names=names) + return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index 86778e0a9e1..5266d0ac773 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -15,6 +15,7 @@ from cudf._lib.cpp.concatenate cimport ( from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.table cimport Table from cudf._lib.utils cimport ( + data_from_unique_ptr, make_column_views, make_table_data_views, make_table_views, @@ -52,7 +53,8 @@ cpdef concat_tables(object tables, bool ignore_index=False): c_views = make_table_data_views(tables) with nogil: c_result = move(libcudf_concatenate_tables(c_views)) - return Table.from_unique_ptr( + + return data_from_unique_ptr( move(c_result), column_names=tables[0]._column_names, index_names=None if ignore_index else tables[0]._index_names diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 87a8aeaaa26..c2b7fb32546 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -11,6 +11,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from rmm._lib.device_buffer cimport DeviceBuffer + from cudf.core.buffer import Buffer from cudf._lib.column cimport Column @@ -35,6 +36,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type +from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -178,7 +180,7 @@ def gather( ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -210,19 +212,17 @@ def _scatter_table(Table source_table, Column scatter_map, ) ) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=None ) - out_table._index = ( + return data, ( None if target_table._index is None else target_table._index.copy( deep=False) ) - return out_table - def _scatter_scalar(scalars, Column scatter_map, Table target_table, bool bounds_check=True): @@ -250,19 +250,17 @@ def _scatter_scalar(scalars, Column scatter_map, ) ) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=None ) - out_table._index = ( + return data, ( None if target_table._index is None else target_table._index.copy( deep=False) ) - return out_table - def scatter(object input, object scatter_map, Table target, bool bounds_check=True): @@ -306,7 +304,7 @@ def _reverse_table(Table source_table): reverse_table_view )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=source_table._index_names @@ -371,7 +369,7 @@ def table_empty_like(Table input_table, bool keep_index=True): with nogil: c_result = move(cpp_copying.empty_like(input_table_view)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=input_table._column_names, index_names=( @@ -434,8 +432,8 @@ def table_slice(Table input_table, object indices, bool keep_index=True): ) num_of_result_cols = c_result.size() - result =[ - Table.from_table_view( + return [ + data_from_table_view( c_result[i], input_table, column_names=input_table._column_names, @@ -446,8 +444,6 @@ def table_slice(Table input_table, object indices, bool keep_index=True): ) ) for i in range(num_of_result_cols)] - return result - def column_split(Column input_column, object splits): @@ -505,8 +501,8 @@ def table_split(Table input_table, object splits, bool keep_index=True): ) num_of_result_cols = c_result.size() - result = [ - Table.from_table_view( + return [ + data_from_table_view( c_result[i], input_table, column_names=input_table._column_names, @@ -515,8 +511,6 @@ def table_split(Table input_table, object splits, bool keep_index=True): else None ) for i in range(num_of_result_cols)] - return result - def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): @@ -642,7 +636,7 @@ def _boolean_mask_scatter_table(Table input_table, Table target_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=target_table._index._column_names @@ -672,13 +666,15 @@ def _boolean_mask_scatter_scalar(list input_scalars, Table target_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=target_table._column_names, index_names=target_table._index._column_names ) +# TODO: This function is currently unused but should be used in +# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667. def boolean_mask_scatter(object input, Table target_table, Column boolean_mask): @@ -755,7 +751,7 @@ def sample(Table input, size_type n, cpp_copying.sample(tbl_view, n, replacement, seed) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_output), column_names=input._column_names, index_names=( @@ -884,12 +880,12 @@ cdef class _CPackedColumns: return p def unpack(self): - output_table = Table.from_table_view( + output_table = Table(*data_from_table_view( cpp_copying.unpack(self.c_obj), self, self.column_names, self.index_names - ) + )) for name, dtype in self.column_dtypes.items(): output_table._data[name] = ( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 773e81a0a7b..2dfa61ee900 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -40,6 +40,7 @@ from cudf._lib.cpp.io.types cimport ( from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.io.utils cimport make_sink_info, make_source_info from cudf._lib.table cimport Table, make_table_view +from cudf._lib.utils cimport data_from_unique_ptr ctypedef int32_t underlying_type_t_compression @@ -393,7 +394,7 @@ def read_csv( c_result = move(cpp_read_csv(read_csv_options_c)) meta_names = [name.decode() for name in c_result.metadata.column_names] - df = cudf.DataFrame._from_table(Table.from_unique_ptr( + df = cudf.DataFrame._from_data(*data_from_unique_ptr( move(c_result.tbl), column_names=meta_names )) diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx index d9fdf72415c..99a3957006b 100644 --- a/python/cudf/cudf/_lib/filling.pyx +++ b/python/cudf/cudf/_lib/filling.pyx @@ -16,6 +16,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def fill_in_place(Column destination, int begin, int end, DeviceScalar value): @@ -70,7 +71,7 @@ def _repeat_via_column(Table inp, Column count, bool check_count): c_check_count )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=inp._column_names, index_names=inp._index_names @@ -87,7 +88,7 @@ def _repeat_via_size_type(Table inp, size_type count): count )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=inp._column_names, index_names=inp._index_names diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 12e3f65a8a2..ed9820300d8 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -22,6 +22,8 @@ from libcpp.pair cimport pair from libcpp.utility cimport move from libcpp.vector cimport vector +import cudf + from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table @@ -39,6 +41,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.utilities.host_span cimport host_span +from cudf._lib.utils cimport data_from_unique_ptr # The sets below define the possible aggregations that can be performed on # different dtypes. These strings must be elements of the AggregationKind enum. @@ -91,11 +94,11 @@ cdef class GroupBy: c_grouped_values = move(c_groups.values) c_group_offsets = c_groups.offsets - grouped_keys = Table.from_unique_ptr( + grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( move(c_grouped_keys), column_names=range(c_grouped_keys.get()[0].num_columns()) - ) - grouped_values = Table.from_unique_ptr( + )) + grouped_values = data_from_unique_ptr( move(c_grouped_values), index_names=values._index_names, column_names=values._column_names @@ -197,7 +200,7 @@ cdef class GroupBy: else: raise - grouped_keys = Table.from_unique_ptr( + grouped_keys, _ = data_from_unique_ptr( move(c_result.first), column_names=self.keys._column_names ) @@ -213,7 +216,7 @@ cdef class GroupBy: Column.from_unique_ptr(move(c_result.second[i].results[j])) ) - return Table(data=result_data, index=grouped_keys) + return result_data, cudf.Index._from_data(grouped_keys) def shift(self, Table values, int periods, list fill_values): cdef table_view view = values.view() @@ -238,16 +241,16 @@ cdef class GroupBy: self.c_obj.get()[0].shift(view, offsets, c_fill_values) ) - grouped_keys = Table.from_unique_ptr( + grouped_keys = cudf.Index._from_data(*data_from_unique_ptr( move(c_result.first), column_names=self.keys._column_names - ) + )) - shifted = Table.from_unique_ptr( + shifted, _ = data_from_unique_ptr( move(c_result.second), column_names=values._column_names ) - return Table(data=shifted._data, index=grouped_keys) + return shifted, grouped_keys def replace_nulls(self, Table values, object method): cdef table_view val_view = values.view() @@ -265,12 +268,10 @@ cdef class GroupBy: self.c_obj.get()[0].replace_nulls(val_view, policies) ) - grouped_result = Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result.second), column_names=values._column_names - ) + )[0] - result = Table(data=grouped_result._data) - return result _GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"} diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 198e7a748c9..137b19ef69c 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -15,6 +15,7 @@ from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def hash_partition(Table source_table, object columns_to_hash, @@ -41,12 +42,14 @@ def hash_partition(Table source_table, object columns_to_hash, # the original table (`source_table`) is empty. We need to # return a list of zeros in this case. return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=source_table._column_names, - index_names=source_table._index_names if( - keep_index is True) - else None + index_names=( + source_table._index_names + if keep_index is True + else None + ) ), list(c_result.second) if c_result.second.size() diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 08ea58e4587..234513733d1 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -21,6 +21,7 @@ from cudf._lib.cpp.interop cimport ( from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def from_dlpack(dlpack_capsule): @@ -40,7 +41,7 @@ def from_dlpack(dlpack_capsule): cpp_from_dlpack(dlpack_tensor) ) - res = Table.from_unique_ptr( + res = data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -164,10 +165,8 @@ def from_arrow( with nogil: c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) - out_table = Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=column_names, index_names=index_names ) - - return out_table diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 82ad9d67f78..66d93ffc531 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -3,6 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from cudf._lib.column cimport Column from cudf._lib.cpp.io.types cimport ( column_name_info, data_sink, @@ -17,3 +18,7 @@ cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* cdef update_struct_field_names( Table table, vector[column_name_info]& schema_info) +cdef Column update_column_struct_field_names( + Column col, + column_name_info& info +) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 72ab64f6249..d26cf19deaf 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -127,12 +127,12 @@ cdef update_struct_field_names( vector[column_name_info]& schema_info ): for i, (name, col) in enumerate(table._data.items()): - table._data[name] = _update_column_struct_field_names( + table._data[name] = update_column_struct_field_names( col, schema_info[i] ) -cdef Column _update_column_struct_field_names( +cdef Column update_column_struct_field_names( Column col, column_name_info& info ): @@ -149,7 +149,7 @@ cdef Column _update_column_struct_field_names( if col.children: children = list(col.children) for i, child in enumerate(children): - children[i] = _update_column_struct_field_names( + children[i] = update_column_struct_field_names( child, info.children[i] ) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 4a15edf8a19..db196528e97 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -22,6 +22,7 @@ from cudf._lib.cpp.io.json cimport ( from cudf._lib.cpp.types cimport size_type from cudf._lib.io.utils cimport make_source_info from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr cpdef read_json(object filepaths_or_buffers, @@ -112,5 +113,5 @@ cpdef read_json(object filepaths_or_buffers, c_out_table = move(libcudf_read_json(opts)) column_names = [x.decode() for x in c_out_table.metadata.column_names] - return Table.from_unique_ptr(move(c_out_table.tbl), - column_names=column_names) + return data_from_unique_ptr(move(c_out_table.tbl), + column_names=column_names) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 8ada3376fdb..59c3a4b89dc 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -43,6 +43,7 @@ from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.contains cimport contains from cudf._lib.cpp.lists.extract cimport extract_list_element +from cudf._lib.utils cimport data_from_unique_ptr def count_elements(Column col): @@ -72,7 +73,7 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): with nogil: c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=tbl._column_names, index_names=None if ignore_index else tbl._index_names diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index cc2d405c207..83f088f4419 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.merge cimport merge as cpp_merge from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def merge_sorted( @@ -102,7 +103,7 @@ def merge_sorted( ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=index_names, diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 2470c15f541..e15b569ed85 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -40,6 +40,7 @@ from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.io.utils cimport ( make_sink_info, make_source_info, + update_column_struct_field_names, update_struct_field_names, ) from cudf._lib.table cimport Table @@ -50,7 +51,7 @@ from cudf._lib.types cimport underlying_type_t_type_id import numpy as np -from cudf._lib.utils cimport get_column_names +from cudf._lib.utils cimport data_from_unique_ptr, get_column_names from cudf._lib.utils import _index_level_name, generate_pandas_metadata @@ -111,11 +112,16 @@ cpdef read_orc(object filepaths_or_buffers, names = [name.decode() for name in c_result.metadata.column_names] - tbl = Table.from_unique_ptr(move(c_result.tbl), names) + data, index = data_from_unique_ptr(move(c_result.tbl), names) - update_struct_field_names(tbl, c_result.metadata.schema_info) + data = { + name: update_column_struct_field_names( + col, c_result.metadata.schema_info[i] + ) + for i, (name, col) in enumerate(data.items()) + } - return tbl + return data, index cdef compression_type _get_comp_type(object compression): diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 52f3aada00b..471aa3107d9 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -26,7 +26,7 @@ from cudf.utils.dtypes import ( np_to_pa_dtype, ) -from cudf._lib.utils cimport get_column_names +from cudf._lib.utils cimport data_from_unique_ptr, get_column_names from cudf._lib.utils import _index_level_name, generate_pandas_metadata @@ -178,12 +178,10 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for c in meta['columns']: if c['field_name'] == idx_col: index_col_names[idx_col] = c['name'] - df = cudf.DataFrame._from_table( - Table.from_unique_ptr( - move(c_out_table.tbl), - column_names=column_names - ) - ) + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_out_table.tbl), + column_names=column_names + )) update_struct_field_names(df, c_out_table.metadata.schema_info) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index 865138bec84..90aa6bb0344 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -16,6 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.utils cimport data_from_unique_ptr def partition(Table source_table, Column partition_map, @@ -44,7 +45,7 @@ def partition(Table source_table, Column partition_map, ) return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=source_table._column_names, index_names=source_table._index_names if( diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 45a4ff7c92c..76bf587237c 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -32,6 +32,7 @@ from cudf._lib.cpp.types cimport ( order_info, sorted, ) +from cudf._lib.utils cimport data_from_unique_ptr def quantile( @@ -118,7 +119,7 @@ def quantiles(Table source_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names ) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index fbed410de86..acca2694d10 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -13,6 +13,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def interleave_columns(Table source_table): @@ -35,7 +36,7 @@ def tile(Table source_table, size_type count): with nogil: c_result = move(cpp_tile(c_view, c_count)) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=source_table._index_names diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 8b0a34b134e..cf1d577bd8f 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -30,7 +30,7 @@ from cudf.core.dtypes import ListDtype, StructDtype from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.table cimport Table +from cudf._lib.table cimport Table, make_table_view from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf._lib.interop import from_arrow, to_arrow @@ -58,6 +58,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.utils cimport data_from_table_view from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype @@ -338,8 +339,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, names=columns ) - cdef Table table = from_arrow(pyarrow_table, column_names=columns) - cdef table_view struct_view = table.view() + data, _ = from_arrow(pyarrow_table, column_names=columns) + cdef table_view struct_view = make_table_view(data.values()) s.reset( new struct_scalar(struct_view, valid) @@ -352,11 +353,14 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s): cdef table_view struct_table_view = (s.get()).view() columns = [str(i) for i in range(struct_table_view.num_columns())] - cdef Table to_arrow_table = Table.from_table_view( + data, _ = data_from_table_view( struct_table_view, None, column_names=columns ) + cdef Table to_arrow_table = Table( + cudf.core.column_accessor.ColumnAccessor(data) + ) python_dict = to_arrow(to_arrow_table, columns).to_pydict() diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 1d15052e41a..a07017ef796 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -24,6 +24,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order from cudf._lib.sort cimport underlying_type_t_rank_method from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def is_sorted( @@ -276,9 +277,9 @@ def rank_columns(Table source_table, object method, str na_option, cdef unique_ptr[table] c_result c_result.reset(new table(move(c_results))) - out_table = Table.from_unique_ptr( + data, _ = data_from_unique_ptr( move(c_result), - column_names=source_table._column_names + column_names=source_table._column_names, + index_names=None ) - out_table._index = source_table._index - return out_table + return data, source_table._index diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index a7326efcc03..f1eca64bb87 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -25,6 +25,7 @@ from cudf._lib.cpp.types cimport ( size_type, ) from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def drop_nulls(Table source_table, how="any", keys=None, thresh=None): @@ -78,7 +79,7 @@ def drop_nulls(Table source_table, how="any", keys=None, thresh=None): ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -115,7 +116,7 @@ def apply_boolean_mask(Table source_table, Column boolean_mask): ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( @@ -192,7 +193,7 @@ def drop_duplicates(Table source_table, ) ) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=source_table._column_names, index_names=( diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 58558fade24..74d8e548ad1 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.strings.extract cimport extract as cpp_extract from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def extract(Column source_strings, object pattern): @@ -31,7 +32,7 @@ def extract(Column source_strings, object pattern): pattern_string )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index cc5730c467d..702b0fc8053 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -12,6 +12,7 @@ from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def findall(Column source_strings, pattern): @@ -30,7 +31,7 @@ def findall(Column source_strings, pattern): pattern_string )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index 590de5bf526..0e62ab69298 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -17,6 +17,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def partition(Column source_strings, @@ -40,7 +41,7 @@ def partition(Column source_strings, scalar_str[0] )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -67,7 +68,7 @@ def rpartition(Column source_strings, scalar_str[0] )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index 599f7602b51..a2ce237ced6 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -19,6 +19,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_unique_ptr def split(Column source_strings, @@ -45,7 +46,7 @@ def split(Column source_strings, maxsplit )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) @@ -104,7 +105,7 @@ def rsplit(Column source_strings, maxsplit )) - return Table.from_unique_ptr( + return data_from_unique_ptr( move(c_result), column_names=range(0, c_result.get()[0].num_columns()) ) diff --git a/python/cudf/cudf/_lib/table.pxd b/python/cudf/cudf/_lib/table.pxd index e1bffbc3864..0730199c8a9 100644 --- a/python/cudf/cudf/_lib/table.pxd +++ b/python/cudf/cudf/_lib/table.pxd @@ -16,21 +16,6 @@ cdef class Table: cdef table_view index_view(self) except * cdef mutable_table_view mutable_index_view(self) except * - @staticmethod - cdef Table from_unique_ptr( - unique_ptr[table] c_tbl, - column_names, - index_names=* - ) - - @staticmethod - cdef Table from_table_view( - table_view, - owner, - column_names, - index_names=* - ) - cdef table_view make_table_view(columns) except * cdef mutable_table_view make_mutable_table_view(columns) except * cdef columns_from_ptr(unique_ptr[table] c_tbl) diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi index 2a5dfb2a4dd..ccf0eab99dc 100644 --- a/python/cudf/cudf/_lib/table.pyi +++ b/python/cudf/cudf/_lib/table.pyi @@ -6,7 +6,7 @@ import cudf class Table(object): _data: cudf.core.column_accessor.ColumnAccessor - _index: Optional[cudf.core.index.Index] + _index: Optional[cudf.core.index.BaseIndex] def __init__(self, data: object = None, index: object = None) -> None: ... diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 07d7a0fcf02..09cb05a076d 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -71,106 +71,6 @@ cdef class Table: """ return self._data.columns - @staticmethod - cdef Table from_unique_ptr( - unique_ptr[table] c_tbl, - object column_names, - object index_names=None - ): - """ - Construct a Table from a unique_ptr to a cudf::table. - - Parameters - ---------- - c_tbl : unique_ptr[cudf::table] - index_names : iterable - column_names : iterable - """ - cdef vector[unique_ptr[column]] columns - columns = move(c_tbl.get()[0].release()) - - cdef vector[unique_ptr[column]].iterator it = columns.begin() - - # First construct the index, if any - cdef int i - - index = None - if index_names is not None: - index_data = ColumnAccessor._create_unsafe( - { - name: Column.from_unique_ptr( - move(dereference(it + i)) - ) - for i, name in enumerate(index_names) - } - ) - index = Table(data=index_data) - - # Construct the data dict - cdef int n_index_columns = len(index_names) if index_names else 0 - data = ColumnAccessor._create_unsafe( - { - name: Column.from_unique_ptr( - move(dereference(it + i + n_index_columns)) - ) - for i, name in enumerate(column_names) - } - ) - - return Table(data=data, index=index) - - @staticmethod - cdef Table from_table_view( - table_view tv, - object owner, - object column_names, - object index_names=None - ): - """ - Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and - reach inside of each ``cudf.Column`` to make the owner of each newly - created ``Buffer`` underneath the ``cudf.Column`` objects of the - created ``cudf.Table`` the respective ``Buffer`` from the relevant - ``cudf.Column`` of the ``owner`` ``cudf.Table``. - """ - cdef size_type column_idx = 0 - table_owner = isinstance(owner, Table) - - # First construct the index, if any - index = None - if index_names is not None: - index_columns = [] - for _ in index_names: - column_owner = owner - if table_owner: - column_owner = owner._index._columns[column_idx] - index_columns.append( - Column.from_column_view( - tv.column(column_idx), - column_owner - ) - ) - column_idx += 1 - index = Table(dict(zip(index_names, index_columns))) - - # Construct the data dict - cdef size_type source_column_idx = 0 - data_columns = [] - for _ in column_names: - column_owner = owner - if table_owner: - column_owner = owner._columns[source_column_idx] - data_columns.append( - Column.from_column_view(tv.column(column_idx), column_owner) - ) - column_idx += 1 - source_column_idx += 1 - data = dict(zip(column_names, data_columns)) - - return Table(data=data, index=index) - cdef table_view view(self) except *: """ Return a cudf::table_view of all columns (including index columns) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 9a0c06a6fa1..63abdb8314c 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -27,6 +27,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.utils cimport data_from_unique_ptr from numba.np import numpy_support @@ -151,7 +152,7 @@ def table_encode(Table input): c_result = move(libcudf_transform.encode(c_input)) return ( - Table.from_unique_ptr( + *data_from_unique_ptr( move(c_result.first), column_names=input._column_names, ), diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 7e4423419c9..0f8f0b6ea14 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -14,6 +14,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.transpose cimport transpose as cpp_transpose from cudf._lib.table cimport Table +from cudf._lib.utils cimport data_from_table_view def transpose(Table source): @@ -51,14 +52,14 @@ def transpose(Table source): c_result = move(cpp_transpose(c_input)) result_owner = Column.from_unique_ptr(move(c_result.first)) - result = Table.from_table_view( + data, _ = data_from_table_view( c_result.second, owner=result_owner, column_names=range(source._num_rows) ) if cats is not None: - result = Table(index=result._index, data=[ + data= [ (name, cudf.core.column.column.build_categorical_column( codes=cudf.core.column.column.as_column( col.base_data, dtype=col.dtype), @@ -67,7 +68,7 @@ def transpose(Table source): categories=cats, offset=col.offset, )) - for name, col in result._data.items() - ]) + for name, col in data.items() + ] - return result + return data diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index e8ac858d8b2..f9b225a0b89 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,10 +1,11 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column_view -from cudf._lib.cpp.table.table cimport table_view +from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.table cimport Table @@ -12,3 +13,7 @@ cdef vector[column_view] make_column_views(object columns) except* cdef vector[table_view] make_table_views(object tables) except* cdef vector[table_view] make_table_data_views(object tables) except* cdef vector[string] get_column_names(Table table, object index) except* +cdef data_from_unique_ptr( + unique_ptr[table] c_tbl, column_names, index_names=*) +cdef data_from_table_view( + table_view tv, object owner, object column_names, object index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index d42e15df9f3..81b62159b59 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -4,13 +4,17 @@ import pyarrow as pa import cudf +from cython.operator cimport dereference from libc.stdint cimport uint8_t +from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column_view +from cudf._lib.cpp.column.column cimport column, column_view from cudf._lib.cpp.table.table cimport table_view +from cudf._lib.cpp.types cimport size_type from cudf._lib.table cimport Table try: @@ -192,3 +196,124 @@ def _index_level_name(index_name, level, column_names): return index_name else: return f"__index_level_{level}__" + + +cdef data_from_unique_ptr( + unique_ptr[table] c_tbl, column_names, index_names=None +): + """Convert a libcudf table into a dict with an index. + + This method is intended to provide the bridge between the columns returned + from calls to libcudf APIs and the cuDF Python Table objects, which require + named columns and a separate index. + + Since cuDF Python has an independent representation of a table as a + collection of columns, this function simply returns a dict of columns + suitable for conversion into data to be passed to cuDF constructors. + This method returns the columns of the table in the order they are + stored in libcudf, but calling code is responsible for partitioning and + labeling them as needed. + + Parameters + ---------- + c_tbl : unique_ptr[cudf::table] + The libcudf table whose columns will be extracted + column_names : iterable + The keys associated with the columns in the output data. + index_names : iterable, optional + If provided, an iterable of strings that will be used to label the + corresponding first set of columns into a (Multi)Index. If this + argument is omitted, all columns are assumed to be part of the output + table and no index is constructed. + + + Returns + ------- + tuple(Dict[str, Column], Optional[Index]) + A dict of the columns in the output table. + """ + cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) + cdef vector[unique_ptr[column]].iterator it = c_columns.begin() + + cdef int i + + columns = [Column.from_unique_ptr(move(dereference(it+i))) + for i in range(c_columns.size())] + + # First construct the index, if any + index = ( + # TODO: For performance, the _from_data methods of Frame types assume + # that the passed index object is already an Index because cudf.Index + # and cudf.as_index are expensive. As a result, this function is + # currently somewhat inconsistent in returning a dict of columns for + # the data while actually constructing the Index object here (instead + # of just returning a dict for that as well). As we clean up the + # Frame factories we may want to look for a less dissonant approach + # that does not impose performance penalties. The same applies to + # data_from_table_view below. + cudf.Index._from_data( + { + name: columns[i] + for i, name in enumerate(index_names) + } + ) + if index_names is not None + else None + ) + n_index_columns = len(index_names) if index_names is not None else 0 + data = { + name: columns[i + n_index_columns] + for i, name in enumerate(column_names) + } + return data, index + + +cdef data_from_table_view( + table_view tv, + object owner, + object column_names, + object index_names=None +): + """ + Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it, + along with referencing an ``owner`` Python object that owns the memory + lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and + reach inside of each ``cudf.Column`` to make the owner of each newly + created ``Buffer`` underneath the ``cudf.Column`` objects of the + created ``cudf.Table`` the respective ``Buffer`` from the relevant + ``cudf.Column`` of the ``owner`` ``cudf.Table``. + """ + cdef size_type column_idx = 0 + table_owner = isinstance(owner, Table) + + # First construct the index, if any + index = None + if index_names is not None: + index_columns = [] + for _ in index_names: + column_owner = owner + if table_owner: + column_owner = owner._index._columns[column_idx] + index_columns.append( + Column.from_column_view( + tv.column(column_idx), + column_owner + ) + ) + column_idx += 1 + index = cudf.Index._from_data(dict(zip(index_names, index_columns))) + + # Construct the data dict + cdef size_type source_column_idx = 0 + data_columns = [] + for _ in column_names: + column_owner = owner + if table_owner: + column_owner = owner._columns[source_column_idx] + data_columns.append( + Column.from_column_view(tv.column(column_idx), column_owner) + ) + column_idx += 1 + source_column_idx += 1 + + return dict(zip(column_names, data_columns)), index diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d449d52927e..56d89fc4d14 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -81,7 +81,7 @@ def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame """ - return cudf.core.frame.Frame({None: self.copy(deep=False)}) + return cudf.core.frame.SingleColumnFrame({None: self.copy(deep=False)}) @property def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": @@ -261,10 +261,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: codes = libcudf.interop.from_arrow( indices_table, indices_table.column_names - )._data["None"] + )[0]["None"] categories = libcudf.interop.from_arrow( dictionaries_table, dictionaries_table.column_names - )._data["None"] + )[0]["None"] return build_categorical_column( categories=categories, @@ -282,9 +282,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: elif isinstance(array.type, pa.Decimal128Type): return cudf.core.column.Decimal64Column.from_arrow(array) - result = libcudf.interop.from_arrow(data, data.column_names)._data[ - "None" - ] + result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"] result = result._with_type_metadata( cudf_dtype_from_pa_type(array.type) @@ -1230,57 +1228,6 @@ def _process_for_reduction( ) return result_col - def scatter_to_table( - self, - row_indices: ColumnBase, - column_indices: ColumnBase, - names: List[Any], - nrows: int = None, - ncols: int = None, - ) -> "cudf.core.frame.Frame": - """ - Scatters values from the column into a table. - - Parameters - ---------- - row_indices - A column of the same size as `self` specifying the - row index to scatter each value to - column_indices - A column of the same size as `self` specifying the - column index to scatter each value to - names - The column names of the resulting table - - Returns - ------- - """ - if nrows is None: - nrows = 0 - if len(row_indices) > 0: - nrows = int(row_indices.max() + 1) - - if ncols is None: - ncols = 0 - if len(column_indices) > 0: - ncols = int(column_indices.max() + 1) - - if nrows * ncols == 0: - return cudf.core.frame.Frame({}) - - scatter_map = (column_indices * np.int32(nrows)) + row_indices - target = cudf.core.frame.Frame( - {None: column_empty_like(self, masked=True, newsize=nrows * ncols)} - ) - target._data[None][scatter_map] = self - result_frames = target._split(range(nrows, nrows * ncols, nrows)) - return cudf.core.frame.Frame( - { - name: next(iter(f._columns)) - for name, f in zip(names, result_frames) - } - ) - def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: """ Copies type metadata from self onto other, returning a new column. diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 27dc4fe0c0d..a587c58a49d 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -63,8 +63,8 @@ def _return_or_inplace( """ if inplace: self._parent._mimic_inplace( - self._parent.__class__._from_table( - cudf._lib.table.Table({self._parent.name: new_col}) + self._parent.__class__._from_data( + {self._parent.name: new_col} ), inplace=True, ) @@ -78,8 +78,8 @@ def _return_or_inplace( table = new_col if isinstance(self._parent, cudf.BaseIndex): - idx = self._parent._constructor_expanddim._from_table( - table=table + idx = self._parent._constructor_expanddim._from_data( + table._data, table._index ) idx.names = None return idx diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 92c57477465..fe231e1def9 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -607,11 +607,12 @@ def extract( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - out = libstrings.extract(self._column, pat) - if out._num_columns == 1 and expand is False: - return self._return_or_inplace(out._columns[0], expand=expand) + data, index = libstrings.extract(self._column, pat) + if len(data) == 1 and expand is False: + data = next(iter(data.values())) else: - return self._return_or_inplace(out, expand=expand) + data = cudf.core.frame.Frame(data, index) + return self._return_or_inplace(data, expand=expand) def contains( self, @@ -2274,12 +2275,13 @@ def split( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = libstrings.split( + data, index = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) - if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): - result_table = cudf.core.frame.Frame({}) + if len(data) == 1 and data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) + else: + result_table = cudf.core.frame.Frame(data, index) else: result_table = libstrings.split_record( self._column, cudf.Scalar(pat, "str"), n @@ -2429,12 +2431,13 @@ def rsplit( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = libstrings.rsplit( - self._column, cudf.Scalar(pat), n + data, index = libstrings.rsplit( + self._column, cudf.Scalar(pat, "str"), n ) - if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): - result_table = cudf.core.frame.Frame({}) + if len(data) == 1 and data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) + else: + result_table = cudf.core.frame.Frame(data, index) else: result_table = libstrings.rsplit_record( self._column, cudf.Scalar(pat), n @@ -2519,7 +2522,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep)), + cudf.core.frame.Frame( + *libstrings.partition(self._column, cudf.Scalar(sep)) + ), expand=expand, ) @@ -2584,7 +2589,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep)), + cudf.core.frame.Frame( + *libstrings.rpartition(self._column, cudf.Scalar(sep)) + ), expand=expand, ) @@ -3309,8 +3316,9 @@ def findall( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") + data, index = libstrings.findall(self._column, pat) return self._return_or_inplace( - libstrings.findall(self._column, pat), expand=expand + cudf.core.frame.Frame(data, index), expand=expand ) def isempty(self) -> SeriesOrIndex: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e9e3cd71ddb..9bd62e3bf2c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Optional, TypeVar +from typing import Any, Mapping, Optional, TypeVar import cupy import numpy as np @@ -31,7 +31,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy -from cudf.core.index import BaseIndex, Index, RangeIndex, as_index +from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer from cudf.core.series import Series from cudf.core.window import Rolling @@ -455,30 +455,16 @@ def _init_from_dict_like(self, data, index=None, columns=None): if columns is not None: self.columns = columns - @classmethod - def _from_table(cls, table, index=None): - if index is None: - if table._index is not None: - index = Index._from_table(table._index) - else: - index = RangeIndex(table._num_rows) - out = cls.__new__(cls) - out._data = table._data - out._index = index - return out - @classmethod def _from_data( cls, - data: ColumnAccessor, - index: Optional[Index] = None, + data: Mapping, + index: Optional[BaseIndex] = None, columns: Any = None, ) -> DataFrame: - out = cls.__new__(cls) - out._data = data + out = super()._from_data(data, index) if index is None: - index = cudf.Index(range(data.nrows)) - out._index = index + out.index = RangeIndex(out._data.nrows) if columns is not None: out.columns = columns return out @@ -864,8 +850,8 @@ def _slice(self: T, arg: slice) -> T: ) ) else: - result = self._from_table( - libcudf.copying.table_slice( + result = self._from_data( + *libcudf.copying.table_slice( self, [start, stop], keep_index )[0] ) @@ -4214,10 +4200,12 @@ def transpose(self): index = self.columns.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) - # Cython renames the columns to the range [0...ncols] - result = self.__class__._from_table(libcudf.transpose.transpose(self)) # Set the old column names as the new index - result._index = as_index(index) + result = self.__class__._from_data( + # Cython renames the columns to the range [0...ncols] + libcudf.transpose.transpose(self), + as_index(index), + ) # Set the old index as the new column names result.columns = columns return result diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6ecb0bcc139..14b8ebe801f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import abc -from typing import Any, Dict, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, Mapping, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -64,8 +64,12 @@ def __init_subclass__(cls): cls._accessors = set() @classmethod - def _from_table(cls, table: Frame): - return cls(table._data, index=table._index) + def _from_data( + cls, data: Mapping, index: Optional[cudf.core.index.BaseIndex] = None, + ): + obj = cls.__new__(cls) + libcudf.table.Table.__init__(obj, data, index) + return obj def _mimic_inplace( self: T, result: Frame, inplace: bool = False @@ -476,8 +480,8 @@ def _concat( ) # Concatenate the Tables - out = cls._from_table( - libcudf.concat.concat_tables(tables, ignore_index=ignore_index) + out = cls._from_data( + *libcudf.concat.concat_tables(tables, ignore_index) ) # If ignore_index is True, all input frames are empty, and at @@ -612,10 +616,11 @@ def _explode(self, explode_column: Any, ignore_index: bool): if not ignore_index and self._index is not None: explode_column_num += self._index.nlevels - res_tbl = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index + res = self.__class__._from_data( # type: ignore + *libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) ) - res = self.__class__._from_table(res_tbl) res._data.multiindex = self._data.multiindex res._data._level_names = self._data._level_names @@ -644,14 +649,15 @@ def _get_columns_by_index(self, indices): def _gather(self, gather_map, keep_index=True, nullify=False): if not is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") - result = self.__class__._from_table( - libcudf.copying.gather( + result = self.__class__._from_data( + *libcudf.copying.gather( self, as_column(gather_map), keep_index=keep_index, nullify=nullify, ) ) + result._copy_type_metadata(self, include_index=keep_index) if keep_index and self._index is not None: result._index.names = self._index.names @@ -663,10 +669,10 @@ def _hash(self, initial_hash_values=None): def _hash_partition( self, columns_to_hash, num_partitions, keep_index=True ): - output, offsets = libcudf.hash.hash_partition( + output_data, output_index, offsets = libcudf.hash.hash_partition( self, columns_to_hash, num_partitions, keep_index ) - output = self.__class__._from_table(output) + output = self.__class__._from_data(output_data, output_index) output._copy_type_metadata(self, include_index=keep_index) return output, offsets @@ -684,14 +690,16 @@ def _as_column(self): return self._data[None].copy(deep=False) def _scatter(self, key, value): - result = self._from_table(libcudf.copying.scatter(value, key, self)) + result = self.__class__._from_data( + *libcudf.copying.scatter(value, key, self) + ) result._copy_type_metadata(self) return result def _empty_like(self, keep_index=True): - result = self._from_table( - libcudf.copying.table_empty_like(self, keep_index) + result = self.__class__._from_data( + *libcudf.copying.table_empty_like(self, keep_index) ) result._copy_type_metadata(self, include_index=keep_index) @@ -944,10 +952,10 @@ def mask(self, cond, other=None, inplace=False): def _partition(self, scatter_map, npartitions, keep_index=True): - output_table, output_offsets = libcudf.partitioning.partition( + data, index, output_offsets = libcudf.partitioning.partition( self, scatter_map, npartitions, keep_index ) - partitioned = self.__class__._from_table(output_table) + partitioned = self.__class__._from_data(data, index) # due to the split limitation mentioned # here: https://github.com/rapidsai/cudf/issues/4607 @@ -1332,7 +1340,7 @@ def fillna( ) or method is not None if should_fill: copy_data[name] = copy_data[name].fillna(value[name], method) - result = self._from_table(Frame(copy_data, self._index)) + result = self._from_data(copy_data, self._index) return self._mimic_inplace(result, inplace=inplace) @@ -1381,8 +1389,8 @@ def _drop_na_rows( else: frame._data[name] = col - result = frame.__class__._from_table( - libcudf.stream_compaction.drop_nulls( + result = self.__class__._from_data( + *libcudf.stream_compaction.drop_nulls( frame, how=how, keys=subset, thresh=thresh ) ) @@ -1427,8 +1435,8 @@ def _apply_boolean_mask(self, boolean_mask): """ boolean_mask = as_column(boolean_mask) - result = self.__class__._from_table( - libcudf.stream_compaction.apply_boolean_mask( + result = self.__class__._from_data( + *libcudf.stream_compaction.apply_boolean_mask( self, as_column(boolean_mask) ) ) @@ -1453,8 +1461,8 @@ def _quantiles( libcudf.types.NullOrder[key] for key in null_precedence ] - result = self.__class__._from_table( - libcudf.quantiles.quantiles( + result = self.__class__._from_data( + *libcudf.quantiles.quantiles( self, q, interpolation, @@ -1548,11 +1556,11 @@ def rank( if source.empty: return source.astype("float64") - out_rank_table = libcudf.sort.rank_columns( + data, index = libcudf.sort.rank_columns( source, method_enum, na_option, ascending, pct ) - return self._from_table(out_rank_table).astype(np.float64) + return self._from_data(data, index).astype(np.float64) def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1639,24 +1647,24 @@ def _repeat(self, count): if not is_scalar(count): count = as_column(count) - result = self.__class__._from_table( - libcudf.filling.repeat(self, count) + result = self.__class__._from_data( + *libcudf.filling.repeat(self, count) ) result._copy_type_metadata(self) return result def _reverse(self): - result = self.__class__._from_table(libcudf.copying.reverse(self)) - return result + return self.__class__._from_data(*libcudf.copying.reverse(self)) def _fill(self, fill_values, begin, end, inplace): col_and_fill = zip(self._columns, fill_values) if not inplace: data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) for (c, v) in col_and_fill: c.fill(v, begin, end, inplace=True) @@ -1671,8 +1679,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): def _shift(self, offset, fill_value=None): data_columns = (col.shift(offset, fill_value) for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) def __array__(self, dtype=None): raise TypeError( @@ -1792,13 +1801,11 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - return self.__class__._from_table( - Frame( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ) + return self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, ), index=self._index, ) @@ -1923,8 +1930,8 @@ def sample( else: seed = np.int64(random_state) - result = self._from_table( - libcudf.copying.sample( + result = self.__class__._from_data( + *libcudf.copying.sample( self, n=n, replace=replace, @@ -2064,12 +2071,12 @@ def from_arrow(cls, data): ) # Handle dict arrays - cudf_category_frame = libcudf.table.Table() + cudf_category_frame = {} if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - cudf_indices_frame = libcudf.interop.from_arrow( + cudf_indices_frame, _ = libcudf.interop.from_arrow( dict_indices_table, dict_indices_table.column_names ) # as dictionary size can vary, it can't be a single table @@ -2078,9 +2085,8 @@ def from_arrow(cls, data): for name in dict_dictionaries.keys() } - for name in cudf_indices_frame._data.names: - codes = cudf_indices_frame._data[name] - cudf_category_frame._data[name] = build_categorical_column( + for name, codes in cudf_indices_frame.items(): + cudf_category_frame[name] = build_categorical_column( cudf_dictionaries_columns[name], codes, mask=codes.base_mask, @@ -2090,30 +2096,20 @@ def from_arrow(cls, data): # Handle non-dict arrays cudf_non_category_frame = ( - libcudf.table.Table() + {} if data.num_columns == 0 - else libcudf.interop.from_arrow(data, data.column_names) + else libcudf.interop.from_arrow(data, data.column_names)[0] ) - if ( - cudf_non_category_frame._num_columns > 0 - and cudf_category_frame._num_columns > 0 - ): - result = cudf_non_category_frame - for name in cudf_category_frame._data.names: - result._data[name] = cudf_category_frame._data[name] - elif cudf_non_category_frame._num_columns > 0: - result = cudf_non_category_frame - else: - result = cudf_category_frame + result = {**cudf_non_category_frame, **cudf_category_frame} # There are some special cases that need to be handled # based on metadata. if pandas_dtypes: - for name in result._data.names: + for name in result: dtype = None if ( - len(result._data[name]) == 0 + len(result[name]) == 0 and pandas_dtypes[name] == "categorical" ): # When pandas_dtype is a categorical column and the size @@ -2139,18 +2135,14 @@ def from_arrow(cls, data): # struct fields, hence renaming the struct fields is # necessary by extracting the field names from arrow # struct types. - result._data[name] = result._data[name]._rename_fields( + result[name] = result[name]._rename_fields( [field.name for field in data[name].type] ) if dtype is not None: - result._data[name] = result._data[name].astype(dtype) + result[name] = result[name].astype(dtype) - result = libcudf.table.Table( - result._data.select_by_label(column_names) - ) - - return cls._from_table(result) + return cls._from_data({name: result[name] for name in column_names}) @annotate("TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): @@ -2209,8 +2201,8 @@ def drop_duplicates( if len(subset_cols) == 0: return self.copy(deep=True) - result = self._from_table( - libcudf.stream_compaction.drop_duplicates( + result = self.__class__._from_data( + *libcudf.stream_compaction.drop_duplicates( self, keys=subset, keep=keep, @@ -2256,7 +2248,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame: else: copy_data = self._data.copy(deep=True) - result = self._from_table(Frame(copy_data, self._index)) + result = self._from_data(copy_data, self._index) return result @@ -2278,15 +2270,17 @@ def _copy_type_metadata( if include_index: if self._index is not None and other._index is not None: self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, there is + # When other._index is a CategoricalIndex, the current index + # will be a NumericalIndex with an underlying CategoricalColumn + # (the above _copy_type_metadata call will have converted the + # column). Calling cudf.Index on that column generates the + # appropriate index. if isinstance( other._index, cudf.core.index.CategoricalIndex ) and not isinstance( self._index, cudf.core.index.CategoricalIndex ): - self._index = cudf.core.index.Index._from_table( - self._index - ) + self._index = cudf.Index(self._index._column) return self @@ -2376,8 +2370,9 @@ def isnull(self): GenericIndex([False, False, True, True, False, False], dtype='bool') """ data_columns = (col.isnull() for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) # Alias for isnull isna = isnull @@ -2456,8 +2451,9 @@ def notnull(self): GenericIndex([True, True, False, False, True, True], dtype='bool') """ data_columns = (col.notnull() for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) # Alias for notnull notna = notnull @@ -2526,7 +2522,7 @@ def tile(self, count): ------- The table containing the tiled "rows". """ - result = self.__class__._from_table(libcudf.reshape.tile(self, count)) + result = self.__class__._from_data(*libcudf.reshape.tile(self, count)) result._copy_type_metadata(self) return result @@ -3264,20 +3260,16 @@ def _is_sorted(self, ascending=None, null_position=None): ) def _split(self, splits, keep_index=True): - result = libcudf.copying.table_split( + results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) - result = [self.__class__._from_table(tbl) for tbl in result] - return result + return [self.__class__._from_data(*result) for result in results] def _encode(self): - keys, indices = libcudf.transform.table_encode(self) - keys = self.__class__._from_table(keys) - for col in keys._data: - keys._data[col] = keys._data[col]._with_type_metadata( - self._data[col].dtype - ) - + data, index, indices = libcudf.transform.table_encode(self) + for name, col in data.items(): + data[name] = col._with_type_metadata(self._data[name].dtype) + keys = self.__class__._from_data(data, index) return keys, indices def _reindex( @@ -3344,13 +3336,11 @@ def _reindex( for name in names } - result = self.__class__._from_table( - Frame( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ) + result = self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, ), index=index, ) @@ -3359,8 +3349,9 @@ def _reindex( def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) - data = zip(self._column_names, data_columns) - return self.__class__._from_table(Frame(data, self._index)) + return self.__class__._from_data( + zip(self._column_names, data_columns), self._index + ) def _binaryop( self, @@ -3621,6 +3612,19 @@ class SingleColumnFrame(Frame): this class. """ + @classmethod + def _from_data( + cls, + data: Mapping, + index: Optional[cudf.core.index.BaseIndex] = None, + name: Any = None, + ): + + out = super()._from_data(data, index) + if name is not None: + out.name = name + return out + @property def name(self): """The name of this object.""" diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 29c29691389..19b4e0f5620 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -195,9 +195,9 @@ def agg(self, func): # Note: When there are no key columns, the below produces # a Float64Index, while Pandas returns an Int64Index # (GH: 6945) - result = self._groupby.aggregate(self.obj, normalized_aggs) - - result = cudf.DataFrame._from_table(result) + result = cudf.DataFrame._from_data( + *self._groupby.aggregate(self.obj, normalized_aggs) + ) if self._sort: result = result.sort_index() @@ -226,7 +226,7 @@ def agg(self, func): # copy categorical information from keys to the result index: result.index._copy_type_metadata(self.grouping.keys) - result._index = cudf.core.index.Index._from_table(result._index) + result._index = cudf.Index(result._index) if not self._as_index: for col_name in reversed(self.grouping._named_columns): @@ -288,9 +288,7 @@ def deserialize(cls, header, frames): def _grouped(self): grouped_keys, grouped_values, offsets = self._groupby.groups(self.obj) - - grouped_keys = cudf.Index._from_table(grouped_keys) - grouped_values = self.obj.__class__._from_table(grouped_values) + grouped_values = self.obj.__class__._from_data(*grouped_values) grouped_values._copy_type_metadata(self.obj) group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) @@ -819,10 +817,9 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: """Internal implementation for `ffill` and `bfill` """ value_columns = self.grouping.values - result = self._groupby.replace_nulls( - Table(value_columns._data), method + result = self.obj.__class__._from_data( + self._groupby.replace_nulls(Table(value_columns._data), method) ) - result = self.obj.__class__._from_table(result) result = self._mimic_pandas_order(result) return result._copy_type_metadata(value_columns) @@ -936,9 +933,9 @@ def fillna( return getattr(self, method, limit)() value_columns = self.grouping.values - _, grouped_values, _ = self._groupby.groups(Table(value_columns._data)) + _, (data, index), _ = self._groupby.groups(Table(value_columns._data)) - grouped = self.obj.__class__._from_data(grouped_values._data) + grouped = self.obj.__class__._from_data(data, index) result = grouped.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) @@ -997,8 +994,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): fill_value = [fill_value] * num_columns_to_shift value_columns = self.obj._data.select_by_label(value_column_names) - result = self._groupby.shift(Table(value_columns), periods, fill_value) - return self.obj.__class__._from_table(result) + return self.obj.__class__._from_data( + *self._groupby.shift(Table(value_columns), periods, fill_value) + ) def _mimic_pandas_order( self, result: DataFrameOrSeries @@ -1007,11 +1005,10 @@ def _mimic_pandas_order( matching that of pandas. This also adds appropriate indices. """ sorted_order_column = arange(0, result._data.nrows) - _, order, _ = self._groupby.groups( + _, (order, _), _ = self._groupby.groups( Table({"sorted_order_column": sorted_order_column}) ) - order = order._data["sorted_order_column"] - gather_map = order.argsort() + gather_map = order["sorted_order_column"].argsort() result = result.take(gather_map) result.index = self.obj.index return result diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 691b6ab2e29..97ee0948209 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1362,53 +1362,42 @@ def from_pandas(cls, index, nan_as_null=None): ind.name = index.name return ind - @classmethod - def _from_table(cls, table): - if not isinstance(table, RangeIndex): - if table._num_columns == 0: - raise ValueError("Cannot construct Index from any empty Table") - if table._num_columns == 1: - values = next(iter(table._data.values())) - - if isinstance(values, NumericalColumn): - try: - index_class_type = _dtype_to_index[values.dtype.type] - except KeyError: - index_class_type = GenericIndex - out = super(BaseIndex, index_class_type).__new__( - index_class_type - ) - elif isinstance(values, DatetimeColumn): - out = super(BaseIndex, DatetimeIndex).__new__( - DatetimeIndex - ) - elif isinstance(values, TimeDeltaColumn): - out = super(BaseIndex, TimedeltaIndex).__new__( - TimedeltaIndex - ) - elif isinstance(values, StringColumn): - out = super(BaseIndex, StringIndex).__new__(StringIndex) - elif isinstance(values, CategoricalColumn): - out = super(BaseIndex, CategoricalIndex).__new__( - CategoricalIndex - ) - out._data = table._data - out._index = None - return out - else: - return cudf.MultiIndex._from_table( - table, names=table._data.names - ) - else: - return as_index(table) - @property def _copy_construct_defaults(self): return {"data": self._column, "name": self.name} @classmethod def _from_data(cls, data, index=None): - return cls._from_table(SingleColumnFrame(data=data)) + if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): + data = cudf.core.column_accessor.ColumnAccessor(data) + if len(data) == 0: + raise ValueError("Cannot construct Index from any empty Table") + if len(data) == 1: + values = next(iter(data.values())) + + if isinstance(values, NumericalColumn): + try: + index_class_type = _dtype_to_index[values.dtype.type] + except KeyError: + index_class_type = GenericIndex + out = super(BaseIndex, index_class_type).__new__( + index_class_type + ) + elif isinstance(values, DatetimeColumn): + out = super(BaseIndex, DatetimeIndex).__new__(DatetimeIndex) + elif isinstance(values, TimeDeltaColumn): + out = super(BaseIndex, TimedeltaIndex).__new__(TimedeltaIndex) + elif isinstance(values, StringColumn): + out = super(BaseIndex, StringIndex).__new__(StringIndex) + elif isinstance(values, CategoricalColumn): + out = super(BaseIndex, CategoricalIndex).__new__( + CategoricalIndex + ) + out._data = data + out._index = None + return out + else: + return cudf.MultiIndex._from_data(data) @property def _constructor_expanddim(self): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 51423d604c2..26a893a4676 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -6,7 +6,7 @@ import pickle import warnings from collections.abc import Sequence -from typing import Any, List, Tuple, Union +from typing import Any, List, Mapping, Tuple, Union import cupy import numpy as np @@ -18,7 +18,6 @@ from cudf._typing import DataFrameOrSeries from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import as_column, column -from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import SingleColumnFrame from cudf.core.index import BaseIndex, as_index from cudf.utils.utils import _maybe_indices_to_slice @@ -174,7 +173,7 @@ def __init__( source_data[name] = libcudf.copying.gather( level, codes._data.columns[0] - )._data[name] + )[0][name] self._data = source_data._data self.names = names @@ -294,17 +293,15 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) + # TODO: This type ignore is indicating a real problem, which is that + # MultiIndex should not be inheriting from SingleColumnFrame, but fixing + # that will have to wait until we reshuffle the Index hierarchy. @classmethod - def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex: + def _from_data( # type: ignore + cls, data: Mapping, index=None + ) -> MultiIndex: return cls.from_frame(cudf.DataFrame._from_data(data)) - @classmethod - def _from_table(cls, table, names=None): - df = cudf.DataFrame(table._data) - if names is None: - names = df.columns - return MultiIndex.from_frame(df, names=names) - @property def shape(self): return (self._data.nrows, len(self._data.names)) @@ -1241,9 +1238,7 @@ def _poplevels(self, level): popped_data[n] = self._data.pop(n) # construct the popped result - popped = cudf.core.index.Index._from_table( - cudf.core.frame.Frame(popped_data) - ) + popped = cudf.Index._from_data(popped_data) popped.names = popped_names # update self diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 9d449d16401..54571ebb31d 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -781,8 +781,8 @@ def merge_sorted( if by_index and ignore_index: raise ValueError("`by_index` and `ignore_index` cannot both be True") - result = objs[0].__class__._from_table( - cudf._lib.merge.merge_sorted( + result = objs[0].__class__._from_data( + *cudf._lib.merge.merge_sorted( objs, keys=keys, by_index=by_index, @@ -822,22 +822,31 @@ def as_tuple(x): for v in df: names = [as_tuple(v) + as_tuple(name) for name in column_labels] - col = df._data[v] - result.update( - cudf.DataFrame._from_table( - col.scatter_to_table( - index_idx, - columns_idx, - names, - nrows=len(index_labels), - ncols=len(names), - ) - )._data - ) - out = cudf.DataFrame._from_data( + nrows = len(index_labels) + ncols = len(names) + num_elements = nrows * ncols + if num_elements > 0: + col = df._data[v] + scatter_map = (columns_idx * np.int32(nrows)) + index_idx + target = cudf.core.frame.Frame( + { + None: cudf.core.column.column_empty_like( + col, masked=True, newsize=nrows * ncols + ) + } + ) + target._data[None][scatter_map] = col + result_frames = target._split(range(nrows, nrows * ncols, nrows)) + result.update( + { + name: next(iter(f._columns)) + for name, f in zip(names, result_frames) + } + ) + + return cudf.DataFrame._from_data( result, index=cudf.Index(index_labels, name=index.name) ) - return out def pivot(data, index=None, columns=None, values=None): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ac1861e2cfc..107e33bc5ff 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Optional +from typing import Any, Mapping, Optional from uuid import uuid4 import cupy @@ -266,29 +266,19 @@ def __init__( super().__init__({name: data}) self._index = RangeIndex(len(data)) if index is None else index - @classmethod - def _from_table(cls, table, index=None): - name, data = next(iter(table._data.items())) - if index is None: - if table._index is not None: - index = Index._from_table(table._index) - return cls(data=data, index=index, name=name) - @classmethod def _from_data( cls, - data: ColumnAccessor, - index: Optional[Index] = None, + data: Mapping, + index: Optional[BaseIndex] = None, name: Any = None, ) -> Series: """ Construct the Series from a ColumnAccessor """ - out = cls.__new__(cls) - out._data = data - out._index = index if index is not None else RangeIndex(data.nrows) - if name is not None: - out.name = name + out: Series = super()._from_data(data, index, name) + if index is None: + out._index = RangeIndex(out._data.nrows) return out def __contains__(self, item): diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index a6713e85e76..9e38b6e896d 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,4 +1,5 @@ # Copyright (c) 2019, NVIDIA CORPORATION. +import cudf from cudf import _lib as libcudf from cudf.utils import ioutils @@ -14,8 +15,6 @@ def read_avro( ): """{docstring}""" - from cudf import DataFrame - is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) @@ -31,8 +30,8 @@ def read_avro( ValueError("URL content-encoding decompression is not supported") if engine == "cudf": - return DataFrame._from_table( - libcudf.avro.read_avro( + return cudf.DataFrame._from_data( + *libcudf.avro.read_avro( filepath_or_buffer, columns, skiprows, num_rows ) ) diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index b8a76890913..9d97bee0396 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -35,12 +35,12 @@ def from_dlpack(pycapsule_obj): tensor is row-major, transpose it before passing it to this function. """ - res = libdlpack.from_dlpack(pycapsule_obj) + data, _ = libdlpack.from_dlpack(pycapsule_obj) - if res._num_columns == 1: - return Series(res._data[0]) + if len(data) == 1: + return Series._from_data(data) else: - return DataFrame(data=res._data) + return DataFrame._from_data(data) @ioutils.doc_to_dlpack() diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index b605bf90ff4..8a00d9c73a0 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -53,8 +53,8 @@ def read_json( else: filepaths_or_buffers.append(tmp_source) - return cudf.DataFrame._from_table( - libjson.read_json( + return cudf.DataFrame._from_data( + *libjson.read_json( filepaths_or_buffers, dtype, lines, compression, byte_range ) ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index a99f82fde7a..8f6002bb577 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -290,8 +290,8 @@ def read_orc( stripes = selected_stripes if engine == "cudf": - df = DataFrame._from_table( - liborc.read_orc( + return DataFrame._from_data( + *liborc.read_orc( filepaths_or_buffers, columns, stripes, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4214ca46b3d..d358b2a3de1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5347,14 +5347,6 @@ def test_change_column_dtype_in_empty(): assert_eq(pdf, gdf) -def test_dataframe_from_table_empty_index(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - odict = df._data - tbl = cudf._lib.table.Table(odict) - - result = cudf.DataFrame._from_table(tbl) # noqa: F841 - - @pytest.mark.parametrize("dtype", ["int64", "str"]) def test_dataframe_from_dictionary_series_same_name_index(dtype): pd_idx1 = pd.Index([1, 2, 0], name="test_index").astype(dtype) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index dab74050437..c735a71d5e1 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -61,7 +61,8 @@ def assert_packed_frame_equality(df): packed = pack(df) del df - unpacked = DataFrame._from_table(unpack(packed)) + tbl = unpack(packed) + unpacked = DataFrame(tbl._data, tbl._index) assert_eq(unpacked, pdf) @@ -196,15 +197,15 @@ def check_packed_pickled_equality(df): ) for b in buffers: assert isinstance(b, pickle.PickleBuffer) - loaded = DataFrame._from_table( - unpack(pickle.loads(serialbytes, buffers=buffers)) - ) + tbl = unpack(pickle.loads(serialbytes, buffers=buffers)) + loaded = DataFrame(tbl._data, tbl._index) assert_eq(loaded, df) def assert_packed_frame_picklable(df): serialbytes = pickle.dumps(pack(df)) - loaded = DataFrame._from_table(unpack(pickle.loads(serialbytes))) + tbl = unpack(pickle.loads(serialbytes)) + loaded = DataFrame(tbl._data, tbl._index) assert_eq(loaded, df) @@ -269,7 +270,8 @@ def check_packed_serialized_equality(df): def assert_packed_frame_serializable(df): packed = pack(df) header, frames = packed.serialize() - loaded = DataFrame._from_table(unpack(packed.deserialize(header, frames))) + tbl = unpack(packed.deserialize(header, frames)) + loaded = DataFrame(tbl._data, tbl._index) assert_eq(loaded, df)