diff --git a/.all-contributorsrc b/.all-contributorsrc index ed10f7894d..bb16ac769a 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -320,6 +320,15 @@ "contributions": [ "code" ] + }, + { + "login": "Saransh-cpp", + "name": "Saransh", + "avatar_url": "https://avatars.githubusercontent.com/u/74055102?v=4", + "profile": "https://saransh-cpp.github.io/", + "contributions": [ + "code" + ] } ], "contributorsPerLine": 7, diff --git a/README.md b/README.md index 146186d2c3..591302ffc5 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,7 @@ Thanks especially to the gracious help of Awkward Array contributors (including
Ahmad-AlSubaie

💻
Manasvi Goyal

💻
Aryan Roy

💻 +
Saransh

💻 diff --git a/src/awkward/_v2/_connect/rdataframe/from_rdataframe.py b/src/awkward/_v2/_connect/rdataframe/from_rdataframe.py index abad41db2f..4bd8a91067 100644 --- a/src/awkward/_v2/_connect/rdataframe/from_rdataframe.py +++ b/src/awkward/_v2/_connect/rdataframe/from_rdataframe.py @@ -12,10 +12,21 @@ from awkward._v2.types.numpytype import primitive_to_dtype cpp_type_of = { - "float64": "double", + "bool": "bool", + "int8": "int8_t", + "uint8": "uint8_t", + "int16": "int16_t", + "uint16": "uint16_t", + "int32": "int32_t", + "uint32": "uint32_t", "int64": "int64_t", + "uint64": "uint64_t", + "float32": "float", + "float64": "double", + "complex64": "std::complex", "complex128": "std::complex", - "uint8": "uint8_t", + "datetime64": "std::time_t", + "timedelta64": "std::difftime", } np = ak.nplike.NumpyMetadata.instance() @@ -43,140 +54,193 @@ assert done is True -def from_rdataframe(data_frame, column): - def _wrap_as_record_array(array): - layout = array.layout if isinstance(array, ak._v2.highlevel.Array) else array - return ak._v2._util.wrap( - ak._v2.contents.RecordArray( - fields=[column], - contents=[layout], - ), - highlevel=True, - ) +def from_rdataframe(data_frame, columns): + def form_dtype(form): + if isinstance(form, ak._v2.forms.NumpyForm) and form.inner_shape == (): + return primitive_to_dtype(form.primitive) + elif isinstance(form, ak._v2.forms.ListOffsetForm): + return form_dtype(form.content) + + def empty_buffers(cpp_buffers_self, names_nbytes): + buffers = {} + for item in names_nbytes: + buffers[item.first] = ak.nplike.numpy.empty(item.second) + cpp_buffers_self.append( + item.first, + buffers[item.first].ctypes.data_as(ctypes.POINTER(ctypes.c_ubyte)), + ) + return buffers - # Cast input node to base RNode type - data_frame_rnode = cppyy.gbl.ROOT.RDF.AsRNode(data_frame) + def cpp_builder_type(depth, data_type): + if depth == 1: + return f"awkward::LayoutBuilder::Numpy<{data_type}>>" + else: + return ( + "awkward::LayoutBuilder::ListOffset" + ) - column_type = data_frame_rnode.GetColumnType(column) - form_str = ROOT.awkward.type_to_form[column_type](0) + def cpp_fill_offsets_and_flatten(depth): + if depth == 1: + return "\nfor (auto it : vec1) {\n" + " builder1.append(it);\n" + "}\n" + else: + return ( + f"for (auto const& vec{depth - 1} : vec{depth}) " + + "{\n" + + f" auto& builder{depth - 1} = builder{depth}.begin_list();\n" + + " " + + cpp_fill_offsets_and_flatten(depth - 1) + + "\n" + + f" builder{depth}.end_list();\n" + + "}\n" + ) - # 'Take' is a lazy action: - result_ptrs = data_frame_rnode.Take[column_type](column) - - if form_str.startswith("{"): - form = ak._v2.forms.from_json(form_str) - list_depth = form.purelist_depth - if list_depth > 4: - raise ak._v2._util.error( - NotImplementedError( - "Retrieving arbitrary depth nested containers is not implemented yet." - ) + def cpp_fill_function(depth): + if depth == 1: + return ( + "template\n" + + "void\n" + + "fill_from(BUILDER& builder, ROOT::RDF::RResultPtr>& result) {" + + " for (auto it : result) {\n" + + " builder.append(it);\n" + + " }\n" + + "}\n" + ) + else: + return ( + "template\n" + + "void\n" + + f"fill_offsets_and_flatten{depth}(BUILDER& builder{depth}, ROOT::RDF::RResultPtr>& result) " + + "{\n" + + f" for (auto const& vec{depth - 1} : result) " + + "{\n" + + f" auto& builder{depth - 1} = builder{depth}.begin_list();\n" + + " " + + cpp_fill_offsets_and_flatten(depth - 1) + + "\n" + + f" builder{depth}.end_list();\n" + + "}\n" + + "}\n" ) - def supported(form): - if form.purelist_depth == 1: - # special case for a list of strings form - return isinstance( - form, (ak._v2.forms.ListOffsetForm, ak._v2.forms.NumpyForm) - ) - else: - return isinstance(form, ak._v2.forms.ListOffsetForm) and supported( - form.content - ) + is_indexed = True if "awkward_index_" in data_frame.GetColumnNames() else False - if not supported(form): - raise ak._v2._util.error(NotImplementedError) + # Register Take action for each column + # 'Take' is a lazy action: + result_ptrs = {} + column_types = {} + contents_index = None + columns = ( + columns + ("awkward_index_",) + if (is_indexed and "awkward_index_" not in columns) + else columns + ) + for col in columns: + column_types[col] = data_frame.GetColumnType(col) + result_ptrs[col] = data_frame.Take[column_types[col]](col) + + contents = {} + awkward_contents = {} + contents_index = {} + for col in columns: + col_type = column_types[col] + if ROOT.awkward.is_awkward_type[col_type](): # Retrieve Awkward arrays + + # ROOT::RDF::RResultPtr::begin Returns an iterator to the beginning of + # the contained object if this makes sense, throw a compilation error otherwise. + # + # Does not trigger event loop and execution of all actions booked in + # the associated RLoopManager. + lookup = result_ptrs[col].begin().lookup() + generator = lookup[col].generator + layout = generator.tolayout(lookup[col], 0, ()) + awkward_contents[col] = layout + + else: # Convert the C++ vectors to Awkward arrays + form_str = ROOT.awkward.type_to_form[col_type](0) + form = ak._v2.forms.from_json(form_str) + + list_depth = form.purelist_depth + form_dtype_name = form_dtype(form).name + data_type = cpp_type_of[form_dtype_name] + + # pull in the CppBuffers (after which we can import from it) + CppBuffers = cppyy.gbl.awkward.CppBuffers[col_type] + cpp_buffers_self = CppBuffers(result_ptrs[col]) + + if isinstance(form, ak._v2.forms.NumpyForm): + + NumpyBuilder = cppyy.gbl.awkward.LayoutBuilder.Numpy[data_type] + builder = NumpyBuilder() + builder_type = type(builder).__cpp_name__ + + cpp_buffers_self.fill_from[builder_type, col_type]( + builder, result_ptrs[col] + ) - def form_dtype(form): - if form.purelist_depth == 1: - # special case for a list of strings form - return ( - primitive_to_dtype(form.content.primitive) - if isinstance(form, ak._v2.forms.ListOffsetForm) - else primitive_to_dtype(form.primitive) + names_nbytes = cpp_buffers_self.names_nbytes[builder_type](builder) + buffers = empty_buffers(cpp_buffers_self, names_nbytes) + cpp_buffers_self.to_char_buffers[builder_type](builder) + + elif isinstance(form, ak._v2.forms.ListOffsetForm): + if isinstance(form.content, ak._v2.forms.NumpyForm): + # NOTE: list_depth == 2 or 1 if its the list of strings + list_depth = 2 + + ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[ + "int64_t", + cpp_builder_type(list_depth - 1, data_type), + ] + builder = ListOffsetBuilder() + builder_type = type(builder).__cpp_name__ + + if not hasattr( + cppyy.gbl.awkward, f"fill_offsets_and_flatten{list_depth}" + ): + done = cppyy.cppdef( + "namespace awkward {" + cpp_fill_function(list_depth) + "}" + ) + assert done is True + + fill_from_func = getattr( + cppyy.gbl.awkward, f"fill_offsets_and_flatten{list_depth}" ) + fill_from_func[builder_type, col_type](builder, result_ptrs[col]) else: - return form_dtype(form.content) - - def empty_buffers(cpp_buffers_self, names_nbytes): - buffers = {} - for item in names_nbytes: - buffers[item.first] = ak.nplike.numpy.empty(item.second) - cpp_buffers_self.append( - item.first, - buffers[item.first].ctypes.data_as(ctypes.POINTER(ctypes.c_ubyte)), + raise ak._v2._util.error( + AssertionError(f"unrecognized Form: {type(form)}") ) - return buffers - - data_type = cpp_type_of[form_dtype(form).name] - - # pull in the CppBuffers (after which we can import from it) - CppBuffers = cppyy.gbl.awkward.CppBuffers[column_type] - cpp_buffers_self = CppBuffers(result_ptrs) - - if isinstance(form, ak._v2.forms.NumpyForm): - - NumpyBuilder = cppyy.gbl.awkward.LayoutBuilder.Numpy[data_type] - builder = NumpyBuilder() - builder_type = type(builder).__cpp_name__ - - cpp_buffers_self.fill_from[builder_type](builder) - - elif isinstance(form, ak._v2.forms.ListOffsetForm) and isinstance( - form.content, ak._v2.forms.NumpyForm - ): - # NOTE: list_depth == 2 or 1 if its the list of strings - ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[ - "int64_t", - f"awkward::LayoutBuilder::Numpy<{data_type}", - ] - builder = ListOffsetBuilder() - builder_type = type(builder).__cpp_name__ - cpp_buffers_self.fill_offsets_and_flatten_2[builder_type](builder) + names_nbytes = cpp_buffers_self.names_nbytes[builder_type](builder) + buffers = empty_buffers(cpp_buffers_self, names_nbytes) + cpp_buffers_self.to_char_buffers[builder_type](builder) - elif list_depth == 3: - ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[ - "int64_t", - f"awkward::LayoutBuilder::ListOffset", - ] - builder = ListOffsetBuilder() - builder_type = type(builder).__cpp_name__ - - cpp_buffers_self.fill_offsets_and_flatten_3[builder_type](builder) + array = ak._v2.from_buffers( + form, + builder.length(), + buffers, + ) + if col == "awkward_index_": + contents_index = ak._v2.index.Index64( + array.layout.to_numpy(allow_missing=True) + ) + else: + contents[col] = array.layout + + for col, content in awkward_contents.items(): + # wrap Awkward array in IndexedArray only if needed + if contents_index is not None and len(contents_index) < len(content): + array = ak._v2._util.wrap( + ak._v2.contents.IndexedArray(contents_index, content), + highlevel=True, + ) + contents[col] = array.layout else: - ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[ - "int64_t", - f"awkward::LayoutBuilder::ListOffset>", - ] - builder = ListOffsetBuilder() - builder_type = type(builder).__cpp_name__ - - cpp_buffers_self.fill_offsets_and_flatten_4[builder_type](builder) - - names_nbytes = cpp_buffers_self.names_nbytes[builder_type](builder) - buffers = empty_buffers(cpp_buffers_self, names_nbytes) - cpp_buffers_self.to_char_buffers[builder_type, data_type](builder) - - array = ak._v2.from_buffers( - form, - builder.length(), - buffers, - ) - return _wrap_as_record_array(array) - - elif form_str == "awkward type": - - # ROOT::RDF::RResultPtr::begin Returns an iterator to the beginning of - # the contained object if this makes sense, throw a compilation error otherwise. - # - # Does not trigger event loop and execution of all actions booked in - # the associated RLoopManager. - lookup = result_ptrs.begin().lookup() - generator = lookup[column].generator - layout = generator.tolayout(lookup[column], 0, ()) - - return _wrap_as_record_array(layout) - else: - raise ak._v2._util.error(NotImplementedError) + contents[col] = content + + return ak._v2._util.wrap( + ak._v2.contents.RecordArray(list(contents.values()), list(contents.keys())), + highlevel=True, + ) diff --git a/src/awkward/_v2/_connect/rdataframe/to_rdataframe.py b/src/awkward/_v2/_connect/rdataframe/to_rdataframe.py index 2222e2c015..f984ecbb1c 100644 --- a/src/awkward/_v2/_connect/rdataframe/to_rdataframe.py +++ b/src/awkward/_v2/_connect/rdataframe/to_rdataframe.py @@ -314,4 +314,6 @@ class {array_data_source} final (self.data_ptrs_list), ) + rdf = rdf.Define("awkward_index_", "(int64_t)rdfentry_") + return rdf diff --git a/src/awkward/_v2/_util.py b/src/awkward/_v2/_util.py index b69f96204a..e3c8fcb320 100644 --- a/src/awkward/_v2/_util.py +++ b/src/awkward/_v2/_util.py @@ -483,11 +483,6 @@ def arrayclass(layout, behavior): cls = behavior[arr] if isinstance(cls, type) and issubclass(cls, ak._v2.highlevel.Array): return cls - rec = layout.parameter("__record__") - if isstr(rec): - cls = behavior[".", rec] - if isinstance(cls, type) and issubclass(cls, ak._v2.highlevel.Array): - return cls deeprec = layout.purelist_parameter("__record__") if isstr(deeprec): cls = behavior["*", deeprec] @@ -554,11 +549,6 @@ def numba_array_typer(layouttype, behavior): typer = behavior["__numba_typer__", arr] if callable(typer): return typer - rec = layouttype.parameters.get("__record__") - if isstr(rec): - typer = behavior["__numba_typer__", ".", rec] - if callable(typer): - return typer deeprec = layouttype.parameters.get("__record__") if isstr(deeprec): typer = behavior["__numba_typer__", "*", deeprec] @@ -574,11 +564,6 @@ def numba_array_lower(layouttype, behavior): lower = behavior["__numba_lower__", arr] if callable(lower): return lower - rec = layouttype.parameters.get("__record__") - if isstr(rec): - lower = behavior["__numba_lower__", ".", rec] - if callable(lower): - return lower deeprec = layouttype.parameters.get("__record__") if isstr(deeprec): lower = behavior["__numba_lower__", "*", deeprec] diff --git a/src/awkward/_v2/contents/content.py b/src/awkward/_v2/contents/content.py index 657783e5e7..c93f5c0391 100644 --- a/src/awkward/_v2/contents/content.py +++ b/src/awkward/_v2/contents/content.py @@ -1378,12 +1378,14 @@ def to_json( complex_real_string = None complex_imag_string = None elif ( - isinstance(complex_record_fields, tuple) + isinstance(complex_record_fields, (tuple, list)) and len(complex_record_fields) == 2 and isinstance(complex_record_fields[0], str) and isinstance(complex_record_fields[1], str) ): complex_real_string, complex_imag_string = complex_record_fields + else: + complex_real_string, complex_imag_string = None, None return self.packed()._to_list( behavior, @@ -1411,6 +1413,11 @@ def _to_list_custom(self, behavior, json_conversions): for i in range(self.length): out[i] = array[i] + # These json_conversions are applied in NumpyArray (for numbers) + # and ListArray/ListOffsetArray/RegularArray (for bytestrings), + # but they're also applied here because __getitem__ might return + # something convertible (the overloaded __getitem__ might be + # trivial, as it is in Vector). if json_conversions is not None: convert_bytes = json_conversions["convert_bytes"] if convert_bytes is not None: diff --git a/src/awkward/_v2/cpp-headers/awkward/utils.h b/src/awkward/_v2/cpp-headers/awkward/utils.h index 7429497a54..6795b0d979 100644 --- a/src/awkward/_v2/cpp-headers/awkward/utils.h +++ b/src/awkward/_v2/cpp-headers/awkward/utils.h @@ -1,7 +1,7 @@ // BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE -#ifndef AWKWARD_UTILS_H_ -#define AWKWARD_UTILS_H_ +#ifndef AWKWARD_CPP_HEADERS_UTILS_H_ +#define AWKWARD_CPP_HEADERS_UTILS_H_ #include #include @@ -17,6 +17,7 @@ namespace awkward { template const std::string type_to_name() { + std::cout << "Type " << typeid(T).name() << " is not recognized." << std::endl; return typeid(T).name(); } @@ -249,6 +250,13 @@ namespace awkward { return "unsupported type"; } + /// @brief Check if an RDataFrame column is an Awkward Array. + template + bool + is_awkward_type() { + return (std::string(typeid(T).name()).find("awkward") != std::string::npos); + } + /// @class visit_impl /// /// @brief Class to index tuple at runtime. @@ -299,4 +307,4 @@ namespace awkward { } // namespace awkward -#endif // AWKWARD_UTILS_H_ +#endif // AWKWARD_CPP_HEADERS_UTILS_H_ diff --git a/src/awkward/_v2/cpp-headers/rdataframe/jagged_builders.h b/src/awkward/_v2/cpp-headers/rdataframe/jagged_builders.h index 914f567ab5..59ef4a6d50 100644 --- a/src/awkward/_v2/cpp-headers/rdataframe/jagged_builders.h +++ b/src/awkward/_v2/cpp-headers/rdataframe/jagged_builders.h @@ -44,85 +44,20 @@ namespace awkward { std::cout << std::endl; } - template + template void - fill_from(BUILDER& builder) const { - for (auto it : result_) { + fill_from(BUILDER& builder, ROOT::RDF::RResultPtr>& result) const { + for (auto it : result) { builder.append(it); } } - template + template void to_char_buffers(BUILDER& builder) { builder.to_char_buffers(buffers_uint8_ptr_); } - template - void - fill_offsets_and_flatten_2(BUILDER& builder) const { - for (auto const& vec : result_) { - auto& subbuilder = builder.begin_list(); - for (auto it : vec) { - subbuilder.append(it); - } - builder.end_list(); - } - } - - template - void - fill_offsets_and_flatten_3(BUILDER& builder) const { - for (auto const& vec_of_vecs : result_) { - auto& builder1 = builder.begin_list(); - for (auto const& vec : vec_of_vecs) { - auto& builder2 = builder1.begin_list(); - for (auto it : vec) { - builder2.append(it); - } - builder1.end_list(); - } - builder.end_list(); - } - } - - template - void - fill_offsets_and_flatten_4(BUILDER& builder) const { - for (auto const& vec_of_vecs_of_vecs : result_) { - auto& builder1 = builder.begin_list(); - for (auto const& vec_of_vecs : vec_of_vecs_of_vecs) { - auto& builder2 = builder1.begin_list(); - for (auto const& vec : vec_of_vecs) { - auto& builder3 = builder2.begin_list(); - for (auto it : vec) { - builder3.append(it); - } - builder2.end_list(); - } - builder1.end_list(); - } - builder.end_list(); - } - } - - template - void - recurse_fill_from(int64_t level, BUILDER& builder, ITERABLE& result) const { - if (level == 0) { - for (auto it : result) { - builder.append(it); - } - } - else { - auto& next_builder = builder.begin_list(); - for (auto& it : result) { - recurse_fill_from(level - 1, next_builder, it); - } - next_builder.end_list(); - } - } - private: ROOT::RDF::RResultPtr>& result_; std::map map_names_nbytes_; diff --git a/src/awkward/_v2/operations/ak_fields.py b/src/awkward/_v2/operations/ak_fields.py index 9c6571735a..2faa8a48a6 100644 --- a/src/awkward/_v2/operations/ak_fields.py +++ b/src/awkward/_v2/operations/ak_fields.py @@ -27,4 +27,4 @@ def fields(array): def _impl(array): layout = ak._v2.operations.to_layout(array, allow_record=True, allow_other=False) - return layout.fields + return layout.fields.copy() diff --git a/src/awkward/_v2/operations/ak_from_rdataframe.py b/src/awkward/_v2/operations/ak_from_rdataframe.py index c37f6f0ba9..20bf2da6b5 100644 --- a/src/awkward/_v2/operations/ak_from_rdataframe.py +++ b/src/awkward/_v2/operations/ak_from_rdataframe.py @@ -3,12 +3,12 @@ import awkward as ak -def from_rdataframe(data_frame, column): +def from_rdataframe(data_frame, columns): """ Args: data_frame (`ROOT.RDataFrame`): ROOT RDataFrame to convert into an Awkward Array. - column (str): A column to be converted to Awkward Array. + columns (str or tuple of str): A column or multiple columns to be converted to Awkward Array. Converts ROOT Data Frame columns into an Awkward Array. @@ -18,22 +18,22 @@ def from_rdataframe(data_frame, column): "ak._v2.from_rdataframe", dict( data_frame=data_frame, - column=column, + columns=columns, ), ): return _impl( data_frame, - column, + columns, ) def _impl( data_frame, - column, + columns, ): import awkward._v2._connect.rdataframe.from_rdataframe # noqa: F401 return ak._v2._connect.rdataframe.from_rdataframe.from_rdataframe( data_frame, - column, + columns, ) diff --git a/src/awkward/_v2/operations/ak_parameters.py b/src/awkward/_v2/operations/ak_parameters.py index efa034d746..1b90916cbc 100644 --- a/src/awkward/_v2/operations/ak_parameters.py +++ b/src/awkward/_v2/operations/ak_parameters.py @@ -1,5 +1,8 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +import copy +import numbers + import awkward as ak np = ak.nplike.NumpyMetadata.instance() @@ -27,13 +30,13 @@ def parameters(array): def _impl(array): if isinstance(array, (ak._v2.highlevel.Array, ak._v2.highlevel.Record)): - return array.layout.parameters + return _copy(array.layout.parameters) elif isinstance( array, (ak._v2.contents.Content, ak._v2.record.Record), ): - return array.parameters + return _copy(array.parameters) elif isinstance(array, ak._v2.highlevel.ArrayBuilder): return array.snapshot().layout.parameters @@ -43,3 +46,10 @@ def _impl(array): else: return {} + + +def _copy(what): + if all(isinstance(x, (str, numbers.Real)) for x in what.values()): + return what.copy() + else: + return copy.deepcopy(what) diff --git a/tests/v2/test_1374-to-rdataframe.py b/tests/v2/test_1374-to-rdataframe.py index de31930976..f6d917290c 100644 --- a/tests/v2/test_1374-to-rdataframe.py +++ b/tests/v2/test_1374-to-rdataframe.py @@ -26,7 +26,7 @@ def test_two_columns(): data_frame = ak._v2.to_rdataframe( {"x": ak_array_1, "y": ak_array_2}, flatlist_as_rvec=True ) - assert set(data_frame.GetColumnNames()) == {"x", "y"} + assert set(data_frame.GetColumnNames()) == {"x", "y", "awkward_index_"} assert data_frame.GetColumnType("x") == "ROOT::VecOps::RVec" assert data_frame.GetColumnType("y").startswith("awkward::ListArray_") @@ -38,7 +38,7 @@ def test_two_columns_as_rvecs(): ) data_frame = ak._v2.to_rdataframe({"x": ak_array_1, "y": ak_array_2}) - assert set(data_frame.GetColumnNames()) == {"x", "y"} + assert set(data_frame.GetColumnNames()) == {"x", "y", "awkward_index_"} assert data_frame.GetColumnType("x") == "double" assert data_frame.GetColumnType("y").startswith("awkward::Record_") @@ -120,7 +120,7 @@ def test_two_columns_as_vecs(): data_frame = ak._v2.operations.to_rdataframe( {"x": ak_array_1, "y": ak_array_2}, flatlist_as_rvec=False ) - assert set(data_frame.GetColumnNames()) == {"x", "y"} + assert set(data_frame.GetColumnNames()) == {"x", "y", "awkward_index_"} assert data_frame.GetColumnType("x") == "double" assert data_frame.GetColumnType("y").startswith("awkward::Record_") @@ -166,7 +166,7 @@ def test_two_columns_transform_filter(): ) data_frame = ak._v2.to_rdataframe({"one": example1, "two": example2}) - assert set(data_frame.GetColumnNames()) == {"one", "two"} + assert set(data_frame.GetColumnNames()) == {"one", "two", "awkward_index_"} compiler( """ @@ -181,7 +181,12 @@ def test_two_columns_transform_filter(): data_frame_transformed = ROOT.MyTransformation[data_frame.GetColumnType("one")]( ROOT.RDF.AsRNode(data_frame) ) - assert set(data_frame_transformed.GetColumnNames()) == {"neg_one", "one", "two"} + assert set(data_frame_transformed.GetColumnNames()) == { + "neg_one", + "one", + "two", + "awkward_index_", + } assert data_frame_transformed.Count().GetValue() == 5 data_frame2 = data_frame.Filter("one > 2.5") @@ -194,9 +199,9 @@ def test_two_columns_transform_filter(): def test_jims_example1(): array = ak._v2.Array([{"x": 1.1}, {"x": 2.2}, {"x": 3.3}, {"x": 4.4}, {"x": 5.5}]) data_frame = ak._v2.to_rdataframe({"some_array": array}) - assert set(data_frame.GetColumnNames()) == {"some_array"} + assert set(data_frame.GetColumnNames()) == {"some_array", "awkward_index_"} data_frame_y = data_frame.Define("y", "some_array.x()") - assert set(data_frame_y.GetColumnNames()) == {"some_array", "y"} + assert set(data_frame_y.GetColumnNames()) == {"some_array", "y", "awkward_index_"} cpp_list = ", ".join(str(e) for e in array.x.to_list()) diff --git a/tests/v2/test_1449-v2-to_json-from_json-functions.py b/tests/v2/test_1449-v2-to_json-from_json-functions.py index e6ab379e10..9f39a502dc 100644 --- a/tests/v2/test_1449-v2-to_json-from_json-functions.py +++ b/tests/v2/test_1449-v2-to_json-from_json-functions.py @@ -8,6 +8,58 @@ import awkward as ak # noqa: F401 +def test_without_control(): + array = ak._v2.Array( + [ + {"ok": 1, "x": 1.1, "y": 1 + 1j, "z": b"one"}, + {"ok": 2, "x": 2.2, "y": 2 + 2j, "z": b"two"}, + {"ok": 3, "x": 3.3, "y": 3 + 3j, "z": b"three"}, + {"ok": 4, "x": float("nan"), "y": float("nan"), "z": b"four"}, + {"ok": 5, "x": float("inf"), "y": float("inf") + 5j, "z": b"five"}, + {"ok": 6, "x": float("-inf"), "y": 6 + float("-inf") * 1j, "z": b"six"}, + {"ok": 7, "x": 7.7, "y": 7 + 7j, "z": b"seven"}, + {"ok": 8, "x": None, "y": 8 + 8j, "z": b"eight"}, + {"ok": 9, "x": 9.9, "y": 9 + 9j, "z": b"nine"}, + ] + ) + + assert ak._v2.to_json(array.ok) == "[1,2,3,4,5,6,7,8,9]" + + with pytest.raises(ValueError): + ak._v2.to_json(array.x) + + assert ak._v2.to_json(array.x[:3]) == "[1.1,2.2,3.3]" + + with pytest.raises(ValueError): + ak._v2.to_json(array.x, nan_string="NAN") + + with pytest.raises(ValueError): + ak._v2.to_json(array.x, nan_string="NAN", posinf_string="INF") + + assert ( + ak._v2.to_json( + array.x, nan_string="NAN", posinf_string="INF", neginf_string="-INF" + ) + == '[1.1,2.2,3.3,"NAN","INF","-INF",7.7,null,9.9]' + ) + + with pytest.raises(TypeError): + ak._v2.to_json(array.y[:3]) + + assert ( + ak._v2.to_json(array.y[:3], complex_record_fields=["R", "I"]) + == '[{"R":1.0,"I":1.0},{"R":2.0,"I":2.0},{"R":3.0,"I":3.0}]' + ) + + with pytest.raises(TypeError): + ak._v2.to_json(array.z) + + assert ( + ak._v2.to_json(array.z, convert_bytes=lambda x: x.decode()) + == '["one","two","three","four","five","six","seven","eight","nine"]' + ) + + def test_to_json_options(tmp_path): filename = os.path.join(tmp_path, "whatever.json") diff --git a/tests/v2/test_1473-from-rdataframe.py b/tests/v2/test_1473-from-rdataframe.py index 9df4bc5638..cb98d70aaa 100644 --- a/tests/v2/test_1473-from-rdataframe.py +++ b/tests/v2/test_1473-from-rdataframe.py @@ -41,7 +41,7 @@ def test_to_from_data_frame_large(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert len(ak_array_in) == len(ak_array_out) @@ -56,7 +56,7 @@ def test_data_frame_boolean(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out.to_list() @@ -70,7 +70,7 @@ def test_data_frame_integers(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -84,7 +84,7 @@ def test_data_frame_real(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -100,7 +100,7 @@ def test_data_frame_complex(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -114,7 +114,7 @@ def test_data_frame_strings(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -128,7 +128,7 @@ def test_data_frame_vec_of_integers(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -142,7 +142,7 @@ def test_data_frame_vec_of_real(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -158,7 +158,7 @@ def test_data_frame_vec_of_complex(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -172,7 +172,7 @@ def test_data_frame_vec_of_strings(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -186,7 +186,7 @@ def test_data_frame_vec_of_vec_of_integers(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -200,7 +200,7 @@ def test_data_frame_vec_of_vec_of_real(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -216,7 +216,7 @@ def test_data_frame_vec_of_vec_of_complex(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -227,13 +227,13 @@ def test_rdata_frame_vecs_as_records(): ak_array_x = ak._v2.from_rdataframe( data_frame_xy, - column="x", + columns=("x",), ) assert ak_array_x["x"].layout.form == ak._v2.forms.NumpyForm("float64") ak_record_array_x = ak._v2.from_rdataframe( data_frame_xy, - column="x", + columns=("x",), ) assert ak_record_array_x.layout.form == ak._v2.forms.RecordForm( [ak._v2.forms.NumpyForm("float64")], "x" @@ -241,7 +241,7 @@ def test_rdata_frame_vecs_as_records(): ak_record_array_y = ak._v2.from_rdataframe( data_frame_xy, - column="y", + columns=("y",), ) ak_array = ak._v2.zip([ak_record_array_x, ak_record_array_y]) assert ak_array.layout.form == ak._v2.forms.RecordForm( @@ -259,7 +259,7 @@ def test_rdata_frame_vecs_of_complex(): ak_array_y = ak._v2.from_rdataframe( data_frame_xy, - column="y", + columns=("y",), ) assert ak_array_y["y"].layout.form == ak._v2.forms.NumpyForm("complex128") @@ -285,7 +285,7 @@ def test_rdata_frame_rvecs_as_records(): array = ak._v2.from_rdataframe( data_frame_x_y_r, - column="r", + columns=("r",), ) assert array.layout.form == ak._v2.forms.RecordForm( @@ -303,7 +303,7 @@ def test_to_from_data_frame(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_out["x"].layout.content.is_contiguous is True @@ -318,7 +318,7 @@ def test_to_from_data_frame_rvec_of_rvec(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -333,7 +333,7 @@ def test_to_from_data_frame_rvec_of_rvec_of_rvec(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() diff --git a/tests/v2/test_1508-awkward-from-rdataframe.py b/tests/v2/test_1508-awkward-from-rdataframe.py index 1c3336aa82..602ddc73eb 100644 --- a/tests/v2/test_1508-awkward-from-rdataframe.py +++ b/tests/v2/test_1508-awkward-from-rdataframe.py @@ -37,7 +37,7 @@ def test_refcount(): array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert array.to_list() == array_out["x"].to_list() @@ -122,7 +122,7 @@ def test_data_frame_vec_of_vec_of_integers(): ak_array_out = ak._v2.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() diff --git a/tests/v2/test_1613-generator-tolayout-records.py b/tests/v2/test_1613-generator-tolayout-records.py index 3125620832..bdbe628f49 100644 --- a/tests/v2/test_1613-generator-tolayout-records.py +++ b/tests/v2/test_1613-generator-tolayout-records.py @@ -334,6 +334,6 @@ def test_data_frame_from_json(): data_frame = ak._v2.to_rdataframe({"variants": array}) out = ak._v2.from_rdataframe( data_frame, - column="variants", + columns=("variants",), ) assert array.to_list() == out["variants"].to_list() diff --git a/tests/v2/test_1620-layout-builders.py b/tests/v2/test_1620-layout-builders.py index 52ef187d9c..4428c58f1a 100644 --- a/tests/v2/test_1620-layout-builders.py +++ b/tests/v2/test_1620-layout-builders.py @@ -20,7 +20,7 @@ def test_data_frame_integers(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -34,7 +34,7 @@ def test_data_frame_double(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -48,7 +48,7 @@ def test_data_frame_char(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -62,7 +62,7 @@ def test_data_frame_complex(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -76,7 +76,7 @@ def test_data_frame_listoffset_integers(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -97,7 +97,7 @@ def test_data_frame_listoffset_listoffset_double(): ak_array_out = ak.from_rdataframe( data_frame, - column="x", + columns=("x",), ) assert ak_array_in.to_list() == ak_array_out["x"].to_list() @@ -141,7 +141,7 @@ def test_data_frame_vec_of_vec(): assert rdf3.GetColumnType("output") == "vector >" out = ak.from_rdataframe( rdf3, - column="output", + columns=("output",), ) assert out["output"].to_list() == (array["y"] * array["y"] * 1.0).to_list() @@ -172,7 +172,7 @@ def test_data_frame_vec_of_vec(): assert rdf3.GetColumnType("output2") == "vector > >" out = ak.from_rdataframe( # noqa: F841 rdf3, - column="output2", + columns=("output2",), ) result = ak.Array( [ diff --git a/tests/v2/test_1625-multiple-columns-from-rdataframe.py b/tests/v2/test_1625-multiple-columns-from-rdataframe.py new file mode 100644 index 0000000000..1d4d864205 --- /dev/null +++ b/tests/v2/test_1625-multiple-columns-from-rdataframe.py @@ -0,0 +1,266 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import pytest # noqa: F401 +import numpy as np # noqa: F401 +import awkward._v2 as ak # noqa: F401 + + +ROOT = pytest.importorskip("ROOT") + + +compiler = ROOT.gInterpreter.Declare + + +def test_data_frame_integers(): + ak_array_x = ak.Array([1, 2, 3, 4, 5]) + ak_array_y = ak.Array([1.1, 2.2, 3.3, 4.4, 5.5]) + + data_frame = ak.to_rdataframe({"x": ak_array_x, "y": ak_array_y}) + + assert data_frame.GetColumnType("x") == "int64_t" + assert data_frame.GetColumnType("y") == "double" + + ak_array_out = ak.from_rdataframe( + data_frame, + columns=("x", "y"), + ) + assert ak_array_x.to_list() == ak_array_out["x"].to_list() + assert ak_array_y.to_list() == ak_array_out["y"].to_list() + + +def test_data_frame_vec_of_vec_of_real(): + ak_array_in = ak.Array([[[1.1], [2.2]], [[3.3], [4.4, 5.5]]]) + + data_frame = ak.to_rdataframe({"x": ak_array_in}) + + assert data_frame.GetColumnType("x").startswith("awkward::ListArray_") + + ak_array_out = ak.from_rdataframe( + data_frame, + columns=("x",), + ) + assert ak_array_in.to_list() == ak_array_out["x"].to_list() + + +def test_data_frame_filter(): + ak_array_x = ak.Array([1, 2, 3, 4, 5]) + ak_array_y = ak.Array([1.1, 2.2, 3.3, 4.4, 5.5]) + + data_frame = ak.to_rdataframe({"x": ak_array_x, "y": ak_array_y}) + rdf3 = data_frame.Filter("x > 3") + + assert data_frame.GetColumnType("x") == "int64_t" + assert data_frame.GetColumnType("y") == "double" + + ak_array_out = ak.from_rdataframe( + rdf3, + columns=( + "x", + "y", + ), + ) + assert ak_array_x[3:].to_list() == ak_array_out["x"].to_list() + assert ak_array_y[3:].to_list() == ak_array_out["y"].to_list() + + +def test_data_frame_rvec_filter(): + ak_array_x = ak.Array([[1, 2], [3], [4, 5]]) + ak_array_y = ak.Array([[1.0, 1.1], [2.2, 3.3, 4.4], [5.5]]) + + data_frame = ak.to_rdataframe({"x": ak_array_x, "y": ak_array_y}) + rdf3 = data_frame.Filter("x.size() >= 2") + + assert data_frame.GetColumnType("x") == "ROOT::VecOps::RVec" + assert data_frame.GetColumnType("y") == "ROOT::VecOps::RVec" + + ak_array_out = ak.from_rdataframe( + rdf3, + columns=( + "x", + "y", + ), + ) + assert ak_array_out["x"].to_list() == [[1, 2], [4, 5]] + assert ak_array_out["y"].to_list() == [[1.0, 1.1], [5.5]] + + rdf4 = data_frame.Filter("y.size() == 2") + ak_array_out = ak.from_rdataframe( + rdf4, + columns=( + "x", + "y", + ), + ) + assert ak_array_out["x"].to_list() == [[1, 2]] + assert ak_array_out["y"].to_list() == [[1.0, 1.1]] + + +def test_data_frame_double(): + ak_array_in = ak.Array([1.1, 2.2, 3.3, 4.4, 5.5]) + + data_frame = ak.to_rdataframe({"x": ak_array_in}) + + assert data_frame.GetColumnType("x") == "double" + + ak_array_out = ak.from_rdataframe(data_frame, columns=("x",)) + assert ak_array_in.to_list() == ak_array_out["x"].to_list() + + +def test_data_frame_vec_of_vec(): + array = ak.Array( + [ + [ + {"x": 1.1, "y": [1]}, + {"x": None, "y": [1, 2]}, + {"x": 3.3, "y": [1, 2, 3]}, + ], + [], + [{"x": None, "y": [1, 2, 3, 4]}, {"x": 5.5, "y": [1, 2, 3, 4, 5]}], + ] + ) + # ] * 10000) + + rdf2 = ak.to_rdataframe({"array": array}) + # We create a matrix RxC here + # Note when dimensions R and C are large, the following code suffers + # from potential performance penalties caused by frequent reallocation + # of memory by the push_back() function. This should be used only when + # vector dimensions are not known in advance. + rdf3 = rdf2.Define( + "output", + """ + std::vector> tmp1; + + for (auto record : array) { + std::vector tmp2; + for (auto number : record.y()) { + tmp2.push_back(number * number); + } + tmp1.push_back(tmp2); + } + return tmp1; + """, + ) + + assert rdf3.GetColumnType("output") == "vector >" + + rdf4 = rdf3.Define( + "output2", + """ + std::vector>> tmp1; + + for (auto record : array) { + std::vector> tmp2; + // we can check if it's None: + // if (record.x().has_value()) + // or set it to 1 so that we do not scale: + double x_number = record.x().value_or(1); + for (auto number : record.y()) { + std::vector tmp3; + for (int64_t i = 0; i < std::rint(x_number); i++) { + double value = x_number * number; + tmp3.push_back(value); + } + tmp2.push_back(tmp3); + } + tmp1.push_back(tmp2); + } + return tmp1; + """, + ) + assert rdf4.GetColumnType("output2") == "vector > >" + + out = ak.from_rdataframe( # noqa: F841 + rdf4, + columns=( + "output", + "output2", + ), + ) + + assert out["output"].to_list() == (array["y"] * array["y"] * 1.0).to_list() + result = ak.Array( + [ + [ + [[1.1]], # "x" is 1 - "y" values are unchanged, and each is nesed + [ + [1.0], + [2.0], + ], # "x" is None - "y" values are unchanged, and each is nesed + [ + [3.3, 3.3, 3.3], + [6.6, 6.6, 6.6], + [9.899999999999999, 9.899999999999999, 9.899999999999999], + ], # "x" is 3.3 - "y" values are scaled by 3.3 and each is nesed 3 times + ], + [], + [ + [ + [1.0], + [2.0], + [3.0], + [4.0], + ], # "x" is None - "y" values are unchanged, and each is nesed + [ + [5.5, 5.5, 5.5, 5.5, 5.5, 5.5], + [11.0, 11.0, 11.0, 11.0, 11.0, 11.0], + [16.5, 16.5, 16.5, 16.5, 16.5, 16.5], + [22.0, 22.0, 22.0, 22.0, 22.0, 22.0], + [27.5, 27.5, 27.5, 27.5, 27.5, 27.5], + ], # "x" is 5.5 - "y" values are scaled by 5.5 and each is nesed 5 times + ], + ] + ) + assert out["output2"].to_list() == result.to_list() + + +def test_rdata_frame_rvecs_as_records(): + data_frame = ROOT.RDataFrame(1024) + coordDefineCode = """ROOT::VecOps::RVec {0}(len); + std::transform({0}.begin(), {0}.end(), {0}.begin(), [](double){{return gRandom->Uniform(-1.0, 1.0);}}); + return {0};""" + + data_frame_x_y = ( + data_frame.Define("len", "gRandom->Uniform(0, 16)") + .Define("x", coordDefineCode.format("x")) + .Define("y", coordDefineCode.format("y")) + ) + + # Now we have in hands d, a RDataFrame with two columns, x and y, which + # hold collections of coordinates. The size of these collections vary. + # Let's now define radii out of x and y. We'll do it treating the collections + # stored in the columns without looping on the individual elements. + data_frame_x_y_r = data_frame_x_y.Define("r", "sqrt(x*x + y*y)") + assert data_frame_x_y_r.GetColumnType("r") == "ROOT::VecOps::RVec" + + array = ak.from_rdataframe( + data_frame_x_y_r, + columns=( + "x", + "y", + "r", + ), + ) + + assert array["x"].layout.form == ak.forms.ListOffsetForm( + "i64", ak.forms.NumpyForm("float64") + ) + assert array["y"].layout.form == ak.forms.ListOffsetForm( + "i64", ak.forms.NumpyForm("float64") + ) + assert array["r"].layout.form == ak.forms.ListOffsetForm( + "i64", ak.forms.NumpyForm("float64") + ) + + assert array.layout.form == ak.forms.RecordForm( + [ + ak.forms.ListOffsetForm("i64", ak.forms.NumpyForm("float64")), + ak.forms.ListOffsetForm("i64", ak.forms.NumpyForm("float64")), + ak.forms.ListOffsetForm("i64", ak.forms.NumpyForm("float64")), + ], + [ + "x", + "y", + "r", + ], + )