From f5e40dc45ba3465dfaf1c32af020b63a294b6982 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Thu, 5 Oct 2023 12:25:56 -0400 Subject: [PATCH] GH-37978: [C++] Add support for specifying custom Array element delimiter to `arrow::PrettyPrintOptions` (#37981) ### Rationale for this change In order to make the [`arrow::PrettyPrint`](https://github.com/apache/arrow/blob/7667b81bffcb5b361fab6d61c42ce396d98cc6e1/cpp/src/arrow/pretty_print.h#L101) functionality for `arrow::Array`/`arrow::ChunkedArray` more flexible, it would be useful to be able to specify a custom element delimiter other than `","`. For example, the MATLAB interface wraps the Arrow C++ libraries and being able to specify a custom `Array` element delimiter, would make it possible to make the display of MATLAB `arrow.array.Array` objects more MATLAB-like. For the MATLAB interface, we would like to enable display that looks something like the following (note the ` | ` between individual `Array` elements): ```matlab % Make a MATLAB array. >> A = 1:5 A = 1 2 3 4 5 % Make an Arrow array from the MATLAB array. >> B = arrow.array(A) B = [ 1 | 2 | 3 | 4 | 5 ] ``` In order to support custom `Array` element delimiters, this pull request adds a new `struct` type `PrettyPrintDelimiters`. The `PrettyPrintDelimiters` type has one property `element` (of type `std::string`), which allows client code to control the delimiter used to distinguish between individual elements of an `arrow::Array` / `arrow::ChunkedArray`. In a future pull request, we plan to add more properties like `open` and `close` to allow client code to specify the opening and closing delimiters to use when printing an `arrow::Array` / `arrow::ChunkedArray` (e.g. `"<"` rather than `"["` and `">"` rather than `"]"`). ### What changes are included in this PR? 1. Added a new `struct` type `PrettyPrintDelimiters` with one property `element` (of type `std::string`). The `element` property allows client code to specify any string value as the delimiter to distinguish between individual elements of an `arrow::Array` or `arrow::ChunkedArray` when printing using `arrow::PrettyPrint`. 2. Added two new properties to `arrow::PrettyPrintOptions` - (1) `array_delimiters` (of type `arrow::PrettyPrintDelimiters`) and `chunked_array_delimiters` (of type `arrow::PrettyPrintDelimiters`). These properties can be modified to customize how `arrow::Arrow`/`arrow::ChunkedArray` are printed when using `arrow::PrettyPrint`. ### Are these changes tested? Yes. 1. Added new tests `ArrayCustomElementDelimiter` and `ChunkedArrayCustomElementDelimiter` to `pretty_print_test.cc`. 2. All existing `PrettyPrint`-related C++ tests pass. ### Are there any user-facing changes? Yes. 1. User's can now specify a custom element delimiter to use when printing `arrow::Array`s / `arrow::ChunkedArray`s using [`arrow::PrettyPrint`](https://github.com/apache/arrow/blob/7667b81bffcb5b361fab6d61c42ce396d98cc6e1/cpp/src/arrow/pretty_print.h#L101) by modifying the `array_delimiters` or `chunked_array_delimiters` properties of `arrow::PrettyPrintOptions`. **Example**: ```cpp auto array = ...; auto stream = ... arrow::PrettyPrintOptions options = arrow::PrettyPrintOptions::Defaults(); // Use " | " as the element-wise (element = scalar value) delimiter for arrow::Array. options.array_delimiters.element = " | "; // Use "';" as the element-wise (element = chunk) delimiter for arrow::ChunkedArray. options.chunked_array_delimiters.element = ";"; arrow::PrettyPrint(array, options, stream); ``` ### Future Directions 1. To keep this pull request small and focused, I intentionally chose not to include changes related to specifying custom opening and closing `Array` delimiters (e.g. use `<` and `>` instead of `[` and `]`). I've captured the idea of supporting custom opening and closing `Array` delimiters in #37979. I will follow up with a future PR to address this. ### Notes 1. This pull request was motivated by our desire to improve the display of Arrow related classes in the MATLAB interface, but it is hopefully a generic enough change that it may benefit other use cases too. 3. @ rok helpfully pointed out in https://github.com/apache/arrow/issues/37978#issuecomment-1743715458 that a similar attempt to modify the default `Array` element delimiter to be `", "` (note the space after the comma) was taken in #30951. However, this issue appears to have gone stale and the PR (https://github.com/apache/arrow/pull/12420) that was opened also seems to have gone stale. If these changes get merged, it may make sense to close out this issue since this one seems to at least partially address it (although, it isn't exactly the same, since it doesn't change the default delimiter to be `", "`. However, for `PyArrow`, `array_delimiters.element` and `chunked_array_delimiters.element` could just be set to `", "` after merging these changes to change the default display if that is still desirable). * Closes: #37978 Authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- cpp/src/arrow/pretty_print.cc | 19 ++--- cpp/src/arrow/pretty_print.h | 23 +++++- cpp/src/arrow/pretty_print_test.cc | 111 +++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 03e2051c2fb88..a4a1fa90c2878 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -151,14 +151,14 @@ class ArrayPrinter : public PrettyPrinter { IndentAfterNewline(); (*sink_) << "..."; if (!is_last && options_.skip_new_lines) { - (*sink_) << ","; + (*sink_) << options_.array_delimiters.element; } i = array.length() - window - 1; } else if (array.IsNull(i)) { IndentAfterNewline(); (*sink_) << options_.null_rep; if (!is_last) { - (*sink_) << ","; + (*sink_) << options_.array_delimiters.element; } } else { if (indent_non_null_values) { @@ -166,7 +166,7 @@ class ArrayPrinter : public PrettyPrinter { } RETURN_NOT_OK(func(i)); if (!is_last) { - (*sink_) << ","; + (*sink_) << options_.array_delimiters.element; } } Newline(); @@ -453,12 +453,12 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op if (!skip_new_lines) { *sink << "\n"; } - bool skip_comma = true; + bool skip_element_delimiter = true; for (int i = 0; i < num_chunks; ++i) { - if (skip_comma) { - skip_comma = false; + if (skip_element_delimiter) { + skip_element_delimiter = false; } else { - (*sink) << ","; + (*sink) << options.chunked_array_delimiters.element; if (!skip_new_lines) { *sink << "\n"; } @@ -467,12 +467,13 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op for (int i = 0; i < indent; ++i) { (*sink) << " "; } - (*sink) << "...,"; + (*sink) << "..."; + (*sink) << options.chunked_array_delimiters.element; if (!skip_new_lines) { *sink << "\n"; } i = num_chunks - window - 1; - skip_comma = true; + skip_element_delimiter = true; } else { PrettyPrintOptions chunk_options = options; chunk_options.indent += options.indent_size; diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 5d22fd5c51ab8..96a214c68b8a6 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -32,7 +32,21 @@ class Schema; class Status; class Table; -struct PrettyPrintOptions { +/// \class PrettyPrintDelimiters +/// \brief Options for controlling which delimiters to use when printing +/// an Array or ChunkedArray. +struct ARROW_EXPORT PrettyPrintDelimiters { + /// Delimiter for separating individual elements of an Array (e.g. ","), + /// or individual chunks of a ChunkedArray + std::string element = ","; + + /// Create a PrettyPrintDelimiters instance with default values + static PrettyPrintDelimiters Defaults() { return PrettyPrintDelimiters(); } +}; + +/// \class PrettyPrintOptions +/// \brief Options for controlling how various Arrow types should be printed. +struct ARROW_EXPORT PrettyPrintOptions { PrettyPrintOptions() = default; PrettyPrintOptions(int indent, // NOLINT runtime/explicit @@ -47,6 +61,7 @@ struct PrettyPrintOptions { skip_new_lines(skip_new_lines), truncate_metadata(truncate_metadata) {} + /// Create a PrettyPrintOptions instance with default values static PrettyPrintOptions Defaults() { return PrettyPrintOptions(); } /// Number of spaces to shift entire formatted object to the right @@ -77,6 +92,12 @@ struct PrettyPrintOptions { /// If true, display schema metadata when pretty-printing a Schema bool show_schema_metadata = true; + + /// Delimiters to use when printing an Array + PrettyPrintDelimiters array_delimiters = PrettyPrintDelimiters::Defaults(); + + /// Delimiters to use when printing a ChunkedArray + PrettyPrintDelimiters chunked_array_delimiters = PrettyPrintDelimiters::Defaults(); }; /// \brief Print human-readable representation of RecordBatch diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 9a6e347c0bdb2..45bb4ecffe054 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -200,6 +200,65 @@ TEST_F(TestPrettyPrint, PrimitiveTypeNoNewlines) { CheckPrimitive(options, is_valid, values, expected, false); } +TEST_F(TestPrettyPrint, ArrayCustomElementDelimiter) { + PrettyPrintOptions options{}; + // Use a custom array element delimiter of " | ", + // rather than the default delimiter (i.e. ","). + options.array_delimiters.element = " | "; + + // Short array without ellipsis + { + std::vector is_valid = {true, true, false, true, false}; + std::vector values = {1, 2, 3, 4, 5}; + static const char* expected = R"expected([ + 1 | + 2 | + null | + 4 | + null +])expected"; + CheckPrimitive(options, is_valid, values, expected, false); + } + + // Longer array with ellipsis + { + std::vector is_valid = {true, false, true}; + std::vector values = {1, 2, 3}; + // Append 20 copies of the value "10" to the end of the values vector. + values.insert(values.end(), 20, 10); + // Append 20 copies of the value "true" to the end of the validity bitmap vector. + is_valid.insert(is_valid.end(), 20, true); + // Append the values 4, 5, and 6 to the end of the values vector. + values.insert(values.end(), {4, 5, 6}); + // Append the values true, false, and true to the end of the validity bitmap vector. + is_valid.insert(is_valid.end(), {true, false, true}); + static const char* expected = R"expected([ + 1 | + null | + 3 | + 10 | + 10 | + 10 | + 10 | + 10 | + 10 | + 10 | + ... + 10 | + 10 | + 10 | + 10 | + 10 | + 10 | + 10 | + 4 | + null | + 6 +])expected"; + CheckPrimitive(options, is_valid, values, expected, false); + } +} + TEST_F(TestPrettyPrint, Int8) { static const char* expected = R"expected([ 0, @@ -1020,6 +1079,58 @@ TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { CheckStream(chunked_array_2, {0}, expected_2); } +TEST_F(TestPrettyPrint, ChunkedArrayCustomElementDelimiter) { + PrettyPrintOptions options{}; + // Use a custom ChunkedArray element delimiter of ";", + // rather than the default delimiter (i.e. ","). + options.chunked_array_delimiters.element = ";"; + // Use a custom Array element delimiter of " | ", + // rather than the default delimiter (i.e. ","). + options.array_delimiters.element = " | "; + + const auto chunk = ArrayFromJSON(int32(), "[1, 2, null, 4, null]"); + + // ChunkedArray with 1 chunk + { + const ChunkedArray chunked_array(chunk); + + static const char* expected = R"expected([ + [ + 1 | + 2 | + null | + 4 | + null + ] +])expected"; + CheckStream(chunked_array, options, expected); + } + + // ChunkedArray with 2 chunks + { + const ChunkedArray chunked_array({chunk, chunk}); + + static const char* expected = R"expected([ + [ + 1 | + 2 | + null | + 4 | + null + ]; + [ + 1 | + 2 | + null | + 4 | + null + ] +])expected"; + + CheckStream(chunked_array, options, expected); + } +} + TEST_F(TestPrettyPrint, TablePrimitive) { std::shared_ptr int_field = field("column", int32()); auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]");