Skip to content

Commit

Permalink
apacheGH-44101: [C++][Parquet] Tools: Debug Print for Json should be …
Browse files Browse the repository at this point in the history
…a valid json
  • Loading branch information
mapleFU committed Oct 25, 2024
1 parent 802a846 commit 2fc8e96
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 27 deletions.
56 changes: 30 additions & 26 deletions cpp/src/parquet/printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
bool hasRow;
do {
hasRow = false;
for (auto scanner : scanners) {
for (const auto& scanner : scanners) {
if (scanner->HasNext()) {
hasRow = true;
scanner->PrintNext(stream, COL_WIDTH);
Expand All @@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< file_metadata->schema()->group_node()->field_count() << "\",\n";
stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";

if (selected_columns.size() == 0) {
if (selected_columns.empty()) {
for (int i = 0; i < file_metadata->num_columns(); i++) {
selected_columns.push_back(i);
}
Expand Down Expand Up @@ -299,70 +299,74 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< column_chunk->num_values() << "\", "
<< "\"StatsSet\": ";
if (column_chunk->is_stats_set()) {
stream << "\"True\", \"Stats\": {";
stream << R"("True", "Stats": {)";
if (stats->HasNullCount()) {
stream << "\"NumNulls\": \"" << stats->null_count();
stream << R"("NumNulls": ")" << stats->null_count() << "\"";
}
if (stats->HasDistinctCount()) {
stream << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count();
stream << ", "
<< R"("DistinctValues": ")" << stats->distinct_count() << "\"";
}
if (stats->HasMinMax()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\", "
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
stream << ", "
<< R"("Max": ")" << FormatStatValue(descr->physical_type(), max)
<< "\", "
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min);
<< R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\"";
}
stream << "\" },";
stream << " },";
} else {
stream << "\"False\",";
}
stream << "\n \"Compression\": \""
<< ::arrow::internal::AsciiToUpper(
Codec::GetCodecAsString(column_chunk->compression()))
<< "\", \"Encodings\": \"";
<< R"(", "Encodings": )";
stream << "\"";
if (column_chunk->encoding_stats().empty()) {
for (auto encoding : column_chunk->encodings()) {
stream << EncodingToString(encoding) << " ";
}
} else {
PrintPageEncodingStats(stream, column_chunk->encoding_stats());
}
stream << "\", "
<< "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
<< "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
stream << "\"";
stream << ", "
<< R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size()
<< R"(", "CompressedSize": ")" << column_chunk->total_compressed_size()
<< "\"";

if (column_chunk->bloom_filter_offset()) {
// Output BloomFilter {offset, length}
stream << "\", BloomFilter {"
<< "\"offset\": \"" << column_chunk->bloom_filter_offset().value();
stream << ", \"BloomFilter\": {"
<< R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\"";
if (column_chunk->bloom_filter_length()) {
stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value();
stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value()
<< "\"";
}
stream << "\"}";
stream << "}";
}

if (column_chunk->GetColumnIndexLocation()) {
auto location = column_chunk->GetColumnIndexLocation().value();
// Output ColumnIndex {offset, length}
stream << "\", ColumnIndex {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << ", \"ColumnIndex\": {"
<< R"("offset": ")" << location.offset;
stream << R"(", "length": ")" << location.length;
stream << "\"}";
}

if (column_chunk->GetOffsetIndexLocation()) {
auto location = column_chunk->GetOffsetIndexLocation().value();
// Output OffsetIndex {offset, length}
stream << "\", OffsetIndex {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << "\"}";
stream << ", \"OffsetIndex\": {"
<< R"("offset": ")" << location.offset << "\"";
stream << R"(, "length": ")" << location.length << "\"";
stream << "}";
}

// end of a ColumnChunk
stream << "\" }";
stream << " }";
c1++;
if (c1 != static_cast<int>(selected_columns.size())) {
stream << ",\n";
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/printer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter {

public:
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
~ParquetFilePrinter() {}
~ParquetFilePrinter() = default;

void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
bool print_values = false, bool format_dump = false,
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/parquet/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep

#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/stringbuffer.h>

#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/io/file.h"
Expand All @@ -47,6 +53,8 @@
#include "parquet/printer.h"
#include "parquet/test_util.h"

namespace rj = arrow::rapidjson;

using arrow::internal::checked_pointer_cast;
using arrow::internal::Zip;

Expand Down Expand Up @@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
}

// GH-44101: Test that JSON output is valid JSON
TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
auto checkJsonValid = [](std::string_view json_string) -> ::arrow::Status {
rj::Document json_doc;
constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
if (json_doc.HasParseError()) {
return ::arrow::Status::Invalid("JSON parse error at offset ",
json_doc.GetErrorOffset(), ": ",
rj::GetParseError_En(json_doc.GetParseError()));
}
return ::arrow::Status::OK();
};
std::vector<std::string_view> check_file_lists = {
"data_index_bloom_encoding_with_length.parquet",
"data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet",
"concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
for (const auto& file : check_file_lists) {
std::string json_content = ReadFromLocalFile(file);
ASSERT_OK(checkJsonValid(json_content))
<< "Invalid JSON output for file: " << file << ", content:" << json_content;
}
}

TEST(TestFileReader, BufferedReadsWithDictionary) {
const int num_rows = 1000;

Expand Down

0 comments on commit 2fc8e96

Please sign in to comment.