Skip to content

Commit

Permalink
ARROW-6760: [C++] More informative error messages for JSON parsing er…
Browse files Browse the repository at this point in the history
…rors

Parse errors will now include the path to whichever field caused the parse error and the row number where the parse error occurred.

Closes apache#5571 from bkietz/6760-JSON-improve-error-messag and squashes the following commits:

5b46b45 <Benjamin Kietzman> typo fix
175de1e <Benjamin Kietzman> amend message assertion for extra error context
0380a85 <Benjamin Kietzman> fix field_index logic
3386db0 <Benjamin Kietzman> change field names in FailOnNestedInconvertible test
7ad6241 <Benjamin Kietzman> -Wsign-compare
079d08c <Benjamin Kietzman> ARROW-6760:  More informative error messages for JSON parsing errors

Authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
  • Loading branch information
bkietz committed Oct 4, 2019
1 parent 368562b commit 86eaa6b
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 18 deletions.
65 changes: 52 additions & 13 deletions cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@ static Status ParseError(T&&... t) {
return Status::Invalid("JSON parse error: ", std::forward<T>(t)...);
}

static Status KindChangeError(Kind::type from, Kind::type to) {
return ParseError("A column changed from ", Kind::Name(from), " to ", Kind::Name(to));
}

const std::string& Kind::Name(Kind::type kind) {
static const std::string names[] = {"null", "boolean", "number",
"string", "array", "object"};
Expand Down Expand Up @@ -327,6 +323,15 @@ class RawArrayBuilder<Kind::kObject> {

Status AppendNull(int64_t count) { return null_bitmap_builder_.Append(count, false); }

std::string FieldName(int i) const {
for (const auto& name_index : name_to_index_) {
if (name_index.second == i) {
return name_index.first;
}
}
return "";
}

int GetFieldIndex(const std::string& name) const {
auto it = name_to_index_.find(name);
if (it == name_to_index_.end()) {
Expand Down Expand Up @@ -649,6 +654,25 @@ class HandlerBase : public BlockParser,
return builder_set_.Finish(scalar_values, builder_, parsed);
}

/// \brief Emit path of current field for debugging purposes
std::string Path() {
std::string path;
for (size_t i = 0; i < builder_stack_.size(); ++i) {
auto builder = builder_stack_[i];
if (builder.kind == Kind::kArray) {
path += "/[]";
} else {
auto struct_builder = Cast<Kind::kObject>(builder);
auto field_index = field_index_;
if (i + 1 < field_index_stack_.size()) {
field_index = field_index_stack_[i + 1];
}
path += "/" + struct_builder->FieldName(field_index);
}
}
return path;
}

protected:
template <typename Handler, typename Stream>
Status DoParse(Handler& handler, Stream&& json) {
Expand All @@ -672,9 +696,7 @@ class HandlerBase : public BlockParser,
return handler.Error();
default:
// rj emitted an error
// FIXME(bkietz) report more error data (at least the byte range of the current
// block, and maybe the path to the most recently parsed value?)
return ParseError(rj::GetParseError_En(ok.Code()));
return ParseError(rj::GetParseError_En(ok.Code()), " in row ", num_rows_);
}
}
return Status::Invalid("Exceeded maximum rows");
Expand Down Expand Up @@ -722,12 +744,17 @@ class HandlerBase : public BlockParser,
///
/// sets the field builder with name key, or returns false if
/// there is no field with that name
bool SetFieldBuilder(string_view key) {
bool SetFieldBuilder(string_view key, bool* duplicate_keys) {
auto parent = Cast<Kind::kObject>(builder_stack_.back());
field_index_ = parent->GetFieldIndex(std::string(key));
if (ARROW_PREDICT_FALSE(field_index_ == -1)) {
return false;
}
*duplicate_keys = !absent_fields_stack_[field_index_];
if (*duplicate_keys) {
status_ = ParseError("Column(", Path(), ") was specified twice in row ", num_rows_);
return false;
}
builder_ = parent->field_builder(field_index_);
absent_fields_stack_[field_index_] = false;
return true;
Expand Down Expand Up @@ -790,7 +817,8 @@ class HandlerBase : public BlockParser,
}

Status IllegallyChangedTo(Kind::type illegally_changed_to) {
return KindChangeError(builder_.kind, illegally_changed_to);
return ParseError("Column(", Path(), ") changed from ", Kind::Name(builder_.kind),
" to ", Kind::Name(illegally_changed_to), " in row ", num_rows_);
}

/// Reserve storage for scalars, these can occupy almost all of the JSON buffer
Expand Down Expand Up @@ -834,10 +862,13 @@ class Handler<UnexpectedFieldBehavior::Error> : public HandlerBase {
///
/// if an unexpected field is encountered, emit a parse error and bail
bool Key(const char* key, rj::SizeType len, ...) {
if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len)))) {
bool duplicate_keys = false;
if (ARROW_PREDICT_FALSE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) {
return true;
}
status_ = ParseError("unexpected field");
if (!duplicate_keys) {
status_ = ParseError("unexpected field");
}
return false;
}
};
Expand Down Expand Up @@ -895,9 +926,13 @@ class Handler<UnexpectedFieldBehavior::Ignore> : public HandlerBase {
if (Skipping()) {
return true;
}
if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len)))) {
bool duplicate_keys = false;
if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) {
return true;
}
if (ARROW_PREDICT_FALSE(duplicate_keys)) {
return false;
}
skip_depth_ = depth_;
return true;
}
Expand Down Expand Up @@ -982,9 +1017,13 @@ class Handler<UnexpectedFieldBehavior::InferType> : public HandlerBase {
/// (parent.length - 1) leading nulls. The next value parsed
/// will probably trigger promotion of this field from null
bool Key(const char* key, rj::SizeType len, ...) {
if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len)))) {
bool duplicate_keys = false;
if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) {
return true;
}
if (ARROW_PREDICT_FALSE(duplicate_keys)) {
return false;
}
auto struct_builder = Cast<Kind::kObject>(builder_stack_.back());
auto leading_nulls = static_cast<uint32_t>(struct_builder->length() - 1);
builder_ = BuilderPtr(Kind::kNull, leading_nulls, true);
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ class ARROW_EXPORT BlockParser {
/// \brief Return the number of parsed rows
int32_t num_rows() const { return num_rows_; }

/// \brief Construct a BlockParser
///
/// \param[in] pool MemoryPool to use when constructing parsed array
/// \param[in] options ParseOptions to use when parsing JSON
/// \param[out] out constructed BlockParser
static Status Make(MemoryPool* pool, const ParseOptions& options,
std::unique_ptr<BlockParser>* out);

Expand Down
51 changes: 46 additions & 5 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <utility>
#include <vector>

#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

#include "arrow/json/options.h"
Expand Down Expand Up @@ -144,14 +145,54 @@ TEST(BlockParserWithSchema, SkipFieldsOutsideSchema) {
"[\"thing\", null, \"\xe5\xbf\x8d\", null]"});
}

TEST(BlockParserWithSchema, FailOnInconvertible) {
auto options = ParseOptions::Defaults();
options.explicit_schema = schema({field("a", int32())});
options.unexpected_field_behavior = UnexpectedFieldBehavior::Ignore;
class BlockParserTypeError : public ::testing::TestWithParam<UnexpectedFieldBehavior> {
public:
ParseOptions Options(std::shared_ptr<Schema> explicit_schema) {
auto options = ParseOptions::Defaults();
options.explicit_schema = std::move(explicit_schema);
options.unexpected_field_behavior = GetParam();
return options;
}
};

TEST_P(BlockParserTypeError, FailOnInconvertible) {
auto options = Options(schema({field("a", int32())}));
std::shared_ptr<Array> parsed;
Status error = ParseFromString(options, "{\"a\":0}\n{\"a\":true}", &parsed);
ASSERT_RAISES(Invalid, error);
EXPECT_THAT(
error.message(),
testing::StartsWith(
"JSON parse error: Column(/a) changed from number to boolean in row 1"));
}

TEST_P(BlockParserTypeError, FailOnNestedInconvertible) {
auto options = Options(schema({field("a", list(struct_({field("b", int32())})))}));
std::shared_ptr<Array> parsed;
Status error =
ParseFromString(options, "{\"a\":[{\"b\":0}]}\n{\"a\":[{\"b\":true}]}", &parsed);
ASSERT_RAISES(Invalid, error);
EXPECT_THAT(
error.message(),
testing::StartsWith(
"JSON parse error: Column(/a/[]/b) changed from number to boolean in row 1"));
}

TEST_P(BlockParserTypeError, FailOnDuplicateKeys) {
std::shared_ptr<Array> parsed;
ASSERT_RAISES(Invalid, ParseFromString(options, "{\"a\":0}\n{\"a\":true}", &parsed));
Status error = ParseFromString(Options(schema({field("a", int32())})),
"{\"a\":0, \"a\":1}\n", &parsed);
ASSERT_RAISES(Invalid, error);
EXPECT_THAT(
error.message(),
testing::StartsWith("JSON parse error: Column(/a) was specified twice in row 0"));
}

INSTANTIATE_TEST_CASE_P(BlockParserTypeError, BlockParserTypeError,
::testing::Values(UnexpectedFieldBehavior::Ignore,
UnexpectedFieldBehavior::Error,
UnexpectedFieldBehavior::InferType));

TEST(BlockParserWithSchema, Nested) {
auto options = ParseOptions::Defaults();
options.explicit_schema = schema({field("yo", utf8()), field("arr", list(int32())),
Expand Down

0 comments on commit 86eaa6b

Please sign in to comment.