Skip to content

Commit

Permalink
ARROW-17995: [C++] Fix json decimals not being rescaled based on the …
Browse files Browse the repository at this point in the history
…explicit schema (#14380)

arrow::json::DecimalConverter::Convert() currently read the decimal values using the parsed precision and scale. This produces wrong results if the parsed scale doesn't match the output scale (specified by explicit schema).

More details on how to reproduce the issue are in the JIRA description. This patch fixes json::DecimalConverter::Convert() to rescale the values based on the output scale. Unit tests are added as well.

Lead-authored-by: stiga-huang <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
stiga-huang and pitrou authored Oct 15, 2022
1 parent aeba616 commit 289e0c9
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 7 deletions.
24 changes: 21 additions & 3 deletions cpp/src/arrow/json/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,28 @@ class DecimalConverter : public PrimitiveConverter {
using Builder = typename TypeTraits<T>::BuilderType;
Builder builder(out_type_, pool_);
RETURN_NOT_OK(builder.Resize(dict_array.indices()->length()));
const auto& decimal_type(checked_cast<const DecimalType&>(*out_type_));
int32_t out_precision = decimal_type.precision();
int32_t out_scale = decimal_type.scale();

auto visit_valid = [&builder](string_view repr) {
ARROW_ASSIGN_OR_RAISE(value_type value,
TypeTraits<T>::BuilderType::ValueType::FromString(repr));
auto visit_valid = [&](string_view repr) {
value_type value;
int32_t precision, scale;
RETURN_NOT_OK(TypeTraits<T>::BuilderType::ValueType::FromString(
repr, &value, &precision, &scale));
if (precision > out_precision) {
return GenericConversionError(*out_type_, ": ", repr, " requires precision ",
precision);
}
if (scale != out_scale) {
auto result = value.Rescale(scale, out_scale);
if (ARROW_PREDICT_FALSE(!result.ok())) {
return GenericConversionError(*out_type_, ": ", repr, " requires scale ",
scale);
} else {
value = result.MoveValueUnsafe();
}
}
builder.UnsafeAppend(value);
return Status::OK();
};
Expand Down
62 changes: 58 additions & 4 deletions cpp/src/arrow/json/converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "arrow/json/converter.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <string>
Expand Down Expand Up @@ -190,9 +191,14 @@ TEST(ConverterTest, Decimal128And256) {
options.explicit_schema = schema({field("", decimal_type)});

std::string json_source = R"(
{"" : "02.0000000000"}
{"" : "30.0000000000"}
)";
{"" : "02.0000000000"}
{"" : "30.0000000000"}
{"" : "30.0123456789"}
{"" : "30.012345678900"}
{"" : "30.0123"}
{"" : "0.012345678"}
{"" : "1234567890123456789012345678.0123456789"}
)";

std::shared_ptr<StructArray> parse_array;
ASSERT_OK(ParseFromString(options, json_source, &parse_array));
Expand All @@ -204,11 +210,59 @@ TEST(ConverterTest, Decimal128And256) {
// assert equality
auto expected = ArrayFromJSON(decimal_type, R"([
"02.0000000000",
"30.0000000000"])");
"30.0000000000",
"30.0123456789",
"30.0123456789",
"30.0123000000",
"0.0123456780",
"1234567890123456789012345678.0123456789"
])");

AssertArraysEqual(*expected, *converted);
}
}

TEST(ConverterTest, Decimal128And256ScaleError) {
for (auto decimal_type : {decimal128(38, 10), decimal256(38, 10)}) {
ParseOptions options;
options.explicit_schema = schema({field("", decimal_type)});

std::string json_source = R"(
{"" : "30.0123456789001"}
)";

std::shared_ptr<StructArray> parse_array;
ASSERT_OK(ParseFromString(options, json_source, &parse_array));

std::string error_msg = "Failed of conversion of JSON to " +
decimal_type->ToString() +
": 30.0123456789001 requires scale 13";
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr(error_msg),
Convert(decimal_type, parse_array->GetFieldByName("")));
}
}

TEST(ConverterTest, Decimal128And256PrecisionError) {
for (auto decimal_type : {decimal128(38, 10), decimal256(38, 10)}) {
ParseOptions options;
options.explicit_schema = schema({field("", decimal_type)});

std::string json_source = R"(
{"" : "123456789012345678901234567890.0123456789"}
)";

std::shared_ptr<StructArray> parse_array;
ASSERT_OK(ParseFromString(options, json_source, &parse_array));

std::string error_msg =
"Invalid: Failed of conversion of JSON to " + decimal_type->ToString() +
": 123456789012345678901234567890.0123456789 requires precision 40";
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr(error_msg),
Convert(decimal_type, parse_array->GetFieldByName("")));
}
}

} // namespace json
} // namespace arrow
16 changes: 16 additions & 0 deletions python/pyarrow/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

from collections import OrderedDict
from decimal import Decimal
import io
import itertools
import json
Expand Down Expand Up @@ -243,6 +244,21 @@ def test_reconcile_accross_blocks(self):
# Check that the issue was exercised
assert table.column("a").num_chunks > 1

def test_explicit_schema_decimal(self):
rows = (b'{"a": 1}\n'
b'{"a": 1.45}\n'
b'{"a": -23.456}\n'
b'{}\n')
expected = {
'a': [Decimal("1"), Decimal("1.45"), Decimal("-23.456"), None],
}
for type_factory in (pa.decimal128, pa.decimal256):
schema = pa.schema([('a', type_factory(9, 4))])
opts = ParseOptions(explicit_schema=schema)
table = self.read_bytes(rows, parse_options=opts)
assert table.schema == schema
assert table.to_pydict() == expected

def test_explicit_schema_with_unexpected_behaviour(self):
# infer by default
rows = (b'{"foo": "bar", "num": 0}\n'
Expand Down

0 comments on commit 289e0c9

Please sign in to comment.