Skip to content

Commit

Permalink
ARROW-13242: [C++] Improve random generation of decimal arrays
Browse files Browse the repository at this point in the history
- Allow precisions larger than a single uint64
- Implement decimal256 generation
- Add validity tests

Closes #10643 from pitrou/ARROW-13242-dec-random-gen

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
pitrou committed Jul 21, 2021
1 parent 55891ed commit 60f49f1
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 54 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/builder_decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace arrow {
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
using ValueType = Decimal128;

explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
Expand Down Expand Up @@ -61,6 +62,7 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal256Type;
using ValueType = Decimal256;

explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
Expand Down
119 changes: 95 additions & 24 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <gtest/gtest.h>

#include <algorithm>
#include <array>
#include <cmath>
#include <limits>
#include <memory>
Expand Down Expand Up @@ -229,34 +230,95 @@ std::shared_ptr<Array> RandomArrayGenerator::Float64(int64_t size, double min, d
#undef PRIMITIVE_RAND_INTEGER_IMPL
#undef PRIMITIVE_RAND_IMPL

std::shared_ptr<Array> RandomArrayGenerator::Decimal128(std::shared_ptr<DataType> type,
int64_t size,
double null_probability) {
const auto& decimal_type = checked_cast<const Decimal128Type&>(*type);
const auto digits = decimal_type.precision();
if (digits > 18) {
// More than 18 digits + sign don't fit in a int64_t
ABORT_NOT_OK(
Status::NotImplemented("random decimal128 generation with precision > 18"));
}
namespace {

// Generate logical values as integers, then convert them
const auto max = static_cast<int64_t>(std::llround(std::pow(10.0, digits)) - 1);
const auto int_array =
checked_pointer_cast<Int64Array>(Int64(size, -max, max, null_probability));
// A generic generator for random decimal arrays
template <typename DecimalType>
struct DecimalGenerator {
using DecimalBuilderType = typename TypeTraits<DecimalType>::BuilderType;
using DecimalValue = typename DecimalBuilderType::ValueType;

std::shared_ptr<DataType> type_;
RandomArrayGenerator* rng_;

static uint64_t MaxDecimalInteger(int32_t digits) {
// Need to decrement *after* the cast to uint64_t because, while
// 10**x is exactly representable in a double for x <= 19,
// 10**x - 1 is not.
return static_cast<uint64_t>(std::ceil(std::pow(10.0, digits))) - 1;
}

std::shared_ptr<Array> MakeRandomArray(int64_t size, double null_probability) {
// 10**19 fits in a 64-bit unsigned integer
static constexpr int32_t kMaxDigitsInInteger = 19;
static constexpr int kNumIntegers = DecimalType::kByteWidth / 8;

static_assert(
kNumIntegers ==
(DecimalType::kMaxPrecision + kMaxDigitsInInteger - 1) / kMaxDigitsInInteger,
"inconsistent decimal metadata: kMaxPrecision doesn't match kByteWidth");

// First generate separate random values for individual components:
// boolean sign (including null-ness), and uint64 "digits" in big endian order.
const auto& decimal_type = checked_cast<const DecimalType&>(*type_);

const auto sign_array = checked_pointer_cast<BooleanArray>(
rng_->Boolean(size, /*true_probability=*/0.5, null_probability));
std::array<std::shared_ptr<UInt64Array>, kNumIntegers> digit_arrays;

auto remaining_digits = decimal_type.precision();
for (int i = kNumIntegers - 1; i >= 0; --i) {
const auto digits = std::min(kMaxDigitsInInteger, remaining_digits);
digit_arrays[i] = checked_pointer_cast<UInt64Array>(
rng_->UInt64(size, 0, MaxDecimalInteger(digits)));
DCHECK_EQ(digit_arrays[i]->null_count(), 0);
remaining_digits -= digits;
}

Decimal128Builder builder(type);
ABORT_NOT_OK(builder.Reserve(size));
for (int64_t i = 0; i < size; ++i) {
if (int_array->IsValid(i)) {
builder.UnsafeAppend(::arrow::Decimal128(int_array->Value(i)));
} else {
builder.UnsafeAppendNull();
// Second compute decimal values from the individual components,
// building up a decimal array.
DecimalBuilderType builder(type_);
ABORT_NOT_OK(builder.Reserve(size));

const DecimalValue kDigitsMultiplier =
DecimalValue::GetScaleMultiplier(kMaxDigitsInInteger);

for (int64_t i = 0; i < size; ++i) {
if (sign_array->IsValid(i)) {
DecimalValue dec_value{0};
for (int j = 0; j < kNumIntegers; ++j) {
dec_value =
dec_value * kDigitsMultiplier + DecimalValue(digit_arrays[j]->Value(i));
}
if (sign_array->Value(i)) {
builder.UnsafeAppend(dec_value.Negate());
} else {
builder.UnsafeAppend(dec_value);
}
} else {
builder.UnsafeAppendNull();
}
}
std::shared_ptr<Array> array;
ABORT_NOT_OK(builder.Finish(&array));
return array;
}
std::shared_ptr<Array> array;
ABORT_NOT_OK(builder.Finish(&array));
return array;
};

} // namespace

std::shared_ptr<Array> RandomArrayGenerator::Decimal128(std::shared_ptr<DataType> type,
int64_t size,
double null_probability) {
DecimalGenerator<Decimal128Type> gen{type, this};
return gen.MakeRandomArray(size, null_probability);
}

std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType> type,
int64_t size,
double null_probability) {
DecimalGenerator<Decimal256Type> gen{type, this};
return gen.MakeRandomArray(size, null_probability);
}

template <typename TypeClass>
Expand Down Expand Up @@ -623,6 +685,11 @@ struct RandomArrayGeneratorOfImpl {
return Status::OK();
}

Status Visit(const Decimal256Type&) {
out_ = rag_->Decimal256(type_, size_, null_probability_);
return Status::OK();
}

Status Visit(const Decimal128Type&) {
out_ = rag_->Decimal128(type_, size_, null_probability_);
return Status::OK();
Expand Down Expand Up @@ -779,7 +846,11 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
}

case Type::type::DECIMAL128:
return Decimal128(field.type(), length, null_probability);

case Type::type::DECIMAL256:
return Decimal256(field.type(), length, null_probability);

case Type::type::FIXED_SIZE_BINARY: {
auto byte_width =
internal::checked_pointer_cast<FixedSizeBinaryType>(field.type())->byte_width();
Expand Down
49 changes: 30 additions & 19 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
///
/// \param[in] size the size of the array to generate
/// \param[in] true_probability the probability of a value being 1 / bit-set
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
Expand All @@ -66,7 +66,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> UInt8(int64_t size, uint8_t min, uint8_t max,
Expand All @@ -77,7 +77,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Int8(int64_t size, int8_t min, int8_t max,
Expand All @@ -88,7 +88,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> UInt16(int64_t size, uint16_t min, uint16_t max,
Expand All @@ -99,7 +99,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Int16(int64_t size, int16_t min, int16_t max,
Expand All @@ -110,7 +110,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> UInt32(int64_t size, uint32_t min, uint32_t max,
Expand All @@ -121,7 +121,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Int32(int64_t size, int32_t min, int32_t max,
Expand All @@ -132,7 +132,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> UInt64(int64_t size, uint64_t min, uint64_t max,
Expand All @@ -143,7 +143,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Int64(int64_t size, int64_t min, int64_t max,
Expand All @@ -154,7 +154,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the distribution
/// \param[in] max the upper bound of the distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Float16(int64_t size, int16_t min, int16_t max,
Expand All @@ -165,8 +165,8 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] nan_probability the probability of a row being NaN
/// \param[in] null_probability the probability of a value being null
/// \param[in] nan_probability the probability of a value being NaN
///
/// \return a generated Array
std::shared_ptr<Array> Float32(int64_t size, float min, float max,
Expand All @@ -177,8 +177,8 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] size the size of the array to generate
/// \param[in] min the lower bound of the uniform distribution
/// \param[in] max the upper bound of the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] nan_probability the probability of a row being NaN
/// \param[in] null_probability the probability of a value being null
/// \param[in] nan_probability the probability of a value being NaN
///
/// \return a generated Array
std::shared_ptr<Array> Float64(int64_t size, double min, double max,
Expand Down Expand Up @@ -231,12 +231,23 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] type the type of the array to generate
/// (must be an instance of Decimal128Type)
/// \param[in] size the size of the array to generate
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Decimal128(std::shared_ptr<DataType> type, int64_t size,
double null_probability = 0);

/// \brief Generate a random Decimal256Array
///
/// \param[in] type the type of the array to generate
/// (must be an instance of Decimal256Type)
/// \param[in] size the size of the array to generate
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> Decimal256(std::shared_ptr<DataType> type, int64_t size,
double null_probability = 0);

/// \brief Generate an array of offsets (for use in e.g. ListArray::FromArrays)
///
/// \param[in] size the size of the array to generate
Expand All @@ -261,7 +272,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// determined by the uniform distribution
/// \param[in] max_length the upper bound of the string length
/// determined by the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> String(int64_t size, int32_t min_length, int32_t max_length,
Expand All @@ -274,7 +285,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// determined by the uniform distribution
/// \param[in] max_length the upper bound of the string length
/// determined by the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> LargeString(int64_t size, int32_t min_length, int32_t max_length,
Expand All @@ -289,7 +300,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// determined by the uniform distribution
/// \param[in] max_length the upper bound of the string length
/// determined by the uniform distribution
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> StringWithRepeats(int64_t size, int64_t unique,
Expand All @@ -305,7 +316,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
///
/// \param[in] size the size of the array to generate
/// \param[in] byte_width the byte width of fixed-size binary items
/// \param[in] null_probability the probability of a row being null
/// \param[in] null_probability the probability of a value being null
///
/// \return a generated Array
std::shared_ptr<Array> FixedSizeBinary(int64_t size, int32_t byte_width,
Expand Down
Loading

0 comments on commit 60f49f1

Please sign in to comment.