Skip to content

Commit

Permalink
Use default value for decimal precision in parquet writer when not sp…
Browse files Browse the repository at this point in the history
…ecified (#9963)

Fixes #9962

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: #9963
  • Loading branch information
devavret authored Jan 7, 2022
1 parent 42a0e55 commit 7656277
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 23 deletions.
20 changes: 12 additions & 8 deletions cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,14 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const use_pandas_metadata = (flags & 2) != 0;
auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};

auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const data_types = dtypes_for_column_selection(
get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
static_cast<int32_t>(type_group_id::FLOATING_POINT),
static_cast<int32_t>(type_group_id::FIXED_POINT),
static_cast<int32_t>(type_group_id::TIMESTAMP),
static_cast<int32_t>(cudf::type_id::STRING),
static_cast<int32_t>(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();

Expand Down Expand Up @@ -181,6 +181,9 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
->Unit(benchmark::kMillisecond)
->UseManualTime();

// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
/*
BENCHMARK_DEFINE_F(ParquetRead, row_selection)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
BENCHMARK_REGISTER_F(ParquetRead, row_selection)
Expand All @@ -191,6 +194,7 @@ BENCHMARK_REGISTER_F(ParquetRead, row_selection)
{int32_t(cudf::type_id::EMPTY)}})
->Unit(benchmark::kMillisecond)
->UseManualTime();
*/

BENCHMARK_DEFINE_F(ParquetRead, misc_options)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
Expand Down
27 changes: 15 additions & 12 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -447,25 +447,28 @@ struct leaf_schema_fn {
std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
{
if (std::is_same_v<T, numeric::decimal32>) {
col_schema.type = Type::INT32;
col_schema.stats_dtype = statistics_dtype::dtype_int32;
col_schema.type = Type::INT32;
col_schema.stats_dtype = statistics_dtype::dtype_int32;
col_schema.decimal_precision = 9;
} else if (std::is_same_v<T, numeric::decimal64>) {
col_schema.type = Type::INT64;
col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
col_schema.type = Type::INT64;
col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
col_schema.decimal_precision = 18;
} else if (std::is_same_v<T, numeric::decimal128>) {
col_schema.type = Type::FIXED_LEN_BYTE_ARRAY;
col_schema.type_length = sizeof(__int128_t);
col_schema.stats_dtype = statistics_dtype::dtype_decimal128;
col_schema.type = Type::FIXED_LEN_BYTE_ARRAY;
col_schema.type_length = sizeof(__int128_t);
col_schema.stats_dtype = statistics_dtype::dtype_decimal128;
col_schema.decimal_precision = 38;
} else {
CUDF_FAIL("Unsupported fixed point type for parquet writer");
}
col_schema.converted_type = ConvertedType::DECIMAL;
col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs
CUDF_EXPECTS(col_meta.is_decimal_precision_set(),
"Precision must be specified for decimal columns");
CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
"Precision must be equal to or greater than scale!");
col_schema.decimal_precision = col_meta.get_decimal_precision();
if (col_meta.is_decimal_precision_set()) {
CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
"Precision must be equal to or greater than scale!");
col_schema.decimal_precision = col_meta.get_decimal_precision();
}
}

template <typename T>
Expand Down
3 changes: 0 additions & 3 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2021,9 +2021,6 @@ TEST_F(ParquetWriterTest, DecimalWrite)
cudf_io::parquet_writer_options args =
cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table);

// verify failure if no decimal precision given
EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);

cudf_io::table_input_metadata expected_metadata(table);

// verify failure if too small a precision is given
Expand Down

0 comments on commit 7656277

Please sign in to comment.