rapidsai · rapids-bot · Dec 8, 2021 · Nov 24, 2021 · Nov 24, 2021 · Dec 2, 2021
@@ -72,6 +72,7 @@ class orc_reader_options {
 
   // Columns that should be read as Decimal128
   std::vector<std::string> _decimal128_columns;
+  bool _enable_decimal128 = true;
 
   friend orc_reader_options_builder;
 
@@ -151,6 +152,11 @@ class orc_reader_options {
    */
   std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
 
+  /**
+   * @brief Whether to use row index to speed-up reading.
+   */
+  bool is_enabled_decimal128() const { return _enable_decimal128; }
+
   // Setters
 
   /**
@@ -225,6 +231,13 @@ class orc_reader_options {
     _decimal_cols_as_float = std::move(val);
   }
 
+  /**
+   * @brief Enable/Disable the use of decimal128 type
+   *
+   * @param use Boolean value to enable/disable.
+   */
+  void enable_decimal128(bool use) { _enable_decimal128 = use; }
+
   /**
    * @brief Set columns that should be read as 128-bit Decimal
    *
@@ -362,6 +375,17 @@ class orc_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Enable/Disable use of decimal128 type
+   *
+   * @param use Boolean value to enable/disable.
+   */
+  orc_reader_options_builder& decimal128(bool use)
+  {
+    options.enable_decimal128(use);
+    return *this;
+  }
+
   /**
    * @brief move orc_reader_options member once it's built.
    */

@@ -230,24 +230,35 @@ size_t gather_stream_info(const size_t stripe_index,
 /**
  * @brief Determines cuDF type of an ORC Decimal column.
  */
-auto decimal_column_type(const std::vector<std::string>& float64_columns,
-                         const std::vector<std::string>& decimal128_columns,
-                         cudf::io::orc::metadata& metadata,
+auto decimal_column_type(std::vector<std::string> const& float64_columns,
+                         std::vector<std::string> const& decimal128_columns,
+                         bool is_decimal128_enabled,
+                         cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
                          int column_index)
 {
-  auto const& column_path = metadata.column_path(column_index);
+  if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY;
+
+  auto const& column_path = metadata.column_path(0, column_index);
   auto is_column_in       = [&](const std::vector<std::string>& cols) {
     return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end();
   };
 
   auto const user_selected_float64    = is_column_in(float64_columns);
-  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
+  auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns);
   CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
                "Both decimal128 and float64 types selected for column " + column_path);
 
   if (user_selected_float64) return type_id::FLOAT64;
   if (user_selected_decimal128) return type_id::DECIMAL128;
-  return type_id::DECIMAL64;
+
+  auto const precision = metadata.get_col_type(column_index)
+                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
+  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) return type_id::DECIMAL32;
+  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) return type_id::DECIMAL64;
+  CUDF_EXPECTS(is_decimal128_enabled,
+               "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable "
+               "decimal128 use");
+  return type_id::DECIMAL128;
 }
 
 }  // namespace
@@ -744,7 +755,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
     _use_np_dtypes,
     _timestamp_type.id(),
     decimal_column_type(
-      _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id));
+      _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -795,7 +806,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
+      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -889,6 +900,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
   decimal128_columns     = options.get_decimal128_columns();
+  is_decimal128_enabled  = options.is_enabled_decimal128();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -953,13 +965,10 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         _use_np_dtypes,
         _timestamp_type.id(),
         decimal_column_type(
-          _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id));
+          _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
-        "Precision of column " + std::string{_metadata.column_name(0, col.id)} +
-          " is over 18, use 128-bit Decimal.");
-      if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) {
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.

@@ -219,12 +219,13 @@ class reader::impl {
   cudf::io::orc::detail::aggregate_orc_metadata _metadata;
   cudf::io::orc::detail::column_hierarchy selected_columns;
 
-  bool _use_index     = true;
-  bool _use_np_dtypes = true;
+  bool _use_index{true};
+  bool _use_np_dtypes{true};
   std::vector<std::string> _decimal_cols_as_float;
   std::vector<std::string> decimal128_columns;
+  bool is_decimal128_enabled{true};
   data_type _timestamp_type{type_id::EMPTY};
-  reader_column_meta _col_meta;
+  reader_column_meta _col_meta{};
 };
 
 }  // namespace orc

@@ -1066,12 +1066,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
             return (v / kPow5i[scale]) >> scale;
           }
         }();
-        if (dtype_id == type_id::DECIMAL64) {
+        if (dtype_id == type_id::DECIMAL32) {
+          vals.i32[t] = scaled_value;
+        } else if (dtype_id == type_id::DECIMAL64) {
           vals.i64[t] = scaled_value;
         } else {
-          {
-            vals.i128[t] = scaled_value;
-          }
+          vals.i128[t] = scaled_value;
         }
       }
     }
@@ -1708,8 +1708,10 @@ __global__ void __launch_bounds__(block_size)
             case DOUBLE:
             case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              if (s->chunk.dtype_id == type_id::FLOAT64 or
-                  s->chunk.dtype_id == type_id::DECIMAL64) {
+              if (s->chunk.dtype_id == type_id::DECIMAL32) {
+                static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped];
+              } else if (s->chunk.dtype_id == type_id::FLOAT64 or
+                         s->chunk.dtype_id == type_id::DECIMAL64) {
                 static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               } else {
                 // decimal128

@@ -387,9 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
-      .use_index(false)
-      .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"});
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -1178,9 +1176,9 @@ TEST_F(OrcWriterTest, Decimal32)
   auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
     return numeric::decimal32{vals[i], numeric::scale_type{2}};
   });
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
   column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
-  cudf::table_view expected({static_cast<cudf::column_view>(col)});
+  cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
   cudf_io::orc_writer_options out_opts =
@@ -1192,12 +1190,7 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{2}};
-  });
-  column_wrapper<numeric::decimal64> col64{data64, data64 + num_rows, mask};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0));
 }
 
 TEST_F(OrcStatisticsTest, Overflow)
@@ -1438,7 +1431,7 @@ TEST_F(OrcReaderTest, DecimalOptions)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
       .decimal128_columns({"dec", "fake_name"})
       .decimal_cols_as_float({"decc", "fake_name"});
-  // Should not throw
+  // Should not throw, even with "fake name" in both options
   EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
 
   cudf_io::orc_reader_options invalid_opts =
@@ -1493,10 +1486,12 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
       .use_index(false)
-      .decimal128_columns({"lists.1.dec128"});
+      .decimal128_columns({"lists.1.dec64"});
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  // Both columns should be read as decimal128
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0),
+                                      result.tbl->view().column(0).child(1).child(1));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
@@ -36,7 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
-        void set_decimal128_columns(vector[string] val) except+
+        void enable_decimal128(bool val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -58,9 +58,7 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
-        orc_reader_options_builder& decimal128_columns(
-            vector[string] val
-        ) except+
+        orc_reader_options_builder& decimal128(bool val) except+
 
         orc_reader_options build() except+
 

@@ -248,6 +248,7 @@ cdef orc_reader_options make_orc_reader_options(
         .timestamp_type(data_type(timestamp_type))
         .use_index(use_index)
         .decimal_cols_as_float(c_decimal_cols_as_float)
+        .decimal128(False)
         .build()
     )