From 2f6440a2e93f45c020dae9dffc65653244147a65 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Jun 2019 15:21:01 -0500 Subject: [PATCH] Remove not-currently-needed enum, add comment about an example partition structure --- cpp/src/arrow/dataset/dataset.h | 9 ++------- cpp/src/arrow/dataset/partition.h | 30 ++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index ba49b28466dcb..4aba8945b270e 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include "arrow/dataset/type_fwd.h" @@ -54,15 +55,9 @@ struct DataSelector { /// DataFragments class ARROW_DS_EXPORT DataSource { public: - enum Type { - SIMPLE, // Flat collection - PARTITIONED, // Partitioned collection - GENERIC // All others - }; - virtual ~DataSource() = default; - virtual Type type() const = 0; + virtual std::string type() const = 0; virtual std::unique_ptr GetFragments( const DataSelector& selector) = 0; diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 628480f4457a3..28c55adcc108b 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -117,9 +117,35 @@ class ARROW_DS_EXPORT HivePartitionScheme : public PartitionScheme { // ---------------------------------------------------------------------- // +// Partitioned datasets come in different forms. Here is an example of +// a Hive-style partitioned dataset: +// +// dataset_root/ +// key1=$k1_v1/ +// key2=$k2_v1/ +// 0.parquet +// 1.parquet +// 2.parquet +// 3.parquet +// key2=$k2_v2/ +// 0.parquet +// 1.parquet +// key1=$k1_v2/ +// key2=$k2_v1/ +// 0.parquet +// 1.parquet +// key2=$k2_v2/ +// 0.parquet +// 1.parquet +// 2.parquet +// +// In this case, the dataset has 11 fragments (11 files) to be +// scanned, or potentially more if it is configured to split Parquet +// files at the row group level + class ARROW_DS_EXPORT Partition : public DataSource { public: - DataSource::Type type() const override; + std::string type() const override; /// \brief The key for this partition source, may be nullptr, /// e.g. for the top-level partitioned source container @@ -129,7 +155,7 @@ class ARROW_DS_EXPORT Partition : public DataSource { const Selector& selector) = 0; }; -/// \brief Container for a dataset partition, which consists of a +/// \brief Simple implementation of Partition, which consists of a /// partition identifier, subpartitions, and some data fragments class ARROW_DS_EXPORT SimplePartition : public Partition { public: