ARROW-6630: [Doc] Document C++ file formats

kszucs · Sep 25, 2019 · aa5c57d · aa5c57d
1 parent 511c089
commit aa5c57d
Show file tree

Hide file tree

Showing 13 changed files with 547 additions and 75 deletions.
diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile
@@ -2074,6 +2074,7 @@ INCLUDE_FILE_PATTERNS  =
 
 PREDEFINED = __attribute__(x)= \
              __declspec(x)= \
+             PARQUET_EXPORT= \
              ARROW_EXPORT= \
              ARROW_FLIGHT_EXPORT= \
              ARROW_EXTERN_TEMPLATE= \

diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
@@ -32,82 +32,91 @@ class DataType;
 
 namespace csv {
 
+// Silly workaround for https://github.com/michaeljones/breathe/issues/453
+constexpr char kDefaultEscapeChar = '\\';
+
 struct ARROW_EXPORT ParseOptions {
   // Parsing options
 
-  // Field delimiter
+  /// Field delimiter
   char delimiter = ',';
-  // Whether quoting is used
+  /// Whether quoting is used
   bool quoting = true;
-  // Quoting character (if `quoting` is true)
+  /// Quoting character (if `quoting` is true)
   char quote_char = '"';
-  // Whether a quote inside a value is double-quoted
+  /// Whether a quote inside a value is double-quoted
   bool double_quote = true;
-  // Whether escaping is used
+  /// Whether escaping is used
   bool escaping = false;
-  // Escaping character (if `escaping` is true)
-  char escape_char = '\\';
-  // Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
+  /// Escaping character (if `escaping` is true)
+  char escape_char = kDefaultEscapeChar;
+  /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
   bool newlines_in_values = false;
-  // Whether empty lines are ignored.  If false, an empty line represents
-  // a single empty value (assuming a one-column CSV file).
+  /// Whether empty lines are ignored.  If false, an empty line represents
+  /// a single empty value (assuming a one-column CSV file).
   bool ignore_empty_lines = true;
 
+  /// Create parsing options with default values
   static ParseOptions Defaults();
 };
 
 struct ARROW_EXPORT ConvertOptions {
   // Conversion options
 
-  // Whether to check UTF8 validity of string columns
+  /// Whether to check UTF8 validity of string columns
   bool check_utf8 = true;
-  // Optional per-column types (disabling type inference on those columns)
+  /// Optional per-column types (disabling type inference on those columns)
   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
-  // Recognized spellings for null values
+  /// Recognized spellings for null values
   std::vector<std::string> null_values;
-  // Recognized spellings for boolean values
+  /// Recognized spellings for boolean true values
   std::vector<std::string> true_values;
+  /// Recognized spellings for boolean false values
   std::vector<std::string> false_values;
-  // Whether string / binary columns can have null values.
-  // If true, then strings in "null_values" are considered null for string columns.
-  // If false, then all strings are valid string values.
+  /// Whether string / binary columns can have null values.
+  ///
+  /// If true, then strings in "null_values" are considered null for string columns.
+  /// If false, then all strings are valid string values.
   bool strings_can_be_null = false;
 
   // XXX Should we have a separate FilterOptions?
 
-  // If non-empty, indicates the names of columns from the CSV file that should
-  // be actually read and converted (in the vector's order).
-  // Columns not in this vector will be ignored.
+  /// If non-empty, indicates the names of columns from the CSV file that should
+  /// be actually read and converted (in the vector's order).
+  /// Columns not in this vector will be ignored.
   std::vector<std::string> include_columns;
-  // If false, columns in `include_columns` but not in the CSV file will error out.
-  // If true, columns in `include_columns` but not in the CSV file will produce
-  // a column of nulls (whose type is selected using `column_types`,
-  // or null by default)
-  // This option is ignored if `include_columns` is empty.
+  /// If false, columns in `include_columns` but not in the CSV file will error out.
+  /// If true, columns in `include_columns` but not in the CSV file will produce
+  /// a column of nulls (whose type is selected using `column_types`,
+  /// or null by default)
+  /// This option is ignored if `include_columns` is empty.
   bool include_missing_columns = false;
 
+  /// Create conversion options with default values, including conventional
+  /// values for `null_values`, `true_values` and `false_values`
   static ConvertOptions Defaults();
 };
 
 struct ARROW_EXPORT ReadOptions {
   // Reader options
 
-  // Whether to use the global CPU thread pool
+  /// Whether to use the global CPU thread pool
   bool use_threads = true;
-  // Block size we request from the IO layer; also determines the size of
-  // chunks when use_threads is true
+  /// Block size we request from the IO layer; also determines the size of
+  /// chunks when use_threads is true
   int32_t block_size = 1 << 20;  // 1 MB
 
-  // Number of header rows to skip (not including the row of column names, if any)
+  /// Number of header rows to skip (not including the row of column names, if any)
   int32_t skip_rows = 0;
-  // Column names for the target table.
-  // If empty, fall back on autogenerate_column_names.
+  /// Column names for the target table.
+  /// If empty, fall back on autogenerate_column_names.
   std::vector<std::string> column_names;
-  // Whether to autogenerate column names if `column_names` is empty.
-  // If true, column names will be of the form "f0", "f1"...
-  // If false, column names will be read from the first CSV row after `skip_rows`.
+  /// Whether to autogenerate column names if `column_names` is empty.
+  /// If true, column names will be of the form "f0", "f1"...
+  /// If false, column names will be read from the first CSV row after `skip_rows`.
   bool autogenerate_column_names = false;
 
+  /// Create read options with default values
   static ReadOptions Defaults();
 };
 

diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h
@@ -35,12 +35,15 @@ class InputStream;
 
 namespace csv {
 
+/// A class that reads an entire CSV file into a Arrow Table
 class ARROW_EXPORT TableReader {
  public:
   virtual ~TableReader() = default;
 
+  /// Read the entire CSV file and convert it to a Arrow Table
   virtual Status Read(std::shared_ptr<Table>* out) = 0;
 
+  /// Create a TableReader instance
   static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
                      const ReadOptions&, const ParseOptions&, const ConvertOptions&,
                      std::shared_ptr<TableReader>* out);

diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h
@@ -29,33 +29,44 @@ class Schema;
 
 namespace json {
 
-enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType };
+enum class UnexpectedFieldBehavior : char {
+  /// Unexpected JSON fields are ignored
+  Ignore,
+  /// Unexpected JSON fields error out
+  Error,
+  /// Unexpected JSON fields are type-inferred and included in the output
+  InferType
+};
 
 struct ARROW_EXPORT ParseOptions {
   // Parsing options
 
-  // Optional explicit schema (no type inference, ignores other fields)
+  /// Optional explicit schema (disables type inference on those fields)
   std::shared_ptr<Schema> explicit_schema;
 
-  // Whether objects may be printed across multiple lines (for example pretty printed)
-  // NB: if false, input must end with an empty line
+  /// Whether objects may be printed across multiple lines (for example pretty-printed)
+  ///
+  /// If true, parsing may be slower
+  /// If false, input must end with an empty line
   bool newlines_in_values = false;
 
-  // How should parse handle fields outside the explicit_schema?
+  /// How JSON fields outside of explicit_schema (if given) are treated
   UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
 
+  /// Create parsing options with default values
   static ParseOptions Defaults();
 };
 
 struct ARROW_EXPORT ReadOptions {
   // Reader options
 
-  // Whether to use the global CPU thread pool
+  /// Whether to use the global CPU thread pool
   bool use_threads = true;
-  // Block size we request from the IO layer; also determines the size of
-  // chunks when use_threads is true
+  /// Block size we request from the IO layer; also determines the size of
+  /// chunks when use_threads is true
   int32_t block_size = 1 << 20;  // 1 MB
 
+  /// Create read options with default values
   static ReadOptions Defaults();
 };
 

diff --git a/cpp/src/arrow/json/reader.h b/cpp/src/arrow/json/reader.h
@@ -39,12 +39,17 @@ class InputStream;
 
 namespace json {
 
+/// A class that reads an entire JSON file into a Arrow Table
+///
+/// The file is expected to consist of individual line-separated JSON objects
 class ARROW_EXPORT TableReader {
  public:
   virtual ~TableReader() = default;
 
+  /// Read the entire JSON file and convert it to a Arrow Table
   virtual Status Read(std::shared_ptr<Table>* out) = 0;
 
+  /// Create a TableReader instance
   static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
                      const ReadOptions&, const ParseOptions&,
                      std::shared_ptr<TableReader>* out);

diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
@@ -48,23 +48,22 @@ class ColumnChunkReader;
 class ColumnReader;
 class RowGroupReader;
 
-// Arrow read adapter class for deserializing Parquet files as Arrow row
-// batches.
-//
-// This interfaces caters for different use cases and thus provides different
-// interfaces. In its most simplistic form, we cater for a user that wants to
-// read the whole Parquet at once with the FileReader::ReadTable method.
-//
-// More advanced users that also want to implement parallelism on top of each
-// single Parquet files should do this on the RowGroup level. For this, they can
-// call FileReader::RowGroup(i)->ReadTable to receive only the specified
-// RowGroup as a table.
-//
-// In the most advanced situation, where a consumer wants to independently read
-// RowGroups in parallel and consume each column individually, they can call
-// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
-// instance.
-//
+/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
+///
+/// This interfaces caters for different use cases and thus provides different
+/// interfaces. In its most simplistic form, we cater for a user that wants to
+/// read the whole Parquet at once with the `FileReader::ReadTable` method.
+///
+/// More advanced users that also want to implement parallelism on top of each
+/// single Parquet files should do this on the RowGroup level. For this, they can
+/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
+/// RowGroup as a table.
+///
+/// In the most advanced situation, where a consumer wants to independently read
+/// RowGroups in parallel and consume each column individually, they can call
+/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
+/// instance.
+///
 // TODO(wesm): nested data does not always make sense with this user
 // interface unless you are only reading a single leaf node from a branch of
 // a table. For example:
@@ -106,11 +105,13 @@ class RowGroupReader;
 // arrays
 class PARQUET_EXPORT FileReader {
  public:
+  /// Factory function to create a FileReader from a ParquetFileReader and properties
   static ::arrow::Status Make(::arrow::MemoryPool* pool,
                               std::unique_ptr<ParquetFileReader> reader,
                               const ArrowReaderProperties& properties,
                               std::unique_ptr<FileReader>* out);
 
+  /// Factory function to create a FileReader from a ParquetFileReader
   static ::arrow::Status Make(::arrow::MemoryPool* pool,
                               std::unique_ptr<ParquetFileReader> reader,
                               std::unique_ptr<FileReader>* out);
@@ -127,7 +128,9 @@ class PARQUET_EXPORT FileReader {
   /// \brief Return arrow schema for all the columns.
   virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
 
-  // Read column as a whole into an Array.
+  /// \brief Read column as a whole into a chunked array.
+  ///
+  /// The indicated column index is relative to the schema
   virtual ::arrow::Status ReadColumn(int i,
                                      std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
 
@@ -183,11 +186,12 @@ class PARQUET_EXPORT FileReader {
     return ::arrow::Status::OK();
   }
 
-  // Read a table of columns into a Table
+  /// Read all columns into a Table
   virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
 
-  // Read a table of columns into a Table. Read only the indicated column
-  // indices (relative to the schema)
+  /// \brief Read the given columns into a Table
+  ///
+  /// The indicated column indices are relative to the schema
   virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
                                     std::shared_ptr<::arrow::Table>* out) = 0;
 
@@ -212,6 +216,7 @@ class PARQUET_EXPORT FileReader {
   ///   FileReader.
   virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
 
+  /// \brief The number of row groups in the file
   virtual int num_row_groups() const = 0;
 
   virtual ParquetFileReader* parquet_reader() const = 0;
@@ -270,14 +275,18 @@ class PARQUET_EXPORT FileReaderBuilder {
  public:
   FileReaderBuilder();
 
+  /// Create FileReaderBuilder from Arrow file and optional properties / metadata
   ::arrow::Status Open(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
                        const ReaderProperties& properties = default_reader_properties(),
                        const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
 
   ParquetFileReader* raw_reader() { return raw_reader_.get(); }
 
+  /// Set Arrow MemoryPool for memory allocation
   FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
+  /// Set Arrow reader properties
   FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
+  /// Build FileReader instance
   ::arrow::Status Build(std::unique_ptr<FileReader>* out);
 
  private:
@@ -286,6 +295,13 @@ class PARQUET_EXPORT FileReaderBuilder {
   std::unique_ptr<ParquetFileReader> raw_reader_;
 };
 
+/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
+///
+/// @{
+
+/// \brief Build FileReader from Arrow file and MemoryPool
+///
+/// Advanced settings are supported through the FileReaderBuilder class.
 PARQUET_EXPORT
 ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
                          ::arrow::MemoryPool* allocator,
@@ -306,6 +322,8 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f
                          const ArrowReaderProperties& properties,
                          std::unique_ptr<FileReader>* reader);
 
+/// @}
+
 PARQUET_EXPORT
 ::arrow::Status FromParquetSchema(
     const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,

diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
@@ -42,11 +42,10 @@ class ParquetFileWriter;
 
 namespace arrow {
 
-/**
- * Iterative API:
- *  Start a new RowGroup/Chunk with NewRowGroup
- *  Write column-by-column the whole column chunk
- */
+/// \brief Iterative FileWriter class
+///
+/// Start a new RowGroup or Chunk with NewRowGroup.
+/// Write column-by-column the whole column chunk.
 class PARQUET_EXPORT FileWriter {
  public:
   static ::arrow::Status Make(
@@ -99,11 +98,9 @@ PARQUET_EXPORT
 ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
                                   ::arrow::io::OutputStream* sink);
 
-/**
- * Write a Table to Parquet.
- *
- * The table shall only consist of columns of primitive type or of primitive lists.
- */
+/// \brief Write a Table to Parquet.
+///
+/// The table shall only consist of columns of primitive type or of primitive lists.
 ::arrow::Status PARQUET_EXPORT WriteTable(
     const ::arrow::Table& table, MemoryPool* pool,
     const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size,

diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst
@@ -31,6 +31,7 @@ API Reference
    api/tensor
    api/utilities
    api/io
+   api/formats
    api/cuda
    api/flight
    api/filesystem