Skip to content

Commit

Permalink
ARROW-6630: [Doc] Document C++ file formats
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Sep 25, 2019
1 parent 511c089 commit aa5c57d
Show file tree
Hide file tree
Showing 13 changed files with 547 additions and 75 deletions.
1 change: 1 addition & 0 deletions cpp/apidoc/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,7 @@ INCLUDE_FILE_PATTERNS =

PREDEFINED = __attribute__(x)= \
__declspec(x)= \
PARQUET_EXPORT= \
ARROW_EXPORT= \
ARROW_FLIGHT_EXPORT= \
ARROW_EXTERN_TEMPLATE= \
Expand Down
77 changes: 43 additions & 34 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,82 +32,91 @@ class DataType;

namespace csv {

// Silly workaround for https://github.com/michaeljones/breathe/issues/453
constexpr char kDefaultEscapeChar = '\\';

struct ARROW_EXPORT ParseOptions {
// Parsing options

// Field delimiter
/// Field delimiter
char delimiter = ',';
// Whether quoting is used
/// Whether quoting is used
bool quoting = true;
// Quoting character (if `quoting` is true)
/// Quoting character (if `quoting` is true)
char quote_char = '"';
// Whether a quote inside a value is double-quoted
/// Whether a quote inside a value is double-quoted
bool double_quote = true;
// Whether escaping is used
/// Whether escaping is used
bool escaping = false;
// Escaping character (if `escaping` is true)
char escape_char = '\\';
// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
/// Escaping character (if `escaping` is true)
char escape_char = kDefaultEscapeChar;
/// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
bool newlines_in_values = false;
// Whether empty lines are ignored. If false, an empty line represents
// a single empty value (assuming a one-column CSV file).
/// Whether empty lines are ignored. If false, an empty line represents
/// a single empty value (assuming a one-column CSV file).
bool ignore_empty_lines = true;

/// Create parsing options with default values
static ParseOptions Defaults();
};

struct ARROW_EXPORT ConvertOptions {
// Conversion options

// Whether to check UTF8 validity of string columns
/// Whether to check UTF8 validity of string columns
bool check_utf8 = true;
// Optional per-column types (disabling type inference on those columns)
/// Optional per-column types (disabling type inference on those columns)
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
// Recognized spellings for null values
/// Recognized spellings for null values
std::vector<std::string> null_values;
// Recognized spellings for boolean values
/// Recognized spellings for boolean true values
std::vector<std::string> true_values;
/// Recognized spellings for boolean false values
std::vector<std::string> false_values;
// Whether string / binary columns can have null values.
// If true, then strings in "null_values" are considered null for string columns.
// If false, then all strings are valid string values.
/// Whether string / binary columns can have null values.
///
/// If true, then strings in "null_values" are considered null for string columns.
/// If false, then all strings are valid string values.
bool strings_can_be_null = false;

// XXX Should we have a separate FilterOptions?

// If non-empty, indicates the names of columns from the CSV file that should
// be actually read and converted (in the vector's order).
// Columns not in this vector will be ignored.
/// If non-empty, indicates the names of columns from the CSV file that should
/// be actually read and converted (in the vector's order).
/// Columns not in this vector will be ignored.
std::vector<std::string> include_columns;
// If false, columns in `include_columns` but not in the CSV file will error out.
// If true, columns in `include_columns` but not in the CSV file will produce
// a column of nulls (whose type is selected using `column_types`,
// or null by default)
// This option is ignored if `include_columns` is empty.
/// If false, columns in `include_columns` but not in the CSV file will error out.
/// If true, columns in `include_columns` but not in the CSV file will produce
/// a column of nulls (whose type is selected using `column_types`,
/// or null by default)
/// This option is ignored if `include_columns` is empty.
bool include_missing_columns = false;

/// Create conversion options with default values, including conventional
/// values for `null_values`, `true_values` and `false_values`
static ConvertOptions Defaults();
};

struct ARROW_EXPORT ReadOptions {
// Reader options

// Whether to use the global CPU thread pool
/// Whether to use the global CPU thread pool
bool use_threads = true;
// Block size we request from the IO layer; also determines the size of
// chunks when use_threads is true
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB

// Number of header rows to skip (not including the row of column names, if any)
/// Number of header rows to skip (not including the row of column names, if any)
int32_t skip_rows = 0;
// Column names for the target table.
// If empty, fall back on autogenerate_column_names.
/// Column names for the target table.
/// If empty, fall back on autogenerate_column_names.
std::vector<std::string> column_names;
// Whether to autogenerate column names if `column_names` is empty.
// If true, column names will be of the form "f0", "f1"...
// If false, column names will be read from the first CSV row after `skip_rows`.
/// Whether to autogenerate column names if `column_names` is empty.
/// If true, column names will be of the form "f0", "f1"...
/// If false, column names will be read from the first CSV row after `skip_rows`.
bool autogenerate_column_names = false;

/// Create read options with default values
static ReadOptions Defaults();
};

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/csv/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ class InputStream;

namespace csv {

/// A class that reads an entire CSV file into a Arrow Table
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;

/// Read the entire CSV file and convert it to a Arrow Table
virtual Status Read(std::shared_ptr<Table>* out) = 0;

/// Create a TableReader instance
static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
const ReadOptions&, const ParseOptions&, const ConvertOptions&,
std::shared_ptr<TableReader>* out);
Expand Down
27 changes: 19 additions & 8 deletions cpp/src/arrow/json/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,33 +29,44 @@ class Schema;

namespace json {

enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType };
enum class UnexpectedFieldBehavior : char {
/// Unexpected JSON fields are ignored
Ignore,
/// Unexpected JSON fields error out
Error,
/// Unexpected JSON fields are type-inferred and included in the output
InferType
};

struct ARROW_EXPORT ParseOptions {
// Parsing options

// Optional explicit schema (no type inference, ignores other fields)
/// Optional explicit schema (disables type inference on those fields)
std::shared_ptr<Schema> explicit_schema;

// Whether objects may be printed across multiple lines (for example pretty printed)
// NB: if false, input must end with an empty line
/// Whether objects may be printed across multiple lines (for example pretty-printed)
///
/// If true, parsing may be slower
/// If false, input must end with an empty line
bool newlines_in_values = false;

// How should parse handle fields outside the explicit_schema?
/// How JSON fields outside of explicit_schema (if given) are treated
UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;

/// Create parsing options with default values
static ParseOptions Defaults();
};

struct ARROW_EXPORT ReadOptions {
// Reader options

// Whether to use the global CPU thread pool
/// Whether to use the global CPU thread pool
bool use_threads = true;
// Block size we request from the IO layer; also determines the size of
// chunks when use_threads is true
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB

/// Create read options with default values
static ReadOptions Defaults();
};

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/json/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,17 @@ class InputStream;

namespace json {

/// A class that reads an entire JSON file into a Arrow Table
///
/// The file is expected to consist of individual line-separated JSON objects
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;

/// Read the entire JSON file and convert it to a Arrow Table
virtual Status Read(std::shared_ptr<Table>* out) = 0;

/// Create a TableReader instance
static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
const ReadOptions&, const ParseOptions&,
std::shared_ptr<TableReader>* out);
Expand Down
60 changes: 39 additions & 21 deletions cpp/src/parquet/arrow/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,22 @@ class ColumnChunkReader;
class ColumnReader;
class RowGroupReader;

// Arrow read adapter class for deserializing Parquet files as Arrow row
// batches.
//
// This interfaces caters for different use cases and thus provides different
// interfaces. In its most simplistic form, we cater for a user that wants to
// read the whole Parquet at once with the FileReader::ReadTable method.
//
// More advanced users that also want to implement parallelism on top of each
// single Parquet files should do this on the RowGroup level. For this, they can
// call FileReader::RowGroup(i)->ReadTable to receive only the specified
// RowGroup as a table.
//
// In the most advanced situation, where a consumer wants to independently read
// RowGroups in parallel and consume each column individually, they can call
// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column
// instance.
//
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
///
/// This interfaces caters for different use cases and thus provides different
/// interfaces. In its most simplistic form, we cater for a user that wants to
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
///
/// More advanced users that also want to implement parallelism on top of each
/// single Parquet files should do this on the RowGroup level. For this, they can
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
/// RowGroup as a table.
///
/// In the most advanced situation, where a consumer wants to independently read
/// RowGroups in parallel and consume each column individually, they can call
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
/// instance.
///
// TODO(wesm): nested data does not always make sense with this user
// interface unless you are only reading a single leaf node from a branch of
// a table. For example:
Expand Down Expand Up @@ -106,11 +105,13 @@ class RowGroupReader;
// arrays
class PARQUET_EXPORT FileReader {
public:
/// Factory function to create a FileReader from a ParquetFileReader and properties
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
const ArrowReaderProperties& properties,
std::unique_ptr<FileReader>* out);

/// Factory function to create a FileReader from a ParquetFileReader
static ::arrow::Status Make(::arrow::MemoryPool* pool,
std::unique_ptr<ParquetFileReader> reader,
std::unique_ptr<FileReader>* out);
Expand All @@ -127,7 +128,9 @@ class PARQUET_EXPORT FileReader {
/// \brief Return arrow schema for all the columns.
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;

// Read column as a whole into an Array.
/// \brief Read column as a whole into a chunked array.
///
/// The indicated column index is relative to the schema
virtual ::arrow::Status ReadColumn(int i,
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;

Expand Down Expand Up @@ -183,11 +186,12 @@ class PARQUET_EXPORT FileReader {
return ::arrow::Status::OK();
}

// Read a table of columns into a Table
/// Read all columns into a Table
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;

// Read a table of columns into a Table. Read only the indicated column
// indices (relative to the schema)
/// \brief Read the given columns into a Table
///
/// The indicated column indices are relative to the schema
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
std::shared_ptr<::arrow::Table>* out) = 0;

Expand All @@ -212,6 +216,7 @@ class PARQUET_EXPORT FileReader {
/// FileReader.
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;

/// \brief The number of row groups in the file
virtual int num_row_groups() const = 0;

virtual ParquetFileReader* parquet_reader() const = 0;
Expand Down Expand Up @@ -270,14 +275,18 @@ class PARQUET_EXPORT FileReaderBuilder {
public:
FileReaderBuilder();

/// Create FileReaderBuilder from Arrow file and optional properties / metadata
::arrow::Status Open(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
const ReaderProperties& properties = default_reader_properties(),
const std::shared_ptr<FileMetaData>& metadata = NULLPTR);

ParquetFileReader* raw_reader() { return raw_reader_.get(); }

/// Set Arrow MemoryPool for memory allocation
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
/// Set Arrow reader properties
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
/// Build FileReader instance
::arrow::Status Build(std::unique_ptr<FileReader>* out);

private:
Expand All @@ -286,6 +295,13 @@ class PARQUET_EXPORT FileReaderBuilder {
std::unique_ptr<ParquetFileReader> raw_reader_;
};

/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
///
/// @{

/// \brief Build FileReader from Arrow file and MemoryPool
///
/// Advanced settings are supported through the FileReaderBuilder class.
PARQUET_EXPORT
::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
::arrow::MemoryPool* allocator,
Expand All @@ -306,6 +322,8 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f
const ArrowReaderProperties& properties,
std::unique_ptr<FileReader>* reader);

/// @}

PARQUET_EXPORT
::arrow::Status FromParquetSchema(
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
Expand Down
17 changes: 7 additions & 10 deletions cpp/src/parquet/arrow/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,10 @@ class ParquetFileWriter;

namespace arrow {

/**
* Iterative API:
* Start a new RowGroup/Chunk with NewRowGroup
* Write column-by-column the whole column chunk
*/
/// \brief Iterative FileWriter class
///
/// Start a new RowGroup or Chunk with NewRowGroup.
/// Write column-by-column the whole column chunk.
class PARQUET_EXPORT FileWriter {
public:
static ::arrow::Status Make(
Expand Down Expand Up @@ -99,11 +98,9 @@ PARQUET_EXPORT
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);

/**
* Write a Table to Parquet.
*
* The table shall only consist of columns of primitive type or of primitive lists.
*/
/// \brief Write a Table to Parquet.
///
/// The table shall only consist of columns of primitive type or of primitive lists.
::arrow::Status PARQUET_EXPORT WriteTable(
const ::arrow::Table& table, MemoryPool* pool,
const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size,
Expand Down
1 change: 1 addition & 0 deletions docs/source/cpp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ API Reference
api/tensor
api/utilities
api/io
api/formats
api/cuda
api/flight
api/filesystem
Loading

0 comments on commit aa5c57d

Please sign in to comment.