diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 09ac5b9868517..60551910b96ee 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2074,6 +2074,7 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ + PARQUET_EXPORT= \ ARROW_EXPORT= \ ARROW_FLIGHT_EXPORT= \ ARROW_EXTERN_TEMPLATE= \ diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 4daaa9f1887bc..48af5a6b8ea4c 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -32,82 +32,91 @@ class DataType; namespace csv { +// Silly workaround for https://github.com/michaeljones/breathe/issues/453 +constexpr char kDefaultEscapeChar = '\\'; + struct ARROW_EXPORT ParseOptions { // Parsing options - // Field delimiter + /// Field delimiter char delimiter = ','; - // Whether quoting is used + /// Whether quoting is used bool quoting = true; - // Quoting character (if `quoting` is true) + /// Quoting character (if `quoting` is true) char quote_char = '"'; - // Whether a quote inside a value is double-quoted + /// Whether a quote inside a value is double-quoted bool double_quote = true; - // Whether escaping is used + /// Whether escaping is used bool escaping = false; - // Escaping character (if `escaping` is true) - char escape_char = '\\'; - // Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters + /// Escaping character (if `escaping` is true) + char escape_char = kDefaultEscapeChar; + /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters bool newlines_in_values = false; - // Whether empty lines are ignored. If false, an empty line represents - // a single empty value (assuming a one-column CSV file). + /// Whether empty lines are ignored. If false, an empty line represents + /// a single empty value (assuming a one-column CSV file). bool ignore_empty_lines = true; + /// Create parsing options with default values static ParseOptions Defaults(); }; struct ARROW_EXPORT ConvertOptions { // Conversion options - // Whether to check UTF8 validity of string columns + /// Whether to check UTF8 validity of string columns bool check_utf8 = true; - // Optional per-column types (disabling type inference on those columns) + /// Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; - // Recognized spellings for null values + /// Recognized spellings for null values std::vector null_values; - // Recognized spellings for boolean values + /// Recognized spellings for boolean true values std::vector true_values; + /// Recognized spellings for boolean false values std::vector false_values; - // Whether string / binary columns can have null values. - // If true, then strings in "null_values" are considered null for string columns. - // If false, then all strings are valid string values. + /// Whether string / binary columns can have null values. + /// + /// If true, then strings in "null_values" are considered null for string columns. + /// If false, then all strings are valid string values. bool strings_can_be_null = false; // XXX Should we have a separate FilterOptions? - // If non-empty, indicates the names of columns from the CSV file that should - // be actually read and converted (in the vector's order). - // Columns not in this vector will be ignored. + /// If non-empty, indicates the names of columns from the CSV file that should + /// be actually read and converted (in the vector's order). + /// Columns not in this vector will be ignored. std::vector include_columns; - // If false, columns in `include_columns` but not in the CSV file will error out. - // If true, columns in `include_columns` but not in the CSV file will produce - // a column of nulls (whose type is selected using `column_types`, - // or null by default) - // This option is ignored if `include_columns` is empty. + /// If false, columns in `include_columns` but not in the CSV file will error out. + /// If true, columns in `include_columns` but not in the CSV file will produce + /// a column of nulls (whose type is selected using `column_types`, + /// or null by default) + /// This option is ignored if `include_columns` is empty. bool include_missing_columns = false; + /// Create conversion options with default values, including conventional + /// values for `null_values`, `true_values` and `false_values` static ConvertOptions Defaults(); }; struct ARROW_EXPORT ReadOptions { // Reader options - // Whether to use the global CPU thread pool + /// Whether to use the global CPU thread pool bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true int32_t block_size = 1 << 20; // 1 MB - // Number of header rows to skip (not including the row of column names, if any) + /// Number of header rows to skip (not including the row of column names, if any) int32_t skip_rows = 0; - // Column names for the target table. - // If empty, fall back on autogenerate_column_names. + /// Column names for the target table. + /// If empty, fall back on autogenerate_column_names. std::vector column_names; - // Whether to autogenerate column names if `column_names` is empty. - // If true, column names will be of the form "f0", "f1"... - // If false, column names will be read from the first CSV row after `skip_rows`. + /// Whether to autogenerate column names if `column_names` is empty. + /// If true, column names will be of the form "f0", "f1"... + /// If false, column names will be read from the first CSV row after `skip_rows`. bool autogenerate_column_names = false; + /// Create read options with default values static ReadOptions Defaults(); }; diff --git a/cpp/src/arrow/csv/reader.h b/cpp/src/arrow/csv/reader.h index 53255f9677815..8645d24d083e9 100644 --- a/cpp/src/arrow/csv/reader.h +++ b/cpp/src/arrow/csv/reader.h @@ -35,12 +35,15 @@ class InputStream; namespace csv { +/// A class that reads an entire CSV file into a Arrow Table class ARROW_EXPORT TableReader { public: virtual ~TableReader() = default; + /// Read the entire CSV file and convert it to a Arrow Table virtual Status Read(std::shared_ptr* out) = 0; + /// Create a TableReader instance static Status Make(MemoryPool* pool, std::shared_ptr input, const ReadOptions&, const ParseOptions&, const ConvertOptions&, std::shared_ptr* out); diff --git a/cpp/src/arrow/json/options.h b/cpp/src/arrow/json/options.h index 8d27faabea2da..f075041880386 100644 --- a/cpp/src/arrow/json/options.h +++ b/cpp/src/arrow/json/options.h @@ -29,33 +29,44 @@ class Schema; namespace json { -enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType }; +enum class UnexpectedFieldBehavior : char { + /// Unexpected JSON fields are ignored + Ignore, + /// Unexpected JSON fields error out + Error, + /// Unexpected JSON fields are type-inferred and included in the output + InferType +}; struct ARROW_EXPORT ParseOptions { // Parsing options - // Optional explicit schema (no type inference, ignores other fields) + /// Optional explicit schema (disables type inference on those fields) std::shared_ptr explicit_schema; - // Whether objects may be printed across multiple lines (for example pretty printed) - // NB: if false, input must end with an empty line + /// Whether objects may be printed across multiple lines (for example pretty-printed) + /// + /// If true, parsing may be slower + /// If false, input must end with an empty line bool newlines_in_values = false; - // How should parse handle fields outside the explicit_schema? + /// How JSON fields outside of explicit_schema (if given) are treated UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + /// Create parsing options with default values static ParseOptions Defaults(); }; struct ARROW_EXPORT ReadOptions { // Reader options - // Whether to use the global CPU thread pool + /// Whether to use the global CPU thread pool bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true int32_t block_size = 1 << 20; // 1 MB + /// Create read options with default values static ReadOptions Defaults(); }; diff --git a/cpp/src/arrow/json/reader.h b/cpp/src/arrow/json/reader.h index 51a3473a04e19..9ffe47de08c6a 100644 --- a/cpp/src/arrow/json/reader.h +++ b/cpp/src/arrow/json/reader.h @@ -39,12 +39,17 @@ class InputStream; namespace json { +/// A class that reads an entire JSON file into a Arrow Table +/// +/// The file is expected to consist of individual line-separated JSON objects class ARROW_EXPORT TableReader { public: virtual ~TableReader() = default; + /// Read the entire JSON file and convert it to a Arrow Table virtual Status Read(std::shared_ptr
* out) = 0; + /// Create a TableReader instance static Status Make(MemoryPool* pool, std::shared_ptr input, const ReadOptions&, const ParseOptions&, std::shared_ptr* out); diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 17ebf925bb91b..f027de1fddbb9 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -48,23 +48,22 @@ class ColumnChunkReader; class ColumnReader; class RowGroupReader; -// Arrow read adapter class for deserializing Parquet files as Arrow row -// batches. -// -// This interfaces caters for different use cases and thus provides different -// interfaces. In its most simplistic form, we cater for a user that wants to -// read the whole Parquet at once with the FileReader::ReadTable method. -// -// More advanced users that also want to implement parallelism on top of each -// single Parquet files should do this on the RowGroup level. For this, they can -// call FileReader::RowGroup(i)->ReadTable to receive only the specified -// RowGroup as a table. -// -// In the most advanced situation, where a consumer wants to independently read -// RowGroups in parallel and consume each column individually, they can call -// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column -// instance. -// +/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches. +/// +/// This interfaces caters for different use cases and thus provides different +/// interfaces. In its most simplistic form, we cater for a user that wants to +/// read the whole Parquet at once with the `FileReader::ReadTable` method. +/// +/// More advanced users that also want to implement parallelism on top of each +/// single Parquet files should do this on the RowGroup level. For this, they can +/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified +/// RowGroup as a table. +/// +/// In the most advanced situation, where a consumer wants to independently read +/// RowGroups in parallel and consume each column individually, they can call +/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column` +/// instance. +/// // TODO(wesm): nested data does not always make sense with this user // interface unless you are only reading a single leaf node from a branch of // a table. For example: @@ -106,11 +105,13 @@ class RowGroupReader; // arrays class PARQUET_EXPORT FileReader { public: + /// Factory function to create a FileReader from a ParquetFileReader and properties static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, std::unique_ptr* out); + /// Factory function to create a FileReader from a ParquetFileReader static ::arrow::Status Make(::arrow::MemoryPool* pool, std::unique_ptr reader, std::unique_ptr* out); @@ -127,7 +128,9 @@ class PARQUET_EXPORT FileReader { /// \brief Return arrow schema for all the columns. virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0; - // Read column as a whole into an Array. + /// \brief Read column as a whole into a chunked array. + /// + /// The indicated column index is relative to the schema virtual ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0; @@ -183,11 +186,12 @@ class PARQUET_EXPORT FileReader { return ::arrow::Status::OK(); } - // Read a table of columns into a Table + /// Read all columns into a Table virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0; - // Read a table of columns into a Table. Read only the indicated column - // indices (relative to the schema) + /// \brief Read the given columns into a Table + /// + /// The indicated column indices are relative to the schema virtual ::arrow::Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) = 0; @@ -212,6 +216,7 @@ class PARQUET_EXPORT FileReader { /// FileReader. virtual std::shared_ptr RowGroup(int row_group_index) = 0; + /// \brief The number of row groups in the file virtual int num_row_groups() const = 0; virtual ParquetFileReader* parquet_reader() const = 0; @@ -270,14 +275,18 @@ class PARQUET_EXPORT FileReaderBuilder { public: FileReaderBuilder(); + /// Create FileReaderBuilder from Arrow file and optional properties / metadata ::arrow::Status Open(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, const ReaderProperties& properties = default_reader_properties(), const std::shared_ptr& metadata = NULLPTR); ParquetFileReader* raw_reader() { return raw_reader_.get(); } + /// Set Arrow MemoryPool for memory allocation FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool); + /// Set Arrow reader properties FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties); + /// Build FileReader instance ::arrow::Status Build(std::unique_ptr* out); private: @@ -286,6 +295,13 @@ class PARQUET_EXPORT FileReaderBuilder { std::unique_ptr raw_reader_; }; +/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers +/// +/// @{ + +/// \brief Build FileReader from Arrow file and MemoryPool +/// +/// Advanced settings are supported through the FileReaderBuilder class. PARQUET_EXPORT ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, ::arrow::MemoryPool* allocator, @@ -306,6 +322,8 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f const ArrowReaderProperties& properties, std::unique_ptr* reader); +/// @} + PARQUET_EXPORT ::arrow::Status FromParquetSchema( const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index cfcb6eb0c7bd6..354c2ba8de00e 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -42,11 +42,10 @@ class ParquetFileWriter; namespace arrow { -/** - * Iterative API: - * Start a new RowGroup/Chunk with NewRowGroup - * Write column-by-column the whole column chunk - */ +/// \brief Iterative FileWriter class +/// +/// Start a new RowGroup or Chunk with NewRowGroup. +/// Write column-by-column the whole column chunk. class PARQUET_EXPORT FileWriter { public: static ::arrow::Status Make( @@ -99,11 +98,9 @@ PARQUET_EXPORT ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata, ::arrow::io::OutputStream* sink); -/** - * Write a Table to Parquet. - * - * The table shall only consist of columns of primitive type or of primitive lists. - */ +/// \brief Write a Table to Parquet. +/// +/// The table shall only consist of columns of primitive type or of primitive lists. ::arrow::Status PARQUET_EXPORT WriteTable( const ::arrow::Table& table, MemoryPool* pool, const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size, diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index f62fe52b6196a..9b7d356980b8b 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -31,6 +31,7 @@ API Reference api/tensor api/utilities api/io + api/formats api/cuda api/flight api/filesystem diff --git a/docs/source/cpp/api/formats.rst b/docs/source/cpp/api/formats.rst new file mode 100644 index 0000000000000..5713b034d623d --- /dev/null +++ b/docs/source/cpp/api/formats.rst @@ -0,0 +1,86 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +File Formats +============ + +CSV +=== + +.. doxygenstruct:: arrow::csv::ReadOptions + :members: + +.. doxygenstruct:: arrow::csv::ParseOptions + :members: + +.. doxygenstruct:: arrow::csv::ConvertOptions + :members: + +.. doxygenclass:: arrow::csv::TableReader + :members: + +Line-separated JSON +=================== + +.. doxygenenum:: arrow::json::UnexpectedFieldBehavior + +.. doxygenstruct:: arrow::json::ReadOptions + :members: + +.. doxygenstruct:: arrow::json::ParseOptions + :members: + +.. doxygenclass:: arrow::json::TableReader + :members: + +Parquet reader +============== + +.. doxygenclass:: parquet::ReaderProperties + :members: + +.. doxygenclass:: parquet::ArrowReaderProperties + :members: + +.. doxygenclass:: parquet::ParquetFileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReaderBuilder + :members: + +.. doxygengroup:: parquet-arrow-reader-factories + :content-only: + +Parquet writer +============== + +.. doxygenclass:: parquet::WriterProperties + :members: + +.. doxygenclass:: parquet::ArrowWriterProperties + :members: + +.. doxygenclass:: parquet::arrow::FileWriter + :members: + +.. doxygenfunction:: parquet::arrow::WriteTable + +.. TODO ORC diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst new file mode 100644 index 0000000000000..5be5483ac4c93 --- /dev/null +++ b/docs/source/cpp/csv.rst @@ -0,0 +1,144 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::csv + +================= +Reading CSV files +================= + +Arrow provides a fast CSV reader allowing ingestion of external data +as Arrow tables. + +Basic usage +=========== + +A CSV file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/csv/api.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + auto read_options = arrow::csv::ReadOptions::Defaults(); + auto parse_options = arrow::csv::ParseOptions::Defaults(); + auto convert_options = arrow::csv::ConvertOptions::Defaults(); + + // Instantiate TableReader from input stream and options + std::shared_ptr reader; + st = arrow::csv::TableReader::Make(pool, input, read_options, + parse_options, convert_options, + &reader); + if (!st.ok()) { + // Handle TableReader instantiation error... + } + + std::shared_ptr table; + // Read table from CSV file + st = reader->Read(&table); + if (!st.ok()) { + // Handle CSV read error + // (for example a CSV syntax error or failed type conversion) + } + } + +Column names +============ + +There are three possible ways to infer column names from the CSV file: + +* By default, the column names are read from the first row in the CSV file +* If :member:`ReadOptions::column_names` is set, it forces the column + names in the table to these values (the first row in the CSV file is + read as data) +* If :member:`ReadOptions::autogenerate_column_names` is true, column names + will be autogenerated with the pattern "f0", "f1"... (the first row in the + CSV file is read as data) + +Column selection +================ + +By default, Arrow reads all columns in the CSV file. You can narrow the +selection of columns with the :member:`ConvertOptions::include_columns` +option. If some columns in :member:`ConvertOptions::include_columns` +are missing from the CSV file, an error will be emitted unless +:member:`ConvertOptions::include_missing_columns` is true, in which case +the missing columns are assumed to contain all-null values. + +Interaction with column names +----------------------------- + +If both :member:`ReadOptions::column_names` and +:member:`ConvertOptions::include_columns` are specified, +the :member:`ReadOptions::column_names` are assumed to map to CSV columns, +and :member:`ConvertOptions::include_columns` is a subset of those column +names that will part of the Arrow Table. + +Data types +========== + +By default, the CSV reader infers the most appropriate data type for each +column. Type inference considers the following data types, in order: + +* Null +* Int64 +* Boolean +* Timestamp (with seconds unit) +* Float64 +* String +* Binary + +It is possible to override type inference for select columns by setting +the :member:`ConvertOptions::column_types` option. Explicit data types +can be chosen from the following list: + +* Null +* All Integer types +* Float32 and Float64 +* Decimal128 +* Boolean +* Timestamp +* Binary and Large Binary +* String and Large String (with optional UTF8 input validation) +* Fixed-Size Binary + +Other data types do not support conversion from CSV values and will error out. + +Nulls +----- + +Null values are recognized from the spellings stored in +:member:`ConvertOptions::null_values`. The :func:`ConvertOptions::Defaults` +factory method will initialize a number of conventional null spellings such +as ``N/A``. + +Performance +=========== + +By default, the CSV reader will parallelize reads in order to exploit all +CPU cores on your machine. You can change this setting in +:member:`ReadOptions::use_threads`. A reasonable expectation is at least +100 MB/s per core on a modern desktop machine (measured in source CSV bytes, +not target Arrow data bytes). diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 4c918b3d96c30..05fea989c668a 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -18,8 +18,8 @@ .. default-domain:: cpp .. highlight:: cpp -Getting Started -=============== +User Guide +========== .. toctree:: @@ -30,4 +30,7 @@ Getting Started datatypes tables io + parquet + csv + json flight diff --git a/docs/source/cpp/json.rst b/docs/source/cpp/json.rst new file mode 100644 index 0000000000000..93dcdfa3c47fb --- /dev/null +++ b/docs/source/cpp/json.rst @@ -0,0 +1,125 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::json + +================== +Reading JSON files +================== + +Arrow allows reading line-separated JSON files as Arrow tables. Each +independent JSON object in the input file is converted to a row in +the target Arrow table. + +Basic usage +=========== + +A JSON file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/json/api.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + auto read_options = arrow::json::ReadOptions::Defaults(); + auto parse_options = arrow::json::ParseOptions::Defaults(); + + // Instantiate TableReader from input stream and options + std::shared_ptr reader; + st = arrow::json::TableReader::Make(pool, input, read_options, + parse_options, &reader); + if (!st.ok()) { + // Handle TableReader instantiation error... + } + + std::shared_ptr table; + // Read table from JSON file + st = reader->Read(&table); + if (!st.ok()) { + // Handle JSON read error + // (for example a JSON syntax error or failed type conversion) + } + } + +Data types +========== + +Since JSON values are typed, the possible Arrow data types on output +depend on the input value types. Top-level JSON values should always be +objects. The fields of top-level objects are taken to represent columns +in the Arrow data. For each name/value pair in a JSON object, there are +two possible modes of deciding the output data type: + +* if the name is in :class:`ConvertOptions::explicit_schema`, + conversion of the JSON value to the corresponding Arrow data type is + attempted; + +* otherwise, the Arrow data type is determined via type inference on + the JSON value, trying out a number of Arrow data types in order. + +The following tables show the possible combinations for each of those +two modes. + +.. table:: Explicit conversions from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Allowed Arrow data types | + +=================+====================================================+ + | Null | Any (including Null) | + +-----------------+----------------------------------------------------+ + | Number | All Integer types, Float32, Float64, | + | | Date32, Date64, Time32, Time64 | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Binary, LargeBinary, String, LargeString, | + | | Timestamp | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ + +.. table:: Implicit type inference from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Inferred Arrow data types (in order) | + +=================+====================================================+ + | Null | Null, any other | + +-----------------+----------------------------------------------------+ + | Number | Int64, Float64 | + | | | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Timestamp (with seconds unit), String | + | | | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst new file mode 100644 index 0000000000000..62e342e9e69d5 --- /dev/null +++ b/docs/source/cpp/parquet.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: parquet::arrow + +================================= +Reading and writing Parquet files +================================= + +The Parquet C++ library is part of the Apache Arrow project and benefits +from tight integration with Arrow C++. + +Reading +======= + +The Parquet :class:`FileReader` requires a :class:`::arrow::io::RandomAccessFile` +instance representing the input file. + +.. code-block:: cpp + + #include "arrow/parquet/arrow/reader.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr input = ...; + + // Open Parquet file reader + std::unique_ptr arrow_reader; + st = parquet::arrow::OpenFile(input, pool, &arrow_reader); + if (!st.ok()) { + // Handle error instantiating file reader... + } + + // Read entire file as a single Arrow table + std::shared_ptr table; + st = arrow_reader->ReadTable(&table); + if (!st.ok()) { + // Handle error reading Parquet data... + } + } + +Finer-grained options are available through the :class:`FileReaderBuilder` +helper class. + +.. TODO write section about performance and memory efficiency + +Writing +======= + +TODO: write this