Skip to content

Commit

Permalink
Merge branch 'master' into ARROW-6657
Browse files Browse the repository at this point in the history
  • Loading branch information
sinistersnare authored Sep 26, 2019
2 parents 6c57262 + dec0cfb commit b93c61a
Show file tree
Hide file tree
Showing 102 changed files with 3,607 additions and 450 deletions.
32 changes: 30 additions & 2 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1841,8 +1841,8 @@ This project includes code from the autobrew project.
* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
are based on code from the autobrew project.

Copyright: Copyright (c) 2017 - 2019, Jeroen Ooms.
All rights reserved.
Copyright (c) 2019, Jeroen Ooms
License: MIT
Homepage: https://github.com/jeroen/autobrew

--------------------------------------------------------------------------------
Expand Down Expand Up @@ -1874,3 +1874,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

----------------------------------------------------------------------

cpp/src/arrow/vendored/base64.cpp has the following license

ZLIB License

Copyright (C) 2004-2017 René Nyffenegger

This source code is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages arising
from the use of this software.

Permission is granted to anyone to use this software for any purpose, including
commercial applications, and to alter it and redistribute it freely, subject to
the following restrictions:

1. The origin of this source code must not be misrepresented; you must not
claim that you wrote the original source code. If you use this source code
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.

2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original source code.

3. This notice may not be removed or altered from any source distribution.

René Nyffenegger [email protected]
2 changes: 2 additions & 0 deletions cpp/apidoc/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,9 @@ INCLUDE_FILE_PATTERNS =

PREDEFINED = __attribute__(x)= \
__declspec(x)= \
PARQUET_EXPORT= \
ARROW_EXPORT= \
ARROW_FLIGHT_EXPORT= \
ARROW_EXTERN_TEMPLATE= \
ARROW_DEPRECATED(x)=

Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ set(ARROW_SRCS
filesystem/filesystem.cc
filesystem/localfs.cc
filesystem/mockfs.cc
filesystem/path_tree.cc
filesystem/path_util.cc
filesystem/util_internal.cc
io/buffered.cc
Expand Down Expand Up @@ -145,6 +146,7 @@ set(ARROW_SRCS
util/thread_pool.cc
util/trie.cc
util/utf8.cc
vendored/base64.cpp
vendored/datetime/tz.cpp)

# Add dependencies for third-party allocators.
Expand Down
10 changes: 9 additions & 1 deletion cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ struct ValidateVisitor {
}

Status Visit(const StructArray& array) {
const auto& struct_type = checked_cast<const StructType&>(*array.type());
if (array.num_fields() > 0) {
// Validate fields
int64_t array_length = array.field(0)->length();
Expand All @@ -1245,10 +1246,17 @@ struct ValidateVisitor {
it->type()->ToString(), " at position [", idx, "]");
}

auto it_type = struct_type.child(i)->type();
if (!it->type()->Equals(it_type)) {
return Status::Invalid("Child array at position [", idx,
"] does not match type field: ", it->type()->ToString(),
" vs ", it_type->ToString());
}

const Status child_valid = it->Validate();
if (!child_valid.ok()) {
return Status::Invalid("Child array invalid: ", child_valid.ToString(),
" at position [", idx, "}");
" at position [", idx, "]");
}
++idx;
}
Expand Down
77 changes: 43 additions & 34 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,82 +32,91 @@ class DataType;

namespace csv {

// Silly workaround for https://github.com/michaeljones/breathe/issues/453
constexpr char kDefaultEscapeChar = '\\';

struct ARROW_EXPORT ParseOptions {
// Parsing options

// Field delimiter
/// Field delimiter
char delimiter = ',';
// Whether quoting is used
/// Whether quoting is used
bool quoting = true;
// Quoting character (if `quoting` is true)
/// Quoting character (if `quoting` is true)
char quote_char = '"';
// Whether a quote inside a value is double-quoted
/// Whether a quote inside a value is double-quoted
bool double_quote = true;
// Whether escaping is used
/// Whether escaping is used
bool escaping = false;
// Escaping character (if `escaping` is true)
char escape_char = '\\';
// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
/// Escaping character (if `escaping` is true)
char escape_char = kDefaultEscapeChar;
/// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
bool newlines_in_values = false;
// Whether empty lines are ignored. If false, an empty line represents
// a single empty value (assuming a one-column CSV file).
/// Whether empty lines are ignored. If false, an empty line represents
/// a single empty value (assuming a one-column CSV file).
bool ignore_empty_lines = true;

/// Create parsing options with default values
static ParseOptions Defaults();
};

struct ARROW_EXPORT ConvertOptions {
// Conversion options

// Whether to check UTF8 validity of string columns
/// Whether to check UTF8 validity of string columns
bool check_utf8 = true;
// Optional per-column types (disabling type inference on those columns)
/// Optional per-column types (disabling type inference on those columns)
std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
// Recognized spellings for null values
/// Recognized spellings for null values
std::vector<std::string> null_values;
// Recognized spellings for boolean values
/// Recognized spellings for boolean true values
std::vector<std::string> true_values;
/// Recognized spellings for boolean false values
std::vector<std::string> false_values;
// Whether string / binary columns can have null values.
// If true, then strings in "null_values" are considered null for string columns.
// If false, then all strings are valid string values.
/// Whether string / binary columns can have null values.
///
/// If true, then strings in "null_values" are considered null for string columns.
/// If false, then all strings are valid string values.
bool strings_can_be_null = false;

// XXX Should we have a separate FilterOptions?

// If non-empty, indicates the names of columns from the CSV file that should
// be actually read and converted (in the vector's order).
// Columns not in this vector will be ignored.
/// If non-empty, indicates the names of columns from the CSV file that should
/// be actually read and converted (in the vector's order).
/// Columns not in this vector will be ignored.
std::vector<std::string> include_columns;
// If false, columns in `include_columns` but not in the CSV file will error out.
// If true, columns in `include_columns` but not in the CSV file will produce
// a column of nulls (whose type is selected using `column_types`,
// or null by default)
// This option is ignored if `include_columns` is empty.
/// If false, columns in `include_columns` but not in the CSV file will error out.
/// If true, columns in `include_columns` but not in the CSV file will produce
/// a column of nulls (whose type is selected using `column_types`,
/// or null by default)
/// This option is ignored if `include_columns` is empty.
bool include_missing_columns = false;

/// Create conversion options with default values, including conventional
/// values for `null_values`, `true_values` and `false_values`
static ConvertOptions Defaults();
};

struct ARROW_EXPORT ReadOptions {
// Reader options

// Whether to use the global CPU thread pool
/// Whether to use the global CPU thread pool
bool use_threads = true;
// Block size we request from the IO layer; also determines the size of
// chunks when use_threads is true
/// Block size we request from the IO layer; also determines the size of
/// chunks when use_threads is true
int32_t block_size = 1 << 20; // 1 MB

// Number of header rows to skip (not including the row of column names, if any)
/// Number of header rows to skip (not including the row of column names, if any)
int32_t skip_rows = 0;
// Column names for the target table.
// If empty, fall back on autogenerate_column_names.
/// Column names for the target table.
/// If empty, fall back on autogenerate_column_names.
std::vector<std::string> column_names;
// Whether to autogenerate column names if `column_names` is empty.
// If true, column names will be of the form "f0", "f1"...
// If false, column names will be read from the first CSV row after `skip_rows`.
/// Whether to autogenerate column names if `column_names` is empty.
/// If true, column names will be of the form "f0", "f1"...
/// If false, column names will be read from the first CSV row after `skip_rows`.
bool autogenerate_column_names = false;

/// Create read options with default values
static ReadOptions Defaults();
};

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/csv/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ class InputStream;

namespace csv {

/// A class that reads an entire CSV file into a Arrow Table
class ARROW_EXPORT TableReader {
public:
virtual ~TableReader() = default;

/// Read the entire CSV file and convert it to a Arrow Table
virtual Status Read(std::shared_ptr<Table>* out) = 0;

/// Create a TableReader instance
static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input,
const ReadOptions&, const ParseOptions&, const ConvertOptions&,
std::shared_ptr<TableReader>* out);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem")

add_arrow_test(filesystem_test)
add_arrow_test(localfs_test)
add_arrow_test(path_tree_test)

if(ARROW_S3)
add_arrow_test(s3fs_test)
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ std::string FileStats::base_name() const {
return internal::GetAbstractPathParent(path_).second;
}

std::string FileStats::dir_name() const {
return internal::GetAbstractPathParent(path_).first;
}

// Debug helper
std::ostream& operator<<(std::ostream& os, const FileStats& stats) {
return os << "FileStats(" << stats.type() << ", " << stats.path() << ")";
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ struct ARROW_EXPORT FileStats {
/// The file base name (component after the last directory separator)
std::string base_name() const;

// The directory base name (component before the file base name).
std::string dir_name() const;

/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
Expand All @@ -110,6 +113,9 @@ struct ARROW_EXPORT FileStats {
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }

bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }

bool operator==(const FileStats& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
Expand Down
Loading

0 comments on commit b93c61a

Please sign in to comment.