Skip to content

Commit

Permalink
Use arrow C++ for CSV parsing in JS
Browse files Browse the repository at this point in the history
  • Loading branch information
texodus committed Sep 26, 2020
1 parent 459c78b commit 229f764
Show file tree
Hide file tree
Showing 29 changed files with 1,307 additions and 319 deletions.
32 changes: 22 additions & 10 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,12 @@ jobs:
- bash: yarn
displayName: 'Install Deps'

- bash: python -m pip install "black==20.8b1" flake8-black
condition: and(succeeded(), ne(variables['python.version'], '2.7'))
displayName: 'Install Python deps'
- script: |
python -m pip install "black==20.8b1" flake8-black
displayName: "Python3 deps"
condition: not(eq(variables['python.version'], '2.7'))
- bash: yarn build_python --ci $(python_flag) $(manylinux_flag)
- bash: yarn build_python --ci $(python_flag) $(manylinux_flag)
displayName: 'build'
env:
PSP_DOCKER: 1
Expand Down Expand Up @@ -181,7 +182,7 @@ jobs:
# displayName: "Which python"

# - script: |
# python -m pip install numpy "pyarrow>=0.16.0,<1"
# python -m pip install numpy "pyarrow>=1.0.1,<2"
# displayName: "Python deps"

# - script: npm install -g yarn
Expand Down Expand Up @@ -252,8 +253,14 @@ jobs:
displayName: "Which python"
- script: |
python -m pip install delocate wheel numpy "black==20.8b1" flake8-black "pyarrow>=0.16.0,<1"
displayName: "Python deps"
python -m pip install delocate wheel numpy "black==20.8b1" flake8-black "pyarrow>=1.0.1,<2"
displayName: "Python3 deps"
condition: not(eq(variables['python.version'], '2.7'))
- script: |
python -m pip install delocate wheel numpy "black==20.8b1" flake8-black "pyarrow<2"
displayName: "Python2 deps"
condition: eq(variables['python.version'], '2.7')
- script: npm install -g yarn
displayName: "Install Yarn"
Expand Down Expand Up @@ -295,7 +302,6 @@ jobs:

strategy:
matrix:

Python27:
python.version: '2.7'
python_flag: '--python2'
Expand Down Expand Up @@ -338,8 +344,14 @@ jobs:
displayName: "Which python"
- script: |
python -m pip install delocate wheel numpy "pyarrow>=0.16.0,<1"
displayName: "Python deps"
python -m pip install delocate wheel numpy "pyarrow>=1.0.1,<2"
displayName: "Python3 deps"
condition: not(eq(variables['python.version'], '2.7'))
- script: |
python -m pip install delocate wheel numpy "pyarrow<2"
displayName: "Python2 deps"
condition: eq(variables['python.version'], '2.7')
- script: npm install -g yarn
displayName: "Install Yarn"
Expand Down
12 changes: 10 additions & 2 deletions cmake/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/converter.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/chunker.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_builder.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_decoder.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/options.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/parser.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/reader.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/filesystem.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/localfs.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/filesystem/mockfs.cc
Expand All @@ -58,20 +58,22 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/json/reader.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/buffered.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/compressed.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/file.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/interfaces.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/memory.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/testing/util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/basic_decimal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_block_counter.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_builders.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_ops.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/compression.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/cpu_info.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/decimal.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/future.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/delimiting.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/int_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/io_util.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/iterator.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/logging.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/key_value_metadata.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/memory.cc
Expand All @@ -81,6 +83,12 @@ set(ARROW_SRCS
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/thread_pool.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/trie.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/utf8.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/value_parsing.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/double-conversion.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/cached-powers.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/diy-fp.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/bignum.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/strtod.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/datetime/tz.cpp
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/dictionary.cc
${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather.cc
Expand Down
1 change: 0 additions & 1 deletion cmake/modules/FindFlatbuffers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ if(NOT ${FLATBUFFERS_INCLUDE_DIR})
set(FLATBUFFERS_INCLUDE_DIR /usr/local/include)
endif()

message("${FLATBUFFERS_COMPILER}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FLATBUFFERS REQUIRED_VARS
FLATBUFFERS_INCLUDE_DIR FLATBUFFERS_COMPILER)
43 changes: 35 additions & 8 deletions cmake/modules/FindPyArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,59 @@ execute_process(
"from __future__ import print_function\ntry: import pyarrow; print(' '.join(pyarrow.get_libraries()), end='')\nexcept:pass"
OUTPUT_VARIABLE __pyarrow_libraries)

# And the version
execute_process(
COMMAND "${Python_EXECUTABLE}" -c
"from __future__ import print_function\ntry: import pyarrow; print(pyarrow.__version__, end='')\nexcept:pass"
OUTPUT_VARIABLE __pyarrow_version)

find_path(PYTHON_PYARROW_INCLUDE_DIR arrow/python/api.h
HINTS "${__pyarrow_path}" "${PYTHON_INCLUDE_PATH}" NO_DEFAULT_PATH)

set(PYTHON_PYARROW_LIBRARY_DIR ${__pyarrow_library_dirs})

# Figure out the major version for the .so/.dylibs
string(REPLACE "." ";" PYARROW_VERSION_LIST ${__pyarrow_version})
list(GET PYARROW_VERSION_LIST 0 PYARROW_VERSION_MAJOR)
list(GET PYARROW_VERSION_LIST 1 PYARROW_VERSION_MINOR)
list(GET PYARROW_VERSION_LIST 2 PYARROW_VERSION_PATCH)

if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
# windows its just "arrow.dll"
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY "arrow_python")
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY "arrow")
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
# Link against pre-built libarrow on MacOS
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND ${PYARROW_VERSION_MAJOR} EQUAL "1")
# Link against pre-built libarrow on MacOS
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python.100.dylib)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow.100.dylib)
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
else()
elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
# Link against pre-built libarrow on MacOS
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python.${PYARROW_VERSION_MINOR}.dylib)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow.${PYARROW_VERSION_MINOR}.dylib)
elseif (${PYARROW_VERSION_MAJOR} EQUAL "1")
# linux
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX})
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX})
set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.100)
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.100)
else()
set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${PYTHON_PYARROW_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
endif()

set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})

if(PYTHON_PYARROW_INCLUDE_DIR AND PYTHON_PYARROW_LIBRARIES)
set(PYTHON_PYARROW_FOUND 1 CACHE INTERNAL "Python pyarrow found")
endif()


# set(PYTHON_PYARROW_LIBRARIES ${PYTHON_PYARROW_PYTHON_SHARED_LIBRARY} ${PYTHON_PYARROW_ARROW_SHARED_LIBRARY})
# else()
# # linux
# set(PYTHON_PYARROW_PYTHON_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow_python${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})
# set(PYTHON_PYARROW_ARROW_SHARED_LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}arrow${CMAKE_SHARED_LIBRARY_SUFFIX}.${PYARROW_VERSION_MINOR})



include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(PyArrow REQUIRED_VARS PYTHON_PYARROW_INCLUDE_DIR PYTHON_PYARROW_LIBRARIES PYTHON_PYARROW_LIBRARY_DIR
VERSION_VAR __pyarrow_version)
34 changes: 16 additions & 18 deletions cpp/perspective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,10 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
include_directories( ${Python_INCLUDE_DIRS} )

if(MACOS)
# don't link against build python
# https://blog.tim-smith.us/2015/09/python-extension-modules-os-x/
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -undefined dynamic_lookup")

# on mac, use the vanilla pybind11 finder
find_package(pybind11)
if(pybind11_FOUND)
Expand All @@ -383,11 +387,6 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
else()
# Check if pip installed PyBind is available
find_package(Pybind)
if(PYTHON_PYBIND_FOUND)
# Need to add extra flags due to pybind weirness
# https://github.com/pybind/pybind11/blob/7830e8509f2adc97ce9ee32bf99cd4b82089cc4c/tools/pybind11Tools.cmake#L103
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -undefined dynamic_lookup")
endif()
endif()

else()
Expand Down Expand Up @@ -549,7 +548,13 @@ set (SOURCE_FILES

set(PYTHON_SOURCE_FILES ${SOURCE_FILES}
${PSP_PYTHON_SRC}/src/column.cpp
)
)

set(WASM_SOURCE_FILES ${SOURCE_FILES}
${PSP_CPP_SRC}/src/cpp/arrow_csv.cpp
${PSP_CPP_SRC}/src/cpp/vendor/arrow_single_threaded_reader.cpp
)


set (PYTHON_BINDING_SOURCE_FILES
${PSP_PYTHON_SRC}/src/accessor.cpp
Expand All @@ -571,7 +576,7 @@ else()
endif()

if (PSP_WASM_BUILD)
add_library(psp ${SOURCE_FILES})
add_library(psp ${WASM_SOURCE_FILES})
target_compile_definitions(psp PRIVATE PSP_ENABLE_WASM=1)
set_target_properties(psp PROPERTIES COMPILE_FLAGS "${ASYNC_MODE_FLAGS}")
target_link_libraries(psp arrow)
Expand Down Expand Up @@ -642,20 +647,13 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
endif()

target_link_libraries(psp ${PYTHON_PYARROW_LIBRARIES})
target_link_libraries(binding ${PYTHON_PYARROW_LIBRARIES})

if(WIN32)
# Don't link

else()
target_link_libraries(psp ${PYTHON_LIBRARIES})
target_link_libraries(binding ${PYTHON_LIBRARIES})
endif()
target_link_libraries(binding ${PYTHON_PYARROW_LIBRARIES})

target_link_libraries(psp tbb)
target_link_libraries(binding tbb)

target_link_libraries(binding psp)
target_compile_options(psp PRIVATE -Wno-deprecated-register)
target_compile_options(binding PRIVATE -Wno-deprecated-register)

# The compiled libraries will be put in CMAKE_LIBRARY_OUTPUT_DIRECTORY by default. In the
# setup.py file, we designate this to be in the build/lib.<platform> directory. However,
Expand All @@ -675,7 +673,7 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD)
endif()
########################
else()
add_library(psp SHARED ${SOURCE_FILES})
add_library(psp SHARED ${WASM_SOURCE_FILES})

# Link perspective against custom-built minimal arrow
target_link_libraries(psp arrow)
Expand Down
60 changes: 60 additions & 0 deletions cpp/perspective/src/cpp/arrow_csv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/******************************************************************************
*
* Copyright (c) 2019, the Perspective Authors.
*
* This file is part of the Perspective library, distributed under the terms of
* the Apache License 2.0. The full license can be found in the LICENSE file.
*
*/

#include <perspective/base.h>
#include <perspective/arrow_csv.h>
#include <arrow/util/value_parsing.h>
#include <arrow/io/memory.h>

// This causes build warnings
// https://github.com/emscripten-core/emscripten/issues/8574
#include <perspective/vendor/arrow_single_threaded_reader.h>

namespace perspective {
namespace apachearrow {

std::shared_ptr<::arrow::Table>
csvToTable(std::string& csv, bool is_update,
std::unordered_map<std::string, std::shared_ptr<arrow::DataType>>&
schema) {
arrow::MemoryPool* pool = arrow::default_memory_pool();
auto input = std::make_shared<arrow::io::BufferReader>(csv);
auto read_options = arrow::csv::ReadOptions::Defaults();
auto parse_options = arrow::csv::ParseOptions::Defaults();
auto convert_options = arrow::csv::ConvertOptions::Defaults();

read_options.use_threads = false;
convert_options.timestamp_parsers
= std::vector<std::shared_ptr<arrow::TimestampParser>>{
arrow::TimestampParser::MakeISO8601(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
arrow::TimestampParser::MakeStrptime("%H:%M:%S.%f"),
};

if (is_update) {
convert_options.column_types = std::move(schema);
}

auto maybe_reader = arrow::csv::TableReader::Make(
pool, input, read_options, parse_options, convert_options);

std::shared_ptr<arrow::csv::TableReader> reader = *maybe_reader;

auto maybe_table = reader->Read();
if (!maybe_table.ok()) {
PSP_COMPLAIN_AND_ABORT(maybe_table.status().ToString());
}
return *maybe_table;
}

} // namespace apachearrow
} // namespace perspective
Loading

0 comments on commit 229f764

Please sign in to comment.