From 9be080fc7747a701c57c86cbe48b33bd48dcba07 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 17 Jan 2018 12:21:08 -0500 Subject: [PATCH 1/5] ARROW-1991: [Website] Fix Docker documentation build @kou something is failing here, I'm not sure what's different here vs. Travis CI (not sure if this is what's failing the doc build): ``` DOC Building HTML ../arrow-glib-docs.xml:25: warning: failed to load external entity "../xml/gtkdocentities.ent" %gtkdocentities; ^ Entity: line 1: %gtkdocentities; ^ ../arrow-glib-docs.xml:29: parser error : Entity 'package_name' not defined &package_name; Reference Manual ^ ../arrow-glib-docs.xml:31: parser error : Entity 'package_string' not defined for &package_string;. ^ warning: failed to load external entity "../xml/basic-array.xml" ../arrow-glib-docs.xml:43: element include: XInclude error : could not load ../xml/basic-array.xml, and no fallback was found warning: failed to load external entity "../xml/composite-array.xml" ../arrow-glib-docs.xml:44: element include: XInclude error : could not load ../xml/composite-array.xml, and no fallback was found ../xml/array-builder.xml:25: warning: failed to load external entity "../xml/xml/gtkdocentities.ent" %gtkdocentities; ``` Author: Wes McKinney Author: Kouhei Sutou Closes #1472 from wesm/fix-gen-apidocs and squashes the following commits: 5b907acd [Wes McKinney] Add explicit instructions for uploading API docs to website 5734a651 [Wes McKinney] Use JDK7 for Java on Ubuntu 16.04 0fcb1e11 [Wes McKinney] Use gcc 4.9 rather than default gcc because of gcc5 ABI issues dbf8be8e [Kouhei Sutou] Disable auto-reconfigure b1b5050b [Kouhei Sutou] Fix GLib doc build 8b2d7e43 [Wes McKinney] Fixes for glib doc build 9da9e143 [Wes McKinney] Add BOOST_ROOT --- c_glib/arrow-glib/Makefile.am | 7 ++- dev/docker-compose.yml | 4 +- dev/gen_apidocs/Dockerfile | 20 +++++--- dev/gen_apidocs/create_documents.sh | 73 +++++++++++++++++++---------- dev/release/RELEASE_MANAGEMENT.md | 9 ++++ 5 files changed, 78 insertions(+), 35 deletions(-) diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index 16d0703142798..9148f8a583a14 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -16,6 +16,7 @@ # under the License. CLEANFILES = +DISTCLEANFILES = EXTRA_DIST = \ meson.build @@ -169,6 +170,10 @@ BUILT_SOURCES = \ stamp-enums.c \ stamp-enums.h +DISTCLEANFILES += \ + stamp-enums.c \ + stamp-enums.h + EXTRA_DIST += \ enums.c.template \ enums.h.template @@ -214,7 +219,7 @@ INTROSPECTION_SCANNER_ARGS = INTROSPECTION_SCANNER_ENV = if USE_ARROW_BUILD_DIR INTROSPECTION_SCANNER_ENV += \ - LD_LIBRARY_PATH=$(ARROW_LIB_DIR):$${PKG_CONFIG_PATH} + LD_LIBRARY_PATH=$(ARROW_LIB_DIR):$${LD_LIBRARY_PATH} endif if OS_MACOS INTROSPECTION_SCANNER_ENV += \ diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index 4b9014894003b..a73fd1bfbbaf0 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -17,7 +17,7 @@ version: '3' services: gen_apidocs: - build: + build: context: gen_apidocs volumes: - ../..:/apache-arrow @@ -29,7 +29,7 @@ services: volumes: - ../..:/apache-arrow dask_integration: - build: + build: context: dask_integration volumes: - ../..:/apache-arrow diff --git a/dev/gen_apidocs/Dockerfile b/dev/gen_apidocs/Dockerfile index ca4718e637842..da740ee0773d2 100644 --- a/dev/gen_apidocs/Dockerfile +++ b/dev/gen_apidocs/Dockerfile @@ -14,19 +14,24 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM ubuntu:14.04 -# Prerequsites for apt-add-repository -RUN apt-get update && apt-get install -y \ - software-properties-common python-software-properties +FROM ubuntu:16.04 + # Basic OS dependencies -RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update && apt-get install -y \ +RUN apt-get update && apt-get install -y \ wget \ rsync \ git \ gcc-4.9 \ g++-4.9 \ - build-essential + build-essential \ + software-properties-common + +# Java build fails with default JDK8 +RUN add-apt-repository ppa:openjdk-r/ppa &&\ + apt-get update &&\ + apt-get install -y openjdk-7-jdk &&\ + update-java-alternatives -s java-1.7.0-openjdk-amd64 + # This will install conda in /home/ubuntu/miniconda RUN wget -O /tmp/miniconda.sh \ https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ @@ -73,6 +78,7 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ doxygen \ maven \ -c conda-forge + ADD . /apache-arrow WORKDIR /apache-arrow CMD arrow/dev/gen_apidocs/create_documents.sh diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index 566d9cee79c85..54031262b3a5d 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -27,6 +27,7 @@ export ARROW_HOME=$(pwd)/dist export PARQUET_HOME=$(pwd)/dist CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} +export PKG_CONFIG_PATH=$(pwd)/dist/lib/pkgconfig:${PKG_CONFIG_PATH} export PATH=${CONDA_BASE}/bin:${PATH} # Prepare the asf-site before copying api docs @@ -38,16 +39,38 @@ git clone --branch=asf-site \ https://git-wip-us.apache.org/repos/asf/arrow-site.git asf-site popd +# Make Java documentation +export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 +wget http://mirrors.gigenet.com/apache/maven/maven-3/3.5.2/binaries/apache-maven-3.5.2-bin.tar.gz +tar xvf apache-maven-3.5.2-bin.tar.gz +export PATH=$(pwd)/apache-maven-3.5.2/bin:$PATH + +pushd arrow/java +rm -rf target/site/apidocs/* +mvn -Drat.skip=true install +mvn -Drat.skip=true site +mkdir -p ../site/asf-site/docs/java/ +rsync -r target/site/apidocs/ ../site/asf-site/docs/java/ +popd + # Make Python documentation (Depends on C++ ) # Build Arrow C++ source activate pyarrow-dev export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export BOOST_ROOT=$CONDA_PREFIX export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX +export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:${LD_LIBRARY_PATH} +export PKG_CONFIG_PATH=$CONDA_PREFIX/lib/pkgconfig:${PKG_CONFIG_PATH} + +export CC=gcc-4.9 +export CXX=g++-4.9 -rm -rf arrow/cpp/build_docs -mkdir arrow/cpp/build_docs -pushd arrow/cpp/build_docs +CPP_BUILD_DIR=$(pwd)/arrow/cpp/build_docs + +rm -rf $CPP_BUILD_DIR +mkdir $CPP_BUILD_DIR +pushd $CPP_BUILD_DIR cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_PYTHON=on \ @@ -58,6 +81,28 @@ make -j4 make install popd +# Build c_glib documentation +pushd arrow/c_glib +if [ -f Makefile ]; then + # Ensure updating to prevent auto re-configure + touch configure **/Makefile + make distclean + # Work around for 'make distclean' removes doc/reference/xml/ + git checkout doc/reference/xml +fi +./autogen.sh +rm -rf build_docs +mkdir build_docs +pushd build_docs +../configure \ + --prefix=${AROW_HOME} \ + --enable-gtk-doc +make -j4 GTK_DOC_V_XREF=": " +mkdir -p ../../site/asf-site/docs/c_glib +rsync -r doc/reference/html/ ../../site/asf-site/docs/c_glib +popd +popd + # Build Parquet C++ rm -rf parquet-cpp/build_docs mkdir parquet-cpp/build_docs @@ -83,19 +128,6 @@ mkdir -p ../site/asf-site/docs/python rsync -r doc/_build/html/ ../site/asf-site/docs/python popd -# Build c_glib documentation -pushd arrow/c_glib -rm -rf doc/reference/html/* -./autogen.sh -./configure \ - --with-arrow-cpp-build-dir=$(pwd)/../cpp/build \ - --with-arrow-cpp-build-type=$ARROW_BUILD_TYPE \ - --enable-gtk-doc -LD_LIBRARY_PATH=$(pwd)/../cpp/build/$ARROW_BUILD_TYPE make GTK_DOC_V_XREF=": " -mkdir -p ../site/asf-site/docs/c_glib -rsync -r doc/reference/html/ ../site/asf-site/docs/c_glib -popd - # Make C++ documentation pushd arrow/cpp/apidoc rm -rf html/* @@ -103,12 +135,3 @@ doxygen Doxyfile mkdir -p ../../site/asf-site/docs/cpp rsync -r html/ ../../site/asf-site/docs/cpp popd - -# Make Java documentation -pushd arrow/java -rm -rf target/site/apidocs/* -mvn -Drat.skip=true install -mvn -Drat.skip=true site -mkdir -p ../site/asf-site/docs/java/ -rsync -r target/site/apidocs/ ../site/asf-site/docs/java/ -popd diff --git a/dev/release/RELEASE_MANAGEMENT.md b/dev/release/RELEASE_MANAGEMENT.md index 73eaf5f95b3b0..0f8c2202fef51 100644 --- a/dev/release/RELEASE_MANAGEMENT.md +++ b/dev/release/RELEASE_MANAGEMENT.md @@ -112,6 +112,15 @@ software must be built in order to create the documentation, so this step may take some time to run, especially the first time around as the Docker container will also have to be built. +To upload the updated documentation to the website, navigate to `site/asf-site` +and commit all changes: + +``` +pushd site/asf-site +git add . +git commit -m "Updated API documentation for version X.Y.Z" +``` + After successfully creating the API documentation the website can be run locally to browse the API documentation from the top level `Documentation` menu. To run the website issue the command: From 9e2fc04ddf80d2a04ec789a7ca82c2929d31f967 Mon Sep 17 00:00:00 2001 From: kmiku7 Date: Wed, 17 Jan 2018 12:26:31 -0500 Subject: [PATCH 2/5] ARROW-2002: [Python] check write_queue is not full and writer_thread is alive before enqueue new record when download file. use pyarrow download file will raise queue.Full exceptions sometimes. jira: https://issues.apache.org/jira/browse/ARROW-2002 Author: kmiku7 Closes #1485 from kmiku7/master and squashes the following commits: 8d5f905d [kmiku7] fix queue.FULL exception when writer thread write data slowly. 722182b8 [kmiku7] Merge pull request #1 from apache/master --- python/pyarrow/io.pxi | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 619ba365c2df7..5449872ff101f 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -312,6 +312,12 @@ cdef class NativeFile: pybuf = cp.PyBytes_FromStringAndSize(buf, bytes_read) + if writer_thread.is_alive(): + while write_queue.full(): + time.sleep(0.01) + else: + break + write_queue.put_nowait(pybuf) finally: free(buf) From 1ffce26e3fc659521f42984d8e87bbb93be0ed2c Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Wed, 17 Jan 2018 14:50:09 -0500 Subject: [PATCH 3/5] ARROW-1856: [Python] Auto-detect Parquet ABI version when using PARQUET_HOME Author: Korn, Uwe Closes #1477 from xhochy/ARROW-1856 and squashes the following commits: a34ade44 [Korn, Uwe] ARROW-1856: [Python] Auto-detect Parquet ABI version when using PARQUET_HOME --- cpp/cmake_modules/FindParquet.cmake | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index 0339ec56ae20a..8bbe05f127f87 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -60,8 +60,22 @@ if(PARQUET_HOME) PATHS ${PARQUET_HOME} NO_DEFAULT_PATH PATH_SUFFIXES "lib") get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH ) - set(PARQUET_ABI_VERSION "1.0.0") - set(PARQUET_SO_VERSION "1") + + # Try to autodiscover the Parquet ABI version + get_filename_component(PARQUET_LIB_REALPATH ${PARQUET_LIBRARIES} REALPATH) + get_filename_component(PARQUET_EXT_REALPATH ${PARQUET_LIB_REALPATH} EXT) + string(REGEX MATCH ".([0-9]+.[0-9]+.[0-9]+)" HAS_ABI_VERSION ${PARQUET_EXT_REALPATH}) + if (HAS_ABI_VERSION) + if (APPLE) + string(REGEX REPLACE ".([0-9]+.[0-9]+.[0-9]+).dylib" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH}) + else() + string(REGEX REPLACE ".so.([0-9]+.[0-9]+.[0-9]+)" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH}) + endif() + string(REGEX REPLACE "([0-9]+).[0-9]+.[0-9]+" "\\1" PARQUET_SO_VERSION ${PARQUET_ABI_VERSION}) + else() + set(PARQUET_ABI_VERSION "1.0.0") + set(PARQUET_SO_VERSION "1") + endif() else() pkg_check_modules(PARQUET parquet) if (PARQUET_FOUND) From 58a24c5ba243f89302f90ef62550cee252968c9d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 18 Jan 2018 10:56:15 -0500 Subject: [PATCH 4/5] ARROW-2004: [C++] Add shrink_to_fit parameter to BufferBuilder::Resize, add Reserve method I also relaxed the requirement to pass `const uint8_t*` so that one can pass `const void*` when writing to a `BufferBuilder`. This will not affect any downstream users Author: Wes McKinney Closes #1486 from wesm/ARROW-2004 and squashes the following commits: 2d6660a8 [Wes McKinney] Add shrink_to_fit parameter to BufferBuilder::Resize, add Reserve method, relax pointer type in Append --- cpp/src/arrow/buffer-test.cc | 25 ++++++++++++++++++++++ cpp/src/arrow/buffer.h | 41 ++++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 5fd2706f0466b..398cc06363a6f 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -194,4 +194,29 @@ TEST(TestBuffer, SliceMutableBuffer) { ASSERT_TRUE(slice->Equals(expected)); } +TEST(TestBufferBuilder, ResizeReserve) { + const std::string data = "some data"; + auto data_ptr = data.c_str(); + + BufferBuilder builder; + + ASSERT_OK(builder.Append(data_ptr, 9)); + ASSERT_EQ(9, builder.length()); + + ASSERT_OK(builder.Resize(128)); + ASSERT_EQ(128, builder.capacity()); + + // Do not shrink to fit + ASSERT_OK(builder.Resize(64, false)); + ASSERT_EQ(128, builder.capacity()); + + // Shrink to fit + ASSERT_OK(builder.Resize(64)); + ASSERT_EQ(64, builder.capacity()); + + // Reserve elements + ASSERT_OK(builder.Reserve(60)); + ASSERT_EQ(128, builder.capacity()); +} + } // namespace arrow diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 450a4c78b5bbb..b50b1a1aa041d 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -25,6 +25,7 @@ #include #include +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" @@ -32,13 +33,12 @@ namespace arrow { -class MemoryPool; - // ---------------------------------------------------------------------- // Buffer classes -/// Immutable API for a chunk of bytes which may or may not be owned by the -/// class instance. +/// \class Buffer +/// \brief Object containing a pointer to a piece of contiguous memory with a +/// particular size. Base class does not own its memory /// /// Buffers have two related notions of length: size and capacity. Size is /// the number of bytes that might have valid data. Capacity is the number @@ -133,7 +133,8 @@ ARROW_EXPORT std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length); -/// A Buffer whose contents can be mutated. May or may not own its data. +/// \class MutableBuffer +/// \brief A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { @@ -148,6 +149,8 @@ class ARROW_EXPORT MutableBuffer : public Buffer { MutableBuffer() : Buffer(NULLPTR, 0) {} }; +/// \class ResizableBuffer +/// \brief A mutable buffer that can be resized class ARROW_EXPORT ResizableBuffer : public MutableBuffer { public: /// Change buffer reported size to indicated size, allocating memory if @@ -190,13 +193,22 @@ class ARROW_EXPORT PoolBuffer : public ResizableBuffer { MemoryPool* pool_; }; +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory data class ARROW_EXPORT BufferBuilder { public: - explicit BufferBuilder(MemoryPool* pool) + explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - /// Resizes the buffer to the nearest multiple of 64 bytes per Layout.md - Status Resize(const int64_t elements) { + /// \brief Resizes the buffer to the nearest multiple of 64 bytes + /// + /// \param elements the new capacity of the of the builder. Will be rounded + /// up to a multiple of 64 bytes for padding + /// \param shrink_to_fit if new capacity smaller than existing size, + /// reallocate internal buffer. Set to false to avoid reallocations when + /// shrinking the builder + /// \return Status + Status Resize(const int64_t elements, bool shrink_to_fit = true) { // Resize(0) is a no-op if (elements == 0) { return Status::OK(); @@ -205,7 +217,7 @@ class ARROW_EXPORT BufferBuilder { buffer_ = std::make_shared(pool_); } int64_t old_capacity = capacity_; - RETURN_NOT_OK(buffer_->Resize(elements)); + RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit)); capacity_ = buffer_->capacity(); data_ = buffer_->mutable_data(); if (capacity_ > old_capacity) { @@ -214,7 +226,14 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } - Status Append(const uint8_t* data, int64_t length) { + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param size number of additional bytes to make space for + /// \return Status + Status Reserve(const int64_t size) { return Resize(size_ + size, false); } + + Status Append(const void* data, int64_t length) { if (capacity_ < length + size_) { int64_t new_capacity = BitUtil::NextPower2(length + size_); RETURN_NOT_OK(Resize(new_capacity)); @@ -248,7 +267,7 @@ class ARROW_EXPORT BufferBuilder { } // Unsafe methods don't check existing size - void UnsafeAppend(const uint8_t* data, int64_t length) { + void UnsafeAppend(const void* data, int64_t length) { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } From bc9f9e532ea2a16810d5ce14e1dfc3272628cb95 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 18 Jan 2018 11:01:23 -0500 Subject: [PATCH 5/5] ARROW-1966: [C++] Accommodate JAVA_HOME on Linux that includes the jre/ directory, or is the full path to directory with libjvm Some users ran into a rough edge where they had a non-standard JRE directory (possibly related to some recent changes by Oracle in their JDK installer) Author: Wes McKinney Closes #1487 from wesm/ARROW-1966 and squashes the following commits: 7e14923d [Wes McKinney] Add note to API documentation about JAVA_HOME f77b31e6 [Wes McKinney] Accommodate a JAVA_HOME containing the jre/ directory, or an absolute path to directory containing libjvm --- cpp/apidoc/HDFS.md | 4 ++++ cpp/src/arrow/io/hdfs-internal.cc | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/apidoc/HDFS.md b/cpp/apidoc/HDFS.md index d54ad270c05f4..d3671fb7691ba 100644 --- a/cpp/apidoc/HDFS.md +++ b/cpp/apidoc/HDFS.md @@ -50,6 +50,10 @@ export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` * `ARROW_LIBHDFS_DIR` (optional): explicit location of `libhdfs.so` if it is installed somewhere other than `$HADOOP_HOME/lib/native`. +To accommodate distribution-specific nuances, the `JAVA_HOME` variable may be +set to the root path for the Java SDK, the JRE path itself, or to the directory +containing the `libjvm` library. + ### Mac Specifics The installed location of Java on OS X can vary, however the following snippet diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index 9cd1c5052fe8d..545b2d17d2e78 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -147,7 +147,7 @@ static std::vector get_potential_libjvm_paths() { file_name = "jvm.dll"; #elif __APPLE__ search_prefixes = {""}; - search_suffixes = {"", "/jre/lib/server"}; + search_suffixes = {"", "/jre/lib/server", "/lib/server"}; file_name = "libjvm.dylib"; // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are @@ -175,7 +175,7 @@ static std::vector get_potential_libjvm_paths() { "/usr/lib/jvm/default", // alt centos "/usr/java/latest", // alt centos }; - search_suffixes = {"/jre/lib/amd64/server"}; + search_suffixes = {"", "/jre/lib/amd64/server", "/lib/amd64/server"}; file_name = "libjvm.so"; #endif // From direct environment variable