From b3d8b1b14487ae910b8de3179382b665d04db79d Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Tue, 9 Nov 2021 11:29:08 +0800 Subject: [PATCH] [NSE-550] Support ORC Format Reading in Gazelle (#551) * [NSE-550][WIP] Support ORC Format Reading in Gazelle * Add ORC SQL UT * Add Catalog UT * Add -DARROW_ORC=ON * Add -DARROW_ORC=ON --- .github/workflows/tpch.yml | 2 +- .github/workflows/unittests.yml | 6 +-- arrow-data-source/.travis.yml | 2 +- arrow-data-source/README.md | 2 +- arrow-data-source/script/build_arrow.sh | 1 + .../datasources/v2/arrow/ArrowUtils.scala | 1 + .../standard/src/test/resources/people.orc | Bin 0 -> 507 bytes .../arrow/ArrowDataSourceTest.scala | 48 +++++++++++++++++- docs/ApacheArrowInstallation.md | 2 +- native-sql-engine/cpp/src/CMakeLists.txt | 1 + 10 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 arrow-data-source/standard/src/test/resources/people.orc diff --git a/.github/workflows/tpch.yml b/.github/workflows/tpch.yml index a66f00ee0..55995e960 100644 --- a/.github/workflows/tpch.yml +++ b/.github/workflows/tpch.yml @@ -53,7 +53,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-4.0.0-oap && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2 sudo make install cd ../../java mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 708db07ee..2edf1b79f 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -47,7 +47,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-4.0.0-oap && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 sudo make install - name: Run unit tests run: | @@ -90,7 +90,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-4.0.0-oap && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 sudo make install cd ../../java mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip @@ -137,7 +137,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-4.0.0-oap && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 sudo make install cd ../../java mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip diff --git a/arrow-data-source/.travis.yml b/arrow-data-source/.travis.yml index 5c938a101..6be7abed0 100644 --- a/arrow-data-source/.travis.yml +++ b/arrow-data-source/.travis.yml @@ -26,7 +26,7 @@ jobs: - cd arrow && git checkout oap-master && cd cpp - sed -i "s/\${Python3_EXECUTABLE}/\/opt\/pyenv\/shims\/python3/g" CMakeLists.txt - mkdir build && cd build - - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make + - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make - sudo make install - cd ../../java - mvn clean install -q -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip diff --git a/arrow-data-source/README.md b/arrow-data-source/README.md index 4e71ee514..5ba77b1fe 100644 --- a/arrow-data-source/README.md +++ b/arrow-data-source/README.md @@ -121,7 +121,7 @@ git clone -b arrow-4.0.0-oap https://github.com/oap-project/arrow.git cd arrow/cpp mkdir build cd build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. +cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. make // build and install arrow jvm library diff --git a/arrow-data-source/script/build_arrow.sh b/arrow-data-source/script/build_arrow.sh index 410e31070..d8ec40128 100755 --- a/arrow-data-source/script/build_arrow.sh +++ b/arrow-data-source/script/build_arrow.sh @@ -71,6 +71,7 @@ cmake ./cpp \ -DARROW_GANDIVA_JAVA=ON \ -DARROW_GANDIVA=ON \ -DARROW_PARQUET=ON \ + -DARROW_ORC=ON \ -DARROW_HDFS=ON \ -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_JNI=ON \ diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala index 07c572cdd..fdc104606 100644 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala +++ b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala @@ -116,6 +116,7 @@ object ArrowUtils { val paramMap = options.parameters.toMap.asJava options.originalFormat match { case "parquet" => org.apache.arrow.dataset.file.format.ParquetFileFormat.create(paramMap) + case "orc" => org.apache.arrow.dataset.file.format.OrcFileFormat.create(paramMap) case "csv" => org.apache.arrow.dataset.file.format.CsvFileFormat.create(paramMap) case _ => throw new IllegalArgumentException("Unrecognizable format") } diff --git a/arrow-data-source/standard/src/test/resources/people.orc b/arrow-data-source/standard/src/test/resources/people.orc new file mode 100644 index 0000000000000000000000000000000000000000..ae4b08c34ecd8ae7af73c13abfdb8cfeb62b69f9 GIT binary patch literal 507 zcmaiw-AV#M6vt=IS=VtA8wRPtNNkaiE*7P`#fv3Fq6-zf*}GU{uupec!KKS@~P9s!A9~WnS=ZnMtKjY~@`q~pPVRBC*V@%ft1T~2#*-V$tvtn+IqM?>Rv#J2? zi@u7@BlX&@*Q9+TE2nm4@3Y(%`ZG1{33Dgt*pOe0ct4Ig8?yDsvh;^p*~ zJEA!xa>d;oi0EoCo+G*%)P_i`Xb_A7TzCQ%mlL_MxAT*)WJ}h*dP+Rm`rA*kO;=xd zLlQ1cvo-5Luq&z>Ibg$Eqz|@%>71V1Y!-khQ{X#~4y2*q4AVTly0+mmPm8;7WsBua uw$P>Huow2a5IL#QFH)m#Qlo#R#_Q1;OSpLEiUq)`ooToH^ols14(e|j{#IB3 literal 0 HcmV?d00001 diff --git a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala index 5ad7596b9..536004fe4 100644 --- a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala +++ b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala @@ -26,7 +26,6 @@ import com.intel.oap.spark.sql.DataFrameWriterImplicits._ import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowOptions import com.sun.management.UnixOperatingSystemMXBean import org.apache.commons.io.FileUtils - import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.{DataFrame, QueryTest, Row} @@ -35,7 +34,7 @@ import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.SPARK_SESSION_EXTENSIONS import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} +import org.apache.spark.sql.types._ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { private val parquetFile1 = "parquet-1.parquet" @@ -297,6 +296,51 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { assert(fdGrowth < 100) } + private val orcFile = "people.orc" + test("read orc file") { + val path = ArrowDataSourceTest.locateResourcePath(orcFile) + verifyFrame( + spark.read + .format("arrow") + .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc") + .load(path), 2, 3) + } + + test("read orc file - programmatic API ") { + val path = ArrowDataSourceTest.locateResourcePath(orcFile) + verifyFrame( + spark.read + .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc") + .arrow(path), 2, 3) + } + + test("create catalog table for orc") { + val path = ArrowDataSourceTest.locateResourcePath(orcFile) + // spark.catalog.createTable("people", path, "arrow") + spark.catalog.createTable("people", "arrow", Map("path" -> path, "originalFormat" -> "orc")) + val sql = "select * from people" + spark.sql(sql).explain() + verifyFrame(spark.sql(sql), 2, 3) + } + + test("simple SQL query on orc file ") { + val path = ArrowDataSourceTest.locateResourcePath(orcFile) + val frame = spark.read + .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc") + .arrow(path) + frame.createOrReplaceTempView("people") + val sqlFrame = spark.sql("select * from people") + assert( + sqlFrame.schema === + StructType(Seq(StructField("name", StringType), + StructField("age", IntegerType), StructField("job", StringType)))) + val rows = sqlFrame.collect() + assert(rows(0).get(0) == "Jorge") + assert(rows(0).get(1) == 30) + assert(rows(0).get(2) == "Developer") + assert(rows.length === 2) + } + private val csvFile1 = "people.csv" private val csvFile2 = "example.csv" private val csvFile3 = "example-tab.csv" diff --git a/docs/ApacheArrowInstallation.md b/docs/ApacheArrowInstallation.md index 746e887e2..c40734dda 100644 --- a/docs/ApacheArrowInstallation.md +++ b/docs/ApacheArrowInstallation.md @@ -33,7 +33,7 @@ git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-4.0.0-oap mkdir -p arrow/cpp/release-build cd arrow/cpp/release-build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. +cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. make -j make install diff --git a/native-sql-engine/cpp/src/CMakeLists.txt b/native-sql-engine/cpp/src/CMakeLists.txt index a0443633e..d6aa0258e 100644 --- a/native-sql-engine/cpp/src/CMakeLists.txt +++ b/native-sql-engine/cpp/src/CMakeLists.txt @@ -153,6 +153,7 @@ macro(build_arrow STATIC_ARROW) -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON + -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=OFF