Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

[NSE-550] Support ORC Format Reading in Gazelle #551

Merged
merged 5 commits into from
Nov 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tpch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2
sudo make install
cd ../../java
mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zhouyuan

Add -DARROW_ORC=ON again for remaining 6 files.

sudo make install
- name: Run unit tests
run: |
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
sudo make install
cd ../../java
mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
sudo make install
cd ../../java
mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip
Expand Down
2 changes: 1 addition & 1 deletion arrow-data-source/.travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- cd arrow && git checkout oap-master && cd cpp
- sed -i "s/\${Python3_EXECUTABLE}/\/opt\/pyenv\/shims\/python3/g" CMakeLists.txt
- mkdir build && cd build
- cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make
- cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make
- sudo make install
- cd ../../java
- mvn clean install -q -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip
Expand Down
2 changes: 1 addition & 1 deletion arrow-data-source/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ git clone -b arrow-4.0.0-oap https://github.com/oap-project/arrow.git
cd arrow/cpp
mkdir build
cd build
cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON ..
cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON ..
make

// build and install arrow jvm library
Expand Down
1 change: 1 addition & 0 deletions arrow-data-source/script/build_arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ cmake ./cpp \
-DARROW_GANDIVA_JAVA=ON \
-DARROW_GANDIVA=ON \
-DARROW_PARQUET=ON \
-DARROW_ORC=ON \
-DARROW_HDFS=ON \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_JNI=ON \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ object ArrowUtils {
val paramMap = options.parameters.toMap.asJava
options.originalFormat match {
case "parquet" => org.apache.arrow.dataset.file.format.ParquetFileFormat.create(paramMap)
case "orc" => org.apache.arrow.dataset.file.format.OrcFileFormat.create(paramMap)
case "csv" => org.apache.arrow.dataset.file.format.CsvFileFormat.create(paramMap)
case _ => throw new IllegalArgumentException("Unrecognizable format")
}
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import com.intel.oap.spark.sql.DataFrameWriterImplicits._
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowOptions
import com.sun.management.UnixOperatingSystemMXBean
import org.apache.commons.io.FileUtils

import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.{DataFrame, QueryTest, Row}
Expand All @@ -35,7 +34,7 @@ import org.apache.spark.sql.functions.col
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.StaticSQLConf.SPARK_SESSION_EXTENSIONS
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.sql.types._

class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
private val parquetFile1 = "parquet-1.parquet"
Expand Down Expand Up @@ -297,6 +296,51 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
assert(fdGrowth < 100)
}

private val orcFile = "people.orc"
test("read orc file") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
verifyFrame(
spark.read
.format("arrow")
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
.load(path), 2, 3)
}

test("read orc file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
verifyFrame(
spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
.arrow(path), 2, 3)
}

test("create catalog table for orc") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
// spark.catalog.createTable("people", path, "arrow")
spark.catalog.createTable("people", "arrow", Map("path" -> path, "originalFormat" -> "orc"))
val sql = "select * from people"
spark.sql(sql).explain()
verifyFrame(spark.sql(sql), 2, 3)
}

test("simple SQL query on orc file ") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
val frame = spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
.arrow(path)
frame.createOrReplaceTempView("people")
val sqlFrame = spark.sql("select * from people")
assert(
sqlFrame.schema ===
StructType(Seq(StructField("name", StringType),
StructField("age", IntegerType), StructField("job", StringType))))
val rows = sqlFrame.collect()
assert(rows(0).get(0) == "Jorge")
assert(rows(0).get(1) == 30)
assert(rows(0).get(2) == "Developer")
assert(rows.length === 2)
}

private val csvFile1 = "people.csv"
private val csvFile2 = "example.csv"
private val csvFile3 = "example-tab.csv"
Expand Down
2 changes: 1 addition & 1 deletion docs/ApacheArrowInstallation.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap
mkdir -p arrow/cpp/release-build
cd arrow/cpp/release-build
cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON ..
cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON ..
make -j
make install

Expand Down
1 change: 1 addition & 0 deletions native-sql-engine/cpp/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ macro(build_arrow STATIC_ARROW)
-DARROW_GANDIVA_JAVA=ON
-DARROW_GANDIVA=ON
-DARROW_PARQUET=ON
-DARROW_ORC=ON
-DARROW_CSV=ON
-DARROW_HDFS=ON
-DARROW_BOOST_USE_SHARED=OFF
Expand Down