Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

[NSE-728] Upgrade to Arrow 7.0.0 #729

Merged
merged 29 commits into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
de1ac2f
fix
zhztheplayer Mar 1, 2022
da58325
fix
zhztheplayer Mar 1, 2022
245e20e
fixup
zhztheplayer Mar 3, 2022
f479ff7
fixup
zhztheplayer Mar 7, 2022
4e69c45
fixup
zhztheplayer Mar 7, 2022
c2f4b4d
Revert "fixup"
zhztheplayer Mar 8, 2022
2bc26d9
Revert "fixup"
zhztheplayer Mar 8, 2022
f22fb98
fix empty buffer bug in distinct count
zhztheplayer Mar 9, 2022
15bd214
fixup
zhztheplayer Mar 10, 2022
a7a339a
minimize arrow build
zhztheplayer Mar 14, 2022
ba5aa4e
Disable memory pool to address async changes in Arrow 7
zhztheplayer Mar 20, 2022
c5eddea
Revert "Disable memory pool to address async changes in Arrow 7"
zhztheplayer Mar 20, 2022
1543aff
Memory pool fix for Arrow 7.0
zhztheplayer Mar 20, 2022
0e0291a
enable debuginfo
zhztheplayer Mar 21, 2022
decd738
enable debuginfo 2
zhztheplayer Mar 21, 2022
d9c8dbd
Revert debug changes
zhztheplayer Mar 27, 2022
1d15b45
Revert debug changes
zhztheplayer Mar 27, 2022
47b074d
update arrow version
zhztheplayer Mar 28, 2022
324a3c9
Fix unit test failures
zhztheplayer Mar 29, 2022
d2d6260
Code style
zhztheplayer Mar 29, 2022
7558964
update hashing.h
zhztheplayer Mar 31, 2022
7b942fe
ut failure
zhztheplayer Apr 1, 2022
b495cc7
Revert "fix empty buffer bug in distinct count"
zhztheplayer Apr 2, 2022
8712e5f
UT failures (orc, write)
zhztheplayer Apr 2, 2022
6adb9b8
Revert "Revert "fix empty buffer bug in distinct count""
zhztheplayer Apr 2, 2022
fddba3e
fix
zhztheplayer Apr 2, 2022
72cab70
debug
zhztheplayer Apr 10, 2022
4cead0e
debug
zhztheplayer Apr 11, 2022
753b4a4
Revert "Memory pool fix for Arrow 7.0"
zhztheplayer Apr 13, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tpch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
run: |
cd /tmp
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
cd arrow && git checkout arrow-7.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2
sudo make install
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
run: |
cd /tmp
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
cd arrow && git checkout arrow-7.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
sudo make install
Expand Down Expand Up @@ -88,7 +88,7 @@ jobs:
run: |
cd /tmp
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
cd arrow && git checkout arrow-7.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
sudo make install
Expand Down Expand Up @@ -133,7 +133,7 @@ jobs:
run: |
cd /tmp
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap && cd cpp
cd arrow && git checkout arrow-7.0.0-oap && cd cpp
mkdir build && cd build
cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2
sudo make install
Expand Down
2 changes: 1 addition & 1 deletion arrow-data-source/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ You have to use a customized Arrow to support for our datasets Java API.

```
// build arrow-cpp
git clone -b arrow-4.0.0-oap https://github.com/oap-project/arrow.git
git clone -b arrow-7.0.0-oap https://github.com/oap-project/arrow.git
cd arrow/cpp
mkdir build
cd build
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import java.util.regex.Pattern

import com.intel.oap.spark.sql.ArrowWriteQueue.EOS_BATCH
import com.intel.oap.spark.sql.ArrowWriteQueue.ScannerImpl
import org.apache.arrow.dataset.file.DatasetFileWriter
import org.apache.arrow.dataset.file.format.FileFormat
import org.apache.arrow.dataset.scanner.Scanner
import org.apache.arrow.dataset.scanner.ScanTask
Expand All @@ -47,12 +46,15 @@ class ArrowWriteQueue(schema: Schema, fileFormat: FileFormat, outputFileURI: Str
val dirURI = matcher.group(1)
val fileName = matcher.group(2)

DatasetFileWriter.write(scanner, fileFormat, dirURI, Array(), 1, fileName)
// disable write by arrow 7.0.0
// DatasetFileWriter.write(scanner, fileFormat, dirURI, Array(), 1, fileName)
}, "ArrowWriteQueue - " + UUID.randomUUID().toString)

writeThread.start()

def enqueue(batch: ArrowRecordBatch): Unit = {
// disable write by arrow 7.0.0
throw new UnsupportedOperationException("write is disabled by arrow 7.0.0 rebase")
scanner.enqueue(batch)
}

Expand Down
4 changes: 2 additions & 2 deletions arrow-data-source/script/build_arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ echo "ARROW_SOURCE_DIR=${ARROW_SOURCE_DIR}"
echo "ARROW_INSTALL_DIR=${ARROW_INSTALL_DIR}"
mkdir -p $ARROW_SOURCE_DIR
mkdir -p $ARROW_INSTALL_DIR
git clone https://github.com/oap-project/arrow.git --branch arrow-4.0.0-oap $ARROW_SOURCE_DIR
git clone https://github.com/oap-project/arrow.git --branch arrow-7.0.0-oap $ARROW_SOURCE_DIR
pushd $ARROW_SOURCE_DIR

cmake ./cpp \
Expand Down Expand Up @@ -98,7 +98,7 @@ make -j$NPROC
make install

cd java
mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=${ARROW_INSTALL_DIR}/lib -DskipTests -Dcheckstyle.skip
mvn clean install -P arrow-jni -pl dataset,gandiva -am -Darrow.cpp.build.dir=${ARROW_INSTALL_DIR}/lib -DskipTests -Dcheckstyle.skip
popd
echo "Finish to build Arrow from Source !!!"
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.ArrowWriteExtension.FakeRow
import com.intel.oap.spark.sql.ArrowWriteQueue
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions, ArrowUtils}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowOptions, ArrowUtils}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import com.intel.oap.vectorized.ArrowWritableColumnVector
import org.apache.arrow.dataset.scanner.ScanOptions
Expand Down Expand Up @@ -128,7 +128,8 @@ class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializab
val dataset = factory.finish();

val filter = if (enableFilterPushDown) {
ArrowFilters.translateFilters(filters)
// disable filter pushdown by arrow 7.0.0
org.apache.arrow.dataset.filter.Filter.EMPTY
} else {
org.apache.arrow.dataset.filter.Filter.EMPTY
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ case class ArrowPartitionReaderFactory(
partitionedFile.start, partitionedFile.length, options)
val dataset = factory.finish()
val filter = if (enableFilterPushDown) {
ArrowFilters.translateFilters(ArrowFilters.pruneWithSchema(pushedFilters, readDataSchema))
// disable filter pushdown by arrow 7.0.0
org.apache.arrow.dataset.filter.Filter.EMPTY
} else {
org.apache.arrow.dataset.filter.Filter.EMPTY
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ object ArrowUtils {
val paramMap = options.parameters.toMap.asJava
options.originalFormat match {
case "parquet" => org.apache.arrow.dataset.file.format.ParquetFileFormat.create(paramMap)
case "orc" => org.apache.arrow.dataset.file.format.OrcFileFormat.create(paramMap)
// disable orc by arrow 7.0.0
// case "orc" => org.apache.arrow.dataset.file.format.OrcFileFormat.create(paramMap)
case "csv" => org.apache.arrow.dataset.file.format.CsvFileFormat.create(paramMap)
case _ => throw new IllegalArgumentException("Unrecognizable format")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
assert(rows.length === 3)
}

test("simple parquet write") {
ignore("simple parquet write") {
val path = ArrowDataSourceTest.locateResourcePath(parquetFile3)
val frame = spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet")
Expand Down Expand Up @@ -339,7 +339,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
df.show()
}

test("orc reader on data type: struct, array, map") {
ignore("orc reader on data type: struct, array, map") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile1)
val frame = spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
Expand All @@ -363,7 +363,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
}

private val orcFile = "people.orc"
test("read orc file") {
ignore("read orc file") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
verifyFrame(
spark.read
Expand All @@ -372,15 +372,15 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
.load(path), 2, 3)
}

test("read orc file - programmatic API ") {
ignore("read orc file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
verifyFrame(
spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
.arrow(path), 2, 3)
}

test("create catalog table for orc") {
ignore("create catalog table for orc") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
// spark.catalog.createTable("people", path, "arrow")
spark.catalog.createTable("people", "arrow", Map("path" -> path, "originalFormat" -> "orc"))
Expand All @@ -389,7 +389,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
verifyFrame(spark.sql(sql), 2, 3)
}

test("simple SQL query on orc file ") {
ignore("simple SQL query on orc file ") {
val path = ArrowDataSourceTest.locateResourcePath(orcFile)
val frame = spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "orc")
Expand Down
2 changes: 1 addition & 1 deletion docs/ApacheArrowInstallation.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Please make sure your cmake version is qualified based on the prerequisite.
# Arrow
``` shell
git clone https://github.com/oap-project/arrow.git
cd arrow && git checkout arrow-4.0.0-oap
cd arrow && git checkout arrow-7.0.0-oap
mkdir -p arrow/cpp/release-build
cd arrow/cpp/release-build
cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_ORC=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON ..
Expand Down
Loading