From 977b08b07c82dc390c8cc3f73b1e129b27a2a065 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Wed, 17 Nov 2021 22:36:59 +0800
Subject: [PATCH 1/7] refine doc

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 docs/Configuration.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)
diff --git a/docs/Configuration.md b/docs/Configuration.md
index 277243d94..86da1caf8 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -12,9 +12,10 @@ You can add these configuration into spark-defaults.conf to enable or disable th
 | spark.executor.memory| To set up how much memory to be used for Spark Executor. | |
 | spark.memory.offHeap.size| To set up how much memory to be used for Java OffHeap.<br /> Please notice Gazelle Plugin will leverage this setting to allocate memory space for native usage even offHeap is disabled. <br /> The value is based on your system and it is recommended to set it larger if you are facing Out of Memory issue in Gazelle Plugin | 30G |
 | spark.sql.sources.useV1SourceList | Choose to use V1 source | avro |
-| spark.sql.join.preferSortMergeJoin | To turn off preferSortMergeJoin in Spark | false |
+| spark.sql.join.preferSortMergeJoin | To turn on/off preferSortMergeJoin in Spark. In gazelle we recomend to turn off this to get better performance | true |
 | spark.plugins | To turn on Gazelle Plugin | com.intel.oap.GazellePlugin |
 | spark.shuffle.manager | To turn on Gazelle Columnar Shuffle Plugin | org.apache.spark.shuffle.sort.ColumnarShuffleManager |
+| spark.sql.shuffle.partitions | shuffle partition size, it's recomended to use the same number of your total cores | 200 |
 | spark.oap.sql.columnar.batchscan | Enable or Disable Columnar Batchscan, default is true | true |
 | spark.oap.sql.columnar.hashagg | Enable or Disable Columnar Hash Aggregate, default is true | true |
 | spark.oap.sql.columnar.projfilter | Enable or Disable Columnar Project and Filter, default is true | true |
@@ -32,18 +33,18 @@ You can add these configuration into spark-defaults.conf to enable or disable th
 | spark.oap.sql.columnar.preferColumnar | Enable or Disable Columnar Operators, default is false.<br /> This parameter could impact the performance in different case. In some cases, to set false can get some performance boost. | false |
 | spark.oap.sql.columnar.joinOptimizationLevel | Fallback to row operators if there are several continous joins | 6 |
 | spark.sql.execution.arrow.maxRecordsPerBatch | Set up the Max Records per Batch | 10000 |
-| spark.sql.execution.sort.spillThreshold | Set up the Max sort in memory threshold | 256M |
+| spark.sql.execution.sort.spillThreshold | Set up the Max sort in memory threshold in bytes, default is disabled | -1 |
 | spark.oap.sql.columnar.wholestagecodegen.breakdownTime | Enable or Disable metrics in Columnar WholeStageCodeGen | false |
-| spark.oap.sql.columnar.tmp_dir | Set up a folder to store the codegen files | /tmp |
+| spark.oap.sql.columnar.tmp_dir | Set up a folder to store the codegen files, default is disabled | "" |
 | spark.oap.sql.columnar.shuffle.customizedCompression.codec | Set up the codec to be used for Columnar Shuffle, default is lz4| lz4 |
 | spark.oap.sql.columnar.numaBinding | Set up NUMABinding, default is false| true |
-| spark.oap.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true. <br /> The setting is based on the number of cores in your system. Use 72 cores as an example. | 0-17,36-53 &#124;18-35,54-71 |
+| spark.oap.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true. <br /> The setting is based on the number of cores in your system(lscpu | grep node[0-4]). Use 72 cores as an example. | 0-17,36-53 &#124;18-35,54-71 |
 
 Below is an example for spark-default.conf, if you are using conda to install OAP project.
 
-```
-##### Columnar Process Configuration
 
+## Example spark-defaults.conf
+```
 spark.sql.sources.useV1SourceList avro
 spark.sql.join.preferSortMergeJoin false
 spark.plugins com.intel.oap.GazellePlugin
@@ -64,4 +65,12 @@ Before you start spark, you must use below command to add some environment varia
 export CC=$HOME/miniconda2/envs/oapenv/bin/gcc
 export LIBARROW_DIR=$HOME/miniconda2/envs/oapenv/
 ```
-
+## Notes on driver
+In gazelle spark driver is used to C++ code generation for different operators. This means driver takes more tasks than vanilla Spark, so it's better to consider allocate more resource to driver. By default, driver will compile C++ codes with best optimizations targeting for local CPU architecture:
+```
+-O3 -march=native
+```
+This could be override by a local environment variable before starting driver:
+```
+export CODEGEN_OPTION=" -O1 -mavx2 -fno-semantic-interposition "
+```

From cc0e46750a756c6deb05bbe49016c5c7b20aec8d Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Wed, 17 Nov 2021 22:42:05 +0800
Subject: [PATCH 2/7] refine

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 docs/Configuration.md | 48 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/docs/Configuration.md b/docs/Configuration.md
index 86da1caf8..028cabb1a 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -29,6 +29,7 @@ You can add these configuration into spark-defaults.conf to enable or disable th
 | spark.oap.sql.columnar.nanCheck | Enable or Disable Nan Check, default is true | true |
 | spark.oap.sql.columnar.hashCompare | Enable or Disable Hash Compare in HashJoins or HashAgg, default is true | true |
 | spark.oap.sql.columnar.broadcastJoin | Enable or Disable Columnar BradcastHashJoin, default is true | true |
+| spark.oap.sql.columnar.sortmergejoin.lazyread | Enable or Disable lazy reading on Sort result. On disable, whole partition will be cached before doing SortMergeJoin | false |
 | spark.oap.sql.columnar.wholestagecodegen | Enable or Disable Columnar WholeStageCodeGen, default is true | true |
 | spark.oap.sql.columnar.preferColumnar | Enable or Disable Columnar Operators, default is false.<br /> This parameter could impact the performance in different case. In some cases, to set false can get some performance boost. | false |
 | spark.oap.sql.columnar.joinOptimizationLevel | Fallback to row operators if there are several continous joins | 6 |
@@ -40,6 +41,53 @@ You can add these configuration into spark-defaults.conf to enable or disable th
 | spark.oap.sql.columnar.numaBinding | Set up NUMABinding, default is false| true |
 | spark.oap.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true. <br /> The setting is based on the number of cores in your system(lscpu | grep node[0-4]). Use 72 cores as an example. | 0-17,36-53 &#124;18-35,54-71 |
 
+## Example thrift-server configuration
+Here's one example of the thrift-server configuration
+```
+THRIFTSERVER_CONFIG="--name ${runname}
+--num-executors 72
+--driver-memory 20g
+--executor-memory 6g
+--executor-cores 6
+--master yarn
+--deploy-mode client
+--conf spark.executor.memoryOverhead=384
+--conf spark.executorEnv.CC=/home/sparkuser/miniconda3/envs/arrow-new/bin/gcc
+--conf spark.plugins=com.intel.oap.GazellePlugin
+--conf spark.executorEnv.LD_LIBRARY_PATH=/home/sparkuser/miniconda3/envs/arrow-new/lib/:/home/sparkuser/miniconda3/envs/arrow-new/lib64/
+--conf spark.executorEnv.LIBARROW_DIR=/home/sparkuser/miniconda3/envs/arrow-new
+--conf spark.driver.extraClassPath=${nativesql_jars}
+--conf spark.executor.extraClassPath=${nativesql_jars}
+--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager
+--conf spark.sql.join.preferSortMergeJoin=false
+--conf spark.sql.inMemoryColumnarStorage.batchSize=${batchsize}
+--conf spark.sql.execution.arrow.maxRecordsPerBatch=${batchsize}
+--conf spark.sql.parquet.columnarReaderBatchSize=${batchsize}
+--conf spark.sql.autoBroadcastJoinThreshold=10M
+--conf spark.sql.broadcastTimeout=600
+--conf spark.sql.crossJoin.enabled=true
+--conf spark.driver.maxResultSize=20g
+--hiveconf hive.server2.thrift.port=10001
+--hiveconf hive.server2.thrift.bind.host=sr270
+--conf spark.sql.codegen.wholeStage=true
+--conf spark.sql.shuffle.partitions=432
+--conf spark.memory.offHeap.enabled=true
+--conf spark.memory.offHeap.size=15g
+--conf spark.kryoserializer.buffer.max=128m
+--conf spark.kryoserializer.buffer=32m
+--conf spark.oap.sql.columnar.preferColumnar=false
+--conf spark.oap.sql.columnar.sortmergejoin.lazyread=true
+--conf spark.sql.execution.sort.spillThreshold=2147483648
+--conf spark.executorEnv.LD_PRELOAD=/home/sparkuser/miniconda3/envs/arrow-new/lib/libjemalloc.so
+--conf spark.executorEnv.MALLOC_CONF=background_thread:true,dirty_decay_ms:0,muzzy_decay_ms:0,narenas:2
+--conf spark.executorEnv.MALLOC_ARENA_MAX=2
+--conf spark.oap.sql.columnar.numaBinding=true
+--conf spark.oap.sql.columnar.coreRange=0-35,72-107|36-71,108-143
+--conf spark.oap.sql.columnar.joinOptimizationLevel=18
+--conf spark.oap.sql.columnar.shuffle.customizedCompression.codec=lz4
+--conf spark.yarn.appMasterEnv.LD_PRELOAD=/home/sparkuser/miniconda3/envs/arrow-new/lib/libjemalloc.so"
+```
+
 Below is an example for spark-default.conf, if you are using conda to install OAP project.
 
 

From 43938f4d8674c381a79dc7fff68f19927c388fd1 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Nov 2021 08:06:02 +0800
Subject: [PATCH 3/7] adding perf tuning

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 docs/performance.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 docs/performance.md

diff --git a/docs/performance.md b/docs/performance.md
new file mode 100644
index 000000000..1d4bb6e20
--- /dev/null
+++ b/docs/performance.md
@@ -0,0 +1,21 @@
+# Performance tuning for Gazelle Plugin
+
+It is complicated to tune for Spark workloads as each varies a lot. Here are several general tuning options on the most popular TPCH/TPC-DS benchmarking.
+
+## Data Generating
+On non-partiton tables, it's better to set the number of files to times of total HT cores in your cluster, e.g., there are 384 cores in your cluster, we'd better to set the file numbers to 2*384 or 3*384 to ensure. In this way Spark would only issue one task for each file(together with spark.sql.files.maxPartitionBytes option).
+
+Note the spark.sql.files.maxRecordsPerFile will also split the files in to smaller ones. We may just disable this feature to -1.
+
+## Shuffle Partitions
+As Gazelle currently only supports hash based shuffle, it's recommended to use 1 or 2 times shuffle partitions of the total HT cores in the cluster. e.g., there are 384 cores in the cluster, it's better to use spark.sql.shuffle.partitions = 384 or 768. It this way it's most efficient for Gazelle. 
+
+## On-heap/Off-heap Memory Size
+Unlike Spark, most of the memory usage in Gazelle would be off-heap based. So a big off-heap is recomended. There are still some small objects in On-heap thus a proper sized on-heap is also required. We are recommending below configurations for memory related settings. 
+```
+--executor-cores 6
+--executor-memory 6g // on-heap memory: 1G per core
+--conf spark.executor.memoryOverhead=384 // not used, 384M should be enough
+--conf spark.memory.offHeap.enabled=true // enable off-heap thus Spark can control the memory
+--conf spark.memory.offHeap.size=15g // a big off-heap is required
+```  

From d216b5c5a5c63ed59b71b9736f3dce7a267ba3b5 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Nov 2021 08:11:02 +0800
Subject: [PATCH 4/7] fix index page

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 docs/index.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 43972520a..f7a5debac 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -96,8 +96,10 @@ ${SPARK_HOME}/bin/spark-shell \
         --verbose \
         --master yarn \
         --driver-memory 10G \
+        --conf --conf spark.plugins=com.intel.oap.GazellePlugin \
         --conf spark.driver.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
         --conf spark.executor.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
+        --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
         --conf spark.driver.cores=1 \
         --conf spark.executor.instances=12 \
         --conf spark.executor.cores=6 \
@@ -155,7 +157,7 @@ We pick up 10 queries which can be fully supported in OAP v1.0-Gazelle Plugin an
 
 ![Performance](./image/decision_support_bench2_result_by_query.png)
 
-Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin.
+Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin. Please check the [detailed page](./performance.md) on performance tuning for TPC-H and TPC-DS workloads.
 
 ## Memory allocation
 The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1) Java based allocation which is widely used in Arrow Java API. 2) Native side memory allocation used in each native kernel. We investigated the memory allocation behavior and made more turnings [here](./memory.md), the memroy footprint is stable during a TPC-DS power run.

From 192e4d780d4a100297231cc3e3a39152787cf06c Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Nov 2021 12:07:13 +0800
Subject: [PATCH 5/7] update doc

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 README.md           | 15 +++++++++------
 docs/index.md       |  7 ++++---
 docs/performance.md | 15 +++++++++++----
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 959bc5bd0..9759bdc44 100644
--- a/README.md
+++ b/README.md
@@ -104,13 +104,16 @@ ${SPARK_HOME}/bin/spark-shell \
         --verbose \
         --master yarn \
         --driver-memory 10G \
+        --conf spark.plugins=com.intel.oap.GazellePlugin \
         --conf spark.driver.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
         --conf spark.executor.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
+        --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
         --conf spark.driver.cores=1 \
         --conf spark.executor.instances=12 \
         --conf spark.executor.cores=6 \
         --conf spark.executor.memory=20G \
-        --conf spark.memory.offHeap.size=80G \
+        --conf spark.memory.offHeap.enabled=true \
+        --conf spark.memory.offHeap.size=20G \
         --conf spark.task.cpus=1 \
         --conf spark.locality.wait=0s \
         --conf spark.sql.shuffle.partitions=72 \
@@ -130,9 +133,9 @@ The result should showup on Spark console and you can check the DAG diagram with
 
 ## Could/K8s Integration
 
-### Amazone EMR
+### Amazon EMR
 
-Please refer to [Gazelle_on_Dataproc](https://github.com/oap-project/oap-tools/tree/master/integrations/oap/emr) to find details about how to use OAP Gazelle on Amazon EMR Cloud.
+Please refer to [Gazelle_on_EMR](https://github.com/oap-project/oap-tools/tree/master/integrations/oap/emr) to find details about how to use OAP Gazelle on Amazon EMR Cloud.
 
 ###  Google Cloud Dataproc
 
@@ -167,7 +170,7 @@ We pick up 10 queries which can be fully supported in OAP v1.0-Gazelle Plugin an
 
 ![Performance](./docs/image/decision_support_bench2_result_in_total_v1.1.png)
 
-Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin.
+Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin. Please check the [detailed page](./performance.md) on performance tuning for TPC-H and TPC-DS workloads.
 
 ## Memory allocation
 The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1) Java based allocation which is widely used in Arrow Java API. 2) Native side memory allocation used in each native kernel. We investigated the memory allocation behavior and made more turnings [here](./docs/memory.md), the memroy footprint is stable during a TPC-DS power run.
@@ -183,5 +186,5 @@ The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1
 
 ## Contact
 
-chendi.xue@intel.com
-binwei.yang@intel.com
+weiting.chen@intel.com
+binwei.yang@intel.com
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index f7a5debac..1d31edbd3 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -96,7 +96,7 @@ ${SPARK_HOME}/bin/spark-shell \
         --verbose \
         --master yarn \
         --driver-memory 10G \
-        --conf --conf spark.plugins=com.intel.oap.GazellePlugin \
+        --conf spark.plugins=com.intel.oap.GazellePlugin \
         --conf spark.driver.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
         --conf spark.executor.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
         --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
@@ -104,7 +104,8 @@ ${SPARK_HOME}/bin/spark-shell \
         --conf spark.executor.instances=12 \
         --conf spark.executor.cores=6 \
         --conf spark.executor.memory=20G \
-        --conf spark.memory.offHeap.size=80G \
+        --conf spark.memory.offHeap.enabled=true \
+        --conf spark.memory.offHeap.size=20G \
         --conf spark.task.cpus=1 \
         --conf spark.locality.wait=0s \
         --conf spark.sql.shuffle.partitions=72 \
@@ -173,5 +174,5 @@ The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1
 
 ## Contact
 
-chendi.xue@intel.com
+weiting.chen@intel.com
 binwei.yang@intel.com
diff --git a/docs/performance.md b/docs/performance.md
index 1d4bb6e20..899b4f1db 100644
--- a/docs/performance.md
+++ b/docs/performance.md
@@ -2,10 +2,13 @@
 
 It is complicated to tune for Spark workloads as each varies a lot. Here are several general tuning options on the most popular TPCH/TPC-DS benchmarking.
 
-## Data Generating
-On non-partiton tables, it's better to set the number of files to times of total HT cores in your cluster, e.g., there are 384 cores in your cluster, we'd better to set the file numbers to 2*384 or 3*384 to ensure. In this way Spark would only issue one task for each file(together with spark.sql.files.maxPartitionBytes option).
-
-Note the spark.sql.files.maxRecordsPerFile will also split the files in to smaller ones. We may just disable this feature to -1.
+## Columnar Batch size
+Spark have several options to control the batch size at different operators. We suggest to use bigger value as this will bring better cache efficiency especially for columnar processing.
+```
+--conf spark.sql.inMemoryColumnarStorage.batchSize=20480
+--conf spark.sql.execution.arrow.maxRecordsPerBatch=20480
+--conf spark.sql.parquet.columnarReaderBatchSize=20480
+```
 
 ## Shuffle Partitions
 As Gazelle currently only supports hash based shuffle, it's recommended to use 1 or 2 times shuffle partitions of the total HT cores in the cluster. e.g., there are 384 cores in the cluster, it's better to use spark.sql.shuffle.partitions = 384 or 768. It this way it's most efficient for Gazelle. 
@@ -19,3 +22,7 @@ Unlike Spark, most of the memory usage in Gazelle would be off-heap based. So a
 --conf spark.memory.offHeap.enabled=true // enable off-heap thus Spark can control the memory
 --conf spark.memory.offHeap.size=15g // a big off-heap is required
 ```  
+## Data Generating
+On non-partiton tables, it's better to set the number of files to times of total HT cores in your cluster, e.g., there are 384 cores in your cluster, we'd better to set the file numbers to 2x384 or 3x384 to ensure. In this way Spark would only issue one task for each file(together with spark.sql.files.maxPartitionBytes option).
+
+Note the spark.sql.files.maxRecordsPerFile will also split the files in to smaller ones. We may just disable this feature to -1.
\ No newline at end of file

From da8be01432a68099fa26c3a76dbb20614b2cc2bc Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Nov 2021 12:07:37 +0800
Subject: [PATCH 6/7] remove duplicate page

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 docs/index.md | 178 --------------------------------------------------
 1 file changed, 178 deletions(-)
 delete mode 100644 docs/index.md

diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index 1d31edbd3..000000000
--- a/docs/index.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Gazelle Plugin
-
-A Native Engine for Spark SQL with vectorized SIMD optimizations
-
-## Introduction
-
-![Overview](./image/nativesql_arch.png)
-
-Spark SQL works very well with structured row-based data. It used WholeStageCodeGen to improve the performance by Java JIT code. However Java JIT is usually not working very well on utilizing latest SIMD instructions, especially under complicated queries. [Apache Arrow](https://arrow.apache.org/) provided CPU-cache friendly columnar in-memory layout, its SIMD-optimized kernels and LLVM-based SQL engine Gandiva are also very efficient.
-
-Gazelle Plugin reimplements Spark SQL execution layer with SIMD-friendly columnar data processing based on Apache Arrow, 
-and leverages Arrow's CPU-cache friendly columnar in-memory layout, SIMD-optimized kernels and LLVM-based expression engine to bring better performance to Spark SQL.
-
-
-## Key Features
-
-### Apache Arrow formatted intermediate data among Spark operator
-
-![Overview](./image/columnar.png)
-
-With [Spark 27396](https://issues.apache.org/jira/browse/SPARK-27396) its possible to pass a RDD of Columnarbatch to operators. We implemented this API with Arrow columnar format.
-
-### Apache Arrow based Native Readers for Parquet and other formats
-
-![Overview](./image/dataset.png)
-
-A native parquet reader was developed to speed up the data loading. it's based on Apache Arrow Dataset. For details please check [Arrow Data Source](https://github.com/oap-project/native-sql-engine/tree/master/arrow-data-source)
-
-### Apache Arrow Compute/Gandiva based operators
-
-![Overview](./image/kernel.png)
-
-We implemented common operators based on Apache Arrow Compute and Gandiva. The SQL expression was compiled to one expression tree with protobuf and passed to native kernels. The native kernels will then evaluate the these expressions based on the input columnar batch.
-
-### Native Columnar Shuffle Operator with efficient compression support
-
-![Overview](./image/shuffle.png)
-
-We implemented columnar shuffle to improve the shuffle performance. With the columnar layout we could do very efficient data compression for different data format.
-
-Please check the operator supporting details [here](./operators.md)
-
-## How to use OAP: Gazelle Plugin
-
-There are three ways to use OAP: Gazelle Plugin,
-1. Use precompiled jars
-2. Building by Conda Environment
-3. Building by Yourself
-
-### Use precompiled jars
-
-Please go to [OAP's Maven Central Repository](https://repo1.maven.org/maven2/com/intel/oap/) to find Gazelle Plugin jars.
-For usage, you will require below two jar files:
-1. spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar is located in com/intel/oap/spark-arrow-datasource-standard/<version>/
-2. spark-columnar-core-<version>-jar-with-dependencies.jar is located in com/intel/oap/spark-columnar-core/<version>/
-Please notice the files are fat jars shipped with our custom Arrow library and pre-compiled from our server(using GCC 9.3.0 and LLVM 7.0.1), which means you will require to pre-install GCC 9.3.0 and LLVM 7.0.1 in your system for normal usage. 
-
-### Building by Conda
-
-If you already have a working Hadoop Spark Cluster, we provide a Conda package which will automatically install dependencies needed by OAP, you can refer to [OAP-Installation-Guide](./OAP-Installation-Guide.md) for more information. Once finished [OAP-Installation-Guide](./OAP-Installation-Guide.md), you can find built `spark-columnar-core-<version>-jar-with-dependencies.jar` under `$HOME/miniconda2/envs/oapenv/oap_jars`.
-Then you can just skip below steps and jump to [Get Started](#get-started).
-
-### Building by yourself
-
-If you prefer to build from the source code on your hand, please follow below steps to set up your environment.
-
-#### Prerequisite
-
-There are some requirements before you build the project.
-Please check the document [Prerequisite](./Prerequisite.md) and make sure you have already installed the software in your system.
-If you are running a SPARK Cluster, please make sure all the software are installed in every single node.
-
-#### Installation
-
-Please check the document [Installation Guide](./Installation.md) 
-
-## Get started
-
-To enable Gazelle Plugin, the previous built jar `spark-columnar-core-<version>-jar-with-dependencies.jar` should be added to Spark configuration. We also recommend to use `spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar`. We will demonstrate an example by using both jar files.
-SPARK related options are:
-
-* `spark.driver.extraClassPath` : Set to load jar file to driver.
-* `spark.executor.extraClassPath` : Set to load jar file to executor.
-* `jars` : Set to copy jar file to the executors when using yarn cluster mode.
-* `spark.executorEnv.ARROW_LIBHDFS3_DIR` : Optional if you are using a custom libhdfs3.so.
-* `spark.executorEnv.LD_LIBRARY_PATH` : Optional if you are using a custom libhdfs3.so.
-
-For Spark Standalone Mode, please set the above value as relative path to the jar file.
-For Spark Yarn Cluster Mode, please set the above value as absolute path to the jar file.
-
-More Configuration, please check the document [Configuration Guide](./Configuration.md)
-
-Example to run Spark Shell with ArrowDataSource jar file
-```
-${SPARK_HOME}/bin/spark-shell \
-        --verbose \
-        --master yarn \
-        --driver-memory 10G \
-        --conf spark.plugins=com.intel.oap.GazellePlugin \
-        --conf spark.driver.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
-        --conf spark.executor.extraClassPath=$PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar:$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar \
-        --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
-        --conf spark.driver.cores=1 \
-        --conf spark.executor.instances=12 \
-        --conf spark.executor.cores=6 \
-        --conf spark.executor.memory=20G \
-        --conf spark.memory.offHeap.enabled=true \
-        --conf spark.memory.offHeap.size=20G \
-        --conf spark.task.cpus=1 \
-        --conf spark.locality.wait=0s \
-        --conf spark.sql.shuffle.partitions=72 \
-        --conf spark.executorEnv.ARROW_LIBHDFS3_DIR="$PATH_TO_LIBHDFS3_DIR/" \
-        --conf spark.executorEnv.LD_LIBRARY_PATH="$PATH_TO_LIBHDFS3_DEPENDENCIES_DIR"
-        --jars $PATH_TO_JAR/spark-arrow-datasource-standard-<version>-jar-with-dependencies.jar,$PATH_TO_JAR/spark-columnar-core-<version>-jar-with-dependencies.jar
-```
-
-Here is one example to verify if Gazelle Plugin works, make sure you have TPC-H dataset.  We could do a simple projection on one parquet table. For detailed testing scripts, please refer to [Solution Guide](https://github.com/Intel-bigdata/Solution_navigator/tree/master/nativesql).
-```
-val orders = spark.read.format("arrow").load("hdfs:////user/root/date_tpch_10/orders")
-orders.createOrReplaceTempView("orders")
-spark.sql("select * from orders where o_orderdate > date '1998-07-26'").show(20000, false)
-```
-
-The result should showup on Spark console and you can check the DAG diagram with some Columnar Processing stage. Gazelle Plugin still lacks some features, please check out the [limitations](./limitations.md).
-
-## Could/K8s Integration
-
-###  Google Cloud Dataproc
-
-Gazelle Plugin now supports to run on Dataproc 2.0, we provide a guide to help quickly install Gazelle Plugin and run TPC-DS with notebooks or scripts.
-
-Please refer to [Gazelle_on_Dataproc](https://github.com/oap-project/oap-tools/blob/master/docs/Gazelle_on_Dataproc.md) to find details about:
-
-1. Create a cluster on Dataproc 2.0 with initialization actions. 
-Gazelle Plugin jars compiled with `-Pdataproc-2.0` parameter will installed by Conda in all cluster nodes.
-   
-2. Config for enabling Gazelle Plugin.
-
-3. Run TPC-DS with notebooks or scripts.
-
-
-## Performance data
-
-For advanced performance testing, below charts show the results by using two benchmarks: 1. Decision Support Benchmark1 and 2. Decision Support Benchmark2.
-All the testing environment for Decision Support Benchmark1&2 are using 1 master + 3 workers and Intel(r) Xeon(r) Gold 6252 CPU|384GB memory|NVMe SSD x3 per single node with 1.5TB dataset.
-* Decision Support Benchmark1 is a query set modified from [TPC-H benchmark](http://tpc.org/tpch/default5.asp). We change Decimal to Double since Decimal hasn't been supported in OAP v1.0-Gazelle Plugin.
-Overall, the result shows a 1.49X performance speed up from OAP v1.0-Gazelle Plugin comparing to Vanilla SPARK 3.0.0.
-We also put the detail result by queries, most of queries in Decision Support Benchmark1 can take the advantages from Gazelle Plugin. The performance boost ratio may depend on the individual query.
-
-![Performance](./image/decision_support_bench1_result_in_total.png)
-
-![Performance](./image/decision_support_bench1_result_by_query.png)
-
-* Decision Support Benchmark2 is a query set modified from [TPC-DS benchmark](http://tpc.org/tpcds/default5.asp). We change Decimal to Doubel since Decimal hasn't been supported in OAP v1.0-Gazelle Plugin.
-We pick up 10 queries which can be fully supported in OAP v1.0-Gazelle Plugin and the result shows a 1.26X performance speed up comparing to Vanilla SPARK 3.0.0.
-
-![Performance](./image/decision_support_bench2_result_in_total.png)
-
-![Performance](./image/decision_support_bench2_result_by_query.png)
-
-Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin. Please check the [detailed page](./performance.md) on performance tuning for TPC-H and TPC-DS workloads.
-
-## Memory allocation
-The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1) Java based allocation which is widely used in Arrow Java API. 2) Native side memory allocation used in each native kernel. We investigated the memory allocation behavior and made more turnings [here](./memory.md), the memroy footprint is stable during a TPC-DS power run.
-![Memory](./image/rssmem.png)
-
-
-
-## Coding Style
-
-* For Java code, we used [google-java-format](https://github.com/google/google-java-format)
-* For Scala code, we used [Spark Scala Format](https://github.com/apache/spark/blob/master/dev/.scalafmt.conf), please use [scalafmt](https://github.com/scalameta/scalafmt) or run ./scalafmt for scala codes format
-* For Cpp codes, we used Clang-Format, check on this link [google-vim-codefmt](https://github.com/google/vim-codefmt) for details.
-
-## Contact
-
-weiting.chen@intel.com
-binwei.yang@intel.com

From 9b4c60f3950b4e5ca2645df28288f11d00b61c7a Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Nov 2021 15:36:23 +0800
Subject: [PATCH 7/7] fix link

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 README.md             | 2 +-
 docs/Configuration.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9759bdc44..c39ecd223 100644
--- a/README.md
+++ b/README.md
@@ -170,7 +170,7 @@ We pick up 10 queries which can be fully supported in OAP v1.0-Gazelle Plugin an
 
 ![Performance](./docs/image/decision_support_bench2_result_in_total_v1.1.png)
 
-Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin. Please check the [detailed page](./performance.md) on performance tuning for TPC-H and TPC-DS workloads.
+Please notes the performance data is not an official from TPC-H and TPC-DS. The actual performance result may vary by individual workloads. Please try your workloads with Gazelle Plugin first and check the DAG or log file to see if all the operators can be supported in OAP-Gazelle Plugin. Please check the [detailed page](./docs/performance.md) on performance tuning for TPC-H and TPC-DS workloads.
 
 ## Memory allocation
 The memory usage in Gazelle Plugin is high. The allocations goes to two parts: 1) Java based allocation which is widely used in Arrow Java API. 2) Native side memory allocation used in each native kernel. We investigated the memory allocation behavior and made more turnings [here](./docs/memory.md), the memroy footprint is stable during a TPC-DS power run.
diff --git a/docs/Configuration.md b/docs/Configuration.md
index 028cabb1a..adaf65c0b 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -32,12 +32,12 @@ You can add these configuration into spark-defaults.conf to enable or disable th
 | spark.oap.sql.columnar.sortmergejoin.lazyread | Enable or Disable lazy reading on Sort result. On disable, whole partition will be cached before doing SortMergeJoin | false |
 | spark.oap.sql.columnar.wholestagecodegen | Enable or Disable Columnar WholeStageCodeGen, default is true | true |
 | spark.oap.sql.columnar.preferColumnar | Enable or Disable Columnar Operators, default is false.<br /> This parameter could impact the performance in different case. In some cases, to set false can get some performance boost. | false |
-| spark.oap.sql.columnar.joinOptimizationLevel | Fallback to row operators if there are several continous joins | 6 |
+| spark.oap.sql.columnar.joinOptimizationLevel | Fallback to row operators if there are several continous joins | 18 |
 | spark.sql.execution.arrow.maxRecordsPerBatch | Set up the Max Records per Batch | 10000 |
 | spark.sql.execution.sort.spillThreshold | Set up the Max sort in memory threshold in bytes, default is disabled | -1 |
 | spark.oap.sql.columnar.wholestagecodegen.breakdownTime | Enable or Disable metrics in Columnar WholeStageCodeGen | false |
 | spark.oap.sql.columnar.tmp_dir | Set up a folder to store the codegen files, default is disabled | "" |
-| spark.oap.sql.columnar.shuffle.customizedCompression.codec | Set up the codec to be used for Columnar Shuffle, default is lz4| lz4 |
+| spark.oap.sql.columnar.shuffle.customizedCompression.codec | Set up the codec to be used for Columnar Shuffle, default is lz4. The other option is fastpfor which can bring better perf on compressing fixed-size based contents like int| lz4 |
 | spark.oap.sql.columnar.numaBinding | Set up NUMABinding, default is false| true |
 | spark.oap.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true. <br /> The setting is based on the number of cores in your system(lscpu | grep node[0-4]). Use 72 cores as an example. | 0-17,36-53 &#124;18-35,54-71 |