From 159e874675edb5b95588e3b73288f2f9fc48eed3 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Jul 2022 11:58:30 -0500 Subject: [PATCH] [SPARK-39385][SQL] Translate linear regression aggregate functions for pushdown ### What changes were proposed in this pull request? Spark supports a lot of linear regression aggregate functions now. Because `REGR_AVGX`, `REGR_AVGY`, `REGR_COUNT`, `REGR_SXX` and `REGR_SXY` are replaced to other expression in runtime, This PR will only translate `REGR_INTERCEPT`, `REGR_R2`, `REGR_SLOPE`, `REGR_SXY` for pushdown. After this job, users could override `JdbcDialect.compileAggregate` to implement some linear regression aggregate functions supported by some database. ### Why are the changes needed? Make the implement of *Dialect could compile `REGR_INTERCEPT`, `REGR_R2`, `REGR_SLOPE`, `REGR_SXY`. ### Does this PR introduce _any_ user-facing change? 'No'. New feature. ### How was this patch tested? New tests. Closes #36773 from beliefer/SPARK-39385. Authored-by: Jiaan Geng Signed-off-by: Wenchen Fan --- R/pkg/R/WindowSpec.R | 2 +- R/pkg/tests/fulltests/test_sparkSQL.R | 10 +- R/run-tests.sh | 4 +- .../sql/jdbc/v2/MySQLIntegrationSuite.scala | 2 + .../apache/spark/sql/jdbc/v2/V2JDBCTest.scala | 30 +- ...StatusesConvertBenchmark-jdk11-results.txt | 13 + ...StatusesConvertBenchmark-jdk17-results.txt | 13 + .../MapStatusesConvertBenchmark-results.txt | 10 +- .../scala/org/apache/spark/SparkContext.scala | 28 +- .../scala/org/apache/spark/SparkEnv.scala | 2 +- .../apache/spark/api/python/PythonRDD.scala | 2 +- .../apache/spark/api/python/SerDeUtil.scala | 2 +- .../spark/deploy/ApplicationDescription.scala | 15 +- .../apache/spark/deploy/DeployMessage.scala | 5 +- .../spark/deploy/ExecutorDescription.scala | 5 +- .../deploy/client/StandaloneAppClient.scala | 18 +- .../spark/deploy/history/EventFilter.scala | 3 +- .../spark/deploy/master/ApplicationInfo.scala | 89 +- .../spark/deploy/master/ExecutorDesc.scala | 3 +- .../master/ExecutorResourceDescription.scala | 32 + .../apache/spark/deploy/master/Master.scala | 111 +- .../deploy/master/ui/ApplicationPage.scala | 11 +- .../spark/deploy/worker/ExecutorRunner.scala | 2 + .../apache/spark/deploy/worker/Worker.scala | 7 +- .../spark/resource/ResourceInformation.scala | 4 +- .../spark/resource/ResourceProfile.scala | 36 +- .../resource/ResourceProfileManager.scala | 23 +- .../apache/spark/resource/ResourceUtils.scala | 6 + .../scheduler/EventLoggingListener.scala | 13 +- .../spark/scheduler/ReplayListenerBus.scala | 3 +- .../cluster/StandaloneSchedulerBackend.scala | 21 +- .../apache/spark/ui/ConsoleProgressBar.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala | 1734 ++++++++++------- .../apache/spark/SecurityManagerSuite.scala | 11 +- .../org/apache/spark/SparkFunSuite.scala | 16 +- .../apache/spark/benchmark/Benchmarks.scala | 2 - .../apache/spark/deploy/DeployTestUtils.scala | 46 +- .../spark/deploy/JsonProtocolSuite.scala | 8 +- .../StandaloneDynamicAllocationSuite.scala | 2 +- .../spark/deploy/client/AppClientSuite.scala | 59 +- .../history/EventLogFileCompactorSuite.scala | 3 +- .../deploy/history/EventLogTestHelper.scala | 3 +- .../history/FsHistoryProviderSuite.scala | 7 +- .../spark/deploy/master/MasterSuite.scala | 197 +- .../deploy/worker/ExecutorRunnerTest.scala | 16 +- .../spark/metrics/MetricsSystemSuite.scala | 2 +- .../ResourceProfileManagerSuite.scala | 21 +- .../spark/resource/ResourceProfileSuite.scala | 36 + .../scheduler/EventLoggingListenerSuite.scala | 21 +- .../spark/scheduler/ReplayListenerSuite.scala | 30 +- .../HostLocalShuffleReadingSuite.scala | 6 +- .../apache/spark/util/JsonProtocolSuite.scala | 397 +++- .../ExternalAppendOnlyMapSuite.scala | 2 +- dev/deps/spark-deps-hadoop-2-hive-2.3 | 8 +- dev/deps/spark-deps-hadoop-3-hive-2.3 | 8 +- dev/infra/Dockerfile | 56 + dev/requirements.txt | 2 + docs/README.md | 4 +- docs/configuration.md | 2 +- docs/job-scheduling.md | 4 + docs/spark-standalone.md | 8 + .../GeneralizedLinearRegression.scala | 3 +- .../ml/regression/LinearRegression.scala | 1 + .../pmml/export/KMeansPMMLModelExport.scala | 2 +- .../org/apache/spark/ml/MLEventsSuite.scala | 12 +- .../spark/ml/feature/InstanceSuite.scala | 2 +- .../RandomForestRegressorSuite.scala | 2 +- .../ml/tuning/TrainValidationSplitSuite.scala | 2 +- .../classification/NaiveBayesSuite.scala | 1 + pom.xml | 24 +- project/MimaExcludes.scala | 10 +- .../source/reference/pyspark.sql/catalog.rst | 6 + python/pyspark/mllib/linalg/distributed.py | 4 +- python/pyspark/pandas/numpy_compat.py | 2 +- python/pyspark/pandas/series.py | 2 +- python/pyspark/pandas/tests/test_resample.py | 16 +- python/pyspark/sql/catalog.py | 136 +- python/pyspark/sql/column.py | 117 +- python/pyspark/sql/tests/test_catalog.py | 60 +- python/run-tests.py | 3 +- .../features/BasicExecutorFeatureStep.scala | 5 +- .../cluster/k8s/ExecutorPodsAllocator.scala | 28 +- .../BasicExecutorFeatureStepSuite.scala | 31 +- .../k8s/ExecutorPodsAllocatorSuite.scala | 10 +- ...bernetesClusterSchedulerBackendSuite.scala | 4 + .../k8s/KubernetesExecutorBuilderSuite.scala | 5 + .../mesos/MesosClusterDispatcherSuite.scala | 2 +- .../spark/deploy/yarn/YarnAllocator.scala | 12 +- .../aggregate/GeneralAggregateFunc.java | 4 + .../catalyst/analysis/FunctionRegistry.scala | 2 +- .../sql/catalyst/catalog/SessionCatalog.scala | 7 + .../spark/sql/catalyst/csv/CSVOptions.scala | 6 +- .../catalyst/encoders/ExpressionEncoder.scala | 3 +- ...CodeGeneratorWithInterpretedFallback.scala | 5 +- .../sql/catalyst/expressions/SortOrder.scala | 4 +- .../expressions/numberFormatExpressions.scala | 69 - .../expressions/objects/objects.scala | 32 +- .../expressions/regexpExpressions.scala | 85 +- .../spark/sql/catalyst/identifiers.scala | 2 +- .../optimizer/NestedColumnAliasing.scala | 19 - .../sql/catalyst/optimizer/Optimizer.scala | 15 +- .../BasicStatsPlanVisitor.scala | 4 +- .../SizeInBytesOnlyStatsPlanVisitor.scala | 2 +- .../spark/sql/catalyst/trees/TreeNode.scala | 6 +- .../apache/spark/sql/internal/SQLConf.scala | 3 +- .../expressions/StringExpressionsSuite.scala | 56 - .../optimizer/ColumnPruningSuite.scala | 32 - .../BasicStatsEstimationSuite.scala | 2 +- .../FilterPushdownBenchmark-jdk11-results.txt | 764 ++++---- .../FilterPushdownBenchmark-jdk17-results.txt | 904 +++++---- .../FilterPushdownBenchmark-results.txt | 764 ++++---- .../TPCDSQueryBenchmark-jdk11-results.txt | 810 ++++---- .../TPCDSQueryBenchmark-jdk17-results.txt | 810 ++++++++ .../TPCDSQueryBenchmark-results.txt | 810 ++++---- ...deredAndProjectBenchmark-jdk11-results.txt | 12 + ...deredAndProjectBenchmark-jdk17-results.txt | 12 + ...TakeOrderedAndProjectBenchmark-results.txt | 12 + .../vectorized/ColumnVectorUtils.java | 62 - .../apache/spark/sql/catalog/interface.scala | 19 +- .../analysis/ResolveSessionCatalog.scala | 9 +- .../adaptive/AdaptiveSparkPlanExec.scala | 2 +- .../datasources/DataSourceStrategy.scala | 48 +- .../datasources/PartitioningUtils.scala | 2 +- .../datasources/v2/DataSourceV2Strategy.scala | 9 + .../v2/DescribeNamespaceExec.scala | 3 +- .../datasources/v2/ShowFunctionsExec.scala | 67 + .../v2/parquet/ParquetScanBuilder.scala | 2 +- .../apache/spark/sql/execution/limit.scala | 16 +- .../StateSchemaCompatibilityChecker.scala | 26 +- .../streaming/state/StateStore.scala | 7 +- .../streaming/state/StateStoreConf.scala | 7 +- .../streaming/statefulOperators.scala | 4 +- .../spark/sql/expressions/WindowSpec.scala | 2 +- .../spark/sql/internal/CatalogImpl.scala | 118 +- .../org/apache/spark/sql/jdbc/H2Dialect.scala | 27 +- .../apache/spark/sql/jdbc/JdbcDialects.scala | 2 +- .../apache/spark/sql/jdbc/MySQLDialect.scala | 2 +- .../sql-functions/sql-expression-schema.md | 2 +- .../sql-tests/inputs/regexp-functions.sql | 9 + .../results/regexp-functions.sql.out | 56 + .../commits/.0.crc | Bin 0 -> 12 bytes .../commits/.1.crc | Bin 0 -> 12 bytes .../commits/0 | 2 + .../commits/1 | 2 + .../metadata | 1 + .../offsets/.0.crc | Bin 0 -> 16 bytes .../offsets/.1.crc | Bin 0 -> 16 bytes .../offsets/0 | 3 + .../offsets/1 | 3 + .../state/0/0/.1.delta.crc | Bin 0 -> 12 bytes .../state/0/0/.2.delta.crc | Bin 0 -> 12 bytes .../state/0/0/1.delta | Bin 0 -> 77 bytes .../state/0/0/2.delta | Bin 0 -> 46 bytes .../state/0/0/_metadata/.schema.crc | Bin 0 -> 12 bytes .../state/0/0/_metadata/schema | Bin 0 -> 254 bytes .../state/0/1/.1.delta.crc | Bin 0 -> 12 bytes .../state/0/1/.2.delta.crc | Bin 0 -> 12 bytes .../state/0/1/1.delta | Bin 0 -> 46 bytes .../state/0/1/2.delta | Bin 0 -> 77 bytes .../state/0/2/.1.delta.crc | Bin 0 -> 12 bytes .../state/0/2/.2.delta.crc | Bin 0 -> 12 bytes .../state/0/2/1.delta | Bin 0 -> 46 bytes .../state/0/2/2.delta | Bin 0 -> 46 bytes .../state/0/3/.1.delta.crc | Bin 0 -> 12 bytes .../state/0/3/.2.delta.crc | Bin 0 -> 12 bytes .../state/0/3/1.delta | Bin 0 -> 46 bytes .../state/0/3/2.delta | Bin 0 -> 46 bytes .../state/0/4/.1.delta.crc | Bin 0 -> 12 bytes .../state/0/4/.2.delta.crc | Bin 0 -> 12 bytes .../state/0/4/1.delta | Bin 0 -> 46 bytes .../state/0/4/2.delta | Bin 0 -> 46 bytes .../approved-plans-v1_4/q83.ansi/explain.txt | 28 +- .../q83.ansi/simplified.txt | 14 +- .../q83.sf100.ansi/explain.txt | 28 +- .../q83.sf100.ansi/simplified.txt | 14 +- .../org/apache/spark/sql/DataFrameSuite.scala | 5 + .../sql/DataFrameWindowFunctionsSuite.scala | 2 +- .../sql/DynamicPartitionPruningSuite.scala | 19 + .../org/apache/spark/sql/SQLQuerySuite.scala | 2 +- .../apache/spark/sql/SQLQueryTestSuite.scala | 3 + .../sql/execution/SQLJsonProtocolSuite.scala | 6 +- .../TakeOrderedAndProjectSuite.scala | 34 + .../ConstantColumnVectorBenchmark.scala | 34 +- .../benchmark/TPCDSQueryBenchmark.scala | 31 +- .../TPCDSQueryBenchmarkArguments.scala | 2 +- .../TakeOrderedAndProjectBenchmark.scala | 76 + .../command/ShowFunctionsSuiteBase.scala | 122 +- .../command/v1/ShowFunctionsSuite.scala | 56 +- .../command/v2/CommandSuiteBase.scala | 18 +- .../command/v2/DescribeNamespaceSuite.scala | 2 +- .../command/v2/ShowFunctionsSuite.scala | 29 +- .../datasources/FileFormatWriterSuite.scala | 2 +- .../execution/datasources/csv/CSVSuite.scala | 35 + .../v2/jdbc/JDBCTableCatalogSuite.scala | 1 - .../execution/metric/SQLMetricsSuite.scala | 7 +- ...StateSchemaCompatibilityCheckerSuite.scala | 49 +- .../ui/SQLAppStatusListenerSuite.scala | 8 +- .../sql/execution/ui/SparkPlanInfoSuite.scala | 2 +- .../vectorized/ColumnVectorSuite.scala | 11 +- .../spark/sql/internal/CatalogSuite.scala | 90 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 4 +- .../apache/spark/sql/jdbc/JDBCV2Suite.scala | 119 +- .../StreamingDeduplicationSuite.scala | 70 + .../StreamingQueryListenerSuite.scala | 6 +- .../command/ShowFunctionsSuite.scala | 36 - .../api/java/JavaStreamingListener.scala | 2 +- 206 files changed, 6666 insertions(+), 3809 deletions(-) create mode 100644 core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt create mode 100644 core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt create mode 100644 core/src/main/scala/org/apache/spark/deploy/master/ExecutorResourceDescription.scala create mode 100644 dev/infra/Dockerfile create mode 100644 sql/core/benchmarks/TPCDSQueryBenchmark-jdk17-results.txt create mode 100644 sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk11-results.txt create mode 100644 sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk17-results.txt create mode 100644 sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowFunctionsExec.scala create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.0.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.1.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/0 create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/1 create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/metadata create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/.0.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/.1.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/0 create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/1 create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.1.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.2.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/1.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/2.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/.schema.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/schema create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.1.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.2.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/1.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/2.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.1.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.2.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/1.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/2.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.1.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.2.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/1.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/2.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.1.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.2.delta.crc create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/1.delta create mode 100644 sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/2.delta create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TakeOrderedAndProjectBenchmark.scala diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index be47d0117e..5c1de0beac 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -135,7 +135,7 @@ setMethod("orderBy", #' An offset indicates the number of rows above or below the current row, the frame for the #' current row starts or ends. For instance, given a row based sliding frame with a lower bound #' offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from -#' index 4 to index 6. +#' index 4 to index 7. #' #' @param x a WindowSpec #' @param start boundary start, inclusive. diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 2acb7a9ceb..b3218abb13 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -4015,8 +4015,8 @@ test_that("catalog APIs, currentDatabase, setCurrentDatabase, listDatabases", { expect_equal(currentDatabase(), "default") expect_error(setCurrentDatabase("default"), NA) expect_error(setCurrentDatabase("zxwtyswklpf"), - paste0("Error in setCurrentDatabase : analysis error - Database ", - "'zxwtyswklpf' does not exist")) + paste0("Error in setCurrentDatabase : no such database - Database ", + "'zxwtyswklpf' not found")) dbs <- collect(listDatabases()) expect_equal(names(dbs), c("name", "catalog", "description", "locationUri")) expect_equal(which(dbs[, 1] == "default"), 1) @@ -4050,12 +4050,12 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { f <- listFunctions() expect_true(nrow(f) >= 200) # 250 expect_equal(colnames(f), - c("name", "database", "description", "className", "isTemporary")) + c("name", "catalog", "namespace", "description", "className", "isTemporary")) expect_equal(take(orderBy(f, "className"), 1)$className, "org.apache.spark.sql.catalyst.expressions.Abs") expect_error(listFunctions("zxwtyswklpf_db"), - paste("Error in listFunctions : analysis error - Database", - "'zxwtyswklpf_db' does not exist")) + paste("Error in listFunctions : no such database - Database", + "'zxwtyswklpf_db' not found")) # recoverPartitions does not work with temporary view expect_error(recoverPartitions("cars"), diff --git a/R/run-tests.sh b/R/run-tests.sh index 9a90afe768..ca5b661127 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -30,9 +30,9 @@ if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then fi if [ -z "$SPARK_JARS" ]; then - SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE else - SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE fi FAILED=$((PIPESTATUS[0]||$FAILED)) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala index 97f521a378..6e76b74c7d 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala @@ -119,6 +119,8 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest override def supportsIndex: Boolean = true + override def supportListIndexes: Boolean = true + override def indexOptions: String = "KEY_BLOCK_SIZE=10" testVarPop() diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala index 5f0033490d..0f85bd534c 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala @@ -197,6 +197,8 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu def supportsIndex: Boolean = false + def supportListIndexes: Boolean = false + def indexOptions: String = "" test("SPARK-36895: Test INDEX Using SQL") { @@ -219,11 +221,21 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu s" The supported Index Types are:")) sql(s"CREATE index i1 ON $catalogName.new_table USING BTREE (col1)") + assert(jdbcTable.indexExists("i1")) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 1) + assert(indexes.head.indexName() == "i1") + } + sql(s"CREATE index i2 ON $catalogName.new_table (col2, col3, col5)" + s" OPTIONS ($indexOptions)") - - assert(jdbcTable.indexExists("i1") == true) - assert(jdbcTable.indexExists("i2") == true) + assert(jdbcTable.indexExists("i2")) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 2) + assert(indexes.map(_.indexName()).sorted === Array("i1", "i2")) + } // This should pass without exception sql(s"CREATE index IF NOT EXISTS i1 ON $catalogName.new_table (col1)") @@ -234,10 +246,18 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu assert(m.contains("Failed to create index i1 in new_table")) sql(s"DROP index i1 ON $catalogName.new_table") - sql(s"DROP index i2 ON $catalogName.new_table") - assert(jdbcTable.indexExists("i1") == false) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 1) + assert(indexes.head.indexName() == "i2") + } + + sql(s"DROP index i2 ON $catalogName.new_table") assert(jdbcTable.indexExists("i2") == false) + if (supportListIndexes) { + assert(jdbcTable.listIndexes().isEmpty) + } // This should pass without exception sql(s"DROP index IF EXISTS i1 ON $catalogName.new_table") diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt new file mode 100644 index 0000000000..96fa24175c --- /dev/null +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +MapStatuses Convert Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Num Maps: 50000 Fetch partitions:500 1324 1333 7 0.0 1324283680.0 1.0X +Num Maps: 50000 Fetch partitions:1000 2650 2670 32 0.0 2650318387.0 0.5X +Num Maps: 50000 Fetch partitions:1500 4018 4059 53 0.0 4017921009.0 0.3X + + diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt new file mode 100644 index 0000000000..0ba8d756df --- /dev/null +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +MapStatuses Convert Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Num Maps: 50000 Fetch partitions:500 1092 1104 22 0.0 1091691925.0 1.0X +Num Maps: 50000 Fetch partitions:1000 2172 2192 29 0.0 2171702137.0 0.5X +Num Maps: 50000 Fetch partitions:1500 3268 3291 27 0.0 3267904436.0 0.3X + + diff --git a/core/benchmarks/MapStatusesConvertBenchmark-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-results.txt index f41401bbe2..ae84abfdcc 100644 --- a/core/benchmarks/MapStatusesConvertBenchmark-results.txt +++ b/core/benchmarks/MapStatusesConvertBenchmark-results.txt @@ -2,12 +2,12 @@ MapStatuses Convert Benchmark ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1025-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Num Maps: 50000 Fetch partitions:500 1330 1359 26 0.0 1329827185.0 1.0X -Num Maps: 50000 Fetch partitions:1000 2648 2666 20 0.0 2647944453.0 0.5X -Num Maps: 50000 Fetch partitions:1500 4155 4436 383 0.0 4154563448.0 0.3X +Num Maps: 50000 Fetch partitions:500 1001 1033 36 0.0 1000638934.0 1.0X +Num Maps: 50000 Fetch partitions:1000 1699 1705 7 0.0 1699358972.0 0.6X +Num Maps: 50000 Fetch partitions:1500 2647 2855 314 0.0 2646904255.0 0.4X diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1914c7eac2..6cb4f04ac7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -373,12 +373,6 @@ class SparkContext(config: SparkConf) extends Logging { | stop() method to be called. | * ------------------------------------------------------------------------------------- */ - private def warnSparkMem(value: String): String = { - logWarning("Using SPARK_MEM to set amount of memory to use per executor process is " + - "deprecated, please use spark.executor.memory instead.") - value - } - /** Control our logLevel. This overrides any user-defined log settings. * @param logLevel The desired log level as a string. * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN @@ -530,12 +524,7 @@ class SparkContext(config: SparkConf) extends Logging { } } - _executorMemory = _conf.getOption(EXECUTOR_MEMORY.key) - .orElse(Option(System.getenv("SPARK_EXECUTOR_MEMORY"))) - .orElse(Option(System.getenv("SPARK_MEM")) - .map(warnSparkMem)) - .map(Utils.memoryStringToMb) - .getOrElse(1024) + _executorMemory = SparkContext.executorMemoryInMb(_conf) // Convert java options to env vars as a work around // since we can't set env vars directly in sbt. @@ -2890,6 +2879,21 @@ object SparkContext extends Logging { } } + private[spark] def executorMemoryInMb(conf: SparkConf): Int = { + conf.getOption(EXECUTOR_MEMORY.key) + .orElse(Option(System.getenv("SPARK_EXECUTOR_MEMORY"))) + .orElse(Option(System.getenv("SPARK_MEM")) + .map(warnSparkMem)) + .map(Utils.memoryStringToMb) + .getOrElse(1024) + } + + private def warnSparkMem(value: String): String = { + logWarning("Using SPARK_MEM to set amount of memory to use per executor process is " + + "deprecated, please use spark.executor.memory instead.") + value + } + /** * Create a task scheduler based on a given master URL. * Return a 2-tuple of the scheduler backend and the task scheduler. diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 2f9152d31a..28ab9dc074 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -78,7 +78,7 @@ class SparkEnv ( // A general, soft-reference map for metadata needed during HadoopRDD split computation // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats). private[spark] val hadoopJobMetadata = - CacheBuilder.newBuilder().softValues().build[String, AnyRef]().asMap() + CacheBuilder.newBuilder().maximumSize(1000).softValues().build[String, AnyRef]().asMap() private[spark] var driverTmpDir: Option[String] = None diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 33f2b18cb2..d11d2e7a4f 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -697,7 +697,7 @@ private[spark] class PythonAccumulatorV2( @transient private val serverHost: String, private val serverPort: Int, private val secretToken: String) - extends CollectionAccumulator[Array[Byte]] with Logging{ + extends CollectionAccumulator[Array[Byte]] with Logging { Utils.checkHost(serverHost) diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index dd962ca11e..a2a7fb5c10 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -48,7 +48,7 @@ private[spark] object SerDeUtil extends Logging { // This should be called before trying to unpickle array.array from Python // In cluster mode, this should be put in closure def initialize(): Unit = { - synchronized{ + synchronized { if (!initialized) { Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor()) Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor()) diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala index e11f497b4b..39c2af0184 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala @@ -19,23 +19,28 @@ package org.apache.spark.deploy import java.net.URI -import org.apache.spark.resource.ResourceRequirement +import org.apache.spark.resource.{ResourceProfile, ResourceRequirement, ResourceUtils} +import org.apache.spark.resource.ResourceProfile.getCustomExecutorResources private[spark] case class ApplicationDescription( name: String, maxCores: Option[Int], - memoryPerExecutorMB: Int, command: Command, appUiUrl: String, + defaultProfile: ResourceProfile, eventLogDir: Option[URI] = None, // short name of compression codec used when writing event logs, if any (e.g. lzf) eventLogCodec: Option[String] = None, - coresPerExecutor: Option[Int] = None, // number of executors this application wants to start with, // only used if dynamic allocation is enabled initialExecutorLimit: Option[Int] = None, - user: String = System.getProperty("user.name", ""), - resourceReqsPerExecutor: Seq[ResourceRequirement] = Seq.empty) { + user: String = System.getProperty("user.name", "")) { + + def memoryPerExecutorMB: Int = defaultProfile.getExecutorMemory.map(_.toInt).getOrElse(1024) + def coresPerExecutor: Option[Int] = defaultProfile.getExecutorCores + def resourceReqsPerExecutor: Seq[ResourceRequirement] = + ResourceUtils.executorResourceRequestToRequirement( + getCustomExecutorResources(defaultProfile).values.toSeq.sortBy(_.resourceName)) override def toString: String = "ApplicationDescription(" + name + ")" } diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala index 727cdbc4ef..4ec0edd590 100644 --- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala @@ -24,7 +24,7 @@ import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo} import org.apache.spark.deploy.master.DriverState.DriverState import org.apache.spark.deploy.master.RecoveryState.MasterState import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner} -import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.{ResourceInformation, ResourceProfile} import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef} import org.apache.spark.util.Utils @@ -166,6 +166,7 @@ private[deploy] object DeployMessages { masterUrl: String, appId: String, execId: Int, + rpId: Int, appDesc: ApplicationDescription, cores: Int, memory: Int, @@ -196,7 +197,7 @@ private[deploy] object DeployMessages { case class MasterChangeAcknowledged(appId: String) - case class RequestExecutors(appId: String, requestedTotal: Int) + case class RequestExecutors(appId: String, resourceProfileToTotalExecs: Map[ResourceProfile, Int]) case class KillExecutors(appId: String, executorIds: Seq[String]) diff --git a/core/src/main/scala/org/apache/spark/deploy/ExecutorDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ExecutorDescription.scala index ec23371b52..f613f6d78d 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ExecutorDescription.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ExecutorDescription.scala @@ -25,10 +25,13 @@ package org.apache.spark.deploy private[deploy] class ExecutorDescription( val appId: String, val execId: Int, + val rpId: Int, val cores: Int, + val memoryMb: Int, val state: ExecutorState.Value) extends Serializable { override def toString: String = - "ExecutorState(appId=%s, execId=%d, cores=%d, state=%s)".format(appId, execId, cores, state) + "ExecutorState(appId=%s, execId=%d, rpId=%d, cores=%d, memoryMb=%d state=%s)" + .format(appId, execId, rpId, cores, memoryMb, state) } diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala index e5efb15f6b..b8857ba9dc 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala @@ -30,6 +30,7 @@ import org.apache.spark.deploy.{ApplicationDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.Master import org.apache.spark.internal.Logging +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc._ import org.apache.spark.scheduler.ExecutorDecommissionInfo import org.apache.spark.util.{RpcUtils, ThreadUtils} @@ -294,14 +295,25 @@ private[spark] class StandaloneAppClient( } /** - * Request executors from the Master by specifying the total number desired, - * including existing pending and running executors. + * Request executors for default resource profile from the Master by specifying the + * total number desired, including existing pending and running executors. * * @return whether the request is acknowledged. */ def requestTotalExecutors(requestedTotal: Int): Future[Boolean] = { + requestTotalExecutors(Map(appDescription.defaultProfile -> requestedTotal)) + } + + /** + * Request executors from the Master by specifying the total number desired for each + * resource profile, including existing pending and running executors. + * + * @return whether the request is acknowledged. + */ + def requestTotalExecutors( + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = { if (endpoint.get != null && appId.get != null) { - endpoint.get.ask[Boolean](RequestExecutors(appId.get, requestedTotal)) + endpoint.get.ask[Boolean](RequestExecutors(appId.get, resourceProfileToTotalExecs)) } else { logWarning("Attempted to request executors before driver fully initialized.") Future.successful(false) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala index a5f2394960..02c01a5598 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala @@ -21,7 +21,6 @@ import scala.io.{Codec, Source} import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileSystem, Path} -import org.json4s.jackson.JsonMethods.parse import org.apache.spark.deploy.history.EventFilter.FilterStatistics import org.apache.spark.internal.Logging @@ -81,7 +80,7 @@ private[spark] object EventFilter extends Logging { lines.zipWithIndex.foreach { case (line, lineNum) => try { val event = try { - Some(JsonProtocol.sparkEventFromJson(parse(line))) + Some(JsonProtocol.sparkEventFromJson(line)) } catch { // ignore any exception occurred from unidentified json case NonFatal(_) => diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala index 03965e6dbb..a2926ca64b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala @@ -23,7 +23,8 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.deploy.ApplicationDescription -import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceUtils} +import org.apache.spark.resource.ResourceProfile.{getCustomExecutorResources, DEFAULT_RESOURCE_PROFILE_ID} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils @@ -43,10 +44,10 @@ private[spark] class ApplicationInfo( @transient var endTime: Long = _ @transient var appSource: ApplicationSource = _ - // A cap on the number of executors this application can have at any given time. - // By default, this is infinite. Only after the first allocation request is issued by the - // application will this be set to a finite value. This is used for dynamic allocation. - @transient private[master] var executorLimit: Int = _ + @transient private var executorsPerResourceProfileId: mutable.HashMap[Int, mutable.Set[Int]] = _ + @transient private var targetNumExecutorsPerResourceProfileId: mutable.HashMap[Int, Int] = _ + @transient private var rpIdToResourceProfile: mutable.HashMap[Int, ResourceProfile] = _ + @transient private var rpIdToResourceDesc: mutable.HashMap[Int, ExecutorResourceDescription] = _ @transient private var nextExecutorId: Int = _ @@ -65,7 +66,68 @@ private[spark] class ApplicationInfo( appSource = new ApplicationSource(this) nextExecutorId = 0 removedExecutors = new ArrayBuffer[ExecutorDesc] - executorLimit = desc.initialExecutorLimit.getOrElse(Integer.MAX_VALUE) + val initialExecutorLimit = desc.initialExecutorLimit.getOrElse(Integer.MAX_VALUE) + + rpIdToResourceProfile = new mutable.HashMap[Int, ResourceProfile]() + rpIdToResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) = desc.defaultProfile + rpIdToResourceDesc = new mutable.HashMap[Int, ExecutorResourceDescription]() + createResourceDescForResourceProfile(desc.defaultProfile) + + targetNumExecutorsPerResourceProfileId = new mutable.HashMap[Int, Int]() + targetNumExecutorsPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID) = initialExecutorLimit + + executorsPerResourceProfileId = new mutable.HashMap[Int, mutable.Set[Int]]() + } + + private[deploy] def getOrUpdateExecutorsForRPId(rpId: Int): mutable.Set[Int] = { + executorsPerResourceProfileId.getOrElseUpdate(rpId, mutable.HashSet[Int]()) + } + + private[deploy] def getTargetExecutorNumForRPId(rpId: Int): Int = { + targetNumExecutorsPerResourceProfileId.getOrElse(rpId, 0) + } + + private[deploy] def getRequestedRPIds(): Seq[Int] = { + rpIdToResourceProfile.keys.toSeq.sorted + } + + private def createResourceDescForResourceProfile(resourceProfile: ResourceProfile): Unit = { + if (!rpIdToResourceDesc.contains(resourceProfile.id)) { + val defaultMemoryMbPerExecutor = desc.memoryPerExecutorMB + val defaultCoresPerExecutor = desc.coresPerExecutor + val coresPerExecutor = resourceProfile.getExecutorCores + .orElse(defaultCoresPerExecutor) + val memoryMbPerExecutor = resourceProfile.getExecutorMemory + .map(_.toInt) + .getOrElse(defaultMemoryMbPerExecutor) + val customResources = ResourceUtils.executorResourceRequestToRequirement( + getCustomExecutorResources(resourceProfile).values.toSeq.sortBy(_.resourceName)) + + rpIdToResourceDesc(resourceProfile.id) = + ExecutorResourceDescription(coresPerExecutor, memoryMbPerExecutor, customResources) + } + } + + // Get resources required for schedule. + private[deploy] def getResourceDescriptionForRpId(rpId: Int): ExecutorResourceDescription = { + rpIdToResourceDesc(rpId) + } + + private[deploy] def requestExecutors( + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Unit = { + resourceProfileToTotalExecs.foreach { case (rp, num) => + createResourceDescForResourceProfile(rp) + + if (!rpIdToResourceProfile.contains(rp.id)) { + rpIdToResourceProfile(rp.id) = rp + } + + targetNumExecutorsPerResourceProfileId(rp.id) = num + } + } + + private[deploy] def getResourceProfileById(rpId: Int): ResourceProfile = { + rpIdToResourceProfile(rpId) } private def newExecutorId(useID: Option[Int] = None): Int = { @@ -83,11 +145,14 @@ private[spark] class ApplicationInfo( private[master] def addExecutor( worker: WorkerInfo, cores: Int, + memoryMb: Int, resources: Map[String, ResourceInformation], + rpId: Int, useID: Option[Int] = None): ExecutorDesc = { - val exec = new ExecutorDesc(newExecutorId(useID), this, worker, cores, - desc.memoryPerExecutorMB, resources) + val exec = new ExecutorDesc( + newExecutorId(useID), this, worker, cores, memoryMb, resources, rpId) executors(exec.id) = exec + getOrUpdateExecutorsForRPId(rpId).add(exec.id) coresGranted += cores exec } @@ -96,6 +161,7 @@ private[spark] class ApplicationInfo( if (executors.contains(exec.id)) { removedExecutors += executors(exec.id) executors -= exec.id + executorsPerResourceProfileId(exec.rpId) -= exec.id coresGranted -= exec.cores } } @@ -125,10 +191,11 @@ private[spark] class ApplicationInfo( } /** - * Return the limit on the number of executors this application can have. - * For testing only. + * Return the total limit on the number of executors for all resource profiles. */ - private[deploy] def getExecutorLimit: Int = executorLimit + private[deploy] def getExecutorLimit: Int = { + targetNumExecutorsPerResourceProfileId.values.sum + } def duration: Long = { if (endTime != -1) { diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala index a598d2a1dd..eaf93b67ee 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala @@ -28,7 +28,8 @@ private[master] class ExecutorDesc( val memory: Int, // resources(e.f. gpu/fpga) allocated to this executor // map from resource name to ResourceInformation - val resources: Map[String, ResourceInformation]) { + val resources: Map[String, ResourceInformation], + val rpId: Int) { var state = ExecutorState.LAUNCHING diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorResourceDescription.scala b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorResourceDescription.scala new file mode 100644 index 0000000000..2d594e90bf --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorResourceDescription.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.master + +import org.apache.spark.resource.ResourceRequirement + +/** + * Describe resource requirements for different resource profiles. Used for executor schedule. + * + * @param coresPerExecutor cores for each executor. + * @param memoryMbPerExecutor memory for each executor. + * @param customResourcesPerExecutor custom resource requests for each executor. + */ +private[spark] case class ExecutorResourceDescription( + coresPerExecutor: Option[Int], + memoryMbPerExecutor: Int, + customResourcesPerExecutor: Seq[ResourceRequirement] = Seq.empty) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 2939940698..6085a41963 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -37,7 +37,7 @@ import org.apache.spark.internal.config.Deploy._ import org.apache.spark.internal.config.UI._ import org.apache.spark.internal.config.Worker._ import org.apache.spark.metrics.{MetricsSystem, MetricsSystemInstances} -import org.apache.spark.resource.{ResourceRequirement, ResourceUtils} +import org.apache.spark.resource.{ResourceProfile, ResourceRequirement, ResourceUtils} import org.apache.spark.rpc._ import org.apache.spark.serializer.{JavaSerializer, Serializer} import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils} @@ -348,8 +348,8 @@ private[deploy] class Master( for (exec <- validExecutors) { val (execDesc, execResources) = (exec.desc, exec.resources) val app = idToApp(execDesc.appId) - val execInfo = app.addExecutor( - worker, execDesc.cores, execResources, Some(execDesc.execId)) + val execInfo = app.addExecutor(worker, execDesc.cores, + execDesc.memoryMb, execResources, execDesc.rpId, Some(execDesc.execId)) worker.addExecutor(execInfo) worker.recoverResources(execResources) execInfo.copyState(execDesc) @@ -482,8 +482,8 @@ private[deploy] class Master( case BoundPortsRequest => context.reply(BoundPortsResponse(address.port, webUi.boundPort, restServerBoundPort)) - case RequestExecutors(appId, requestedTotal) => - context.reply(handleRequestExecutors(appId, requestedTotal)) + case RequestExecutors(appId, resourceProfileToTotalExecs: Map[ResourceProfile, Int]) => + context.reply(handleRequestExecutors(appId, resourceProfileToTotalExecs)) case KillExecutors(appId, executorIds) => val formattedExecutorIds = formatExecutorIds(executorIds) @@ -647,13 +647,15 @@ private[deploy] class Master( */ private def scheduleExecutorsOnWorkers( app: ApplicationInfo, + rpId: Int, + resourceDesc: ExecutorResourceDescription, usableWorkers: Array[WorkerInfo], spreadOutApps: Boolean): Array[Int] = { - val coresPerExecutor = app.desc.coresPerExecutor + val coresPerExecutor = resourceDesc.coresPerExecutor val minCoresPerExecutor = coresPerExecutor.getOrElse(1) val oneExecutorPerWorker = coresPerExecutor.isEmpty - val memoryPerExecutor = app.desc.memoryPerExecutorMB - val resourceReqsPerExecutor = app.desc.resourceReqsPerExecutor + val memoryPerExecutor = resourceDesc.memoryMbPerExecutor + val resourceReqsPerExecutor = resourceDesc.customResourcesPerExecutor val numUsable = usableWorkers.length val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker @@ -679,7 +681,9 @@ private[deploy] class Master( } val enoughResources = ResourceUtils.resourcesMeetRequirements( resourcesFree, resourceReqsPerExecutor) - val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit + val executorNum = app.getOrUpdateExecutorsForRPId(rpId).size + val executorLimit = app.getTargetExecutorNumForRPId(rpId) + val underLimit = assignedExecutors.sum + executorNum < executorLimit keepScheduling && enoughCores && enoughMemory && enoughResources && underLimit } else { // We're adding cores to an existing executor, so no need @@ -725,26 +729,38 @@ private[deploy] class Master( */ private def startExecutorsOnWorkers(): Unit = { // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app - // in the queue, then the second app, etc. + // in the queue, then the second app, etc. And for each app, we will schedule base on + // resource profiles also with a simple FIFO scheduler, resource profile with smaller id + // first. for (app <- waitingApps) { - val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1) - // If the cores left is less than the coresPerExecutor,the cores left will not be allocated - if (app.coresLeft >= coresPerExecutor) { - // Filter out workers that don't have enough resources to launch an executor - val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE) - .filter(canLaunchExecutor(_, app.desc)) - .sortBy(_.coresFree).reverse - val appMayHang = waitingApps.length == 1 && - waitingApps.head.executors.isEmpty && usableWorkers.isEmpty - if (appMayHang) { - logWarning(s"App ${app.id} requires more resource than any of Workers could have.") - } - val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps) - - // Now that we've decided how many cores to allocate on each worker, let's allocate them - for (pos <- usableWorkers.indices if assignedCores(pos) > 0) { - allocateWorkerResourceToExecutors( - app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos)) + for (rpId <- app.getRequestedRPIds()) { + logInfo(s"Start scheduling for app ${app.id} with rpId: $rpId") + val resourceDesc = app.getResourceDescriptionForRpId(rpId) + val coresPerExecutor = resourceDesc.coresPerExecutor.getOrElse(1) + + // If the cores left is less than the coresPerExecutor,the cores left will not be allocated + if (app.coresLeft >= coresPerExecutor) { + // Filter out workers that don't have enough resources to launch an executor + val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE) + .filter(canLaunchExecutor(_, resourceDesc)) + .sortBy(_.coresFree).reverse + val appMayHang = waitingApps.length == 1 && + waitingApps.head.executors.isEmpty && usableWorkers.isEmpty + if (appMayHang) { + logWarning(s"App ${app.id} requires more resource than any of Workers could have.") + } + val assignedCores = + scheduleExecutorsOnWorkers(app, rpId, resourceDesc, usableWorkers, spreadOutApps) + + // Now that we've decided how many cores to allocate on each worker, let's allocate them + for (pos <- usableWorkers.indices if assignedCores(pos) > 0) { + allocateWorkerResourceToExecutors( + app, + assignedCores(pos), + resourceDesc, + usableWorkers(pos), + rpId) + } } } } @@ -754,22 +770,26 @@ private[deploy] class Master( * Allocate a worker's resources to one or more executors. * @param app the info of the application which the executors belong to * @param assignedCores number of cores on this worker for this application - * @param coresPerExecutor number of cores per executor + * @param resourceDesc resources requested for the executor * @param worker the worker info + * @param rpId resource profile id for the executor */ private def allocateWorkerResourceToExecutors( app: ApplicationInfo, assignedCores: Int, - coresPerExecutor: Option[Int], - worker: WorkerInfo): Unit = { + resourceDesc: ExecutorResourceDescription, + worker: WorkerInfo, + rpId: Int): Unit = { + val coresPerExecutor = resourceDesc.coresPerExecutor // If the number of cores per executor is specified, we divide the cores assigned // to this worker evenly among the executors with no remainder. // Otherwise, we launch a single executor that grabs all the assignedCores on this worker. val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1) val coresToAssign = coresPerExecutor.getOrElse(assignedCores) for (i <- 1 to numExecutors) { - val allocated = worker.acquireResources(app.desc.resourceReqsPerExecutor) - val exec = app.addExecutor(worker, coresToAssign, allocated) + val allocated = worker.acquireResources(resourceDesc.customResourcesPerExecutor) + val exec = app.addExecutor( + worker, coresToAssign, resourceDesc.memoryMbPerExecutor, allocated, rpId) launchExecutor(worker, exec) app.state = ApplicationState.RUNNING } @@ -798,12 +818,14 @@ private[deploy] class Master( /** * @return whether the worker could launch the executor according to application's requirement */ - private def canLaunchExecutor(worker: WorkerInfo, desc: ApplicationDescription): Boolean = { + private def canLaunchExecutor( + worker: WorkerInfo, + resourceDesc: ExecutorResourceDescription): Boolean = { canLaunch( worker, - desc.memoryPerExecutorMB, - desc.coresPerExecutor.getOrElse(1), - desc.resourceReqsPerExecutor) + resourceDesc.memoryMbPerExecutor, + resourceDesc.coresPerExecutor.getOrElse(1), + resourceDesc.customResourcesPerExecutor) } /** @@ -849,7 +871,7 @@ private[deploy] class Master( logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) worker.addExecutor(exec) worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id, - exec.application.desc, exec.cores, exec.memory, exec.resources)) + exec.rpId, exec.application.desc, exec.cores, exec.memory, exec.resources)) exec.application.driver.send( ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory)) } @@ -986,7 +1008,7 @@ private[deploy] class Master( new ApplicationInfo(now, appId, desc, date, driver, defaultCores) } - private def registerApplication(app: ApplicationInfo): Unit = { + private[master] def registerApplication(app: ApplicationInfo): Unit = { val appAddress = app.driver.address if (addressToApp.contains(appAddress)) { logInfo("Attempted to re-register application at same address: " + appAddress) @@ -1049,15 +1071,18 @@ private[deploy] class Master( * * @return whether the application has previously registered with this Master. */ - private def handleRequestExecutors(appId: String, requestedTotal: Int): Boolean = { + private def handleRequestExecutors( + appId: String, + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Boolean = { idToApp.get(appId) match { case Some(appInfo) => - logInfo(s"Application $appId requested to set total executors to $requestedTotal.") - appInfo.executorLimit = requestedTotal + logInfo(s"Application $appId requested executors: ${resourceProfileToTotalExecs}.") + appInfo.requestExecutors(resourceProfileToTotalExecs) schedule() true case None => - logWarning(s"Unknown application $appId requested $requestedTotal total executors.") + logWarning(s"Unknown application $appId requested executors:" + + s" ${resourceProfileToTotalExecs}.") false } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index 10d3acaa4e..9e10a0bbf3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -43,8 +43,8 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") return UIUtils.basicSparkPage(request, msg, "Not Found") } - val executorHeaders = Seq("ExecutorID", "Worker", "Cores", "Memory", "Resources", - "State", "Logs") + val executorHeaders = Seq("ExecutorID", "Worker", "Cores", "Memory", "Resource Profile Id", + "Resources", "State", "Logs") val allExecutors = (app.executors.values ++ app.removedExecutors).toSet.toSeq // This includes executors that are either still running or have exited cleanly val executors = allExecutors.filter { exec => @@ -76,17 +76,17 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") data-placement="top"> Executor Limit: { - if (app.executorLimit == Int.MaxValue) "Unlimited" else app.executorLimit + if (app.getExecutorLimit == Int.MaxValue) "Unlimited" else app.getExecutorLimit } ({app.executors.size} granted)
  • - Executor Memory: + Executor Memory - Default Resource Profile: {Utils.megabytesToString(app.desc.memoryPerExecutorMB)}
  • - Executor Resources: + Executor Resources - Default Resource Profile: {formatResourceRequirements(app.desc.resourceReqsPerExecutor)}
  • Submit Date: {UIUtils.formatDate(app.submitDate)}
  • @@ -145,6 +145,7 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") {executor.cores} {executor.memory} + {executor.rpId} {formatResourcesAddresses(executor.resources)} {executor.state} diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala index 40d940778e..bf5d889a87 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala @@ -58,6 +58,7 @@ private[deploy] class ExecutorRunner( conf: SparkConf, val appLocalDirs: Seq[String], @volatile var state: ExecutorState.Value, + val rpId: Int, val resources: Map[String, ResourceInformation] = Map.empty) extends Logging { @@ -139,6 +140,7 @@ private[deploy] class ExecutorRunner( case "{{HOSTNAME}}" => host case "{{CORES}}" => cores.toString case "{{APP_ID}}" => appId + case "{{RESOURCE_PROFILE_ID}}" => rpId.toString case other => other } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 8d4f390b2c..04ef9cc012 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -485,7 +485,7 @@ private[deploy] class Worker( } val execs = executors.values.map { e => - new ExecutorDescription(e.appId, e.execId, e.cores, e.state) + new ExecutorDescription(e.appId, e.execId, e.rpId, e.cores, e.memory, e.state) } masterRef.send(WorkerLatestState(workerId, execs.toList, drivers.keys.toSeq)) @@ -555,7 +555,7 @@ private[deploy] class Worker( val executorResponses = executors.values.map { e => WorkerExecutorStateResponse(new ExecutorDescription( - e.appId, e.execId, e.cores, e.state), e.resources) + e.appId, e.execId, e.rpId, e.cores, e.memory, e.state), e.resources) } val driverResponses = drivers.keys.map { id => WorkerDriverStateResponse(id, drivers(id).resources)} @@ -566,7 +566,7 @@ private[deploy] class Worker( logInfo(s"Master with url $masterUrl requested this worker to reconnect.") registerWithMaster() - case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_, resources_) => + case LaunchExecutor(masterUrl, appId, execId, rpId, appDesc, cores_, memory_, resources_) => if (masterUrl != activeMasterUrl) { logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") } else if (decommissioned) { @@ -622,6 +622,7 @@ private[deploy] class Worker( conf, appLocalDirs, ExecutorState.LAUNCHING, + rpId, resources_) executors(appId + "/" + execId) = manager manager.start() diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala b/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala index be056e15b6..7f7bb36512 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala @@ -55,7 +55,9 @@ class ResourceInformation( override def hashCode(): Int = Seq(name, addresses.toSeq).hashCode() - def toJson(): JValue = ResourceInformationJson(name, addresses).toJValue + // TODO(SPARK-39658): reconsider whether we want to expose a third-party library's + // symbols as part of a public API: + final def toJson(): JValue = ResourceInformationJson(name, addresses).toJValue } private[spark] object ResourceInformation { diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala index 087897ff73..5e02c61459 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala @@ -24,7 +24,7 @@ import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.annotation.{Evolving, Since} import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ @@ -90,6 +90,10 @@ class ResourceProfile( executorResources.get(ResourceProfile.PYSPARK_MEM).map(_.amount) } + private[spark] def getExecutorMemory: Option[Long] = { + executorResources.get(ResourceProfile.MEMORY).map(_.amount) + } + /* * This function takes into account fractional amounts for the task resource requirement. * Spark only supports fractional amounts < 1 to basically allow for multiple tasks @@ -336,9 +340,25 @@ object ResourceProfile extends Logging { private def getDefaultExecutorResources(conf: SparkConf): Map[String, ExecutorResourceRequest] = { val ereqs = new ExecutorResourceRequests() - val cores = conf.get(EXECUTOR_CORES) - ereqs.cores(cores) - val memory = conf.get(EXECUTOR_MEMORY) + + val isStandalone = conf.getOption("spark.master").exists(_.startsWith("spark://")) + // Since local-cluster and standalone share the same StandaloneSchedulerBackend and Master, + // and the Master will schedule based on resource profile, so we also need to create default + // resource profile for local-cluster here as well as standalone. + val isLocalCluster = conf.getOption("spark.master").exists(_.startsWith("local-cluster")) + // By default, standalone executors take all available cores, do not have a specific value. + val cores = if (isStandalone || isLocalCluster) { + conf.getOption(EXECUTOR_CORES.key).map(_.toInt) + } else { + Some(conf.get(EXECUTOR_CORES)) + } + cores.foreach(ereqs.cores) + + val memory = if (isStandalone || isLocalCluster) { + SparkContext.executorMemoryInMb(conf) + } else { + conf.get(EXECUTOR_MEMORY) + } ereqs.memory(memory.toString) val overheadMem = conf.get(EXECUTOR_MEMORY_OVERHEAD) overheadMem.map(mem => ereqs.memoryOverhead(mem.toString)) @@ -360,7 +380,7 @@ object ResourceProfile extends Logging { } // for testing only - private[spark] def reInitDefaultProfile(conf: SparkConf): Unit = { + private[spark] def reInitDefaultProfile(conf: SparkConf): ResourceProfile = { clearDefaultProfile() // force recreate it after clearing getOrCreateDefaultProfile(conf) @@ -402,7 +422,7 @@ object ResourceProfile extends Logging { } private[spark] case class ExecutorResourcesOrDefaults( - cores: Int, + cores: Option[Int], // Can only be None for standalone and local-cluster. executorMemoryMiB: Long, memoryOffHeapMiB: Long, pysparkMemoryMiB: Long, @@ -411,7 +431,7 @@ object ResourceProfile extends Logging { customResources: Map[String, ExecutorResourceRequest]) private[spark] case class DefaultProfileExecutorResources( - cores: Int, + cores: Option[Int], // Can only be None for standalone cluster. executorMemoryMiB: Long, memoryOffHeapMiB: Long, pysparkMemoryMiB: Option[Long], @@ -461,7 +481,7 @@ object ResourceProfile extends Logging { case ResourceProfile.OFFHEAP_MEM => memoryOffHeapMiB = executorOffHeapMemorySizeAsMb(conf, execReq) case ResourceProfile.CORES => - cores = execReq.amount.toInt + cores = Some(execReq.amount.toInt) case rName => val nameToUse = resourceMappings.getOrElse(rName, rName) customResources(nameToUse) = execReq diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala index 2858443c7c..489d9c3e85 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.HashMap import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.annotation.Evolving -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config.Tests._ import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerResourceProfileAdded} import org.apache.spark.util.Utils @@ -54,6 +54,7 @@ private[spark] class ResourceProfileManager(sparkConf: SparkConf, private val master = sparkConf.getOption("spark.master") private val isYarn = master.isDefined && master.get.equals("yarn") private val isK8s = master.isDefined && master.get.startsWith("k8s://") + private val isStandalone = master.isDefined && master.get.startsWith("spark://") private val notRunningUnitTests = !isTesting private val testExceptionThrown = sparkConf.get(RESOURCE_PROFILE_MANAGER_TESTING) @@ -63,17 +64,27 @@ private[spark] class ResourceProfileManager(sparkConf: SparkConf, */ private[spark] def isSupported(rp: ResourceProfile): Boolean = { val isNotDefaultProfile = rp.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID - val notYarnOrK8sAndNotDefaultProfile = isNotDefaultProfile && !(isYarn || isK8s) - val YarnOrK8sNotDynAllocAndNotDefaultProfile = - isNotDefaultProfile && (isYarn || isK8s) && !dynamicEnabled + val notYarnOrK8sOrStandaloneAndNotDefaultProfile = + isNotDefaultProfile && !(isYarn || isK8s || isStandalone) + val YarnOrK8sOrStandaloneNotDynAllocAndNotDefaultProfile = + isNotDefaultProfile && (isYarn || isK8s || isStandalone) && !dynamicEnabled // We want the exception to be thrown only when we are specifically testing for the // exception or in a real application. Otherwise in all other testing scenarios we want // to skip throwing the exception so that we can test in other modes to make testing easier. if ((notRunningUnitTests || testExceptionThrown) && - (notYarnOrK8sAndNotDefaultProfile || YarnOrK8sNotDynAllocAndNotDefaultProfile)) { + (notYarnOrK8sOrStandaloneAndNotDefaultProfile || + YarnOrK8sOrStandaloneNotDynAllocAndNotDefaultProfile)) { throw new SparkException("ResourceProfiles are only supported on YARN and Kubernetes " + - "with dynamic allocation enabled.") + "and Standalone with dynamic allocation enabled.") } + + if (isStandalone && rp.getExecutorCores.isEmpty && + sparkConf.getOption(config.EXECUTOR_CORES.key).isEmpty) { + logWarning("Neither executor cores is set for resource profile, nor spark.executor.cores " + + "is explicitly set, you may get more executors allocated than expected. It's recommended " + + "to set executor cores explicitly. Please check SPARK-30299 for more details.") + } + true } diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 3f0a0d36df..5b5a51fc76 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -222,6 +222,12 @@ private[spark] object ResourceUtils extends Logging { } } + def executorResourceRequestToRequirement(resourceRequest: Seq[ExecutorResourceRequest]) + : Seq[ResourceRequirement] = { + resourceRequest.map(request => + ResourceRequirement(request.resourceName, request.amount.toInt, 1)) + } + def resourcesMeetRequirements( resourcesFree: Map[String, Int], resourceRequirements: Seq[ResourceRequirement]) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index cfbaa46ab6..b52a0f2f99 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -24,8 +24,6 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.hadoop.conf.Configuration -import org.json4s.JsonAST.JValue -import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SPARK_VERSION, SparkConf, SparkContext} import org.apache.spark.deploy.SparkHadoopUtil @@ -66,7 +64,7 @@ private[spark] class EventLoggingListener( EventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) // For testing. Keep track of all JSON serialized events that have been logged. - private[scheduler] val loggedEvents = new mutable.ArrayBuffer[JValue] + private[scheduler] val loggedEvents = new mutable.ArrayBuffer[String] private val shouldLogBlockUpdates = sparkConf.get(EVENT_LOG_BLOCK_UPDATES) private val shouldLogStageExecutorMetrics = sparkConf.get(EVENT_LOG_STAGE_EXECUTOR_METRICS) @@ -86,9 +84,8 @@ private[spark] class EventLoggingListener( private def initEventLog(): Unit = { val metadata = SparkListenerLogStart(SPARK_VERSION) - val eventJson = JsonProtocol.logStartToJson(metadata) - val metadataJson = compact(eventJson) - logWriter.writeEvent(metadataJson, flushLogger = true) + val eventJson = JsonProtocol.sparkEventToJsonString(metadata) + logWriter.writeEvent(eventJson, flushLogger = true) if (testing && loggedEvents != null) { loggedEvents += eventJson } @@ -96,8 +93,8 @@ private[spark] class EventLoggingListener( /** Log the event as JSON. */ private def logEvent(event: SparkListenerEvent, flushLogger: Boolean = false): Unit = { - val eventJson = JsonProtocol.sparkEventToJson(event) - logWriter.writeEvent(compact(render(eventJson)), flushLogger) + val eventJson = JsonProtocol.sparkEventToJsonString(event) + logWriter.writeEvent(eventJson, flushLogger) if (testing) { loggedEvents += eventJson } diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala index 60b6fe7a60..dbb4fa74de 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala @@ -23,7 +23,6 @@ import scala.io.{Codec, Source} import com.fasterxml.jackson.core.JsonParseException import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException -import org.json4s.jackson.JsonMethods._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ReplayListenerBus._ @@ -86,7 +85,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { currentLine = entry._1 lineNumber = entry._2 + 1 - postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine))) + postToAll(JsonProtocol.sparkEventFromJson(currentLine)) } catch { case e: ClassNotFoundException => // Ignore unknown events, parse through the event log file. diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index 7a05569601..befc59b821 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -29,7 +29,7 @@ import org.apache.spark.deploy.client.{StandaloneAppClient, StandaloneAppClientL import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} -import org.apache.spark.resource.{ResourceProfile, ResourceUtils} +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.RpcEndpointAddress import org.apache.spark.scheduler._ import org.apache.spark.util.Utils @@ -82,7 +82,8 @@ private[spark] class StandaloneSchedulerBackend( "--hostname", "{{HOSTNAME}}", "--cores", "{{CORES}}", "--app-id", "{{APP_ID}}", - "--worker-url", "{{WORKER_URL}}") + "--worker-url", "{{WORKER_URL}}", + "--resourceProfileId", "{{RESOURCE_PROFILE_ID}}") val extraJavaOpts = sc.conf.get(config.EXECUTOR_JAVA_OPTIONS) .map(Utils.splitCommandString).getOrElse(Seq.empty) val classPathEntries = sc.conf.get(config.EXECUTOR_CLASS_PATH) @@ -111,15 +112,18 @@ private[spark] class StandaloneSchedulerBackend( // ExecutorAllocationManager will send the real initial limit to the Master later. val initialExecutorLimit = if (Utils.isDynamicAllocationEnabled(conf)) { + if (coresPerExecutor.isEmpty) { + logWarning("Dynamic allocation enabled without spark.executor.cores explicitly " + + "set, you may get more executors allocated than expected. It's recommended to " + + "set spark.executor.cores explicitly. Please check SPARK-30299 for more details.") + } + Some(0) } else { None } - val executorResourceReqs = ResourceUtils.parseResourceRequirements(conf, - config.SPARK_EXECUTOR_PREFIX) - val appDesc = ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command, - webUrl, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor, initialExecutorLimit, - resourceReqsPerExecutor = executorResourceReqs) + val appDesc = ApplicationDescription(sc.appName, maxCores, command, + webUrl, defaultProfile = defaultProf, sc.eventLogDir, sc.eventLogCodec, initialExecutorLimit) client = new StandaloneAppClient(sc.env.rpcEnv, masters, appDesc, this, conf) client.start() launcherBackend.setState(SparkAppHandle.State.SUBMITTED) @@ -215,8 +219,7 @@ private[spark] class StandaloneSchedulerBackend( // resources profiles not supported Option(client) match { case Some(c) => - val numExecs = resourceProfileToTotalExecs.getOrElse(defaultProf, 0) - c.requestTotalExecutors(numExecs) + c.requestTotalExecutors(resourceProfileToTotalExecs) case None => logWarning("Attempted to request executors before driver fully initialized.") Future.successful(false) diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala index d3a061fae7..64a786e582 100644 --- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala +++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala @@ -47,7 +47,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { // Schedule a refresh thread to run periodically private val timer = new Timer("refresh progress", true) - timer.schedule(new TimerTask{ + timer.schedule(new TimerTask { override def run(): Unit = { refresh() } diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index f0755b04be..5820a50fb7 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -17,17 +17,17 @@ package org.apache.spark.util +import java.io.ByteArrayOutputStream +import java.nio.charset.StandardCharsets import java.util.{Properties, UUID} import scala.collection.JavaConverters._ import scala.collection.Map -import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} +import com.fasterxml.jackson.core.{JsonEncoding, JsonGenerator} +import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper} import com.fasterxml.jackson.module.scala.DefaultScalaModule -import org.json4s.DefaultFormats -import org.json4s.JsonAST._ -import org.json4s.JsonDSL._ -import org.json4s.jackson.JsonMethods._ +import org.json4s.jackson.JsonMethods.compact import org.apache.spark._ import org.apache.spark.executor._ @@ -57,8 +57,6 @@ import org.apache.spark.util.Utils.weakIntern private[spark] object JsonProtocol { // TODO: Remove this file and put JSON serialization into each individual class. - private implicit val format = DefaultFormats - private val mapper = new ObjectMapper().registerModule(DefaultScalaModule) .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) @@ -66,289 +64,397 @@ private[spark] object JsonProtocol { * JSON serialization methods for SparkListenerEvents | * -------------------------------------------------- */ - def sparkEventToJson(event: SparkListenerEvent): JValue = { + def sparkEventToJsonString(event: SparkListenerEvent): String = { + toJsonString { generator => + writeSparkEventToJson(event, generator) + } + } + + def toJsonString(block: JsonGenerator => Unit): String = { + val baos = new ByteArrayOutputStream() + val generator = mapper.createGenerator(baos, JsonEncoding.UTF8) + block(generator) + generator.close() + baos.close() + new String(baos.toByteArray, StandardCharsets.UTF_8) + } + + def writeSparkEventToJson(event: SparkListenerEvent, g: JsonGenerator): Unit = { event match { case stageSubmitted: SparkListenerStageSubmitted => - stageSubmittedToJson(stageSubmitted) + stageSubmittedToJson(stageSubmitted, g) case stageCompleted: SparkListenerStageCompleted => - stageCompletedToJson(stageCompleted) + stageCompletedToJson(stageCompleted, g) case taskStart: SparkListenerTaskStart => - taskStartToJson(taskStart) + taskStartToJson(taskStart, g) case taskGettingResult: SparkListenerTaskGettingResult => - taskGettingResultToJson(taskGettingResult) + taskGettingResultToJson(taskGettingResult, g) case taskEnd: SparkListenerTaskEnd => - taskEndToJson(taskEnd) + taskEndToJson(taskEnd, g) case jobStart: SparkListenerJobStart => - jobStartToJson(jobStart) + jobStartToJson(jobStart, g) case jobEnd: SparkListenerJobEnd => - jobEndToJson(jobEnd) + jobEndToJson(jobEnd, g) case environmentUpdate: SparkListenerEnvironmentUpdate => - environmentUpdateToJson(environmentUpdate) + environmentUpdateToJson(environmentUpdate, g) case blockManagerAdded: SparkListenerBlockManagerAdded => - blockManagerAddedToJson(blockManagerAdded) + blockManagerAddedToJson(blockManagerAdded, g) case blockManagerRemoved: SparkListenerBlockManagerRemoved => - blockManagerRemovedToJson(blockManagerRemoved) + blockManagerRemovedToJson(blockManagerRemoved, g) case unpersistRDD: SparkListenerUnpersistRDD => - unpersistRDDToJson(unpersistRDD) + unpersistRDDToJson(unpersistRDD, g) case applicationStart: SparkListenerApplicationStart => - applicationStartToJson(applicationStart) + applicationStartToJson(applicationStart, g) case applicationEnd: SparkListenerApplicationEnd => - applicationEndToJson(applicationEnd) + applicationEndToJson(applicationEnd, g) case executorAdded: SparkListenerExecutorAdded => - executorAddedToJson(executorAdded) + executorAddedToJson(executorAdded, g) case executorRemoved: SparkListenerExecutorRemoved => - executorRemovedToJson(executorRemoved) + executorRemovedToJson(executorRemoved, g) case logStart: SparkListenerLogStart => - logStartToJson(logStart) + logStartToJson(logStart, g) case metricsUpdate: SparkListenerExecutorMetricsUpdate => - executorMetricsUpdateToJson(metricsUpdate) + executorMetricsUpdateToJson(metricsUpdate, g) case stageExecutorMetrics: SparkListenerStageExecutorMetrics => - stageExecutorMetricsToJson(stageExecutorMetrics) + stageExecutorMetricsToJson(stageExecutorMetrics, g) case blockUpdate: SparkListenerBlockUpdated => - blockUpdateToJson(blockUpdate) + blockUpdateToJson(blockUpdate, g) case resourceProfileAdded: SparkListenerResourceProfileAdded => - resourceProfileAddedToJson(resourceProfileAdded) - case _ => parse(mapper.writeValueAsString(event)) + resourceProfileAddedToJson(resourceProfileAdded, g) + case _ => + mapper.writeValue(g, event) } } - def stageSubmittedToJson(stageSubmitted: SparkListenerStageSubmitted): JValue = { - val stageInfo = stageInfoToJson(stageSubmitted.stageInfo) - val properties = propertiesToJson(stageSubmitted.properties) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageSubmitted) ~ - ("Stage Info" -> stageInfo) ~ - ("Properties" -> properties) + def stageSubmittedToJson(stageSubmitted: SparkListenerStageSubmitted, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageSubmitted) + g.writeFieldName("Stage Info") + stageInfoToJson(stageSubmitted.stageInfo, g) + Option(stageSubmitted.properties).foreach { properties => + g.writeFieldName("Properties") + propertiesToJson(properties, g) + } + g.writeEndObject() } - def stageCompletedToJson(stageCompleted: SparkListenerStageCompleted): JValue = { - val stageInfo = stageInfoToJson(stageCompleted.stageInfo) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageCompleted) ~ - ("Stage Info" -> stageInfo) + def stageCompletedToJson(stageCompleted: SparkListenerStageCompleted, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageCompleted) + g.writeFieldName("Stage Info") + stageInfoToJson(stageCompleted.stageInfo, g) + g.writeEndObject() } - def taskStartToJson(taskStart: SparkListenerTaskStart): JValue = { - val taskInfo = taskStart.taskInfo - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskStart) ~ - ("Stage ID" -> taskStart.stageId) ~ - ("Stage Attempt ID" -> taskStart.stageAttemptId) ~ - ("Task Info" -> taskInfoToJson(taskInfo)) + def taskStartToJson(taskStart: SparkListenerTaskStart, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskStart) + g.writeNumberField("Stage ID", taskStart.stageId) + g.writeNumberField("Stage Attempt ID", taskStart.stageAttemptId) + g.writeFieldName("Task Info") + taskInfoToJson(taskStart.taskInfo, g) + g.writeEndObject() } - def taskGettingResultToJson(taskGettingResult: SparkListenerTaskGettingResult): JValue = { + def taskGettingResultToJson( + taskGettingResult: SparkListenerTaskGettingResult, + g: JsonGenerator): Unit = { val taskInfo = taskGettingResult.taskInfo - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskGettingResult) ~ - ("Task Info" -> taskInfoToJson(taskInfo)) - } - - def taskEndToJson(taskEnd: SparkListenerTaskEnd): JValue = { - val taskEndReason = taskEndReasonToJson(taskEnd.reason) - val taskInfo = taskEnd.taskInfo - val executorMetrics = taskEnd.taskExecutorMetrics - val taskMetrics = taskEnd.taskMetrics - val taskMetricsJson = if (taskMetrics != null) taskMetricsToJson(taskMetrics) else JNothing - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskEnd) ~ - ("Stage ID" -> taskEnd.stageId) ~ - ("Stage Attempt ID" -> taskEnd.stageAttemptId) ~ - ("Task Type" -> taskEnd.taskType) ~ - ("Task End Reason" -> taskEndReason) ~ - ("Task Info" -> taskInfoToJson(taskInfo)) ~ - ("Task Executor Metrics" -> executorMetricsToJson(executorMetrics)) ~ - ("Task Metrics" -> taskMetricsJson) - } - - def jobStartToJson(jobStart: SparkListenerJobStart): JValue = { - val properties = propertiesToJson(jobStart.properties) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.jobStart) ~ - ("Job ID" -> jobStart.jobId) ~ - ("Submission Time" -> jobStart.time) ~ - ("Stage Infos" -> jobStart.stageInfos.map(stageInfoToJson)) ~ // Added in Spark 1.2.0 - ("Stage IDs" -> jobStart.stageIds) ~ - ("Properties" -> properties) - } - - def jobEndToJson(jobEnd: SparkListenerJobEnd): JValue = { - val jobResult = jobResultToJson(jobEnd.jobResult) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.jobEnd) ~ - ("Job ID" -> jobEnd.jobId) ~ - ("Completion Time" -> jobEnd.time) ~ - ("Job Result" -> jobResult) - } - - def environmentUpdateToJson(environmentUpdate: SparkListenerEnvironmentUpdate): JValue = { - val environmentDetails = environmentUpdate.environmentDetails - val jvmInformation = mapToJson(environmentDetails("JVM Information").toMap) - val sparkProperties = mapToJson(environmentDetails("Spark Properties").toMap) - val hadoopProperties = mapToJson(environmentDetails("Hadoop Properties").toMap) - val systemProperties = mapToJson(environmentDetails("System Properties").toMap) - val metricsProperties = mapToJson(environmentDetails("Metrics Properties").toMap) - val classpathEntries = mapToJson(environmentDetails("Classpath Entries").toMap) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.environmentUpdate) ~ - ("JVM Information" -> jvmInformation) ~ - ("Spark Properties" -> sparkProperties) ~ - ("Hadoop Properties" -> hadoopProperties) ~ - ("System Properties" -> systemProperties) ~ - ("Metrics Properties"-> metricsProperties) ~ - ("Classpath Entries" -> classpathEntries) - } - - def blockManagerAddedToJson(blockManagerAdded: SparkListenerBlockManagerAdded): JValue = { - val blockManagerId = blockManagerIdToJson(blockManagerAdded.blockManagerId) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockManagerAdded) ~ - ("Block Manager ID" -> blockManagerId) ~ - ("Maximum Memory" -> blockManagerAdded.maxMem) ~ - ("Timestamp" -> blockManagerAdded.time) ~ - ("Maximum Onheap Memory" -> blockManagerAdded.maxOnHeapMem) ~ - ("Maximum Offheap Memory" -> blockManagerAdded.maxOffHeapMem) - } - - def blockManagerRemovedToJson(blockManagerRemoved: SparkListenerBlockManagerRemoved): JValue = { - val blockManagerId = blockManagerIdToJson(blockManagerRemoved.blockManagerId) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockManagerRemoved) ~ - ("Block Manager ID" -> blockManagerId) ~ - ("Timestamp" -> blockManagerRemoved.time) - } - - def unpersistRDDToJson(unpersistRDD: SparkListenerUnpersistRDD): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.unpersistRDD) ~ - ("RDD ID" -> unpersistRDD.rddId) - } - - def applicationStartToJson(applicationStart: SparkListenerApplicationStart): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.applicationStart) ~ - ("App Name" -> applicationStart.appName) ~ - ("App ID" -> applicationStart.appId.map(JString(_)).getOrElse(JNothing)) ~ - ("Timestamp" -> applicationStart.time) ~ - ("User" -> applicationStart.sparkUser) ~ - ("App Attempt ID" -> applicationStart.appAttemptId.map(JString(_)).getOrElse(JNothing)) ~ - ("Driver Logs" -> applicationStart.driverLogs.map(mapToJson).getOrElse(JNothing)) ~ - ("Driver Attributes" -> applicationStart.driverAttributes.map(mapToJson).getOrElse(JNothing)) - } - - def applicationEndToJson(applicationEnd: SparkListenerApplicationEnd): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.applicationEnd) ~ - ("Timestamp" -> applicationEnd.time) - } - - def resourceProfileAddedToJson(profileAdded: SparkListenerResourceProfileAdded): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.resourceProfileAdded) ~ - ("Resource Profile Id" -> profileAdded.resourceProfile.id) ~ - ("Executor Resource Requests" -> - executorResourceRequestMapToJson(profileAdded.resourceProfile.executorResources)) ~ - ("Task Resource Requests" -> - taskResourceRequestMapToJson(profileAdded.resourceProfile.taskResources)) - } - - def executorAddedToJson(executorAdded: SparkListenerExecutorAdded): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.executorAdded) ~ - ("Timestamp" -> executorAdded.time) ~ - ("Executor ID" -> executorAdded.executorId) ~ - ("Executor Info" -> executorInfoToJson(executorAdded.executorInfo)) - } + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskGettingResult) + g.writeFieldName("Task Info") + taskInfoToJson(taskInfo, g) + g.writeEndObject() + } + + def taskEndToJson(taskEnd: SparkListenerTaskEnd, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.taskEnd) + g.writeNumberField("Stage ID", taskEnd.stageId) + g.writeNumberField("Stage Attempt ID", taskEnd.stageAttemptId) + g.writeStringField("Task Type", taskEnd.taskType) + g.writeFieldName("Task End Reason") + taskEndReasonToJson(taskEnd.reason, g) + g.writeFieldName("Task Info") + taskInfoToJson(taskEnd.taskInfo, g) + g.writeFieldName("Task Executor Metrics") + executorMetricsToJson(taskEnd.taskExecutorMetrics, g) + Option(taskEnd.taskMetrics).foreach { m => + g.writeFieldName("Task Metrics") + taskMetricsToJson(m, g) + } + g.writeEndObject() + } + + def jobStartToJson(jobStart: SparkListenerJobStart, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.jobStart) + g.writeNumberField("Job ID", jobStart.jobId) + g.writeNumberField("Submission Time", jobStart.time) + g.writeArrayFieldStart("Stage Infos") // Added in Spark 1.2.0 + jobStart.stageInfos.foreach(stageInfoToJson(_, g)) + g.writeEndArray() + g.writeArrayFieldStart("Stage IDs") + jobStart.stageIds.foreach(g.writeNumber) + g.writeEndArray() + Option(jobStart.properties).foreach { properties => + g.writeFieldName("Properties") + propertiesToJson(properties, g) + } - def executorRemovedToJson(executorRemoved: SparkListenerExecutorRemoved): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.executorRemoved) ~ - ("Timestamp" -> executorRemoved.time) ~ - ("Executor ID" -> executorRemoved.executorId) ~ - ("Removed Reason" -> executorRemoved.reason) + g.writeEndObject() } - def logStartToJson(logStart: SparkListenerLogStart): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.logStart) ~ - ("Spark Version" -> SPARK_VERSION) + def jobEndToJson(jobEnd: SparkListenerJobEnd, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.jobEnd) + g.writeNumberField("Job ID", jobEnd.jobId) + g.writeNumberField("Completion Time", jobEnd.time) + g.writeFieldName("Job Result") + jobResultToJson(jobEnd.jobResult, g) + g.writeEndObject() } - def executorMetricsUpdateToJson(metricsUpdate: SparkListenerExecutorMetricsUpdate): JValue = { + def environmentUpdateToJson( + environmentUpdate: SparkListenerEnvironmentUpdate, + g: JsonGenerator): Unit = { + val environmentDetails = environmentUpdate.environmentDetails + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.environmentUpdate) + writeMapField("JVM Information", environmentDetails("JVM Information").toMap, g) + writeMapField("Spark Properties", environmentDetails("Spark Properties").toMap, g) + writeMapField("Hadoop Properties", environmentDetails("Hadoop Properties").toMap, g) + writeMapField("System Properties", environmentDetails("System Properties").toMap, g) + writeMapField("Metrics Properties", environmentDetails("Metrics Properties").toMap, g) + writeMapField("Classpath Entries", environmentDetails("Classpath Entries").toMap, g) + g.writeEndObject() + } + + def blockManagerAddedToJson( + blockManagerAdded: SparkListenerBlockManagerAdded, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockManagerAdded) + g.writeFieldName("Block Manager ID") + blockManagerIdToJson(blockManagerAdded.blockManagerId, g) + g.writeNumberField("Maximum Memory", blockManagerAdded.maxMem) + g.writeNumberField("Timestamp", blockManagerAdded.time) + blockManagerAdded.maxOnHeapMem.foreach(g.writeNumberField("Maximum Onheap Memory", _)) + blockManagerAdded.maxOffHeapMem.foreach(g.writeNumberField("Maximum Offheap Memory", _)) + g.writeEndObject() + } + + def blockManagerRemovedToJson( + blockManagerRemoved: SparkListenerBlockManagerRemoved, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockManagerRemoved) + g.writeFieldName("Block Manager ID") + blockManagerIdToJson(blockManagerRemoved.blockManagerId, g) + g.writeNumberField("Timestamp", blockManagerRemoved.time) + g.writeEndObject() + } + + def unpersistRDDToJson(unpersistRDD: SparkListenerUnpersistRDD, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.unpersistRDD) + g.writeNumberField("RDD ID", unpersistRDD.rddId) + g.writeEndObject() + } + + def applicationStartToJson( + applicationStart: SparkListenerApplicationStart, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.applicationStart) + g.writeStringField("App Name", applicationStart.appName) + applicationStart.appId.foreach(g.writeStringField("App ID", _)) + g.writeNumberField("Timestamp", applicationStart.time) + g.writeStringField("User", applicationStart.sparkUser) + applicationStart.appAttemptId.foreach(g.writeStringField("App Attempt ID", _)) + applicationStart.driverLogs.foreach(writeMapField("Driver Logs", _, g)) + applicationStart.driverAttributes.foreach(writeMapField("Driver Attributes", _, g)) + g.writeEndObject() + } + + def applicationEndToJson( + applicationEnd: SparkListenerApplicationEnd, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.applicationEnd) + g.writeNumberField("Timestamp", applicationEnd.time) + g.writeEndObject() + } + + def resourceProfileAddedToJson( + profileAdded: SparkListenerResourceProfileAdded, + g: JsonGenerator + ): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.resourceProfileAdded) + g.writeNumberField("Resource Profile Id", profileAdded.resourceProfile.id) + g.writeFieldName("Executor Resource Requests") + executorResourceRequestMapToJson(profileAdded.resourceProfile.executorResources, g) + g.writeFieldName("Task Resource Requests") + taskResourceRequestMapToJson(profileAdded.resourceProfile.taskResources, g) + g.writeEndObject() + } + + def executorAddedToJson(executorAdded: SparkListenerExecutorAdded, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.executorAdded) + g.writeNumberField("Timestamp", executorAdded.time) + g.writeStringField("Executor ID", executorAdded.executorId) + g.writeFieldName("Executor Info") + executorInfoToJson(executorAdded.executorInfo, g) + g.writeEndObject() + } + + def executorRemovedToJson( + executorRemoved: SparkListenerExecutorRemoved, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.executorRemoved) + g.writeNumberField("Timestamp", executorRemoved.time) + g.writeStringField("Executor ID", executorRemoved.executorId) + g.writeStringField("Removed Reason", executorRemoved.reason) + g.writeEndObject() + } + + def logStartToJson(logStart: SparkListenerLogStart, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.logStart) + g.writeStringField("Spark Version", SPARK_VERSION) + g.writeEndObject() + } + + def executorMetricsUpdateToJson( + metricsUpdate: SparkListenerExecutorMetricsUpdate, + g: JsonGenerator): Unit = { val execId = metricsUpdate.execId val accumUpdates = metricsUpdate.accumUpdates val executorUpdates = metricsUpdate.executorUpdates - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.metricsUpdate) ~ - ("Executor ID" -> execId) ~ - ("Metrics Updated" -> accumUpdates.map { case (taskId, stageId, stageAttemptId, updates) => - ("Task ID" -> taskId) ~ - ("Stage ID" -> stageId) ~ - ("Stage Attempt ID" -> stageAttemptId) ~ - ("Accumulator Updates" -> JArray(updates.map(accumulableInfoToJson).toList)) - }) ~ - ("Executor Metrics Updated" -> executorUpdates.map { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.metricsUpdate) + g.writeStringField("Executor ID", execId) + g.writeArrayFieldStart("Metrics Updated") + accumUpdates.foreach { case (taskId, stageId, stageAttemptId, updates) => + g.writeStartObject() + g.writeNumberField("Task ID", taskId) + g.writeNumberField("Stage ID", stageId) + g.writeNumberField("Stage Attempt ID", stageAttemptId) + g.writeArrayFieldStart("Accumulator Updates") + updates.foreach(accumulableInfoToJson(_, g)) + g.writeEndArray() + g.writeEndObject() + } + g.writeEndArray() + g.writeArrayFieldStart("Executor Metrics Updated") + executorUpdates.foreach { case ((stageId, stageAttemptId), metrics) => - ("Stage ID" -> stageId) ~ - ("Stage Attempt ID" -> stageAttemptId) ~ - ("Executor Metrics" -> executorMetricsToJson(metrics)) - }) + g.writeStartObject() + g.writeNumberField("Stage ID", stageId) + g.writeNumberField("Stage Attempt ID", stageAttemptId) + g.writeFieldName("Executor Metrics") + executorMetricsToJson(metrics, g) + g.writeEndObject() + } + g.writeEndArray() + g.writeEndObject() } - def stageExecutorMetricsToJson(metrics: SparkListenerStageExecutorMetrics): JValue = { - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageExecutorMetrics) ~ - ("Executor ID" -> metrics.execId) ~ - ("Stage ID" -> metrics.stageId) ~ - ("Stage Attempt ID" -> metrics.stageAttemptId) ~ - ("Executor Metrics" -> executorMetricsToJson(metrics.executorMetrics)) + def stageExecutorMetricsToJson( + metrics: SparkListenerStageExecutorMetrics, + g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageExecutorMetrics) + g.writeStringField("Executor ID", metrics.execId) + g.writeNumberField("Stage ID", metrics.stageId) + g.writeNumberField("Stage Attempt ID", metrics.stageAttemptId) + g.writeFieldName("Executor Metrics") + executorMetricsToJson(metrics.executorMetrics, g) + g.writeEndObject() } - def blockUpdateToJson(blockUpdate: SparkListenerBlockUpdated): JValue = { - val blockUpdatedInfo = blockUpdatedInfoToJson(blockUpdate.blockUpdatedInfo) - ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockUpdate) ~ - ("Block Updated Info" -> blockUpdatedInfo) + def blockUpdateToJson(blockUpdate: SparkListenerBlockUpdated, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Event", SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockUpdate) + g.writeFieldName("Block Updated Info") + blockUpdatedInfoToJson(blockUpdate.blockUpdatedInfo, g) + g.writeEndObject() } /** ------------------------------------------------------------------- * * JSON serialization methods for classes SparkListenerEvents depend on | * -------------------------------------------------------------------- */ - def stageInfoToJson(stageInfo: StageInfo): JValue = { - val rddInfo = JArray(stageInfo.rddInfos.map(rddInfoToJson).toList) - val parentIds = JArray(stageInfo.parentIds.map(JInt(_)).toList) - val submissionTime = stageInfo.submissionTime.map(JInt(_)).getOrElse(JNothing) - val completionTime = stageInfo.completionTime.map(JInt(_)).getOrElse(JNothing) - val failureReason = stageInfo.failureReason.map(JString(_)).getOrElse(JNothing) - ("Stage ID" -> stageInfo.stageId) ~ - ("Stage Attempt ID" -> stageInfo.attemptNumber) ~ - ("Stage Name" -> stageInfo.name) ~ - ("Number of Tasks" -> stageInfo.numTasks) ~ - ("RDD Info" -> rddInfo) ~ - ("Parent IDs" -> parentIds) ~ - ("Details" -> stageInfo.details) ~ - ("Submission Time" -> submissionTime) ~ - ("Completion Time" -> completionTime) ~ - ("Failure Reason" -> failureReason) ~ - ("Accumulables" -> accumulablesToJson(stageInfo.accumulables.values)) ~ - ("Resource Profile Id" -> stageInfo.resourceProfileId) - } - - def taskInfoToJson(taskInfo: TaskInfo): JValue = { - ("Task ID" -> taskInfo.taskId) ~ - ("Index" -> taskInfo.index) ~ - ("Attempt" -> taskInfo.attemptNumber) ~ - ("Partition ID" -> taskInfo.partitionId) ~ - ("Launch Time" -> taskInfo.launchTime) ~ - ("Executor ID" -> taskInfo.executorId) ~ - ("Host" -> taskInfo.host) ~ - ("Locality" -> taskInfo.taskLocality.toString) ~ - ("Speculative" -> taskInfo.speculative) ~ - ("Getting Result Time" -> taskInfo.gettingResultTime) ~ - ("Finish Time" -> taskInfo.finishTime) ~ - ("Failed" -> taskInfo.failed) ~ - ("Killed" -> taskInfo.killed) ~ - ("Accumulables" -> accumulablesToJson(taskInfo.accumulables)) + def stageInfoToJson(stageInfo: StageInfo, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeNumberField("Stage ID", stageInfo.stageId) + g.writeNumberField("Stage Attempt ID", stageInfo.attemptNumber) + g.writeStringField("Stage Name", stageInfo.name) + g.writeNumberField ("Number of Tasks", stageInfo.numTasks) + g.writeArrayFieldStart("RDD Info") + stageInfo.rddInfos.foreach(rddInfoToJson(_, g)) + g.writeEndArray() + g.writeArrayFieldStart("Parent IDs") + stageInfo.parentIds.foreach(g.writeNumber) + g.writeEndArray() + g.writeStringField("Details", stageInfo.details) + stageInfo.submissionTime.foreach(g.writeNumberField("Submission Time", _)) + stageInfo.completionTime.foreach(g.writeNumberField("Completion Time", _)) + stageInfo.failureReason.foreach(g.writeStringField("Failure Reason", _)) + g.writeFieldName("Accumulables") + accumulablesToJson(stageInfo.accumulables.values, g) + g.writeNumberField("Resource Profile Id", stageInfo.resourceProfileId) + g.writeEndObject() + } + + def taskInfoToJson(taskInfo: TaskInfo, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeNumberField("Task ID", taskInfo.taskId) + g.writeNumberField("Index", taskInfo.index) + g.writeNumberField("Attempt", taskInfo.attemptNumber) + g.writeNumberField("Partition ID", taskInfo.partitionId) + g.writeNumberField("Launch Time", taskInfo.launchTime) + g.writeStringField("Executor ID", taskInfo.executorId) + g.writeStringField("Host", taskInfo.host) + g.writeStringField("Locality", taskInfo.taskLocality.toString) + g.writeBooleanField("Speculative", taskInfo.speculative) + g.writeNumberField("Getting Result Time", taskInfo.gettingResultTime) + g.writeNumberField("Finish Time", taskInfo.finishTime) + g.writeBooleanField("Failed", taskInfo.failed) + g.writeBooleanField("Killed", taskInfo.killed) + g.writeFieldName("Accumulables") + accumulablesToJson(taskInfo.accumulables, g) + g.writeEndObject() } private lazy val accumulableExcludeList = Set("internal.metrics.updatedBlockStatuses") - def accumulablesToJson(accumulables: Iterable[AccumulableInfo]): JArray = { - JArray(accumulables + def accumulablesToJson(accumulables: Iterable[AccumulableInfo], g: JsonGenerator): Unit = { + g.writeStartArray() + accumulables .filterNot(_.name.exists(accumulableExcludeList.contains)) - .toList.sortBy(_.id).map(accumulableInfoToJson)) + .toList.sortBy(_.id).foreach(a => accumulableInfoToJson(a, g)) + g.writeEndArray() } - def accumulableInfoToJson(accumulableInfo: AccumulableInfo): JValue = { + def accumulableInfoToJson(accumulableInfo: AccumulableInfo, g: JsonGenerator): Unit = { val name = accumulableInfo.name - ("ID" -> accumulableInfo.id) ~ - ("Name" -> name) ~ - ("Update" -> accumulableInfo.update.map { v => accumValueToJson(name, v) }) ~ - ("Value" -> accumulableInfo.value.map { v => accumValueToJson(name, v) }) ~ - ("Internal" -> accumulableInfo.internal) ~ - ("Count Failed Values" -> accumulableInfo.countFailedValues) ~ - ("Metadata" -> accumulableInfo.metadata) + g.writeStartObject() + g.writeNumberField("ID", accumulableInfo.id) + name.foreach(g.writeStringField("Name", _)) + accumulableInfo.update.foreach { v => + accumValueToJson(name, v, g, fieldName = Some("Update")) + } + accumulableInfo.value.foreach { v => + accumValueToJson(name, v, g, fieldName = Some("Value")) + } + g.writeBooleanField("Internal", accumulableInfo.internal) + g.writeBooleanField("Count Failed Values", accumulableInfo.countFailedValues) + accumulableInfo.metadata.foreach(g.writeStringField("Metadata", _)) + g.writeEndObject() } /** @@ -360,256 +466,343 @@ private[spark] object JsonProtocol { * * The behavior here must match that of [[accumValueFromJson]]. Exposed for testing. */ - private[util] def accumValueToJson(name: Option[String], value: Any): JValue = { + private[util] def accumValueToJson( + name: Option[String], + value: Any, + g: JsonGenerator, + fieldName: Option[String] = None): Unit = { if (name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))) { value match { - case v: Int => JInt(v) - case v: Long => JInt(v) + case v: Int => + fieldName.foreach(g.writeFieldName) + g.writeNumber(v) + case v: Long => + fieldName.foreach(g.writeFieldName) + g.writeNumber(v) // We only have 3 kind of internal accumulator types, so if it's not int or long, it must be // the blocks accumulator, whose type is `java.util.List[(BlockId, BlockStatus)]` case v: java.util.List[_] => - JArray(v.asScala.toList.flatMap { + fieldName.foreach(g.writeFieldName) + g.writeStartArray() + v.asScala.foreach { case (id: BlockId, status: BlockStatus) => - Some( - ("Block ID" -> id.toString) ~ - ("Status" -> blockStatusToJson(status)) - ) + g.writeStartObject() + g.writeStringField("Block ID", id.toString) + g.writeFieldName("Status") + blockStatusToJson(status, g) + g.writeEndObject() case _ => // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should // not crash. - None - }) + } + g.writeEndArray() case _ => // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should not // crash. - JNothing } } else { // For all external accumulators, just use strings - JString(value.toString) + fieldName.foreach(g.writeFieldName) + g.writeString(value.toString) } } - def taskMetricsToJson(taskMetrics: TaskMetrics): JValue = { - val shuffleReadMetrics: JValue = - ("Remote Blocks Fetched" -> taskMetrics.shuffleReadMetrics.remoteBlocksFetched) ~ - ("Local Blocks Fetched" -> taskMetrics.shuffleReadMetrics.localBlocksFetched) ~ - ("Fetch Wait Time" -> taskMetrics.shuffleReadMetrics.fetchWaitTime) ~ - ("Remote Bytes Read" -> taskMetrics.shuffleReadMetrics.remoteBytesRead) ~ - ("Remote Bytes Read To Disk" -> taskMetrics.shuffleReadMetrics.remoteBytesReadToDisk) ~ - ("Local Bytes Read" -> taskMetrics.shuffleReadMetrics.localBytesRead) ~ - ("Total Records Read" -> taskMetrics.shuffleReadMetrics.recordsRead) - val shuffleWriteMetrics: JValue = - ("Shuffle Bytes Written" -> taskMetrics.shuffleWriteMetrics.bytesWritten) ~ - ("Shuffle Write Time" -> taskMetrics.shuffleWriteMetrics.writeTime) ~ - ("Shuffle Records Written" -> taskMetrics.shuffleWriteMetrics.recordsWritten) - val inputMetrics: JValue = - ("Bytes Read" -> taskMetrics.inputMetrics.bytesRead) ~ - ("Records Read" -> taskMetrics.inputMetrics.recordsRead) - val outputMetrics: JValue = - ("Bytes Written" -> taskMetrics.outputMetrics.bytesWritten) ~ - ("Records Written" -> taskMetrics.outputMetrics.recordsWritten) - val updatedBlocks = - JArray(taskMetrics.updatedBlockStatuses.toList.map { case (id, status) => - ("Block ID" -> id.toString) ~ - ("Status" -> blockStatusToJson(status)) - }) - ("Executor Deserialize Time" -> taskMetrics.executorDeserializeTime) ~ - ("Executor Deserialize CPU Time" -> taskMetrics.executorDeserializeCpuTime) ~ - ("Executor Run Time" -> taskMetrics.executorRunTime) ~ - ("Executor CPU Time" -> taskMetrics.executorCpuTime) ~ - ("Peak Execution Memory" -> taskMetrics.peakExecutionMemory) ~ - ("Result Size" -> taskMetrics.resultSize) ~ - ("JVM GC Time" -> taskMetrics.jvmGCTime) ~ - ("Result Serialization Time" -> taskMetrics.resultSerializationTime) ~ - ("Memory Bytes Spilled" -> taskMetrics.memoryBytesSpilled) ~ - ("Disk Bytes Spilled" -> taskMetrics.diskBytesSpilled) ~ - ("Shuffle Read Metrics" -> shuffleReadMetrics) ~ - ("Shuffle Write Metrics" -> shuffleWriteMetrics) ~ - ("Input Metrics" -> inputMetrics) ~ - ("Output Metrics" -> outputMetrics) ~ - ("Updated Blocks" -> updatedBlocks) + def taskMetricsToJson(taskMetrics: TaskMetrics, g: JsonGenerator): Unit = { + def writeShuffleReadMetrics(): Unit = { + g.writeStartObject() + g.writeNumberField( + "Remote Blocks Fetched", taskMetrics.shuffleReadMetrics.remoteBlocksFetched) + g.writeNumberField("Local Blocks Fetched", taskMetrics.shuffleReadMetrics.localBlocksFetched) + g.writeNumberField("Fetch Wait Time", taskMetrics.shuffleReadMetrics.fetchWaitTime) + g.writeNumberField("Remote Bytes Read", taskMetrics.shuffleReadMetrics.remoteBytesRead) + g.writeNumberField( + "Remote Bytes Read To Disk", taskMetrics.shuffleReadMetrics.remoteBytesReadToDisk) + g.writeNumberField("Local Bytes Read", taskMetrics.shuffleReadMetrics.localBytesRead) + g.writeNumberField("Total Records Read", taskMetrics.shuffleReadMetrics.recordsRead) + g.writeEndObject() + } + def writeShuffleWriteMetrics(): Unit = { + g.writeStartObject() + g.writeNumberField("Shuffle Bytes Written", taskMetrics.shuffleWriteMetrics.bytesWritten) + g.writeNumberField("Shuffle Write Time", taskMetrics.shuffleWriteMetrics.writeTime) + g.writeNumberField("Shuffle Records Written", taskMetrics.shuffleWriteMetrics.recordsWritten) + g.writeEndObject() + } + def writeInputMetrics(): Unit = { + g.writeStartObject() + g.writeNumberField("Bytes Read", taskMetrics.inputMetrics.bytesRead) + g.writeNumberField("Records Read", taskMetrics.inputMetrics.recordsRead) + g.writeEndObject() + } + def writeOutputMetrics(): Unit = { + g.writeStartObject() + g.writeNumberField("Bytes Written", taskMetrics.outputMetrics.bytesWritten) + g.writeNumberField("Records Written", taskMetrics.outputMetrics.recordsWritten) + g.writeEndObject() + } + def writeUpdatedBlocks(): Unit = { + g.writeStartArray() + taskMetrics.updatedBlockStatuses.foreach { case (id, status) => + g.writeStartObject() + g.writeStringField("Block ID", id.toString) + g.writeFieldName("Status") + blockStatusToJson(status, g) + g.writeEndObject() + } + g.writeEndArray() + } + + g.writeStartObject() + g.writeNumberField("Executor Deserialize Time", taskMetrics.executorDeserializeTime) + g.writeNumberField("Executor Deserialize CPU Time", taskMetrics.executorDeserializeCpuTime) + g.writeNumberField("Executor Run Time", taskMetrics.executorRunTime) + g.writeNumberField("Executor CPU Time", taskMetrics.executorCpuTime) + g.writeNumberField("Peak Execution Memory", taskMetrics.peakExecutionMemory) + g.writeNumberField("Result Size", taskMetrics.resultSize) + g.writeNumberField("JVM GC Time", taskMetrics.jvmGCTime) + g.writeNumberField("Result Serialization Time", taskMetrics.resultSerializationTime) + g.writeNumberField("Memory Bytes Spilled", taskMetrics.memoryBytesSpilled) + g.writeNumberField("Disk Bytes Spilled", taskMetrics.diskBytesSpilled) + g.writeFieldName("Shuffle Read Metrics") + writeShuffleReadMetrics() + g.writeFieldName("Shuffle Write Metrics") + writeShuffleWriteMetrics() + g.writeFieldName("Input Metrics") + writeInputMetrics() + g.writeFieldName("Output Metrics") + writeOutputMetrics() + g.writeFieldName("Updated Blocks") + writeUpdatedBlocks() + g.writeEndObject() } /** Convert executor metrics to JSON. */ - def executorMetricsToJson(executorMetrics: ExecutorMetrics): JValue = { - val metrics = ExecutorMetricType.metricToOffset.map { case (m, _) => - JField(m, executorMetrics.getMetricValue(m)) + def executorMetricsToJson(executorMetrics: ExecutorMetrics, g: JsonGenerator): Unit = { + g.writeStartObject() + ExecutorMetricType.metricToOffset.foreach { case (m, _) => + g.writeNumberField(m, executorMetrics.getMetricValue(m)) } - JObject(metrics.toSeq: _*) + g.writeEndObject() } - def taskEndReasonToJson(taskEndReason: TaskEndReason): JValue = { - val reason = Utils.getFormattedClassName(taskEndReason) - val json: JObject = taskEndReason match { + def taskEndReasonToJson(taskEndReason: TaskEndReason, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Reason", Utils.getFormattedClassName(taskEndReason)) + taskEndReason match { case fetchFailed: FetchFailed => - val blockManagerAddress = Option(fetchFailed.bmAddress). - map(blockManagerIdToJson).getOrElse(JNothing) - ("Block Manager Address" -> blockManagerAddress) ~ - ("Shuffle ID" -> fetchFailed.shuffleId) ~ - ("Map ID" -> fetchFailed.mapId) ~ - ("Map Index" -> fetchFailed.mapIndex) ~ - ("Reduce ID" -> fetchFailed.reduceId) ~ - ("Message" -> fetchFailed.message) + Option(fetchFailed.bmAddress).foreach { id => + g.writeFieldName("Block Manager Address") + blockManagerIdToJson(id, g) + } + g.writeNumberField("Shuffle ID", fetchFailed.shuffleId) + g.writeNumberField("Map ID", fetchFailed.mapId) + g.writeNumberField("Map Index", fetchFailed.mapIndex) + g.writeNumberField("Reduce ID", fetchFailed.reduceId) + g.writeStringField("Message", fetchFailed.message) case exceptionFailure: ExceptionFailure => - val stackTrace = stackTraceToJson(exceptionFailure.stackTrace) - val accumUpdates = accumulablesToJson(exceptionFailure.accumUpdates) - ("Class Name" -> exceptionFailure.className) ~ - ("Description" -> exceptionFailure.description) ~ - ("Stack Trace" -> stackTrace) ~ - ("Full Stack Trace" -> exceptionFailure.fullStackTrace) ~ - ("Accumulator Updates" -> accumUpdates) + g.writeStringField("Class Name", exceptionFailure.className) + g.writeStringField("Description", exceptionFailure.description) + g.writeFieldName("Stack Trace") + stackTraceToJson(exceptionFailure.stackTrace, g) + g.writeStringField("Full Stack Trace", exceptionFailure.fullStackTrace) + g.writeFieldName("Accumulator Updates") + accumulablesToJson(exceptionFailure.accumUpdates, g) case taskCommitDenied: TaskCommitDenied => - ("Job ID" -> taskCommitDenied.jobID) ~ - ("Partition ID" -> taskCommitDenied.partitionID) ~ - ("Attempt Number" -> taskCommitDenied.attemptNumber) + g.writeNumberField("Job ID", taskCommitDenied.jobID) + g.writeNumberField("Partition ID", taskCommitDenied.partitionID) + g.writeNumberField("Attempt Number", taskCommitDenied.attemptNumber) case ExecutorLostFailure(executorId, exitCausedByApp, reason) => - ("Executor ID" -> executorId) ~ - ("Exit Caused By App" -> exitCausedByApp) ~ - ("Loss Reason" -> reason) + g.writeStringField("Executor ID", executorId) + g.writeBooleanField("Exit Caused By App", exitCausedByApp) + reason.foreach(g.writeStringField("Loss Reason", _)) case taskKilled: TaskKilled => - val accumUpdates = JArray(taskKilled.accumUpdates.map(accumulableInfoToJson).toList) - ("Kill Reason" -> taskKilled.reason) ~ - ("Accumulator Updates" -> accumUpdates) - case _ => emptyJson + g.writeStringField("Kill Reason", taskKilled.reason) + g.writeArrayFieldStart("Accumulator Updates") + taskKilled.accumUpdates.foreach { info => + accumulableInfoToJson(info, g) + } + g.writeEndArray() + case _ => + // no extra fields to write } - ("Reason" -> reason) ~ json + g.writeEndObject() } - def blockManagerIdToJson(blockManagerId: BlockManagerId): JValue = { - ("Executor ID" -> blockManagerId.executorId) ~ - ("Host" -> blockManagerId.host) ~ - ("Port" -> blockManagerId.port) + def blockManagerIdToJson(blockManagerId: BlockManagerId, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Executor ID", blockManagerId.executorId) + g.writeStringField("Host", blockManagerId.host) + g.writeNumberField("Port", blockManagerId.port) + g.writeEndObject() } - def jobResultToJson(jobResult: JobResult): JValue = { - val result = Utils.getFormattedClassName(jobResult) - val json = jobResult match { - case JobSucceeded => emptyJson + def jobResultToJson(jobResult: JobResult, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Result", Utils.getFormattedClassName(jobResult)) + jobResult match { case jobFailed: JobFailed => - JObject("Exception" -> exceptionToJson(jobFailed.exception)) + g.writeFieldName("Exception") + exceptionToJson(jobFailed.exception, g) + case JobSucceeded => + // Nothing else to write in case of success } - ("Result" -> result) ~ json - } - - def rddInfoToJson(rddInfo: RDDInfo): JValue = { - val storageLevel = storageLevelToJson(rddInfo.storageLevel) - val parentIds = JArray(rddInfo.parentIds.map(JInt(_)).toList) - ("RDD ID" -> rddInfo.id) ~ - ("Name" -> rddInfo.name) ~ - ("Scope" -> rddInfo.scope.map(_.toJson)) ~ - ("Callsite" -> rddInfo.callSite) ~ - ("Parent IDs" -> parentIds) ~ - ("Storage Level" -> storageLevel) ~ - ("Barrier" -> rddInfo.isBarrier) ~ - ("DeterministicLevel" -> rddInfo.outputDeterministicLevel.toString) ~ - ("Number of Partitions" -> rddInfo.numPartitions) ~ - ("Number of Cached Partitions" -> rddInfo.numCachedPartitions) ~ - ("Memory Size" -> rddInfo.memSize) ~ - ("Disk Size" -> rddInfo.diskSize) - } - - def storageLevelToJson(storageLevel: StorageLevel): JValue = { - ("Use Disk" -> storageLevel.useDisk) ~ - ("Use Memory" -> storageLevel.useMemory) ~ - ("Use Off Heap" -> storageLevel.useOffHeap) ~ - ("Deserialized" -> storageLevel.deserialized) ~ - ("Replication" -> storageLevel.replication) - } - - def blockStatusToJson(blockStatus: BlockStatus): JValue = { - val storageLevel = storageLevelToJson(blockStatus.storageLevel) - ("Storage Level" -> storageLevel) ~ - ("Memory Size" -> blockStatus.memSize) ~ - ("Disk Size" -> blockStatus.diskSize) - } - - def executorInfoToJson(executorInfo: ExecutorInfo): JValue = { - ("Host" -> executorInfo.executorHost) ~ - ("Total Cores" -> executorInfo.totalCores) ~ - ("Log Urls" -> mapToJson(executorInfo.logUrlMap)) ~ - ("Attributes" -> mapToJson(executorInfo.attributes)) ~ - ("Resources" -> resourcesMapToJson(executorInfo.resourcesInfo)) ~ - ("Resource Profile Id" -> executorInfo.resourceProfileId) ~ - ("Registration Time" -> executorInfo.registrationTime) ~ - ("Request Time" -> executorInfo.requestTime) - } - - def resourcesMapToJson(m: Map[String, ResourceInformation]): JValue = { - val jsonFields = m.map { - case (k, v) => JField(k, v.toJson) - } - JObject(jsonFields.toList) - } - - def blockUpdatedInfoToJson(blockUpdatedInfo: BlockUpdatedInfo): JValue = { - ("Block Manager ID" -> blockManagerIdToJson(blockUpdatedInfo.blockManagerId)) ~ - ("Block ID" -> blockUpdatedInfo.blockId.toString) ~ - ("Storage Level" -> storageLevelToJson(blockUpdatedInfo.storageLevel)) ~ - ("Memory Size" -> blockUpdatedInfo.memSize) ~ - ("Disk Size" -> blockUpdatedInfo.diskSize) + g.writeEndObject() } - def executorResourceRequestToJson(execReq: ExecutorResourceRequest): JValue = { - ("Resource Name" -> execReq.resourceName) ~ - ("Amount" -> execReq.amount) ~ - ("Discovery Script" -> execReq.discoveryScript) ~ - ("Vendor" -> execReq.vendor) - } - - def executorResourceRequestMapToJson(m: Map[String, ExecutorResourceRequest]): JValue = { - val jsonFields = m.map { - case (k, execReq) => - JField(k, executorResourceRequestToJson(execReq)) + def rddInfoToJson(rddInfo: RDDInfo, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeNumberField("RDD ID", rddInfo.id) + g.writeStringField("Name", rddInfo.name) + rddInfo.scope.foreach { s => + g.writeStringField("Scope", s.toJson) + } + g.writeStringField("Callsite", rddInfo.callSite) + g.writeArrayFieldStart("Parent IDs") + rddInfo.parentIds.foreach(g.writeNumber) + g.writeEndArray() + g.writeFieldName("Storage Level") + storageLevelToJson(rddInfo.storageLevel, g) + g.writeBooleanField("Barrier", rddInfo.isBarrier) + g.writeStringField("DeterministicLevel", rddInfo.outputDeterministicLevel.toString) + g.writeNumberField("Number of Partitions", rddInfo.numPartitions) + g.writeNumberField("Number of Cached Partitions", rddInfo.numCachedPartitions) + g.writeNumberField("Memory Size", rddInfo.memSize) + g.writeNumberField("Disk Size", rddInfo.diskSize) + g.writeEndObject() + } + + def storageLevelToJson(storageLevel: StorageLevel, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeBooleanField("Use Disk", storageLevel.useDisk) + g.writeBooleanField("Use Memory", storageLevel.useMemory) + g.writeBooleanField("Use Off Heap", storageLevel.useOffHeap) + g.writeBooleanField("Deserialized", storageLevel.deserialized) + g.writeNumberField("Replication", storageLevel.replication) + g.writeEndObject() + } + + def blockStatusToJson(blockStatus: BlockStatus, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeFieldName("Storage Level") + storageLevelToJson(blockStatus.storageLevel, g) + g.writeNumberField("Memory Size", blockStatus.memSize) + g.writeNumberField("Disk Size", blockStatus.diskSize) + g.writeEndObject() + } + + def executorInfoToJson(executorInfo: ExecutorInfo, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Host", executorInfo.executorHost) + g.writeNumberField("Total Cores", executorInfo.totalCores) + writeMapField("Log Urls", executorInfo.logUrlMap, g) + writeMapField("Attributes", executorInfo.attributes, g) + g.writeObjectFieldStart("Resources") + // TODO(SPARK-39658): here we are taking a Json4s JValue and are converting it to + // a JSON string then are combining that string with Jackson-generated JSON. This is + // done because ResourceInformation.toJson is a public class and exposes Json4s + // JValues as part of its public API. We should reconsider the design of that interface + // and explore whether we can avoid exposing third-party symbols in this public API. + executorInfo.resourcesInfo.foreach { case (k, v) => + g.writeFieldName(k) + g.writeRawValue(compact(v.toJson())) } - JObject(jsonFields.toList) + g.writeEndObject() + g.writeNumberField("Resource Profile Id", executorInfo.resourceProfileId) + executorInfo.registrationTime.foreach(g.writeNumberField("Registration Time", _)) + executorInfo.requestTime.foreach(g.writeNumberField("Request Time", _)) + g.writeEndObject() + } + + def blockUpdatedInfoToJson(blockUpdatedInfo: BlockUpdatedInfo, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeFieldName("Block Manager ID") + blockManagerIdToJson(blockUpdatedInfo.blockManagerId, g) + g.writeStringField("Block ID", blockUpdatedInfo.blockId.toString) + g.writeFieldName("Storage Level") + storageLevelToJson(blockUpdatedInfo.storageLevel, g) + g.writeNumberField("Memory Size", blockUpdatedInfo.memSize) + g.writeNumberField("Disk Size", blockUpdatedInfo.diskSize) + g.writeEndObject() + } + + def executorResourceRequestToJson(execReq: ExecutorResourceRequest, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Resource Name", execReq.resourceName) + g.writeNumberField("Amount", execReq.amount) + g.writeStringField("Discovery Script", execReq.discoveryScript) + g.writeStringField("Vendor", execReq.vendor) + g.writeEndObject() + } + + def executorResourceRequestMapToJson( + m: Map[String, ExecutorResourceRequest], + g: JsonGenerator): Unit = { + g.writeStartObject() + m.foreach { case (k, execReq) => + g.writeFieldName(k) + executorResourceRequestToJson(execReq, g) + } + g.writeEndObject() } - def taskResourceRequestToJson(taskReq: TaskResourceRequest): JValue = { - ("Resource Name" -> taskReq.resourceName) ~ - ("Amount" -> taskReq.amount) + def taskResourceRequestToJson(taskReq: TaskResourceRequest, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Resource Name", taskReq.resourceName) + g.writeNumberField("Amount", taskReq.amount) + g.writeEndObject() } - def taskResourceRequestMapToJson(m: Map[String, TaskResourceRequest]): JValue = { - val jsonFields = m.map { - case (k, taskReq) => - JField(k, taskResourceRequestToJson(taskReq)) + def taskResourceRequestMapToJson(m: Map[String, TaskResourceRequest], g: JsonGenerator): Unit = { + g.writeStartObject() + m.foreach { case (k, taskReq) => + g.writeFieldName(k) + taskResourceRequestToJson(taskReq, g) } - JObject(jsonFields.toList) + g.writeEndObject() } /** ------------------------------ * * Util JSON serialization methods | * ------------------------------- */ - def mapToJson(m: Map[String, String]): JValue = { - val jsonFields = m.map { case (k, v) => JField(k, JString(v)) } - JObject(jsonFields.toList) + def writeMapField(name: String, m: Map[String, String], g: JsonGenerator): Unit = { + g.writeObjectFieldStart(name) + m.foreach { case (k, v) => g.writeStringField(k, v) } + g.writeEndObject() } - def propertiesToJson(properties: Properties): JValue = { - Option(properties).map { p => - mapToJson(p.asScala) - }.getOrElse(JNothing) + def propertiesToJson(properties: Properties, g: JsonGenerator): Unit = { + g.writeStartObject() + properties.asScala.foreach { case (k, v) => g.writeStringField(k, v) } + g.writeEndObject() } - def UUIDToJson(id: UUID): JValue = { - ("Least Significant Bits" -> id.getLeastSignificantBits) ~ - ("Most Significant Bits" -> id.getMostSignificantBits) + def UUIDToJson(id: UUID, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeNumberField("Least Significant Bits", id.getLeastSignificantBits) + g.writeNumberField("Most Significant Bits", id.getMostSignificantBits) + g.writeEndObject() } - def stackTraceToJson(stackTrace: Array[StackTraceElement]): JValue = { - JArray(stackTrace.map { case line => - ("Declaring Class" -> line.getClassName) ~ - ("Method Name" -> line.getMethodName) ~ - ("File Name" -> line.getFileName) ~ - ("Line Number" -> line.getLineNumber) - }.toList) + def stackTraceToJson(stackTrace: Array[StackTraceElement], g: JsonGenerator): Unit = { + g.writeStartArray() + stackTrace.foreach { line => + g.writeStartObject() + g.writeStringField("Declaring Class", line.getClassName) + g.writeStringField("Method Name", line.getMethodName) + g.writeStringField("File Name", line.getFileName) + g.writeNumberField("Line Number", line.getLineNumber) + g.writeEndObject() + } + g.writeEndArray() } - def exceptionToJson(exception: Exception): JValue = { - ("Message" -> exception.getMessage) ~ - ("Stack Trace" -> stackTraceToJson(exception.getStackTrace)) + def exceptionToJson(exception: Exception, g: JsonGenerator): Unit = { + g.writeStartObject() + g.writeStringField("Message", exception.getMessage) + g.writeFieldName("Stack Trace") + stackTraceToJson(exception.getStackTrace, g) + g.writeEndObject() } @@ -640,10 +833,14 @@ private[spark] object JsonProtocol { val resourceProfileAdded = Utils.getFormattedClassName(SparkListenerResourceProfileAdded) } - def sparkEventFromJson(json: JValue): SparkListenerEvent = { + def sparkEventFromJson(json: String): SparkListenerEvent = { + sparkEventFromJson(mapper.readTree(json)) + } + + def sparkEventFromJson(json: JsonNode): SparkListenerEvent = { import SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES._ - (json \ "Event").extract[String] match { + json.get("Event").asText match { case `stageSubmitted` => stageSubmittedFromJson(json) case `stageCompleted` => stageCompletedFromJson(json) case `taskStart` => taskStartFromJson(json) @@ -664,66 +861,69 @@ private[spark] object JsonProtocol { case `stageExecutorMetrics` => stageExecutorMetricsFromJson(json) case `blockUpdate` => blockUpdateFromJson(json) case `resourceProfileAdded` => resourceProfileAddedFromJson(json) - case other => mapper.readValue(compact(render(json)), Utils.classForName(other)) + case other => mapper.readValue(json.toString, Utils.classForName(other)) .asInstanceOf[SparkListenerEvent] } } - def stageSubmittedFromJson(json: JValue): SparkListenerStageSubmitted = { - val stageInfo = stageInfoFromJson(json \ "Stage Info") - val properties = propertiesFromJson(json \ "Properties") + def stageSubmittedFromJson(json: JsonNode): SparkListenerStageSubmitted = { + val stageInfo = stageInfoFromJson(json.get("Stage Info")) + val properties = propertiesFromJson(json.get("Properties")) SparkListenerStageSubmitted(stageInfo, properties) } - def stageCompletedFromJson(json: JValue): SparkListenerStageCompleted = { - val stageInfo = stageInfoFromJson(json \ "Stage Info") + def stageCompletedFromJson(json: JsonNode): SparkListenerStageCompleted = { + val stageInfo = stageInfoFromJson(json.get("Stage Info")) SparkListenerStageCompleted(stageInfo) } - def taskStartFromJson(json: JValue): SparkListenerTaskStart = { - val stageId = (json \ "Stage ID").extract[Int] + def taskStartFromJson(json: JsonNode): SparkListenerTaskStart = { + val stageId = json.get("Stage ID").extractInt val stageAttemptId = - jsonOption(json \ "Stage Attempt ID").map(_.extract[Int]).getOrElse(0) - val taskInfo = taskInfoFromJson(json \ "Task Info") + jsonOption(json.get("Stage Attempt ID")).map(_.extractInt).getOrElse(0) + val taskInfo = taskInfoFromJson(json.get("Task Info")) SparkListenerTaskStart(stageId, stageAttemptId, taskInfo) } - def taskGettingResultFromJson(json: JValue): SparkListenerTaskGettingResult = { - val taskInfo = taskInfoFromJson(json \ "Task Info") + def taskGettingResultFromJson(json: JsonNode): SparkListenerTaskGettingResult = { + val taskInfo = taskInfoFromJson(json.get("Task Info")) SparkListenerTaskGettingResult(taskInfo) } /** Extract the executor metrics from JSON. */ - def executorMetricsFromJson(json: JValue): ExecutorMetrics = { + def executorMetricsFromJson(maybeJson: JsonNode): ExecutorMetrics = { + // Executor metrics might be absent in JSON from very old Spark versions. + // In this case we return zero values for each metric. val metrics = ExecutorMetricType.metricToOffset.map { case (metric, _) => - metric -> jsonOption(json \ metric).map(_.extract[Long]).getOrElse(0L) + val metricValueJson = jsonOption(maybeJson).flatMap(json => jsonOption(json.get(metric))) + metric -> metricValueJson.map(_.extractLong).getOrElse(0L) } new ExecutorMetrics(metrics.toMap) } - def taskEndFromJson(json: JValue): SparkListenerTaskEnd = { - val stageId = (json \ "Stage ID").extract[Int] + def taskEndFromJson(json: JsonNode): SparkListenerTaskEnd = { + val stageId = json.get("Stage ID").extractInt val stageAttemptId = - jsonOption(json \ "Stage Attempt ID").map(_.extract[Int]).getOrElse(0) - val taskType = (json \ "Task Type").extract[String] - val taskEndReason = taskEndReasonFromJson(json \ "Task End Reason") - val taskInfo = taskInfoFromJson(json \ "Task Info") - val executorMetrics = executorMetricsFromJson(json \ "Task Executor Metrics") - val taskMetrics = taskMetricsFromJson(json \ "Task Metrics") + jsonOption(json.get("Stage Attempt ID")).map(_.extractInt).getOrElse(0) + val taskType = json.get("Task Type").extractString + val taskEndReason = taskEndReasonFromJson(json.get("Task End Reason")) + val taskInfo = taskInfoFromJson(json.get("Task Info")) + val executorMetrics = executorMetricsFromJson(json.get("Task Executor Metrics")) + val taskMetrics = taskMetricsFromJson(json.get("Task Metrics")) SparkListenerTaskEnd(stageId, stageAttemptId, taskType, taskEndReason, taskInfo, executorMetrics, taskMetrics) } - def jobStartFromJson(json: JValue): SparkListenerJobStart = { - val jobId = (json \ "Job ID").extract[Int] + def jobStartFromJson(json: JsonNode): SparkListenerJobStart = { + val jobId = json.get("Job ID").extractInt val submissionTime = - jsonOption(json \ "Submission Time").map(_.extract[Long]).getOrElse(-1L) - val stageIds = (json \ "Stage IDs").extract[List[JValue]].map(_.extract[Int]) - val properties = propertiesFromJson(json \ "Properties") + jsonOption(json.get("Submission Time")).map(_.extractLong).getOrElse(-1L) + val stageIds = json.get("Stage IDs").extractElements.map(_.extractInt).toArray.toSeq + val properties = propertiesFromJson(json.get("Properties")) // The "Stage Infos" field was added in Spark 1.2.0 - val stageInfos = jsonOption(json \ "Stage Infos") - .map(_.extract[Seq[JValue]].map(stageInfoFromJson)).getOrElse { + val stageInfos = jsonOption(json.get("Stage Infos")) + .map(_.extractElements.map(stageInfoFromJson).toArray.toSeq).getOrElse { stageIds.map { id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown", resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) @@ -732,155 +932,153 @@ private[spark] object JsonProtocol { SparkListenerJobStart(jobId, submissionTime, stageInfos, properties) } - def jobEndFromJson(json: JValue): SparkListenerJobEnd = { - val jobId = (json \ "Job ID").extract[Int] + def jobEndFromJson(json: JsonNode): SparkListenerJobEnd = { + val jobId = json.get("Job ID").extractInt val completionTime = - jsonOption(json \ "Completion Time").map(_.extract[Long]).getOrElse(-1L) - val jobResult = jobResultFromJson(json \ "Job Result") + jsonOption(json.get("Completion Time")).map(_.extractLong).getOrElse(-1L) + val jobResult = jobResultFromJson(json.get("Job Result")) SparkListenerJobEnd(jobId, completionTime, jobResult) } - def resourceProfileAddedFromJson(json: JValue): SparkListenerResourceProfileAdded = { - val profId = (json \ "Resource Profile Id").extract[Int] - val executorReqs = executorResourceRequestMapFromJson(json \ "Executor Resource Requests") - val taskReqs = taskResourceRequestMapFromJson(json \ "Task Resource Requests") + def resourceProfileAddedFromJson(json: JsonNode): SparkListenerResourceProfileAdded = { + val profId = json.get("Resource Profile Id").extractInt + val executorReqs = executorResourceRequestMapFromJson(json.get("Executor Resource Requests")) + val taskReqs = taskResourceRequestMapFromJson(json.get("Task Resource Requests")) val rp = new ResourceProfile(executorReqs.toMap, taskReqs.toMap) rp.setResourceProfileId(profId) SparkListenerResourceProfileAdded(rp) } - def executorResourceRequestFromJson(json: JValue): ExecutorResourceRequest = { - val rName = (json \ "Resource Name").extract[String] - val amount = (json \ "Amount").extract[Long] - val discoveryScript = (json \ "Discovery Script").extract[String] - val vendor = (json \ "Vendor").extract[String] + def executorResourceRequestFromJson(json: JsonNode): ExecutorResourceRequest = { + val rName = json.get("Resource Name").extractString + val amount = json.get("Amount").extractLong + val discoveryScript = json.get("Discovery Script").extractString + val vendor = json.get("Vendor").extractString new ExecutorResourceRequest(rName, amount, discoveryScript, vendor) } - def taskResourceRequestFromJson(json: JValue): TaskResourceRequest = { - val rName = (json \ "Resource Name").extract[String] - val amount = (json \ "Amount").extract[Double] + def taskResourceRequestFromJson(json: JsonNode): TaskResourceRequest = { + val rName = json.get("Resource Name").extractString + val amount = json.get("Amount").extractDouble new TaskResourceRequest(rName, amount) } - def taskResourceRequestMapFromJson(json: JValue): Map[String, TaskResourceRequest] = { - val jsonFields = json.asInstanceOf[JObject].obj - jsonFields.collect { case JField(k, v) => - val req = taskResourceRequestFromJson(v) - (k, req) + def taskResourceRequestMapFromJson(json: JsonNode): Map[String, TaskResourceRequest] = { + json.fields().asScala.collect { case field => + val req = taskResourceRequestFromJson(field.getValue) + (field.getKey, req) }.toMap } - def executorResourceRequestMapFromJson(json: JValue): Map[String, ExecutorResourceRequest] = { - val jsonFields = json.asInstanceOf[JObject].obj - jsonFields.collect { case JField(k, v) => - val req = executorResourceRequestFromJson(v) - (k, req) + def executorResourceRequestMapFromJson(json: JsonNode): Map[String, ExecutorResourceRequest] = { + json.fields().asScala.collect { case field => + val req = executorResourceRequestFromJson(field.getValue) + (field.getKey, req) }.toMap } - def environmentUpdateFromJson(json: JValue): SparkListenerEnvironmentUpdate = { + def environmentUpdateFromJson(json: JsonNode): SparkListenerEnvironmentUpdate = { // For compatible with previous event logs - val hadoopProperties = jsonOption(json \ "Hadoop Properties").map(mapFromJson(_).toSeq) + val hadoopProperties = jsonOption(json.get("Hadoop Properties")).map(mapFromJson(_).toSeq) .getOrElse(Seq.empty) - val metricsProperties = jsonOption(json \ "Metrics Properties").map(mapFromJson(_).toSeq) + // The "Metrics Properties" field was added in Spark 3.4.0: + val metricsProperties = jsonOption(json.get("Metrics Properties")).map(mapFromJson(_).toSeq) .getOrElse(Seq.empty) val environmentDetails = Map[String, Seq[(String, String)]]( - "JVM Information" -> mapFromJson(json \ "JVM Information").toSeq, - "Spark Properties" -> mapFromJson(json \ "Spark Properties").toSeq, + "JVM Information" -> mapFromJson(json.get("JVM Information")).toSeq, + "Spark Properties" -> mapFromJson(json.get("Spark Properties")).toSeq, "Hadoop Properties" -> hadoopProperties, - "System Properties" -> mapFromJson(json \ "System Properties").toSeq, + "System Properties" -> mapFromJson(json.get("System Properties")).toSeq, "Metrics Properties" -> metricsProperties, - "Classpath Entries" -> mapFromJson(json \ "Classpath Entries").toSeq) + "Classpath Entries" -> mapFromJson(json.get("Classpath Entries")).toSeq) SparkListenerEnvironmentUpdate(environmentDetails) } - def blockManagerAddedFromJson(json: JValue): SparkListenerBlockManagerAdded = { - val blockManagerId = blockManagerIdFromJson(json \ "Block Manager ID") - val maxMem = (json \ "Maximum Memory").extract[Long] - val time = jsonOption(json \ "Timestamp").map(_.extract[Long]).getOrElse(-1L) - val maxOnHeapMem = jsonOption(json \ "Maximum Onheap Memory").map(_.extract[Long]) - val maxOffHeapMem = jsonOption(json \ "Maximum Offheap Memory").map(_.extract[Long]) + def blockManagerAddedFromJson(json: JsonNode): SparkListenerBlockManagerAdded = { + val blockManagerId = blockManagerIdFromJson(json.get("Block Manager ID")) + val maxMem = json.get("Maximum Memory").extractLong + val time = jsonOption(json.get("Timestamp")).map(_.extractLong).getOrElse(-1L) + val maxOnHeapMem = jsonOption(json.get("Maximum Onheap Memory")).map(_.extractLong) + val maxOffHeapMem = jsonOption(json.get("Maximum Offheap Memory")).map(_.extractLong) SparkListenerBlockManagerAdded(time, blockManagerId, maxMem, maxOnHeapMem, maxOffHeapMem) } - def blockManagerRemovedFromJson(json: JValue): SparkListenerBlockManagerRemoved = { - val blockManagerId = blockManagerIdFromJson(json \ "Block Manager ID") - val time = jsonOption(json \ "Timestamp").map(_.extract[Long]).getOrElse(-1L) + def blockManagerRemovedFromJson(json: JsonNode): SparkListenerBlockManagerRemoved = { + val blockManagerId = blockManagerIdFromJson(json.get("Block Manager ID")) + val time = jsonOption(json.get("Timestamp")).map(_.extractLong).getOrElse(-1L) SparkListenerBlockManagerRemoved(time, blockManagerId) } - def unpersistRDDFromJson(json: JValue): SparkListenerUnpersistRDD = { - SparkListenerUnpersistRDD((json \ "RDD ID").extract[Int]) + def unpersistRDDFromJson(json: JsonNode): SparkListenerUnpersistRDD = { + SparkListenerUnpersistRDD(json.get("RDD ID").extractInt) } - def applicationStartFromJson(json: JValue): SparkListenerApplicationStart = { - val appName = (json \ "App Name").extract[String] - val appId = jsonOption(json \ "App ID").map(_.extract[String]) - val time = (json \ "Timestamp").extract[Long] - val sparkUser = (json \ "User").extract[String] - val appAttemptId = jsonOption(json \ "App Attempt ID").map(_.extract[String]) - val driverLogs = jsonOption(json \ "Driver Logs").map(mapFromJson) - val driverAttributes = jsonOption(json \ "Driver Attributes").map(mapFromJson) + def applicationStartFromJson(json: JsonNode): SparkListenerApplicationStart = { + val appName = json.get("App Name").extractString + val appId = jsonOption(json.get("App ID")).map(_.asText()) + val time = json.get("Timestamp").extractLong + val sparkUser = json.get("User").extractString + val appAttemptId = jsonOption(json.get("App Attempt ID")).map(_.asText()) + val driverLogs = jsonOption(json.get("Driver Logs")).map(mapFromJson) + val driverAttributes = jsonOption(json.get("Driver Attributes")).map(mapFromJson) SparkListenerApplicationStart(appName, appId, time, sparkUser, appAttemptId, driverLogs, driverAttributes) } - def applicationEndFromJson(json: JValue): SparkListenerApplicationEnd = { - SparkListenerApplicationEnd((json \ "Timestamp").extract[Long]) + def applicationEndFromJson(json: JsonNode): SparkListenerApplicationEnd = { + SparkListenerApplicationEnd(json.get("Timestamp").extractLong) } - def executorAddedFromJson(json: JValue): SparkListenerExecutorAdded = { - val time = (json \ "Timestamp").extract[Long] - val executorId = (json \ "Executor ID").extract[String] - val executorInfo = executorInfoFromJson(json \ "Executor Info") + def executorAddedFromJson(json: JsonNode): SparkListenerExecutorAdded = { + val time = json.get("Timestamp").extractLong + val executorId = json.get("Executor ID").extractString + val executorInfo = executorInfoFromJson(json.get("Executor Info")) SparkListenerExecutorAdded(time, executorId, executorInfo) } - def executorRemovedFromJson(json: JValue): SparkListenerExecutorRemoved = { - val time = (json \ "Timestamp").extract[Long] - val executorId = (json \ "Executor ID").extract[String] - val reason = (json \ "Removed Reason").extract[String] + def executorRemovedFromJson(json: JsonNode): SparkListenerExecutorRemoved = { + val time = json.get("Timestamp").extractLong + val executorId = json.get("Executor ID").extractString + val reason = json.get("Removed Reason").extractString SparkListenerExecutorRemoved(time, executorId, reason) } - def logStartFromJson(json: JValue): SparkListenerLogStart = { - val sparkVersion = (json \ "Spark Version").extract[String] + def logStartFromJson(json: JsonNode): SparkListenerLogStart = { + val sparkVersion = json.get("Spark Version").extractString SparkListenerLogStart(sparkVersion) } - def executorMetricsUpdateFromJson(json: JValue): SparkListenerExecutorMetricsUpdate = { - val execInfo = (json \ "Executor ID").extract[String] - val accumUpdates = (json \ "Metrics Updated").extract[List[JValue]].map { json => - val taskId = (json \ "Task ID").extract[Long] - val stageId = (json \ "Stage ID").extract[Int] - val stageAttemptId = (json \ "Stage Attempt ID").extract[Int] + def executorMetricsUpdateFromJson(json: JsonNode): SparkListenerExecutorMetricsUpdate = { + val execInfo = json.get("Executor ID").extractString + val accumUpdates = json.get("Metrics Updated").extractElements.map { json => + val taskId = json.get("Task ID").extractLong + val stageId = json.get("Stage ID").extractInt + val stageAttemptId = json.get("Stage Attempt ID").extractInt val updates = - (json \ "Accumulator Updates").extract[List[JValue]].map(accumulableInfoFromJson) + json.get("Accumulator Updates").extractElements.map(accumulableInfoFromJson).toArray.toSeq (taskId, stageId, stageAttemptId, updates) - } - val executorUpdates = (json \ "Executor Metrics Updated") match { - case JNothing => Map.empty[(Int, Int), ExecutorMetrics] - case value: JValue => value.extract[List[JValue]].map { json => - val stageId = (json \ "Stage ID").extract[Int] - val stageAttemptId = (json \ "Stage Attempt ID").extract[Int] - val executorMetrics = executorMetricsFromJson(json \ "Executor Metrics") + }.toArray.toSeq + val executorUpdates = jsonOption(json.get("Executor Metrics Updated")).map { value => + value.extractElements.map { json => + val stageId = json.get("Stage ID").extractInt + val stageAttemptId = json.get("Stage Attempt ID").extractInt + val executorMetrics = executorMetricsFromJson(json.get("Executor Metrics")) ((stageId, stageAttemptId) -> executorMetrics) }.toMap - } + }.getOrElse(Map.empty[(Int, Int), ExecutorMetrics]) SparkListenerExecutorMetricsUpdate(execInfo, accumUpdates, executorUpdates) } - def stageExecutorMetricsFromJson(json: JValue): SparkListenerStageExecutorMetrics = { - val execId = (json \ "Executor ID").extract[String] - val stageId = (json \ "Stage ID").extract[Int] - val stageAttemptId = (json \ "Stage Attempt ID").extract[Int] - val executorMetrics = executorMetricsFromJson(json \ "Executor Metrics") + def stageExecutorMetricsFromJson(json: JsonNode): SparkListenerStageExecutorMetrics = { + val execId = json.get("Executor ID").extractString + val stageId = json.get("Stage ID").extractInt + val stageAttemptId = json.get("Stage Attempt ID").extractInt + val executorMetrics = executorMetricsFromJson(json.get("Executor Metrics")) SparkListenerStageExecutorMetrics(execId, stageId, stageAttemptId, executorMetrics) } - def blockUpdateFromJson(json: JValue): SparkListenerBlockUpdated = { - val blockUpdatedInfo = blockUpdatedInfoFromJson(json \ "Block Updated Info") + def blockUpdateFromJson(json: JsonNode): SparkListenerBlockUpdated = { + val blockUpdatedInfo = blockUpdatedInfoFromJson(json.get("Block Updated Info")) SparkListenerBlockUpdated(blockUpdatedInfo) } @@ -888,27 +1086,27 @@ private[spark] object JsonProtocol { * JSON deserialization methods for classes SparkListenerEvents depend on | * ---------------------------------------------------------------------- */ - def stageInfoFromJson(json: JValue): StageInfo = { - val stageId = (json \ "Stage ID").extract[Int] - val attemptId = jsonOption(json \ "Stage Attempt ID").map(_.extract[Int]).getOrElse(0) - val stageName = (json \ "Stage Name").extract[String] - val numTasks = (json \ "Number of Tasks").extract[Int] - val rddInfos = (json \ "RDD Info").extract[List[JValue]].map(rddInfoFromJson) - val parentIds = jsonOption(json \ "Parent IDs") - .map { l => l.extract[List[JValue]].map(_.extract[Int]) } + def stageInfoFromJson(json: JsonNode): StageInfo = { + val stageId = json.get("Stage ID").extractInt + val attemptId = jsonOption(json.get("Stage Attempt ID")).map(_.extractInt).getOrElse(0) + val stageName = json.get("Stage Name").extractString + val numTasks = json.get("Number of Tasks").extractInt + val rddInfos = json.get("RDD Info").extractElements.map(rddInfoFromJson).toArray + val parentIds = jsonOption(json.get("Parent IDs")) + .map { l => l.extractElements.map(_.extractInt).toArray.toSeq } .getOrElse(Seq.empty) - val details = jsonOption(json \ "Details").map(_.extract[String]).getOrElse("") - val submissionTime = jsonOption(json \ "Submission Time").map(_.extract[Long]) - val completionTime = jsonOption(json \ "Completion Time").map(_.extract[Long]) - val failureReason = jsonOption(json \ "Failure Reason").map(_.extract[String]) + val details = jsonOption(json.get("Details")).map(_.asText).getOrElse("") + val submissionTime = jsonOption(json.get("Submission Time")).map(_.extractLong) + val completionTime = jsonOption(json.get("Completion Time")).map(_.extractLong) + val failureReason = jsonOption(json.get("Failure Reason")).map(_.asText) val accumulatedValues = { - jsonOption(json \ "Accumulables").map(_.extract[List[JValue]]) match { + jsonOption(json.get("Accumulables")).map(_.extractElements) match { case Some(values) => values.map(accumulableInfoFromJson) case None => Seq.empty[AccumulableInfo] } } - val rpId = jsonOption(json \ "Resource Profile Id").map(_.extract[Int]) + val rpId = jsonOption(json.get("Resource Profile Id")).map(_.extractInt) val stageProf = rpId.getOrElse(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val stageInfo = new StageInfo(stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details, resourceProfileId = stageProf) @@ -921,22 +1119,23 @@ private[spark] object JsonProtocol { stageInfo } - def taskInfoFromJson(json: JValue): TaskInfo = { - val taskId = (json \ "Task ID").extract[Long] - val index = (json \ "Index").extract[Int] - val attempt = jsonOption(json \ "Attempt").map(_.extract[Int]).getOrElse(1) - val partitionId = jsonOption(json \ "Partition ID").map(_.extract[Int]).getOrElse(-1) - val launchTime = (json \ "Launch Time").extract[Long] - val executorId = weakIntern((json \ "Executor ID").extract[String]) - val host = weakIntern((json \ "Host").extract[String]) - val taskLocality = TaskLocality.withName((json \ "Locality").extract[String]) - val speculative = jsonOption(json \ "Speculative").exists(_.extract[Boolean]) - val gettingResultTime = (json \ "Getting Result Time").extract[Long] - val finishTime = (json \ "Finish Time").extract[Long] - val failed = (json \ "Failed").extract[Boolean] - val killed = jsonOption(json \ "Killed").exists(_.extract[Boolean]) - val accumulables = jsonOption(json \ "Accumulables").map(_.extract[Seq[JValue]]) match { - case Some(values) => values.map(accumulableInfoFromJson) + def taskInfoFromJson(json: JsonNode): TaskInfo = { + val taskId = json.get("Task ID").extractLong + val index = json.get("Index").extractInt + val attempt = jsonOption(json.get("Attempt")).map(_.extractInt).getOrElse(1) + // The "Partition ID" field was added in Spark 3.3.0: + val partitionId = jsonOption(json.get("Partition ID")).map(_.extractInt).getOrElse(-1) + val launchTime = json.get("Launch Time").extractLong + val executorId = weakIntern(json.get("Executor ID").extractString) + val host = weakIntern(json.get("Host").extractString) + val taskLocality = TaskLocality.withName(json.get("Locality").extractString) + val speculative = jsonOption(json.get("Speculative")).exists(_.extractBoolean) + val gettingResultTime = json.get("Getting Result Time").extractLong + val finishTime = json.get("Finish Time").extractLong + val failed = json.get("Failed").extractBoolean + val killed = jsonOption(json.get("Killed")).exists(_.extractBoolean) + val accumulables = jsonOption(json.get("Accumulables")).map(_.extractElements) match { + case Some(values) => values.map(accumulableInfoFromJson).toArray.toSeq case None => Seq.empty[AccumulableInfo] } @@ -951,15 +1150,15 @@ private[spark] object JsonProtocol { taskInfo } - def accumulableInfoFromJson(json: JValue): AccumulableInfo = { - val id = (json \ "ID").extract[Long] - val name = jsonOption(json \ "Name").map(_.extract[String]) - val update = jsonOption(json \ "Update").map { v => accumValueFromJson(name, v) } - val value = jsonOption(json \ "Value").map { v => accumValueFromJson(name, v) } - val internal = jsonOption(json \ "Internal").exists(_.extract[Boolean]) + def accumulableInfoFromJson(json: JsonNode): AccumulableInfo = { + val id = json.get("ID").extractLong + val name = jsonOption(json.get("Name")).map(_.asText) + val update = jsonOption(json.get("Update")).map { v => accumValueFromJson(name, v) } + val value = jsonOption(json.get("Value")).map { v => accumValueFromJson(name, v) } + val internal = jsonOption(json.get("Internal")).exists(_.extractBoolean) val countFailedValues = - jsonOption(json \ "Count Failed Values").exists(_.extract[Boolean]) - val metadata = jsonOption(json \ "Metadata").map(_.extract[String]) + jsonOption(json.get("Count Failed Values")).exists(_.extractBoolean) + val metadata = jsonOption(json.get("Metadata")).map(_.asText) new AccumulableInfo(id, name, update, value, internal, countFailedValues, metadata) } @@ -972,98 +1171,96 @@ private[spark] object JsonProtocol { * * The behavior here must match that of [[accumValueToJson]]. Exposed for testing. */ - private[util] def accumValueFromJson(name: Option[String], value: JValue): Any = { + private[util] def accumValueFromJson(name: Option[String], value: JsonNode): Any = { if (name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))) { - value match { - case JInt(v) => v.toLong - case JArray(v) => - v.map { blockJson => - val id = BlockId((blockJson \ "Block ID").extract[String]) - val status = blockStatusFromJson(blockJson \ "Status") - (id, status) - }.asJava - case _ => throw new IllegalArgumentException(s"unexpected json value $value for " + + if (value.isIntegralNumber) { + value.extractLong + } else if (value.isArray) { + value.extractElements.map { blockJson => + val id = BlockId(blockJson.get("Block ID").extractString) + val status = blockStatusFromJson(blockJson.get("Status")) + (id, status) + }.toArray.toSeq.asJava + } else { + throw new IllegalArgumentException(s"unexpected json value $value for " + "accumulator " + name.get) } } else { - value.extract[String] + value.asText } } - def taskMetricsFromJson(json: JValue): TaskMetrics = { + def taskMetricsFromJson(json: JsonNode): TaskMetrics = { val metrics = TaskMetrics.empty - if (json == JNothing) { + if (json == null || json.isNull) { return metrics } - metrics.setExecutorDeserializeTime((json \ "Executor Deserialize Time").extract[Long]) - metrics.setExecutorDeserializeCpuTime((json \ "Executor Deserialize CPU Time") match { - case JNothing => 0 - case x => x.extract[Long] - }) - metrics.setExecutorRunTime((json \ "Executor Run Time").extract[Long]) - metrics.setExecutorCpuTime((json \ "Executor CPU Time") match { - case JNothing => 0 - case x => x.extract[Long] - }) - metrics.setPeakExecutionMemory((json \ "Peak Execution Memory") match { - case JNothing => 0 - case x => x.extract[Long] - }) - metrics.setResultSize((json \ "Result Size").extract[Long]) - metrics.setJvmGCTime((json \ "JVM GC Time").extract[Long]) - metrics.setResultSerializationTime((json \ "Result Serialization Time").extract[Long]) - metrics.incMemoryBytesSpilled((json \ "Memory Bytes Spilled").extract[Long]) - metrics.incDiskBytesSpilled((json \ "Disk Bytes Spilled").extract[Long]) + metrics.setExecutorDeserializeTime(json.get("Executor Deserialize Time").extractLong) + // The "Executor Deserialize CPU Time" field was added in Spark 2.1.0: + metrics.setExecutorDeserializeCpuTime( + jsonOption(json.get("Executor Deserialize CPU Time")).map(_.extractLong).getOrElse(0)) + metrics.setExecutorRunTime(json.get("Executor Run Time").extractLong) + // The "Executor CPU Time" field was added in Spark 2.1.0: + metrics.setExecutorCpuTime( + jsonOption(json.get("Executor CPU Time")).map(_.extractLong).getOrElse(0)) + // The "Peak Execution Memory" field was added in Spark 3.0.0: + metrics.setPeakExecutionMemory( + jsonOption(json.get("Peak Execution Memory")).map(_.extractLong).getOrElse(0)) + metrics.setResultSize(json.get("Result Size").extractLong) + metrics.setJvmGCTime(json.get("JVM GC Time").extractLong) + metrics.setResultSerializationTime(json.get("Result Serialization Time").extractLong) + metrics.incMemoryBytesSpilled(json.get("Memory Bytes Spilled").extractLong) + metrics.incDiskBytesSpilled(json.get("Disk Bytes Spilled").extractLong) // Shuffle read metrics - jsonOption(json \ "Shuffle Read Metrics").foreach { readJson => + jsonOption(json.get("Shuffle Read Metrics")).foreach { readJson => val readMetrics = metrics.createTempShuffleReadMetrics() - readMetrics.incRemoteBlocksFetched((readJson \ "Remote Blocks Fetched").extract[Int]) - readMetrics.incLocalBlocksFetched((readJson \ "Local Blocks Fetched").extract[Int]) - readMetrics.incRemoteBytesRead((readJson \ "Remote Bytes Read").extract[Long]) - jsonOption(readJson \ "Remote Bytes Read To Disk") - .foreach { v => readMetrics.incRemoteBytesReadToDisk(v.extract[Long])} + readMetrics.incRemoteBlocksFetched(readJson.get("Remote Blocks Fetched").extractInt) + readMetrics.incLocalBlocksFetched(readJson.get("Local Blocks Fetched").extractInt) + readMetrics.incRemoteBytesRead(readJson.get("Remote Bytes Read").extractLong) + jsonOption(readJson.get("Remote Bytes Read To Disk")) + .foreach { v => readMetrics.incRemoteBytesReadToDisk(v.extractLong)} readMetrics.incLocalBytesRead( - jsonOption(readJson \ "Local Bytes Read").map(_.extract[Long]).getOrElse(0L)) - readMetrics.incFetchWaitTime((readJson \ "Fetch Wait Time").extract[Long]) + jsonOption(readJson.get("Local Bytes Read")).map(_.extractLong).getOrElse(0L)) + readMetrics.incFetchWaitTime(readJson.get("Fetch Wait Time").extractLong) readMetrics.incRecordsRead( - jsonOption(readJson \ "Total Records Read").map(_.extract[Long]).getOrElse(0L)) + jsonOption(readJson.get("Total Records Read")).map(_.extractLong).getOrElse(0L)) metrics.mergeShuffleReadMetrics() } // Shuffle write metrics // TODO: Drop the redundant "Shuffle" since it's inconsistent with related classes. - jsonOption(json \ "Shuffle Write Metrics").foreach { writeJson => + jsonOption(json.get("Shuffle Write Metrics")).foreach { writeJson => val writeMetrics = metrics.shuffleWriteMetrics - writeMetrics.incBytesWritten((writeJson \ "Shuffle Bytes Written").extract[Long]) + writeMetrics.incBytesWritten(writeJson.get("Shuffle Bytes Written").extractLong) writeMetrics.incRecordsWritten( - jsonOption(writeJson \ "Shuffle Records Written").map(_.extract[Long]).getOrElse(0L)) - writeMetrics.incWriteTime((writeJson \ "Shuffle Write Time").extract[Long]) + jsonOption(writeJson.get("Shuffle Records Written")).map(_.extractLong).getOrElse(0L)) + writeMetrics.incWriteTime(writeJson.get("Shuffle Write Time").extractLong) } // Output metrics - jsonOption(json \ "Output Metrics").foreach { outJson => + jsonOption(json.get("Output Metrics")).foreach { outJson => val outputMetrics = metrics.outputMetrics - outputMetrics.setBytesWritten((outJson \ "Bytes Written").extract[Long]) + outputMetrics.setBytesWritten(outJson.get("Bytes Written").extractLong) outputMetrics.setRecordsWritten( - jsonOption(outJson \ "Records Written").map(_.extract[Long]).getOrElse(0L)) + jsonOption(outJson.get("Records Written")).map(_.extractLong).getOrElse(0L)) } // Input metrics - jsonOption(json \ "Input Metrics").foreach { inJson => + jsonOption(json.get("Input Metrics")).foreach { inJson => val inputMetrics = metrics.inputMetrics - inputMetrics.incBytesRead((inJson \ "Bytes Read").extract[Long]) + inputMetrics.incBytesRead(inJson.get("Bytes Read").extractLong) inputMetrics.incRecordsRead( - jsonOption(inJson \ "Records Read").map(_.extract[Long]).getOrElse(0L)) + jsonOption(inJson.get("Records Read")).map(_.extractLong).getOrElse(0L)) } // Updated blocks - jsonOption(json \ "Updated Blocks").foreach { blocksJson => - metrics.setUpdatedBlockStatuses(blocksJson.extract[List[JValue]].map { blockJson => - val id = BlockId((blockJson \ "Block ID").extract[String]) - val status = blockStatusFromJson(blockJson \ "Status") + jsonOption(json.get("Updated Blocks")).foreach { blocksJson => + metrics.setUpdatedBlockStatuses(blocksJson.extractElements.map { blockJson => + val id = BlockId(blockJson.get("Block ID").extractString) + val status = blockStatusFromJson(blockJson.get("Status")) (id, status) - }) + }.toArray.toSeq) } metrics @@ -1081,61 +1278,61 @@ private[spark] object JsonProtocol { val unknownReason = Utils.getFormattedClassName(UnknownReason) } - def taskEndReasonFromJson(json: JValue): TaskEndReason = { + def taskEndReasonFromJson(json: JsonNode): TaskEndReason = { import TASK_END_REASON_FORMATTED_CLASS_NAMES._ - (json \ "Reason").extract[String] match { + json.get("Reason").extractString match { case `success` => Success case `resubmitted` => Resubmitted case `fetchFailed` => - val blockManagerAddress = blockManagerIdFromJson(json \ "Block Manager Address") - val shuffleId = (json \ "Shuffle ID").extract[Int] - val mapId = (json \ "Map ID").extract[Long] - val mapIndex = json \ "Map Index" match { - case JNothing => - // Note, we use the invalid value Int.MinValue here to fill the map index for backward - // compatibility. Otherwise, the fetch failed event will be dropped when the history - // server loads the event log written by the Spark version before 3.0. - Int.MinValue - case x => x.extract[Int] + val blockManagerAddress = blockManagerIdFromJson(json.get("Block Manager Address")) + val shuffleId = json.get("Shuffle ID").extractInt + val mapId = json.get("Map ID").extractLong + val mapIndex = jsonOption(json.get("Map Index")).map(_.extractInt).getOrElse { + // Note, we use the invalid value Int.MinValue here to fill the map index for backward + // compatibility. Otherwise, the fetch failed event will be dropped when the history + // server loads the event log written by the Spark version before 3.0. + Int.MinValue } - val reduceId = (json \ "Reduce ID").extract[Int] - val message = jsonOption(json \ "Message").map(_.extract[String]) + val reduceId = json.get("Reduce ID").extractInt + val message = jsonOption(json.get("Message")).map(_.asText) new FetchFailed(blockManagerAddress, shuffleId, mapId, mapIndex, reduceId, message.getOrElse("Unknown reason")) case `exceptionFailure` => - val className = (json \ "Class Name").extract[String] - val description = (json \ "Description").extract[String] - val stackTrace = stackTraceFromJson(json \ "Stack Trace") + val className = json.get("Class Name").extractString + val description = json.get("Description").extractString + val stackTrace = stackTraceFromJson(json.get("Stack Trace")) val fullStackTrace = - jsonOption(json \ "Full Stack Trace").map(_.extract[String]).orNull + jsonOption(json.get("Full Stack Trace")).map(_.asText).orNull // Fallback on getting accumulator updates from TaskMetrics, which was logged in Spark 1.x - val accumUpdates = jsonOption(json \ "Accumulator Updates") - .map(_.extract[List[JValue]].map(accumulableInfoFromJson)) - .getOrElse(taskMetricsFromJson(json \ "Metrics").accumulators().map(acc => { + val accumUpdates = jsonOption(json.get("Accumulator Updates")) + .map(_.extractElements.map(accumulableInfoFromJson).toArray.toSeq) + .getOrElse(taskMetricsFromJson(json.get("Metrics")).accumulators().map(acc => { acc.toInfo(Some(acc.value), None) - })) + }).toArray.toSeq) ExceptionFailure(className, description, stackTrace, fullStackTrace, None, accumUpdates) case `taskResultLost` => TaskResultLost case `taskKilled` => - val killReason = jsonOption(json \ "Kill Reason") - .map(_.extract[String]).getOrElse("unknown reason") - val accumUpdates = jsonOption(json \ "Accumulator Updates") - .map(_.extract[List[JValue]].map(accumulableInfoFromJson)) + // The "Kill Reason" field was added in Spark 2.2.0: + val killReason = jsonOption(json.get("Kill Reason")) + .map(_.asText).getOrElse("unknown reason") + // The "Accumulator Updates" field was added in Spark 2.4.0: + val accumUpdates = jsonOption(json.get("Accumulator Updates")) + .map(_.extractElements.map(accumulableInfoFromJson).toArray.toSeq) .getOrElse(Seq[AccumulableInfo]()) TaskKilled(killReason, accumUpdates) case `taskCommitDenied` => // Unfortunately, the `TaskCommitDenied` message was introduced in 1.3.0 but the JSON // de/serialization logic was not added until 1.5.1. To provide backward compatibility // for reading those logs, we need to provide default values for all the fields. - val jobId = jsonOption(json \ "Job ID").map(_.extract[Int]).getOrElse(-1) - val partitionId = jsonOption(json \ "Partition ID").map(_.extract[Int]).getOrElse(-1) - val attemptNo = jsonOption(json \ "Attempt Number").map(_.extract[Int]).getOrElse(-1) + val jobId = jsonOption(json.get("Job ID")).map(_.extractInt).getOrElse(-1) + val partitionId = jsonOption(json.get("Partition ID")).map(_.extractInt).getOrElse(-1) + val attemptNo = jsonOption(json.get("Attempt Number")).map(_.extractInt).getOrElse(-1) TaskCommitDenied(jobId, partitionId, attemptNo) case `executorLostFailure` => - val exitCausedByApp = jsonOption(json \ "Exit Caused By App").map(_.extract[Boolean]) - val executorId = jsonOption(json \ "Executor ID").map(_.extract[String]) - val reason = jsonOption(json \ "Loss Reason").map(_.extract[String]) + val exitCausedByApp = jsonOption(json.get("Exit Caused By App")).map(_.extractBoolean) + val executorId = jsonOption(json.get("Executor ID")).map(_.asText) + val reason = jsonOption(json.get("Loss Reason")).map(_.asText) ExecutorLostFailure( executorId.getOrElse("Unknown"), exitCausedByApp.getOrElse(true), @@ -1144,14 +1341,14 @@ private[spark] object JsonProtocol { } } - def blockManagerIdFromJson(json: JValue): BlockManagerId = { + def blockManagerIdFromJson(json: JsonNode): BlockManagerId = { // On metadata fetch fail, block manager ID can be null (SPARK-4471) - if (json == JNothing) { + if (json == null || json.isNull) { return null } - val executorId = weakIntern((json \ "Executor ID").extract[String]) - val host = weakIntern((json \ "Host").extract[String]) - val port = (json \ "Port").extract[Int] + val executorId = weakIntern(json.get("Executor ID").extractString) + val host = weakIntern(json.get("Host").extractString) + val port = json.get("Port").extractInt BlockManagerId(executorId, host, port) } @@ -1160,36 +1357,37 @@ private[spark] object JsonProtocol { val jobFailed = Utils.getFormattedClassName(JobFailed) } - def jobResultFromJson(json: JValue): JobResult = { + def jobResultFromJson(json: JsonNode): JobResult = { import JOB_RESULT_FORMATTED_CLASS_NAMES._ - (json \ "Result").extract[String] match { + json.get("Result").extractString match { case `jobSucceeded` => JobSucceeded case `jobFailed` => - val exception = exceptionFromJson(json \ "Exception") + val exception = exceptionFromJson(json.get("Exception")) new JobFailed(exception) } } - def rddInfoFromJson(json: JValue): RDDInfo = { - val rddId = (json \ "RDD ID").extract[Int] - val name = (json \ "Name").extract[String] - val scope = jsonOption(json \ "Scope") - .map(_.extract[String]) + def rddInfoFromJson(json: JsonNode): RDDInfo = { + val rddId = json.get("RDD ID").extractInt + val name = json.get("Name").extractString + val scope = jsonOption(json.get("Scope")) + .map(_.asText) .map(RDDOperationScope.fromJson) - val callsite = jsonOption(json \ "Callsite").map(_.extract[String]).getOrElse("") - val parentIds = jsonOption(json \ "Parent IDs") - .map { l => l.extract[List[JValue]].map(_.extract[Int]) } + val callsite = jsonOption(json.get("Callsite")).map(_.asText).getOrElse("") + val parentIds = jsonOption(json.get("Parent IDs")) + .map { l => l.extractElements.map(_.extractInt).toArray.toSeq } .getOrElse(Seq.empty) - val storageLevel = storageLevelFromJson(json \ "Storage Level") - val isBarrier = jsonOption(json \ "Barrier").map(_.extract[Boolean]).getOrElse(false) - val numPartitions = (json \ "Number of Partitions").extract[Int] - val numCachedPartitions = (json \ "Number of Cached Partitions").extract[Int] - val memSize = (json \ "Memory Size").extract[Long] - val diskSize = (json \ "Disk Size").extract[Long] + val storageLevel = storageLevelFromJson(json.get("Storage Level")) + // The "Barrier" field was added in Spark 3.0.0: + val isBarrier = jsonOption(json.get("Barrier")).map(_.extractBoolean).getOrElse(false) + val numPartitions = json.get("Number of Partitions").extractInt + val numCachedPartitions = json.get("Number of Cached Partitions").extractInt + val memSize = json.get("Memory Size").extractLong + val diskSize = json.get("Disk Size").extractLong val outputDeterministicLevel = DeterministicLevel.withName( - jsonOption(json \ "DeterministicLevel").map(_.extract[String]).getOrElse("DETERMINATE")) + jsonOption(json.get("DeterministicLevel")).map(_.asText).getOrElse("DETERMINATE")) val rddInfo = new RDDInfo(rddId, name, numPartitions, storageLevel, isBarrier, parentIds, callsite, scope, @@ -1200,16 +1398,16 @@ private[spark] object JsonProtocol { rddInfo } - def storageLevelFromJson(json: JValue): StorageLevel = { - val useDisk = (json \ "Use Disk").extract[Boolean] - val useMemory = (json \ "Use Memory").extract[Boolean] + def storageLevelFromJson(json: JsonNode): StorageLevel = { + val useDisk = json.get("Use Disk").extractBoolean + val useMemory = json.get("Use Memory").extractBoolean // The "Use Off Heap" field was added in Spark 3.4.0 - val useOffHeap = jsonOption(json \ "Use Off Heap") match { - case Some(value) => value.extract[Boolean] + val useOffHeap = jsonOption(json.get("Use Off Heap")) match { + case Some(value) => value.extractBoolean case None => false } - val deserialized = (json \ "Deserialized").extract[Boolean] - val replication = (json \ "Replication").extract[Int] + val deserialized = json.get("Deserialized").extractBoolean + val replication = json.get("Replication").extractInt StorageLevel( useDisk = useDisk, useMemory = useMemory, @@ -1218,54 +1416,59 @@ private[spark] object JsonProtocol { replication = replication) } - def blockStatusFromJson(json: JValue): BlockStatus = { - val storageLevel = storageLevelFromJson(json \ "Storage Level") - val memorySize = (json \ "Memory Size").extract[Long] - val diskSize = (json \ "Disk Size").extract[Long] + def blockStatusFromJson(json: JsonNode): BlockStatus = { + val storageLevel = storageLevelFromJson(json.get("Storage Level")) + val memorySize = json.get("Memory Size").extractLong + val diskSize = json.get("Disk Size").extractLong BlockStatus(storageLevel, memorySize, diskSize) } - def executorInfoFromJson(json: JValue): ExecutorInfo = { - val executorHost = (json \ "Host").extract[String] - val totalCores = (json \ "Total Cores").extract[Int] - val logUrls = mapFromJson(json \ "Log Urls").toMap - val attributes = jsonOption(json \ "Attributes") match { + def executorInfoFromJson(json: JsonNode): ExecutorInfo = { + val executorHost = json.get("Host").extractString + val totalCores = json.get("Total Cores").extractInt + val logUrls = mapFromJson(json.get("Log Urls")).toMap + // The "Attributes" field was added in Spark 3.0.0: + val attributes = jsonOption(json.get("Attributes")) match { case Some(attr) => mapFromJson(attr).toMap case None => Map.empty[String, String] } - val resources = jsonOption(json \ "Resources") match { + // The "Resources" field was added in Spark 3.0.0: + val resources = jsonOption(json.get("Resources")) match { case Some(resources) => resourcesMapFromJson(resources).toMap case None => Map.empty[String, ResourceInformation] } - val resourceProfileId = jsonOption(json \ "Resource Profile Id") match { - case Some(id) => id.extract[Int] + // The "Resource Profile Id" field was added in Spark 3.4.0 + val resourceProfileId = jsonOption(json.get("Resource Profile Id")) match { + case Some(id) => id.extractInt case None => ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID } - val registrationTs = jsonOption(json \ "Registration Time") map { ts => - ts.extract[Long] + // The "Registration Time" field was added in Spark 3.4.0 + val registrationTs = jsonOption(json.get("Registration Time")) map { ts => + ts.extractLong } - val requestTs = jsonOption(json \ "Request Time") map { ts => - ts.extract[Long] + // The "Request Time" field was added in Spark 3.4.0 + val requestTs = jsonOption(json.get("Request Time")) map { ts => + ts.extractLong } new ExecutorInfo(executorHost, totalCores, logUrls, attributes.toMap, resources.toMap, resourceProfileId, registrationTs, requestTs) } - def blockUpdatedInfoFromJson(json: JValue): BlockUpdatedInfo = { - val blockManagerId = blockManagerIdFromJson(json \ "Block Manager ID") - val blockId = BlockId((json \ "Block ID").extract[String]) - val storageLevel = storageLevelFromJson(json \ "Storage Level") - val memorySize = (json \ "Memory Size").extract[Long] - val diskSize = (json \ "Disk Size").extract[Long] + def blockUpdatedInfoFromJson(json: JsonNode): BlockUpdatedInfo = { + val blockManagerId = blockManagerIdFromJson(json.get("Block Manager ID")) + val blockId = BlockId(json.get("Block ID").extractString) + val storageLevel = storageLevelFromJson(json.get("Storage Level")) + val memorySize = json.get("Memory Size").extractLong + val diskSize = json.get("Disk Size").extractLong BlockUpdatedInfo(blockManagerId, blockId, storageLevel, memorySize, diskSize) } - def resourcesMapFromJson(json: JValue): Map[String, ResourceInformation] = { - val jsonFields = json.asInstanceOf[JObject].obj - jsonFields.collect { case JField(k, v) => - val resourceInfo = ResourceInformation.parseJson(v) - (k, resourceInfo) + def resourcesMapFromJson(json: JsonNode): Map[String, ResourceInformation] = { + assert(json.isObject, s"expected object, got ${json.getNodeType}") + json.fields.asScala.map { field => + val resourceInfo = ResourceInformation.parseJson(field.getValue.toString) + (field.getKey, resourceInfo) }.toMap } @@ -1273,49 +1476,86 @@ private[spark] object JsonProtocol { * Util JSON deserialization methods | * --------------------------------- */ - def mapFromJson(json: JValue): Map[String, String] = { - val jsonFields = json.asInstanceOf[JObject].obj - jsonFields.collect { case JField(k, JString(v)) => (k, v) }.toMap + def mapFromJson(json: JsonNode): Map[String, String] = { + assert(json.isObject, s"expected object, got ${json.getNodeType}") + json.fields.asScala.map { field => + (field.getKey, field.getValue.extractString) + }.toMap } - def propertiesFromJson(json: JValue): Properties = { + def propertiesFromJson(json: JsonNode): Properties = { jsonOption(json).map { value => val properties = new Properties - mapFromJson(json).foreach { case (k, v) => properties.setProperty(k, v) } + mapFromJson(value).foreach { case (k, v) => properties.setProperty(k, v) } properties }.orNull } - def UUIDFromJson(json: JValue): UUID = { - val leastSignificantBits = (json \ "Least Significant Bits").extract[Long] - val mostSignificantBits = (json \ "Most Significant Bits").extract[Long] + def UUIDFromJson(json: JsonNode): UUID = { + val leastSignificantBits = json.get("Least Significant Bits").extractLong + val mostSignificantBits = json.get("Most Significant Bits").extractLong new UUID(leastSignificantBits, mostSignificantBits) } - def stackTraceFromJson(json: JValue): Array[StackTraceElement] = { - json.extract[List[JValue]].map { line => - val declaringClass = (line \ "Declaring Class").extract[String] - val methodName = (line \ "Method Name").extract[String] - val fileName = (line \ "File Name").extract[String] - val lineNumber = (line \ "Line Number").extract[Int] + def stackTraceFromJson(json: JsonNode): Array[StackTraceElement] = { + json.extractElements.map { line => + val declaringClass = line.get("Declaring Class").extractString + val methodName = line.get("Method Name").extractString + val fileName = line.get("File Name").extractString + val lineNumber = line.get("Line Number").extractInt new StackTraceElement(declaringClass, methodName, fileName, lineNumber) }.toArray } - def exceptionFromJson(json: JValue): Exception = { - val e = new Exception((json \ "Message").extract[String]) - e.setStackTrace(stackTraceFromJson(json \ "Stack Trace")) + def exceptionFromJson(json: JsonNode): Exception = { + val e = new Exception(json.get("Message").extractString) + e.setStackTrace(stackTraceFromJson(json.get("Stack Trace"))) e } - /** Return an option that translates JNothing to None */ - private def jsonOption(json: JValue): Option[JValue] = { - json match { - case JNothing => None - case value: JValue => Some(value) + /** Return an option that translates NullNode to None */ + private def jsonOption(json: JsonNode): Option[JsonNode] = { + if (json == null || json.isNull) { + None + } else { + Some(json) } } - private def emptyJson: JObject = JObject(List[JField]()) + /** + * Implicit conversions to add methods to JsonNode that perform type-checking when + * reading fields. This ensures that JSON parsing will fail if we process JSON with + * unexpected input types (instead of silently falling back to default values). + */ + private implicit class JsonNodeImplicits(json: JsonNode) { + def extractElements: Iterator[JsonNode] = { + require(json.isContainerNode, s"Expected container, got ${json.getNodeType}") + json.elements.asScala + } + + def extractBoolean: Boolean = { + require(json.isBoolean, s"Expected boolean, got ${json.getNodeType}") + json.booleanValue + } + + def extractInt: Int = { + require(json.isNumber, s"Expected number, got ${json.getNodeType}") + json.intValue + } + + def extractLong: Long = { + require(json.isNumber, s"Expected number, got ${json.getNodeType}") + json.longValue + } + + def extractDouble: Double = { + require(json.isNumber, s"Expected number, got ${json.getNodeType}") + json.doubleValue + } + def extractString: String = { + require(json.isTextual, s"Expected string, got ${json.getNodeType}") + json.textValue + } + } } diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala index 44e338c6f0..a11ecc22d0 100644 --- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala @@ -29,7 +29,7 @@ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.security.GroupMappingServiceProvider -import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv, Utils} +import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv} class DummyGroupMappingServiceProvider extends GroupMappingServiceProvider { @@ -513,14 +513,5 @@ class SecurityManagerSuite extends SparkFunSuite with ResetSystemProperties { private def encodeFileAsBase64(secretFile: File) = { Base64.getEncoder.encodeToString(Files.readAllBytes(secretFile.toPath)) } - - private def withSecretFile(contents: String = "test-secret")(f: File => Unit): Unit = { - val secretDir = Utils.createTempDir("temp-secrets") - val secretFile = new File(secretDir, "temp-secret.txt") - Files.write(secretFile.toPath, contents.getBytes(UTF_8)) - try f(secretFile) finally { - Utils.deleteRecursively(secretDir) - } - } } diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala index 7922e13db6..b17aacc0a9 100644 --- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala @@ -18,7 +18,8 @@ package org.apache.spark import java.io.File -import java.nio.file.Path +import java.nio.charset.StandardCharsets.UTF_8 +import java.nio.file.{Files, Path} import java.util.{Locale, TimeZone} import scala.annotation.tailrec @@ -223,6 +224,19 @@ abstract class SparkFunSuite } } + /** + * Creates a temporary directory containing a secret file, which is then passed to `f` and + * will be deleted after `f` returns. + */ + protected def withSecretFile(contents: String = "test-secret")(f: File => Unit): Unit = { + val secretDir = Utils.createTempDir("temp-secrets") + val secretFile = new File(secretDir, "temp-secret.txt") + Files.write(secretFile.toPath, contents.getBytes(UTF_8)) + try f(secretFile) finally { + Utils.deleteRecursively(secretDir) + } + } + /** * Adds a log appender and optionally sets a log level to the root logger or the logger with * the specified name, then executes the specified function, and in the end removes the log diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmarks.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmarks.scala index 2bb70bc75f..9799eab113 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/Benchmarks.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmarks.scala @@ -96,8 +96,6 @@ object Benchmarks { require(args.length > 0, "Benchmark class to run should be specified.") if ( info.getName.endsWith("Benchmark") && - // TODO(SPARK-34927): Support TPCDSQueryBenchmark in Benchmarks - !info.getName.endsWith("TPCDSQueryBenchmark") && matcher.matches(Paths.get(info.getName)) && Try(runBenchmark).isSuccess && // Does this has a main method? !Modifier.isAbstract(clazz.getModifiers) // Is this a regular class? diff --git a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala index b182b11a0e..f8cbaf8190 100644 --- a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala @@ -22,24 +22,53 @@ import java.io.File import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo, WorkerResourceInfo} import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner} -import org.apache.spark.resource.{ResourceInformation, ResourceRequirement} +import org.apache.spark.resource.{ExecutorResourceRequests, ResourceInformation, ResourceProfile, ResourceRequirement, TaskResourceRequests} import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} private[deploy] object DeployTestUtils { - def createAppDesc(): ApplicationDescription = { + def defaultResourceProfile: ResourceProfile = { + createDefaultResourceProfile(1234) + } + + def createAppDesc(customResources: Map[String, Int] = Map.empty): ApplicationDescription = { val cmd = new Command("mainClass", List("arg1", "arg2"), Map(), Seq(), Seq(), Seq()) - new ApplicationDescription("name", Some(4), 1234, cmd, "appUiUrl") + val rp = createDefaultResourceProfile(1234, customResources) + new ApplicationDescription("name", Some(4), cmd, "appUiUrl", rp) } - def createAppInfo() : ApplicationInfo = { - val appDesc = createAppDesc() + def createAppInfo(): ApplicationInfo = { + val customResources = Map( + GPU -> 3, + FPGA -> 3) + val appDesc = createAppDesc(customResources) val appInfo = new ApplicationInfo(JsonConstants.appInfoStartTime, - "id", appDesc.copy(resourceReqsPerExecutor = createResourceRequirement), - JsonConstants.submitDate, null, Int.MaxValue) + "id", appDesc, JsonConstants.submitDate, null, Int.MaxValue) appInfo.endTime = JsonConstants.currTimeInMillis appInfo } + def createDefaultResourceProfile( + memoryPerExecutorMb: Int, + customResources: Map[String, Int] = Map.empty, + coresPerExecutor: Option[Int] = None): ResourceProfile = { + val rp = createResourceProfile(Some(memoryPerExecutorMb), customResources, coresPerExecutor) + rp.setToDefaultProfile() + rp + } + + def createResourceProfile( + memoryPerExecutorMb: Option[Int] = None, + customResources: Map[String, Int] = Map.empty, + coresPerExecutor: Option[Int] = None): ResourceProfile = { + val treqs = new TaskResourceRequests().cpus(1) + val ereqs = new ExecutorResourceRequests() + memoryPerExecutorMb.foreach(value => ereqs.memory(s"${value}m")) + customResources.foreach { case (resource, amount) => + ereqs.resource(resource, amount) } + coresPerExecutor.foreach(ereqs.cores) + new ResourceProfile(ereqs.requests, treqs.requests) + } + def createDriverCommand(): Command = new Command( "org.apache.spark.FakeClass", Seq("WORKER_URL", "USER_JAR", "mainClass"), Map(("K1", "V1"), ("K2", "V2")), Seq("cp1", "cp2"), Seq("lp1", "lp2"), Seq("-Dfoo") @@ -89,6 +118,7 @@ private[deploy] object DeployTestUtils { new SparkConf, Seq("localDir"), ExecutorState.RUNNING, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID, resources) } @@ -113,6 +143,6 @@ private[deploy] object DeployTestUtils { } private def createResourceRequirement: Seq[ResourceRequirement] = { - Seq(ResourceRequirement("gpu", 3), ResourceRequirement("fpga", 3)) + Seq(ResourceRequirement(GPU, 3), ResourceRequirement(FPGA, 3)) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala index 7d3eb7c6b0..5e62323770 100644 --- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala @@ -107,11 +107,11 @@ object JsonConstants { |{"id":"id","starttime":3,"name":"name", |"cores":0,"user":"%s", |"memoryperexecutor":1234, - |"resourcesperexecutor":[{"name":"gpu", - |"amount":3},{"name":"fpga","amount":3}], + |"resourcesperexecutor":[{"name":"fpga", + |"amount":3},{"name":"gpu","amount":3}], |"memoryperslave":1234, - |"resourcesperslave":[{"name":"gpu", - |"amount":3},{"name":"fpga","amount":3}], + |"resourcesperslave":[{"name":"fpga", + |"amount":3},{"name":"gpu","amount":3}], |"submitdate":"%s", |"state":"WAITING","duration":%d} """.format(System.getProperty("user.name", ""), diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index e47181719a..1a21fc3809 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -649,7 +649,7 @@ class StandaloneDynamicAllocationSuite override def receive: PartialFunction[Any, Unit] = testReceive.orElse(super.receive) private def testReceive: PartialFunction[Any, Unit] = synchronized { - case LaunchExecutor(_, appId, execId, _, _, _, _) => + case LaunchExecutor(_, appId, execId, _, _, _, _, _) => self.send(ExecutorStateChanged(appId, execId, ExecutorState.RUNNING, None, None)) } diff --git a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala index 93c0aa000e..7a67c4c8f7 100644 --- a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala @@ -26,11 +26,13 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.concurrent.{Eventually, ScalaFutures} import org.apache.spark._ -import org.apache.spark.deploy.{ApplicationDescription, Command} +import org.apache.spark.deploy.{ApplicationDescription, Command, DeployTestUtils} import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState, WorkerDecommissioning} import org.apache.spark.deploy.master.{ApplicationInfo, Master} import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.{config, Logging} +import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfileBuilder} +import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID import org.apache.spark.rpc.RpcEnv import org.apache.spark.scheduler.ExecutorDecommissionInfo import org.apache.spark.util.Utils @@ -166,6 +168,57 @@ class AppClientSuite } } + test("request executors with multi resource profiles") { + Utils.tryWithResource(new AppClientInst(masterRpcEnv.address.toSparkURL)) { ci => + ci.client.start() + + // Client should connect with one Master which registers the application + eventually(timeout(10.seconds), interval(10.millis)) { + val apps = getApplications() + assert(ci.listener.connectedIdList.size === 1, "client listener should have one connection") + assert(apps.size === 1, "master should have 1 registered app") + } + + // Send message to Master to request Executors with multiple resource profiles. + val rpBuilder = new ResourceProfileBuilder() + val ereqs = new ExecutorResourceRequests() + ereqs.cores(5) + ereqs.memory("1024m") + rpBuilder.require(ereqs) + val rp = rpBuilder.build() + val resourceProfileToTotalExecs = Map( + ci.desc.defaultProfile -> 1, + rp -> 2 + ) + whenReady( + ci.client.requestTotalExecutors(resourceProfileToTotalExecs), + timeout(10.seconds), + interval(10.millis)) { acknowledged => + assert(acknowledged) + } + + eventually(timeout(10.seconds), interval(10.millis)) { + val app = getApplications().head + assert(app.getRequestedRPIds().length == 2) + assert(app.getResourceProfileById(DEFAULT_RESOURCE_PROFILE_ID) + === ci.desc.defaultProfile) + assert(app.getResourceProfileById(rp.id) === rp) + assert(app.getTargetExecutorNumForRPId(DEFAULT_RESOURCE_PROFILE_ID) === 1) + assert(app.getTargetExecutorNumForRPId(rp.id) === 2) + } + + // Issue stop command for Client to disconnect from Master + ci.client.stop() + + // Verify Client is marked dead and unregistered from Master + eventually(timeout(10.seconds), interval(10.millis)) { + val apps = getApplications() + assert(ci.listener.deadReasonList.size === 1, "client should have been marked dead") + assert(apps.isEmpty, "master should have 0 registered apps") + } + } + } + test("request from AppClient before initialized with master") { Utils.tryWithResource(new AppClientInst(masterRpcEnv.address.toSparkURL)) { ci => @@ -266,7 +319,9 @@ class AppClientSuite val rpcEnv = RpcEnv.create("spark", Utils.localHostName(), 0, conf, securityManager) private val cmd = new Command(TestExecutor.getClass.getCanonicalName.stripSuffix("$"), List(), Map(), Seq(), Seq(), Seq()) - private val desc = new ApplicationDescription("AppClientSuite", Some(1), 512, cmd, "ignored") + private val defaultRp = DeployTestUtils.createDefaultResourceProfile(512) + val desc = + ApplicationDescription("AppClientSuite", Some(1), cmd, "ignored", defaultRp) val listener = new AppClientCollector val client = new StandaloneAppClient(rpcEnv, Array(masterUrl), desc, listener, new SparkConf) diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala index 7d07af4d72..5297ac5aac 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala @@ -21,7 +21,6 @@ import scala.collection.mutable import scala.io.{Codec, Source} import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} -import org.json4s.jackson.JsonMethods.parse import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil @@ -162,7 +161,7 @@ class EventLogFileCompactorSuite extends SparkFunSuite { val lines = Source.fromInputStream(is)(Codec.UTF8).getLines().toList assert(lines.length === 2, "Compacted file should have only two events being accepted") lines.foreach { line => - val event = JsonProtocol.sparkEventFromJson(parse(line)) + val event = JsonProtocol.sparkEventFromJson(line) assert(!event.isInstanceOf[SparkListenerJobStart] && !event.isInstanceOf[SparkListenerJobEnd]) } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala index 298fd65f29..a68086256d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala @@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} -import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.SparkConf import org.apache.spark.internal.config._ @@ -107,6 +106,6 @@ object EventLogTestHelper { } def convertEvent(event: SparkListenerEvent): String = { - compact(render(JsonProtocol.sparkEventToJson(event))) + JsonProtocol.sparkEventToJsonString(event) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index 541b283c13..afe8c61a52 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -30,7 +30,6 @@ import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataInputStream, Path} import org.apache.hadoop.hdfs.{DFSInputStream, DistributedFileSystem} import org.apache.hadoop.security.AccessControlException -import org.json4s.jackson.JsonMethods._ import org.mockito.ArgumentMatchers.{any, argThat} import org.mockito.Mockito.{doThrow, mock, spy, verify, when} import org.scalatest.PrivateMethodTester @@ -1730,13 +1729,13 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with L val bstream = new BufferedOutputStream(cstream) val metadata = SparkListenerLogStart(org.apache.spark.SPARK_VERSION) - val eventJson = JsonProtocol.logStartToJson(metadata) - val metadataJson = compact(eventJson) + "\n" + val eventJsonString = JsonProtocol.sparkEventToJsonString(metadata) + val metadataJson = eventJsonString + "\n" bstream.write(metadataJson.getBytes(StandardCharsets.UTF_8)) val writer = new OutputStreamWriter(bstream, StandardCharsets.UTF_8) Utils.tryWithSafeFinally { - events.foreach(e => writer.write(compact(render(JsonProtocol.sparkEventToJson(e))) + "\n")) + events.foreach(e => writer.write(JsonProtocol.sparkEventToJsonString(e) + "\n")) } { writer.close() } diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index b66b39c3c0..1cec863b1e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -30,7 +30,8 @@ import scala.reflect.ClassTag import org.json4s._ import org.json4s.jackson.JsonMethods._ -import org.mockito.Mockito.{mock, when} +import org.mockito.ArgumentMatchers.any +import org.mockito.Mockito.{doNothing, mock, when} import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.scalatest.concurrent.Eventually import org.scalatest.matchers.must.Matchers @@ -44,7 +45,8 @@ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Deploy._ import org.apache.spark.internal.config.UI._ import org.apache.spark.internal.config.Worker._ -import org.apache.spark.resource.{ResourceInformation, ResourceRequirement} +import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceRequirement} +import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, RpcEndpointRef, RpcEnv} import org.apache.spark.serializer @@ -78,11 +80,13 @@ class MockWorker(master: RpcEndpointRef, conf: SparkConf = new SparkConf) extend val drivers = mutable.HashSet[String]() val driverResources = new mutable.HashMap[String, Map[String, Set[String]]] val execResources = new mutable.HashMap[String, Map[String, Set[String]]] + val launchedExecutors = new mutable.HashMap[String, LaunchExecutor] override def receive: PartialFunction[Any, Unit] = { case RegisteredWorker(masterRef, _, _, _) => masterRef.send(WorkerLatestState(id, Nil, drivers.toSeq)) - case LaunchExecutor(_, appId, execId, _, _, _, resources_) => + case l @ LaunchExecutor(_, appId, execId, _, _, _, _, resources_) => execResources(appId + "/" + execId) = resources_.map(r => (r._1, r._2.addresses.toSet)) + launchedExecutors(appId + "/" + execId) = l case LaunchDriver(driverId, desc, resources_) => drivers += driverId driverResources(driverId) = resources_.map(r => (r._1, r._2.addresses.toSet)) @@ -126,7 +130,7 @@ class MockExecutorLaunchFailWorker(master: Master, conf: SparkConf = new SparkCo } appRegistered.countDown() - case LaunchExecutor(_, appId, execId, _, _, _, _) => + case LaunchExecutor(_, appId, execId, _, _, _, _, _) => assert(appRegistered.await(10, TimeUnit.SECONDS)) if (failedCnt == 0) { @@ -135,7 +139,8 @@ class MockExecutorLaunchFailWorker(master: Master, conf: SparkConf = new SparkCo assert(master.idToApp.contains(appId)) appIdsToLaunchExecutor += appId failedCnt += 1 - master.self.askSync(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, None, None)) + master.self.askSync(ExecutorStateChanged(appId, execId, + ExecutorState.FAILED, None, None)) case otherMsg => super.receive(otherMsg) } @@ -181,12 +186,11 @@ class MasterSuite extends SparkFunSuite desc = new ApplicationDescription( name = "", maxCores = None, - memoryPerExecutorMB = 0, command = commandToPersist, appUiUrl = "", + defaultProfile = DeployTestUtils.defaultResourceProfile, eventLogDir = None, - eventLogCodec = None, - coresPerExecutor = None), + eventLogCodec = None), submitDate = new Date(), driver = null, defaultCores = 0 @@ -281,9 +285,10 @@ class MasterSuite extends SparkFunSuite master.workers should be(Set(fakeWorkerInfo)) // Notify Master about the executor and driver info to make it correctly recovered. + val rpId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID val fakeExecutors = List( - new ExecutorDescription(fakeAppInfo.id, 0, 8, ExecutorState.RUNNING), - new ExecutorDescription(fakeAppInfo.id, 0, 7, ExecutorState.RUNNING)) + new ExecutorDescription(fakeAppInfo.id, 0, rpId, 8, 1024, ExecutorState.RUNNING), + new ExecutorDescription(fakeAppInfo.id, 0, rpId, 7, 1024, ExecutorState.RUNNING)) fakeAppInfo.state should be(ApplicationState.UNKNOWN) fakeWorkerInfo.coresFree should be(16) @@ -530,6 +535,97 @@ class MasterSuite extends SparkFunSuite schedulingWithEverything(spreadOut = false) } + test("scheduling for app with multiple resource profiles") { + scheduleExecutorsForAppWithMultiRPs(withMaxCores = false) + } + + test("scheduling for app with multiple resource profiles with max cores") { + scheduleExecutorsForAppWithMultiRPs(withMaxCores = true) + } + + private def scheduleExecutorsForAppWithMultiRPs(withMaxCores: Boolean): Unit = { + val appInfo: ApplicationInfo = if (withMaxCores) { + makeAppInfo( + 1024, maxCores = Some(30), initialExecutorLimit = Some(0)) + } else { + makeAppInfo( + 1024, maxCores = None, initialExecutorLimit = Some(0)) + } + + val master = makeAliveMaster() + val conf = new SparkConf() + val workers = (1 to 4).map { idx => + val worker = new MockWorker(master.self, conf) + worker.rpcEnv.setupEndpoint(s"worker-$idx", worker) + val workerReg = RegisterWorker( + worker.id, + "localhost", + worker.self.address.port, + worker.self, + 10, + 4096, + "http://localhost:8080", + RpcAddress("localhost", 10000)) + master.self.send(workerReg) + worker + } + + // Register app and schedule. + master.registerApplication(appInfo) + startExecutorsOnWorkers(master) + assert(appInfo.executors.isEmpty) + + // Request executors with multiple resource profile. + // rp1 with 15 cores per executor, rp2 with 8192MB memory per executor, no worker can + // fulfill the resource requirement. + val rp1 = DeployTestUtils.createResourceProfile(Some(2048), Map.empty, Some(15)) + val rp2 = DeployTestUtils.createResourceProfile(Some(8192), Map.empty, Some(5)) + val rp3 = DeployTestUtils.createResourceProfile(Some(2048), Map.empty, Some(5)) + val rp4 = DeployTestUtils.createResourceProfile(Some(2048), Map.empty, Some(10)) + val requests = Map( + appInfo.desc.defaultProfile -> 1, + rp1 -> 1, + rp2 -> 1, + rp3 -> 1, + rp4 -> 2 + ) + eventually(timeout(10.seconds)) { + master.self.askSync[Boolean](RequestExecutors(appInfo.id, requests)) + assert(appInfo.executors.size === workers.map(_.launchedExecutors.size).sum) + } + + if (withMaxCores) { + assert(appInfo.executors.size === 3) + assert(appInfo.getOrUpdateExecutorsForRPId(DEFAULT_RESOURCE_PROFILE_ID).size === 1) + assert(appInfo.getOrUpdateExecutorsForRPId(rp1.id).size === 0) + assert(appInfo.getOrUpdateExecutorsForRPId(rp2.id).size === 0) + assert(appInfo.getOrUpdateExecutorsForRPId(rp3.id).size === 1) + assert(appInfo.getOrUpdateExecutorsForRPId(rp4.id).size === 1) + } else { + assert(appInfo.executors.size === 4) + assert(appInfo.getOrUpdateExecutorsForRPId(DEFAULT_RESOURCE_PROFILE_ID).size === 1) + assert(appInfo.getOrUpdateExecutorsForRPId(rp1.id).size === 0) + assert(appInfo.getOrUpdateExecutorsForRPId(rp2.id).size === 0) + assert(appInfo.getOrUpdateExecutorsForRPId(rp3.id).size === 1) + assert(appInfo.getOrUpdateExecutorsForRPId(rp4.id).size === 2) + } + + // Verify executor information. + val executorForRp3 = appInfo.executors(appInfo.getOrUpdateExecutorsForRPId(rp3.id).head) + assert(executorForRp3.cores === 5) + assert(executorForRp3.memory === 2048) + assert(executorForRp3.rpId === rp3.id) + + // Verify LaunchExecutor message. + val launchExecutorMsg = workers + .find(_.id === executorForRp3.worker.id) + .map(_.launchedExecutors(appInfo.id + "/" + executorForRp3.id)) + .get + assert(launchExecutorMsg.cores === 5) + assert(launchExecutorMsg.memory === 2048) + assert(launchExecutorMsg.rpId === rp3.id) + } + private def basicScheduling(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo = makeAppInfo(1024) @@ -595,11 +691,11 @@ class MasterSuite extends SparkFunSuite private def schedulingWithExecutorLimit(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo = makeAppInfo(256) - appInfo.executorLimit = 0 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 0)) val scheduledCores1 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 2 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 2)) val scheduledCores2 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 5 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 5)) val scheduledCores3 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) assert(scheduledCores1 === Array(0, 0, 0)) assert(scheduledCores2 === Array(10, 10, 0)) @@ -609,11 +705,11 @@ class MasterSuite extends SparkFunSuite private def schedulingWithExecutorLimitAndMaxCores(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo = makeAppInfo(256, maxCores = Some(16)) - appInfo.executorLimit = 0 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 0)) val scheduledCores1 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 2 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 2)) val scheduledCores2 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 5 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 5)) val scheduledCores3 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) assert(scheduledCores1 === Array(0, 0, 0)) if (spreadOut) { @@ -628,11 +724,11 @@ class MasterSuite extends SparkFunSuite private def schedulingWithExecutorLimitAndCoresPerExecutor(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo = makeAppInfo(256, coresPerExecutor = Some(4)) - appInfo.executorLimit = 0 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 0)) val scheduledCores1 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 2 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 2)) val scheduledCores2 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 5 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 5)) val scheduledCores3 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) assert(scheduledCores1 === Array(0, 0, 0)) if (spreadOut) { @@ -647,11 +743,11 @@ class MasterSuite extends SparkFunSuite private def schedulingWithEverything(spreadOut: Boolean): Unit = { val master = makeMaster() val appInfo = makeAppInfo(256, coresPerExecutor = Some(4), maxCores = Some(18)) - appInfo.executorLimit = 0 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 0)) val scheduledCores1 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 2 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 2)) val scheduledCores2 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) - appInfo.executorLimit = 5 + appInfo.requestExecutors(Map(appInfo.desc.defaultProfile -> 5)) val scheduledCores3 = scheduleExecutorsOnWorkers(master, appInfo, workerInfos, spreadOut) assert(scheduledCores1 === Array(0, 0, 0)) if (spreadOut) { @@ -669,6 +765,8 @@ class MasterSuite extends SparkFunSuite private val _scheduleExecutorsOnWorkers = PrivateMethod[Array[Int]](Symbol("scheduleExecutorsOnWorkers")) + private val _startExecutorsOnWorkers = + PrivateMethod[Unit](Symbol("startExecutorsOnWorkers")) private val _drivers = PrivateMethod[HashSet[DriverInfo]](Symbol("drivers")) private val _state = PrivateMethod[RecoveryState.Value](Symbol("state")) @@ -696,13 +794,19 @@ class MasterSuite extends SparkFunSuite private def makeAppInfo( memoryPerExecutorMb: Int, coresPerExecutor: Option[Int] = None, - maxCores: Option[Int] = None): ApplicationInfo = { + maxCores: Option[Int] = None, + customResources: Map[String, Int] = Map.empty, + initialExecutorLimit: Option[Int] = None): ApplicationInfo = { + val rp = DeployTestUtils.createDefaultResourceProfile( + memoryPerExecutorMb, customResources, coresPerExecutor) + val desc = new ApplicationDescription( - "test", maxCores, memoryPerExecutorMb, null, "", None, None, coresPerExecutor) + "test", maxCores, null, "", rp, None, None, initialExecutorLimit) val appId = System.currentTimeMillis.toString val endpointRef = mock(classOf[RpcEndpointRef]) val mockAddress = mock(classOf[RpcAddress]) when(endpointRef.address).thenReturn(mockAddress) + doNothing().when(endpointRef).send(any()) new ApplicationInfo(0, appId, desc, new Date, endpointRef, Int.MaxValue) } @@ -715,12 +819,19 @@ class MasterSuite extends SparkFunSuite endpointRef, "http://localhost:80", Map.empty) } + // Schedule executors for default resource profile. private def scheduleExecutorsOnWorkers( master: Master, appInfo: ApplicationInfo, workerInfos: Array[WorkerInfo], spreadOut: Boolean): Array[Int] = { - master.invokePrivate(_scheduleExecutorsOnWorkers(appInfo, workerInfos, spreadOut)) + val defaultResourceDesc = appInfo.getResourceDescriptionForRpId(DEFAULT_RESOURCE_PROFILE_ID) + master.invokePrivate(_scheduleExecutorsOnWorkers( + appInfo, DEFAULT_RESOURCE_PROFILE_ID, defaultResourceDesc, workerInfos, spreadOut)) + } + + private def startExecutorsOnWorkers(master: Master): Unit = { + master.invokePrivate(_startExecutorsOnWorkers()) } test("SPARK-13604: Master should ask Worker kill unknown executors and drivers") { @@ -746,7 +857,8 @@ class MasterSuite extends SparkFunSuite "http://localhost:8080", RpcAddress("localhost", 9999))) val executors = (0 until 3).map { i => - new ExecutorDescription(appId = i.toString, execId = i, 2, ExecutorState.RUNNING) + new ExecutorDescription(appId = i.toString, execId = i, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID, 2, 1024, ExecutorState.RUNNING) } master.self.send(WorkerLatestState("1", executors, driverIds = Seq("0", "1", "2"))) @@ -1026,7 +1138,7 @@ class MasterSuite extends SparkFunSuite val masterRef = master.self val resourceReqs = Seq(ResourceRequirement(GPU, 3), ResourceRequirement(FPGA, 3)) val worker = makeWorkerAndRegister(masterRef, Map(GPU -> 6, FPGA -> 6)) - worker.appDesc = worker.appDesc.copy(resourceReqsPerExecutor = resourceReqs) + worker.appDesc = DeployTestUtils.createAppDesc(Map(GPU -> 3, FPGA -> 3)) val driver = DeployTestUtils.createDriverDesc().copy(resourceReqs = resourceReqs) val driverId = masterRef.askSync[SubmitDriverResponse](RequestSubmitDriver(driver)).driverId val status = masterRef.askSync[DriverStatusResponse](RequestDriverStatus(driverId.get)) @@ -1053,6 +1165,37 @@ class MasterSuite extends SparkFunSuite } } + test("resource description with multiple resource profiles") { + val appInfo = makeAppInfo(1024, Some(4), None, Map(GPU -> 2)) + val rp1 = DeployTestUtils.createResourceProfile(None, Map(FPGA -> 2), None) + val rp2 = DeployTestUtils.createResourceProfile(Some(2048), Map(GPU -> 3, FPGA -> 3), Some(2)) + + val resourceProfileToTotalExecs = Map( + appInfo.desc.defaultProfile -> 1, + rp1 -> 2, + rp2 -> 3 + ) + appInfo.requestExecutors(resourceProfileToTotalExecs) + + // Default resource profile take it's own resource request. + var resourceDesc = appInfo.getResourceDescriptionForRpId(DEFAULT_RESOURCE_PROFILE_ID) + assert(resourceDesc.memoryMbPerExecutor === 1024) + assert(resourceDesc.coresPerExecutor === Some(4)) + assert(resourceDesc.customResourcesPerExecutor === Seq(ResourceRequirement(GPU, 2))) + + // Non-default resource profiles take cores and memory from default profile if not specified. + resourceDesc = appInfo.getResourceDescriptionForRpId(rp1.id) + assert(resourceDesc.memoryMbPerExecutor === 1024) + assert(resourceDesc.coresPerExecutor === Some(4)) + assert(resourceDesc.customResourcesPerExecutor === Seq(ResourceRequirement(FPGA, 2))) + + resourceDesc = appInfo.getResourceDescriptionForRpId(rp2.id) + assert(resourceDesc.memoryMbPerExecutor === 2048) + assert(resourceDesc.coresPerExecutor === Some(2)) + assert(resourceDesc.customResourcesPerExecutor === + Seq(ResourceRequirement(FPGA, 3), ResourceRequirement(GPU, 3))) + } + private def getDrivers(master: Master): HashSet[DriverInfo] = { master.invokePrivate(_drivers()) } diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala index 988c65fd20..650d6f594c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala @@ -20,20 +20,22 @@ package org.apache.spark.deploy.worker import java.io.File import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} -import org.apache.spark.deploy.{ApplicationDescription, Command, ExecutorState} +import org.apache.spark.deploy.{ApplicationDescription, Command, DeployTestUtils, ExecutorState} +import org.apache.spark.resource.ResourceProfile class ExecutorRunnerTest extends SparkFunSuite { test("command includes appId") { val appId = "12345-worker321-9876" - val conf = new SparkConf + val conf = new SparkConf() val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) - val appDesc = new ApplicationDescription("app name", Some(8), 500, - Command("foo", Seq(appId), Map(), Seq(), Seq(), Seq()), "appUiUrl") - val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "http://", "worker321", + val appDesc = new ApplicationDescription("app name", Some(8), + Command("foo", Seq(appId), Map(), Seq(), Seq(), Seq()), "appUiUrl", + DeployTestUtils.defaultResourceProfile) + val er = new ExecutorRunner(appId, 1, appDesc, 8, 1234, null, "blah", "http://", "worker321", 123, "publicAddr", new File(sparkHome), new File("ooga"), "blah", conf, Seq("localDir"), - ExecutorState.RUNNING) + ExecutorState.RUNNING, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val builder = CommandUtils.buildProcessBuilder( - appDesc.command, new SecurityManager(conf), 512, sparkHome, er.substituteVariables) + appDesc.command, new SecurityManager(conf), 1234, sparkHome, er.substituteVariables) val builderCommand = builder.command() assert(builderCommand.get(builderCommand.size() - 1) === appId) } diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala index 0d4be5b1d3..80dc4ff758 100644 --- a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.internal.config._ import org.apache.spark.metrics.sink.Sink import org.apache.spark.metrics.source.{Source, StaticSources} -class MetricsSystemSuite extends SparkFunSuite with BeforeAndAfter with PrivateMethodTester{ +class MetricsSystemSuite extends SparkFunSuite with BeforeAndAfter with PrivateMethodTester { var filePath: String = _ var conf: SparkConf = null var securityMgr: SecurityManager = null diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala index 65e41986ff..aa00813560 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala @@ -68,7 +68,8 @@ class ResourceProfileManagerSuite extends SparkFunSuite { }.getMessage() assert(error.contains( - "ResourceProfiles are only supported on YARN and Kubernetes with dynamic allocation")) + "ResourceProfiles are only supported on YARN and Kubernetes and Standalone" + + " with dynamic allocation")) } test("isSupported yarn with dynamic allocation") { @@ -100,6 +101,21 @@ class ResourceProfileManagerSuite extends SparkFunSuite { assert(rpmanager.isSupported(immrprof) == true) } + test("isSupported standalone with dynamic allocation") { + val conf = new SparkConf().setMaster("spark://foo").set(EXECUTOR_CORES, 4) + conf.set(DYN_ALLOCATION_ENABLED, true) + conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true) + conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true") + val rpmanager = new ResourceProfileManager(conf, listenerBus) + // default profile should always work + val defaultProf = rpmanager.defaultResourceProfile + val rprof = new ResourceProfileBuilder() + val gpuExecReq = + new ExecutorResourceRequests().resource("gpu", 2, "someScript") + val immrprof = rprof.require(gpuExecReq).build() + assert(rpmanager.isSupported(immrprof)) + } + test("isSupported with local mode") { val conf = new SparkConf().setMaster("local").set(EXECUTOR_CORES, 4) conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true") @@ -115,7 +131,8 @@ class ResourceProfileManagerSuite extends SparkFunSuite { }.getMessage() assert(error.contains( - "ResourceProfiles are only supported on YARN and Kubernetes with dynamic allocation")) + "ResourceProfiles are only supported on YARN and Kubernetes and Standalone" + + " with dynamic allocation")) } test("ResourceProfileManager has equivalent profile") { diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala index 27cc44a099..6c36f5c855 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala @@ -65,6 +65,42 @@ class ResourceProfileSuite extends SparkFunSuite { "Task resources should have 1 cpu") } + test("Executor cores should be None by default for standalone cluster") { + val sparkConf = new SparkConf() + .setMaster("spark://ut.cluster") + .remove(EXECUTOR_CORES.key) + val rprof = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + assert(rprof.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + assert(!rprof.executorResources.contains(ResourceProfile.CORES), + "Executor cores should be None by default for standalone cluster") + assert(rprof.getExecutorCores.isEmpty, + "Executor cores should be None by default for standalone cluster") + } + + test("Get resource for standalone cluster") { + val sparkConf = new SparkConf() + .setMaster("spark://ut.cluster") + .remove(EXECUTOR_CORES.key) + val defaultExecutorResource = ResourceProfile.getDefaultProfileExecutorResources(sparkConf) + assert(defaultExecutorResource.cores.isEmpty) + assert(defaultExecutorResource.executorMemoryMiB === 1024L) + assert(defaultExecutorResource.memoryOffHeapMiB === 0L) + assert(defaultExecutorResource.memoryOverheadMiB.isEmpty) + assert(defaultExecutorResource.pysparkMemoryMiB.isEmpty) + assert(defaultExecutorResource.customResources.isEmpty) + + val rpBuilder = new ResourceProfileBuilder() + val taskReq = new TaskResourceRequests().resource("cpu", 2) + val execReq = + new ExecutorResourceRequests().cores(4) + val rp = rpBuilder.require(taskReq).require(execReq).build() + val executorResourceForRp = ResourceProfile.getResourcesForClusterManager( + rp.id, rp.executorResources, 0.0, sparkConf, false, Map.empty) + // Standalone cluster only take cores and executor memory as built-in resources. + assert(executorResourceForRp.cores.get === 4) + assert(executorResourceForRp.executorMemoryMiB === 1024L) + } + test("Default ResourceProfile with app level resources specified") { val conf = new SparkConf conf.set(PYSPARK_EXECUTOR_MEMORY.key, "2g") diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index d790c35a33..2a5a44ad78 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -26,7 +26,6 @@ import scala.collection.mutable.Set import scala.io.{Codec, Source} import org.apache.hadoop.fs.Path -import org.json4s.jackson.JsonMethods._ import org.mockito.Mockito import org.scalatest.BeforeAndAfter @@ -140,7 +139,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit assert(lines(2).contains("SparkListenerJobStart")) lines.foreach{ - line => JsonProtocol.sparkEventFromJson(parse(line)) match { + line => JsonProtocol.sparkEventFromJson(line) match { case logStartEvent: SparkListenerLogStart => assert(logStartEvent == logStart) @@ -180,7 +179,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit sc.stop() val eventLogStream = EventLogFileReader.openEventLog(new Path(testDirPath, appId), fileSystem) - val events = readLines(eventLogStream).map(line => JsonProtocol.sparkEventFromJson(parse(line))) + val events = readLines(eventLogStream).map(line => JsonProtocol.sparkEventFromJson(line)) val jobStartEvents = events .filter(event => event.isInstanceOf[SparkListenerJobStart]) .map(_.asInstanceOf[SparkListenerJobStart]) @@ -248,9 +247,9 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit assert(lines(0).contains("SparkListenerLogStart")) assert(lines(1).contains("SparkListenerApplicationStart")) assert(lines(2).contains("SparkListenerApplicationEnd")) - assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === logStart) - assert(JsonProtocol.sparkEventFromJson(parse(lines(1))) === applicationStart) - assert(JsonProtocol.sparkEventFromJson(parse(lines(2))) === applicationEnd) + assert(JsonProtocol.sparkEventFromJson(lines(0)) === logStart) + assert(JsonProtocol.sparkEventFromJson(lines(1)) === applicationStart) + assert(JsonProtocol.sparkEventFromJson(lines(2)) === applicationEnd) } finally { logData.close() } @@ -307,7 +306,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit lines.foreach { line => eventSet.foreach { event => if (line.contains(event)) { - val parsedEvent = JsonProtocol.sparkEventFromJson(parse(line)) + val parsedEvent = JsonProtocol.sparkEventFromJson(line) val eventType = Utils.getFormattedClassName(parsedEvent) if (eventType == event) { eventSet.remove(event) @@ -315,7 +314,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } } } - assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === logStart) + assert(JsonProtocol.sparkEventFromJson(lines(0)) === logStart) assert(eventSet.isEmpty, "The following events are missing: " + eventSet.toSeq) } { logData.close() @@ -518,7 +517,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit assert(lines.size === 25) assert(lines(0).contains("SparkListenerLogStart")) assert(lines(1).contains("SparkListenerApplicationStart")) - assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === logStart) + assert(JsonProtocol.sparkEventFromJson(lines(0)) === logStart) var logIdx = 1 events.foreach { event => event match { @@ -609,7 +608,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit /** Check that the Spark history log line matches the expected event. */ private def checkEvent(line: String, event: SparkListenerEvent): Unit = { assert(line.contains(event.getClass.toString.split("\\.").last)) - val parsed = JsonProtocol.sparkEventFromJson(parse(line)) + val parsed = JsonProtocol.sparkEventFromJson(line) assert(parsed.getClass === event.getClass) (event, parsed) match { case (expected: SparkListenerStageSubmitted, actual: SparkListenerStageSubmitted) => @@ -641,7 +640,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit line: String, stageId: Int, expectedEvents: Map[(Int, String), SparkListenerStageExecutorMetrics]): String = { - JsonProtocol.sparkEventFromJson(parse(line)) match { + JsonProtocol.sparkEventFromJson(line) match { case executorMetrics: SparkListenerStageExecutorMetrics => expectedEvents.get((stageId, executorMetrics.execId)) match { case Some(expectedMetrics) => diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index cb50c7c959..77d9ae88fb 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -24,8 +24,6 @@ import java.util.concurrent.atomic.AtomicInteger import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path -import org.json4s.JsonAST.JValue -import org.json4s.jackson.JsonMethods._ import org.scalatest.BeforeAndAfter import org.apache.spark._ @@ -60,8 +58,8 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val applicationEnd = SparkListenerApplicationEnd(1000L) Utils.tryWithResource(new PrintWriter(fwriter)) { writer => // scalastyle:off println - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart)))) - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd)))) + writer.println(JsonProtocol.sparkEventToJsonString(applicationStart)) + writer.println(JsonProtocol.sparkEventToJsonString(applicationEnd)) // scalastyle:on println } @@ -76,8 +74,8 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp logData.close() } assert(eventMonster.loggedEvents.size === 2) - assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJson(applicationStart)) - assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd)) + assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJsonString(applicationStart)) + assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJsonString(applicationEnd)) } /** @@ -99,8 +97,8 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val applicationEnd = SparkListenerApplicationEnd(1000L) // scalastyle:off println - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart)))) - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd)))) + writer.println(JsonProtocol.sparkEventToJsonString(applicationStart)) + writer.println(JsonProtocol.sparkEventToJsonString(applicationEnd)) // scalastyle:on println } @@ -144,9 +142,9 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp val applicationEnd = SparkListenerApplicationEnd(1000L) Utils.tryWithResource(new PrintWriter(fwriter)) { writer => // scalastyle:off println - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart)))) + writer.println(JsonProtocol.sparkEventToJsonString(applicationStart)) writer.println("""{"Event":"UnrecognizedEventOnlyForTest","Timestamp":1477593059313}""") - writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd)))) + writer.println(JsonProtocol.sparkEventToJsonString(applicationEnd)) // scalastyle:on println } @@ -161,8 +159,8 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp logData.close() } assert(eventMonster.loggedEvents.size === 2) - assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJson(applicationStart)) - assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd)) + assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJsonString(applicationStart)) + assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJsonString(applicationEnd)) } // This assumes the correctness of EventLoggingListener @@ -226,9 +224,9 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp // Verify the same events are replayed in the same order assert(sc.eventLogger.isDefined) val originalEvents = sc.eventLogger.get.loggedEvents - .map(JsonProtocol.sparkEventFromJson(_)) + .map(JsonProtocol.sparkEventFromJson) val replayedEvents = eventMonster.loggedEvents - .map(JsonProtocol.sparkEventFromJson(_)) + .map(JsonProtocol.sparkEventFromJson) originalEvents.zip(replayedEvents).foreach { case (e1, e2) => JsonProtocolSuite.assertEquals(e1, e1) } @@ -245,10 +243,10 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp */ private class EventBufferingListener extends SparkFirehoseListener { - private[scheduler] val loggedEvents = new ArrayBuffer[JValue] + private[scheduler] val loggedEvents = new ArrayBuffer[String] override def onEvent(event: SparkListenerEvent): Unit = { - val eventJson = JsonProtocol.sparkEventToJson(event) + val eventJson = JsonProtocol.sparkEventToJsonString(event) loggedEvents += eventJson } } diff --git a/core/src/test/scala/org/apache/spark/shuffle/HostLocalShuffleReadingSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/HostLocalShuffleReadingSuite.scala index 4e74036e11..571f57a6d6 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/HostLocalShuffleReadingSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/HostLocalShuffleReadingSuite.scala @@ -38,13 +38,13 @@ class HostLocalShuffleReadingSuite extends SparkFunSuite with Matchers with Loca override def afterEach(): Unit = { Option(rpcHandler).foreach { handler => - Utils.tryLogNonFatalError{ + Utils.tryLogNonFatalError { server.close() } - Utils.tryLogNonFatalError{ + Utils.tryLogNonFatalError { handler.close() } - Utils.tryLogNonFatalError{ + Utils.tryLogNonFatalError { transportContext.close() } server = null diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 7a18223ec5..7f93051680 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -21,7 +21,10 @@ import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.Map +import scala.language.implicitConversions +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import com.fasterxml.jackson.databind.node.{ObjectNode, TextNode} import org.json4s.JsonAST.{JArray, JInt, JString, JValue} import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ @@ -33,17 +36,21 @@ import org.apache.spark.executor._ import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.rdd.{DeterministicLevel, RDDOperationScope} import org.apache.spark.resource._ +import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.shuffle.MetadataFetchFailedException import org.apache.spark.storage._ class JsonProtocolSuite extends SparkFunSuite { + import JsonProtocol.toJsonString import JsonProtocolSuite._ test("SparkListenerEvent") { val stageSubmitted = SparkListenerStageSubmitted(makeStageInfo(100, 200, 300, 400L, 500L), properties) + val stageSubmittedWithNullProperties = + SparkListenerStageSubmitted(makeStageInfo(100, 200, 300, 400L, 500L), properties = null) val stageCompleted = SparkListenerStageCompleted(makeStageInfo(101, 201, 301, 401L, 501L)) val taskStart = SparkListenerTaskStart(111, 0, makeTaskInfo(222L, 333, 1, 333, 444L, false)) val taskGettingResult = @@ -72,6 +79,9 @@ class JsonProtocolSuite extends SparkFunSuite { makeStageInfo(x, x * 200, x * 300, x * 400L, x * 500L)) SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties) } + val jobStartWithNullProperties = { + SparkListenerJobStart(10, jobSubmissionTime, stageInfos = Seq.empty, properties = null) + } val jobEnd = SparkListenerJobEnd(20, jobCompletionTime, JobSucceeded) val environmentUpdate = SparkListenerEnvironmentUpdate(Map[String, Seq[(String, String)]]( "JVM Information" -> Seq(("GC speed", "9999 objects/s"), ("Java home", "Land of coffee")), @@ -149,6 +159,7 @@ class JsonProtocolSuite extends SparkFunSuite { resourceProfile.setResourceProfileId(21) val resourceProfileAdded = SparkListenerResourceProfileAdded(resourceProfile) testEvent(stageSubmitted, stageSubmittedJsonString) + testEvent(stageSubmittedWithNullProperties, stageSubmittedWithNullPropertiesJsonString) testEvent(stageCompleted, stageCompletedJsonString) testEvent(taskStart, taskStartJsonString) testEvent(taskGettingResult, taskGettingResultJsonString) @@ -156,6 +167,7 @@ class JsonProtocolSuite extends SparkFunSuite { testEvent(taskEndWithHadoopInput, taskEndWithHadoopInputJsonString) testEvent(taskEndWithOutput, taskEndWithOutputJsonString) testEvent(jobStart, jobStartJsonString) + testEvent(jobStartWithNullProperties, jobStartWithNullPropertiesJsonString) testEvent(jobEnd, jobEndJsonString) testEvent(environmentUpdate, environmentUpdateJsonString) testEvent(blockManagerAdded, blockManagerAddedJsonString) @@ -248,21 +260,21 @@ class JsonProtocolSuite extends SparkFunSuite { test("ExceptionFailure backward compatibility: full stack trace") { val exceptionFailure = ExceptionFailure("To be", "or not to be", stackTrace, null, None) - val oldEvent = JsonProtocol.taskEndReasonToJson(exceptionFailure) - .removeField({ _._1 == "Full Stack Trace" }) + val oldEvent = toJsonString(JsonProtocol.taskEndReasonToJson(exceptionFailure, _)) + .removeField("Full Stack Trace") assertEquals(exceptionFailure, JsonProtocol.taskEndReasonFromJson(oldEvent)) } test("StageInfo backward compatibility (details, accumulables)") { val info = makeStageInfo(1, 2, 3, 4L, 5L) - val newJson = JsonProtocol.stageInfoToJson(info) + val newJson = toJsonString(JsonProtocol.stageInfoToJson(info, _)) // Fields added after 1.0.0. assert(info.details.nonEmpty) assert(info.accumulables.nonEmpty) val oldJson = newJson - .removeField { case (field, _) => field == "Details" } - .removeField { case (field, _) => field == "Accumulables" } + .removeField("Details") + .removeField("Accumulables") val newInfo = JsonProtocol.stageInfoFromJson(oldJson) @@ -273,7 +285,7 @@ class JsonProtocolSuite extends SparkFunSuite { test("StageInfo resourceProfileId") { val info = makeStageInfo(1, 2, 3, 4L, 5L, 5) - val json = JsonProtocol.stageInfoToJson(info) + val json = toJsonString(JsonProtocol.stageInfoToJson(info, _)) // Fields added after 1.0.0. assert(info.details.nonEmpty) @@ -288,18 +300,21 @@ class JsonProtocolSuite extends SparkFunSuite { test("InputMetrics backward compatibility") { // InputMetrics were added after 1.0.1. val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = true, hasOutput = false) - val newJson = JsonProtocol.taskMetricsToJson(metrics) - val oldJson = newJson.removeField { case (field, _) => field == "Input Metrics" } + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson.removeField("Input Metrics") val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) + assert(newMetrics.inputMetrics.recordsRead == 0) + assert(newMetrics.inputMetrics.bytesRead == 0) } test("Input/Output records backwards compatibility") { // records read were added after 1.2 val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = true, hasOutput = true, hasRecords = false) - val newJson = JsonProtocol.taskMetricsToJson(metrics) - val oldJson = newJson.removeField { case (field, _) => field == "Records Read" } - .removeField { case (field, _) => field == "Records Written" } + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson + .removeField("Records Read") + .removeField("Records Written") val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) assert(newMetrics.inputMetrics.recordsRead == 0) assert(newMetrics.outputMetrics.recordsWritten == 0) @@ -307,22 +322,46 @@ class JsonProtocolSuite extends SparkFunSuite { test("Shuffle Read/Write records backwards compatibility") { // records read were added after 1.2 + // "Remote Bytes Read To Disk" was added in 2.3.0 val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = false, hasOutput = false, hasRecords = false) - val newJson = JsonProtocol.taskMetricsToJson(metrics) - val oldJson = newJson.removeField { case (field, _) => field == "Total Records Read" } - .removeField { case (field, _) => field == "Shuffle Records Written" } + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson + .removeField("Total Records Read") + .removeField("Shuffle Records Written") + .removeField("Remote Bytes Read To Disk") val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) assert(newMetrics.shuffleReadMetrics.recordsRead == 0) + assert(newMetrics.shuffleReadMetrics.remoteBytesReadToDisk == 0) assert(newMetrics.shuffleWriteMetrics.recordsWritten == 0) } test("OutputMetrics backward compatibility") { // OutputMetrics were added after 1.1 val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = false, hasOutput = true) - val newJson = JsonProtocol.taskMetricsToJson(metrics) - val oldJson = newJson.removeField { case (field, _) => field == "Output Metrics" } + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson.removeField("Output Metrics") + val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) + assert(newMetrics.outputMetrics.recordsWritten == 0) + assert(newMetrics.outputMetrics.bytesWritten == 0) + } + + test("TaskMetrics backward compatibility") { + // "Executor Deserialize CPU Time" and "Executor CPU Time" were introduced in Spark 2.1.0 + // "Peak Execution Memory" was introduced in Spark 3.0.0 + val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = false, hasOutput = true) + metrics.setExecutorDeserializeCpuTime(100L) + metrics.setExecutorCpuTime(100L) + metrics.setPeakExecutionMemory(100L) + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson + .removeField("Executor Deserialize CPU Time") + .removeField("Executor CPU Time") + .removeField("Peak Execution Memory") val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) + assert(newMetrics.executorDeserializeCpuTime == 0) + assert(newMetrics.executorCpuTime == 0) + assert(newMetrics.peakExecutionMemory == 0) } test("StorageLevel backward compatibility") { @@ -334,8 +373,8 @@ class JsonProtocolSuite extends SparkFunSuite { deserialized = false, replication = 1 ) - val newJson = JsonProtocol.storageLevelToJson(level) - val oldJson = newJson.removeField { case (field, _) => field == "Use Off Heap" } + val newJson = toJsonString(JsonProtocol.storageLevelToJson(level, _)) + val oldJson = newJson.removeField("Use Off Heap") val newLevel = JsonProtocol.storageLevelFromJson(oldJson) assert(newLevel.useOffHeap === false) } @@ -347,15 +386,15 @@ class JsonProtocolSuite extends SparkFunSuite { val blockManagerRemoved = SparkListenerBlockManagerRemoved(2L, BlockManagerId("Scarce", "to be counted...", 100)) - val oldBmAdded = JsonProtocol.blockManagerAddedToJson(blockManagerAdded) - .removeField({ _._1 == "Timestamp" }) + val oldBmAdded = toJsonString(JsonProtocol.blockManagerAddedToJson(blockManagerAdded, _)) + .removeField("Timestamp") val deserializedBmAdded = JsonProtocol.blockManagerAddedFromJson(oldBmAdded) assert(SparkListenerBlockManagerAdded(-1L, blockManagerAdded.blockManagerId, blockManagerAdded.maxMem) === deserializedBmAdded) - val oldBmRemoved = JsonProtocol.blockManagerRemovedToJson(blockManagerRemoved) - .removeField({ _._1 == "Timestamp" }) + val oldBmRemoved = toJsonString(JsonProtocol.blockManagerRemovedToJson(blockManagerRemoved, _)) + .removeField("Timestamp") val deserializedBmRemoved = JsonProtocol.blockManagerRemovedFromJson(oldBmRemoved) assert(SparkListenerBlockManagerRemoved(-1L, blockManagerRemoved.blockManagerId) === @@ -366,8 +405,8 @@ class JsonProtocolSuite extends SparkFunSuite { // FetchFailed in Spark 1.1.0 does not have a "Message" property. val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, "ignored") - val oldEvent = JsonProtocol.taskEndReasonToJson(fetchFailed) - .removeField({ _._1 == "Message" }) + val oldEvent = toJsonString(JsonProtocol.taskEndReasonToJson(fetchFailed, _)) + .removeField("Message") val expectedFetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, "Unknown reason") assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent)) @@ -377,8 +416,8 @@ class JsonProtocolSuite extends SparkFunSuite { // FetchFailed in Spark 2.4.0 does not have "Map Index" property. val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, "ignored") - val oldEvent = JsonProtocol.taskEndReasonToJson(fetchFailed) - .removeField({ _._1 == "Map Index" }) + val oldEvent = toJsonString(JsonProtocol.taskEndReasonToJson(fetchFailed, _)) + .removeField("Map Index") val expectedFetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, Int.MinValue, 19, "ignored") assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent)) @@ -388,8 +427,8 @@ class JsonProtocolSuite extends SparkFunSuite { // Metrics about local shuffle bytes read were added in 1.3.1. val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = false, hasOutput = false, hasRecords = false) - val newJson = JsonProtocol.taskMetricsToJson(metrics) - val oldJson = newJson.removeField { case (field, _) => field == "Local Bytes Read" } + val newJson = toJsonString(JsonProtocol.taskMetricsToJson(metrics, _)) + val oldJson = newJson.removeField("Local Bytes Read") val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson) assert(newMetrics.shuffleReadMetrics.localBytesRead == 0) } @@ -399,18 +438,18 @@ class JsonProtocolSuite extends SparkFunSuite { // SparkListenerApplicationStart pre-Spark 1.4 does not have "appAttemptId". // SparkListenerApplicationStart pre-Spark 1.5 does not have "driverLogs val applicationStart = SparkListenerApplicationStart("test", None, 1L, "user", None, None) - val oldEvent = JsonProtocol.applicationStartToJson(applicationStart) - .removeField({ _._1 == "App ID" }) - .removeField({ _._1 == "App Attempt ID" }) - .removeField({ _._1 == "Driver Logs"}) + val oldEvent = toJsonString(JsonProtocol.applicationStartToJson(applicationStart, _)) + .removeField("App ID") + .removeField("App Attempt ID") + .removeField( "Driver Logs") assert(applicationStart === JsonProtocol.applicationStartFromJson(oldEvent)) } test("ExecutorLostFailure backward compatibility") { // ExecutorLostFailure in Spark 1.1.0 does not have an "Executor ID" property. val executorLostFailure = ExecutorLostFailure("100", true, Some("Induced failure")) - val oldEvent = JsonProtocol.taskEndReasonToJson(executorLostFailure) - .removeField({ _._1 == "Executor ID" }) + val oldEvent = toJsonString(JsonProtocol.taskEndReasonToJson(executorLostFailure, _)) + .removeField("Executor ID") val expectedExecutorLostFailure = ExecutorLostFailure("Unknown", true, Some("Induced failure")) assert(expectedExecutorLostFailure === JsonProtocol.taskEndReasonFromJson(oldEvent)) } @@ -423,7 +462,7 @@ class JsonProtocolSuite extends SparkFunSuite { stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown", resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) val jobStart = SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties) - val oldEvent = JsonProtocol.jobStartToJson(jobStart).removeField({_._1 == "Stage Infos"}) + val oldEvent = toJsonString(JsonProtocol.jobStartToJson(jobStart, _)).removeField("Stage Infos") val expectedJobStart = SparkListenerJobStart(10, jobSubmissionTime, dummyStageInfos, properties) assertEquals(expectedJobStart, JsonProtocol.jobStartFromJson(oldEvent)) @@ -435,29 +474,34 @@ class JsonProtocolSuite extends SparkFunSuite { val stageIds = Seq[Int](1, 2, 3, 4) val stageInfos = stageIds.map(x => makeStageInfo(x * 10, x * 20, x * 30, x * 40L, x * 50L)) val jobStart = SparkListenerJobStart(11, jobSubmissionTime, stageInfos, properties) - val oldStartEvent = JsonProtocol.jobStartToJson(jobStart) - .removeField({ _._1 == "Submission Time"}) + val oldStartEvent = toJsonString(JsonProtocol.jobStartToJson(jobStart, _)) + .removeField("Submission Time") val expectedJobStart = SparkListenerJobStart(11, -1, stageInfos, properties) assertEquals(expectedJobStart, JsonProtocol.jobStartFromJson(oldStartEvent)) val jobEnd = SparkListenerJobEnd(11, jobCompletionTime, JobSucceeded) - val oldEndEvent = JsonProtocol.jobEndToJson(jobEnd) - .removeField({ _._1 == "Completion Time"}) + val oldEndEvent = toJsonString(JsonProtocol.jobEndToJson(jobEnd, _)) + .removeField("Completion Time") val expectedJobEnd = SparkListenerJobEnd(11, -1, JobSucceeded) assertEquals(expectedJobEnd, JsonProtocol.jobEndFromJson(oldEndEvent)) } - test("RDDInfo backward compatibility (scope, parent IDs, callsite)") { + test("RDDInfo backward compatibility") { // "Scope" and "Parent IDs" were introduced in Spark 1.4.0 // "Callsite" was introduced in Spark 1.6.0 - val rddInfo = new RDDInfo(1, "one", 100, StorageLevel.NONE, false, Seq(1, 6, 8), - "callsite", Some(new RDDOperationScope("fable"))) - val oldRddInfoJson = JsonProtocol.rddInfoToJson(rddInfo) - .removeField({ _._1 == "Parent IDs"}) - .removeField({ _._1 == "Scope"}) - .removeField({ _._1 == "Callsite"}) + // "Barrier" was introduced in Spark 3.0.0 + // "DeterministicLevel" was introduced in Spark 3.2.0 + val rddInfo = new RDDInfo(1, "one", 100, StorageLevel.NONE, true, Seq(1, 6, 8), + "callsite", Some(new RDDOperationScope("fable")), DeterministicLevel.INDETERMINATE) + val oldRddInfoJson = toJsonString(JsonProtocol.rddInfoToJson(rddInfo, _)) + .removeField("Parent IDs") + .removeField("Scope") + .removeField("Callsite") + .removeField("Barrier") + .removeField("DeterministicLevel") val expectedRddInfo = new RDDInfo( - 1, "one", 100, StorageLevel.NONE, false, Seq.empty, "", scope = None) + 1, "one", 100, StorageLevel.NONE, false, Seq.empty, "", scope = None, + outputDeterministicLevel = DeterministicLevel.INDETERMINATE) assertEquals(expectedRddInfo, JsonProtocol.rddInfoFromJson(oldRddInfoJson)) } @@ -465,7 +509,8 @@ class JsonProtocolSuite extends SparkFunSuite { // Prior to Spark 1.4.0, StageInfo did not have the "Parent IDs" property val stageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq(1, 2, 3), "details", resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) - val oldStageInfo = JsonProtocol.stageInfoToJson(stageInfo).removeField({ _._1 == "Parent IDs"}) + val oldStageInfo = toJsonString(JsonProtocol.stageInfoToJson(stageInfo, _)) + .removeField("Parent IDs") val expectedStageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq.empty, "details", resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) assertEquals(expectedStageInfo, JsonProtocol.stageInfoFromJson(oldStageInfo)) @@ -474,10 +519,10 @@ class JsonProtocolSuite extends SparkFunSuite { // `TaskCommitDenied` was added in 1.3.0 but JSON de/serialization logic was added in 1.5.1 test("TaskCommitDenied backward compatibility") { val denied = TaskCommitDenied(1, 2, 3) - val oldDenied = JsonProtocol.taskEndReasonToJson(denied) - .removeField({ _._1 == "Job ID" }) - .removeField({ _._1 == "Partition ID" }) - .removeField({ _._1 == "Attempt Number" }) + val oldDenied = toJsonString(JsonProtocol.taskEndReasonToJson(denied, _)) + .removeField("Job ID") + .removeField("Partition ID") + .removeField("Attempt Number") val expectedDenied = TaskCommitDenied(-1, -1, -1) assertEquals(expectedDenied, JsonProtocol.taskEndReasonFromJson(oldDenied)) } @@ -485,16 +530,16 @@ class JsonProtocolSuite extends SparkFunSuite { test("AccumulableInfo backward compatibility") { // "Internal" property of AccumulableInfo was added in 1.5.1 val accumulableInfo = makeAccumulableInfo(1, internal = true, countFailedValues = true) - val accumulableInfoJson = JsonProtocol.accumulableInfoToJson(accumulableInfo) - val oldJson = accumulableInfoJson.removeField({ _._1 == "Internal" }) + val accumulableInfoJson = toJsonString(JsonProtocol.accumulableInfoToJson(accumulableInfo, _)) + val oldJson = accumulableInfoJson.removeField("Internal") val oldInfo = JsonProtocol.accumulableInfoFromJson(oldJson) assert(!oldInfo.internal) // "Count Failed Values" property of AccumulableInfo was added in 2.0.0 - val oldJson2 = accumulableInfoJson.removeField({ _._1 == "Count Failed Values" }) + val oldJson2 = accumulableInfoJson.removeField("Count Failed Values") val oldInfo2 = JsonProtocol.accumulableInfoFromJson(oldJson2) assert(!oldInfo2.countFailedValues) // "Metadata" property of AccumulableInfo was added in 2.0.0 - val oldJson3 = accumulableInfoJson.removeField({ _._1 == "Metadata" }) + val oldJson3 = accumulableInfoJson.removeField("Metadata") val oldInfo3 = JsonProtocol.accumulableInfoFromJson(oldJson3) assert(oldInfo3.metadata.isEmpty) } @@ -504,14 +549,15 @@ class JsonProtocolSuite extends SparkFunSuite { // we should still be able to fallback to constructing the accumulator updates from the // "Task Metrics" field, if it exists. val tm = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = true, hasOutput = true) - val tmJson = JsonProtocol.taskMetricsToJson(tm) + val tmJson = toJsonString(JsonProtocol.taskMetricsToJson(tm, _)) val accumUpdates = tm.accumulators().map(AccumulatorSuite.makeInfo) val exception = new SparkException("sentimental") val exceptionFailure = new ExceptionFailure(exception, accumUpdates) - val exceptionFailureJson = JsonProtocol.taskEndReasonToJson(exceptionFailure) - val tmFieldJson: JValue = "Task Metrics" -> tmJson - val oldExceptionFailureJson: JValue = - exceptionFailureJson.removeField { _._1 == "Accumulator Updates" }.merge(tmFieldJson) + val exceptionFailureJson = toJsonString(JsonProtocol.taskEndReasonToJson(exceptionFailure, _)) + val oldExceptionFailureJson = + exceptionFailureJson + .removeField("Accumulator Updates") + .addStringField("Task Metrics", tmJson) val oldExceptionFailure = JsonProtocol.taskEndReasonFromJson(oldExceptionFailureJson).asInstanceOf[ExceptionFailure] assert(exceptionFailure.className === oldExceptionFailure.className) @@ -523,12 +569,30 @@ class JsonProtocolSuite extends SparkFunSuite { exceptionFailure.accumUpdates, oldExceptionFailure.accumUpdates, (x, y) => x == y) } + test("TaskKilled backward compatibility") { + // The "Kill Reason" field was added in Spark 2.2.0 + // The "Accumulator Updates" field was added in Spark 2.4.0 + val tm = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = true, hasOutput = true) + val accumUpdates = tm.accumulators().map(AccumulatorSuite.makeInfo) + val taskKilled = TaskKilled(reason = "test", accumUpdates) + val taskKilledJson = toJsonString(JsonProtocol.taskEndReasonToJson(taskKilled, _)) + val oldExceptionFailureJson = + taskKilledJson + .removeField("Kill Reason") + .removeField("Accumulator Updates") + val oldTaskKilled = + JsonProtocol.taskEndReasonFromJson(oldExceptionFailureJson).asInstanceOf[TaskKilled] + assert(oldTaskKilled.reason === "unknown reason") + assert(oldTaskKilled.accums.isEmpty) + assert(oldTaskKilled.accumUpdates.isEmpty) + } + test("ExecutorMetricsUpdate backward compatibility: executor metrics update") { // executorMetricsUpdate was added in 2.4.0. val executorMetricsUpdate = makeExecutorMetricsUpdate("1", true, true) val oldExecutorMetricsUpdateJson = - JsonProtocol.executorMetricsUpdateToJson(executorMetricsUpdate) - .removeField( _._1 == "Executor Metrics Updated") + toJsonString(JsonProtocol.executorMetricsUpdateToJson(executorMetricsUpdate, _)) + .removeField("Executor Metrics Updated") val expectedExecutorMetricsUpdate = makeExecutorMetricsUpdate("1", true, false) assertEquals(expectedExecutorMetricsUpdate, JsonProtocol.executorMetricsUpdateFromJson(oldExecutorMetricsUpdateJson)) @@ -539,14 +603,91 @@ class JsonProtocolSuite extends SparkFunSuite { val executorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 789L, 40L, 20L, 20L, 10L, 20L, 10L, 301L)) val oldExecutorMetricsJson = - JsonProtocol.executorMetricsToJson(executorMetrics) - .removeField( _._1 == "MappedPoolMemory") + toJsonString(JsonProtocol.executorMetricsToJson(executorMetrics, _)) + .removeField("MappedPoolMemory") val expectedExecutorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 0L, 40L, 20L, 20L, 10L, 20L, 10L, 301L)) assertEquals(expectedExecutorMetrics, JsonProtocol.executorMetricsFromJson(oldExecutorMetricsJson)) } + test("EnvironmentUpdate backward compatibility: handle missing metrics properties") { + // The "Metrics Properties" field was added in Spark 3.4.0: + val expectedEvent: SparkListenerEnvironmentUpdate = { + val e = JsonProtocol.environmentUpdateFromJson(environmentUpdateJsonString) + e.copy(environmentDetails = + e.environmentDetails + ("Metrics Properties" -> Seq.empty[(String, String)])) + } + val oldEnvironmentUpdateJson = environmentUpdateJsonString + .removeField("Metrics Properties") + assertEquals(expectedEvent, JsonProtocol.environmentUpdateFromJson(oldEnvironmentUpdateJson)) + } + + test("ExecutorInfo backward compatibility") { + // The "Attributes" and "Resources" fields were added in Spark 3.0.0 + // The "Resource Profile Id", "Registration Time", and "Request Time" + // fields were added in Spark 3.4.0 + val resourcesInfo = Map(ResourceUtils.GPU -> + new ResourceInformation(ResourceUtils.GPU, Array("0", "1"))).toMap + val attributes = Map("ContainerId" -> "ct1", "User" -> "spark").toMap + val executorInfo = + new ExecutorInfo( + "Hostee.awesome.com", + 11, + logUrlMap = Map.empty[String, String].toMap, + attributes = attributes, + resourcesInfo = resourcesInfo, + resourceProfileId = 123, + registrationTime = Some(2L), + requestTime = Some(1L)) + val oldExecutorInfoJson = toJsonString(JsonProtocol.executorInfoToJson(executorInfo, _)) + .removeField("Attributes") + .removeField("Resources") + .removeField("Resource Profile Id") + .removeField("Registration Time") + .removeField("Request Time") + val oldEvent = JsonProtocol.executorInfoFromJson(oldExecutorInfoJson) + assert(oldEvent.attributes.isEmpty) + assert(oldEvent.resourcesInfo.isEmpty) + assert(oldEvent.resourceProfileId == DEFAULT_RESOURCE_PROFILE_ID) + assert(oldEvent.registrationTime.isEmpty) + assert(oldEvent.requestTime.isEmpty) + } + + test("TaskInfo backward compatibility: handle missing partition ID field") { + // The "Partition ID" field was added in Spark 3.3.0: + val newJson = + """ + |{ + | "Task ID": 222, + | "Index": 333, + | "Attempt": 1, + | "Partition ID": 333, + | "Launch Time": 444, + | "Executor ID": "executor", + | "Host": "your kind sir", + | "Locality": "NODE_LOCAL", + | "Speculative": false, + | "Getting Result Time": 0, + | "Finish Time": 0, + | "Failed": false, + | "Killed": false, + | "Accumulables": [ + | { + | "ID": 1, + | "Name": "Accumulable1", + | "Update": "delta1", + | "Value": "val1", + | "Internal": false, + | "Count Failed Values": false + | } + | ] + |} + """.stripMargin + val oldJson = newJson.removeField("Partition ID") + assert(JsonProtocol.taskInfoFromJson(oldJson).partitionId === -1) + } + test("AccumulableInfo value de/serialization") { import InternalAccumulator._ val blocks = Seq[(BlockId, BlockStatus)]( @@ -554,7 +695,7 @@ class JsonProtocolSuite extends SparkFunSuite { (TestBlockId("feebo"), BlockStatus(StorageLevel.DISK_ONLY, 3L, 4L))) val blocksJson = JArray(blocks.toList.map { case (id, status) => ("Block ID" -> id.toString) ~ - ("Status" -> JsonProtocol.blockStatusToJson(status)) + ("Status" -> parse(toJsonString(JsonProtocol.blockStatusToJson(status, _)))) }) testAccumValue(Some(RESULT_SIZE), 3L, JInt(3)) testAccumValue(Some(shuffleRead.REMOTE_BLOCKS_FETCHED), 2, JInt(2)) @@ -577,7 +718,7 @@ class JsonProtocolSuite extends SparkFunSuite { value = value, internal = isInternal, countFailedValues = false) - val json = JsonProtocol.accumulableInfoToJson(accum) + val json = toJsonString(JsonProtocol.accumulableInfoToJson(accum, _)) val newAccum = JsonProtocol.accumulableInfoFromJson(json) assert(newAccum == accum.copy(update = expectedValue, value = expectedValue)) } @@ -621,7 +762,7 @@ class JsonProtocolSuite extends SparkFunSuite { | "bar" : 123, | "unknown" : "unknown" |}""".stripMargin - assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected) + assert(JsonProtocol.sparkEventFromJson(unknownFieldsJson) === expected) } test("SPARK-30936: backwards compatibility - set default values for missing fields") { @@ -631,13 +772,34 @@ class JsonProtocolSuite extends SparkFunSuite { | "Event" : "org.apache.spark.util.TestListenerEvent", | "foo" : "foo" |}""".stripMargin - assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected) + assert(JsonProtocol.sparkEventFromJson(unknownFieldsJson) === expected) } } private[spark] object JsonProtocolSuite extends Assertions { import InternalAccumulator._ + import JsonProtocol.toJsonString + + private val mapper = new ObjectMapper() + + private implicit class JsonStringImplicits(json: String) { + def removeField(field: String): String = { + val tree = mapper.readTree(json) + Option(tree.asInstanceOf[ObjectNode].findParent(field)).foreach(_.remove(field)) + tree.toString + } + + def addStringField(k: String, v: String): String = { + val tree = mapper.readTree(json) + tree.asInstanceOf[ObjectNode].set(k, new TextNode(v)) + tree.toString + } + } + + private implicit def toJsonNode(json: String): JsonNode = { + mapper.readTree(json) + } private val jobSubmissionTime = 1421191042750L private val jobCompletionTime = 1421191296660L @@ -648,50 +810,62 @@ private[spark] object JsonProtocolSuite extends Assertions { private val nodeExcludedTime = 1421458952000L private val nodeUnexcludedTime = 1421458962000L + implicit def jValueToJsonNode(value: JValue): JsonNode = { + mapper.readTree(pretty(value)) + } + private def testEvent(event: SparkListenerEvent, jsonString: String): Unit = { - val actualJsonString = compact(render(JsonProtocol.sparkEventToJson(event))) - val newEvent = JsonProtocol.sparkEventFromJson(parse(actualJsonString)) + val actualJsonString = JsonProtocol.sparkEventToJsonString(event) + val newEvent = JsonProtocol.sparkEventFromJson(actualJsonString) assertJsonStringEquals(jsonString, actualJsonString, event.getClass.getSimpleName) assertEquals(event, newEvent) } private def testRDDInfo(info: RDDInfo): Unit = { - val newInfo = JsonProtocol.rddInfoFromJson(JsonProtocol.rddInfoToJson(info)) + val newInfo = JsonProtocol.rddInfoFromJson( + toJsonString(JsonProtocol.rddInfoToJson(info, _))) assertEquals(info, newInfo) } private def testStageInfo(info: StageInfo): Unit = { - val newInfo = JsonProtocol.stageInfoFromJson(JsonProtocol.stageInfoToJson(info)) + val newInfo = JsonProtocol.stageInfoFromJson( + toJsonString(JsonProtocol.stageInfoToJson(info, _))) assertEquals(info, newInfo) } private def testStorageLevel(level: StorageLevel): Unit = { - val newLevel = JsonProtocol.storageLevelFromJson(JsonProtocol.storageLevelToJson(level)) + val newLevel = JsonProtocol.storageLevelFromJson( + toJsonString(JsonProtocol.storageLevelToJson(level, _))) assertEquals(level, newLevel) } private def testTaskMetrics(metrics: TaskMetrics): Unit = { - val newMetrics = JsonProtocol.taskMetricsFromJson(JsonProtocol.taskMetricsToJson(metrics)) + val newMetrics = JsonProtocol.taskMetricsFromJson( + toJsonString(JsonProtocol.taskMetricsToJson(metrics, _))) assertEquals(metrics, newMetrics) } private def testBlockManagerId(id: BlockManagerId): Unit = { - val newId = JsonProtocol.blockManagerIdFromJson(JsonProtocol.blockManagerIdToJson(id)) + val newId = JsonProtocol.blockManagerIdFromJson( + toJsonString(JsonProtocol.blockManagerIdToJson(id, _))) assert(id === newId) } private def testTaskInfo(info: TaskInfo): Unit = { - val newInfo = JsonProtocol.taskInfoFromJson(JsonProtocol.taskInfoToJson(info)) + val newInfo = JsonProtocol.taskInfoFromJson( + toJsonString(JsonProtocol.taskInfoToJson(info, _))) assertEquals(info, newInfo) } private def testJobResult(result: JobResult): Unit = { - val newResult = JsonProtocol.jobResultFromJson(JsonProtocol.jobResultToJson(result)) + val newResult = JsonProtocol.jobResultFromJson( + toJsonString(JsonProtocol.jobResultToJson(result, _))) assertEquals(result, newResult) } private def testTaskEndReason(reason: TaskEndReason): Unit = { - val newReason = JsonProtocol.taskEndReasonFromJson(JsonProtocol.taskEndReasonToJson(reason)) + val newReason = JsonProtocol.taskEndReasonFromJson( + toJsonString(JsonProtocol.taskEndReasonToJson(reason, _))) assertEquals(reason, newReason) } @@ -701,12 +875,13 @@ private[spark] object JsonProtocolSuite extends Assertions { } private def testExecutorInfo(info: ExecutorInfo): Unit = { - val newInfo = JsonProtocol.executorInfoFromJson(JsonProtocol.executorInfoToJson(info)) + val newInfo = JsonProtocol.executorInfoFromJson( + toJsonString(JsonProtocol.executorInfoToJson(info, _))) assertEquals(info, newInfo) } private def testAccumValue(name: Option[String], value: Any, expectedJson: JValue): Unit = { - val json = JsonProtocol.accumValueToJson(name, value) + val json = parse(toJsonString(JsonProtocol.accumValueToJson(name, value, _))) assert(json === expectedJson) val newValue = JsonProtocol.accumValueFromJson(name, json) val expectedValue = if (name.exists(_.startsWith(METRICS_PREFIX))) value else value.toString @@ -820,6 +995,8 @@ private[spark] object JsonProtocolSuite extends Assertions { assert(info1.taskId === info2.taskId) assert(info1.index === info2.index) assert(info1.attemptNumber === info2.attemptNumber) + // The "Partition ID" field was added in Spark 3.3.0 + assert(info1.partitionId === info2.partitionId) assert(info1.launchTime === info2.launchTime) assert(info1.executorId === info2.executorId) assert(info1.host === info2.host) @@ -936,14 +1113,18 @@ private[spark] object JsonProtocolSuite extends Assertions { } } + private def prettyString(json: JsonNode): String = { + mapper.writerWithDefaultPrettyPrinter().writeValueAsString(json) + } + private def assertJsonStringEquals(expected: String, actual: String, metadata: String): Unit = { - val expectedJson = parse(expected) - val actualJson = parse(actual) + val expectedJson = mapper.readTree(expected) + val actualJson = mapper.readTree(actual) if (expectedJson != actualJson) { // scalastyle:off // This prints something useful if the JSON strings don't match - println(s"=== EXPECTED ===\n${pretty(expectedJson)}\n") - println(s"=== ACTUAL ===\n${pretty(actualJson)}\n") + println(s"=== EXPECTED ===\n${prettyString(expectedJson)}\n") + println(s"=== ACTUAL ===\n${prettyString(actualJson)}\n") // scalastyle:on throw new TestFailedException(s"$metadata JSON did not equal", 1) } @@ -1192,6 +1373,41 @@ private[spark] object JsonProtocolSuite extends Assertions { |} """.stripMargin + private val stageSubmittedWithNullPropertiesJsonString = + """ + |{ + | "Event": "SparkListenerStageSubmitted", + | "Stage Info": { + | "Stage ID": 100, + | "Stage Attempt ID": 0, + | "Stage Name": "greetings", + | "Number of Tasks": 200, + | "RDD Info": [], + | "Parent IDs" : [100, 200, 300], + | "Details": "details", + | "Accumulables": [ + | { + | "ID": 1, + | "Name": "Accumulable1", + | "Update": "delta1", + | "Value": "val1", + | "Internal": false, + | "Count Failed Values": false + | }, + | { + | "ID": 2, + | "Name": "Accumulable2", + | "Update": "delta2", + | "Value": "val2", + | "Internal": false, + | "Count Failed Values": false + | } + | ], + | "Resource Profile Id" : 0 + | } + |} + """.stripMargin + private val stageCompletedJsonString = """ |{ @@ -2055,6 +2271,17 @@ private[spark] object JsonProtocolSuite extends Assertions { |} """.stripMargin + private val jobStartWithNullPropertiesJsonString = + """ + |{ + | "Event": "SparkListenerJobStart", + | "Job ID": 10, + | "Submission Time": 1421191042750, + | "Stage Infos": [], + | "Stage IDs": [] + |} + """.stripMargin + private val jobEndJsonString = """ |{ diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index 3f6cb2475a..4b63f1dace 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.CompletionIterator class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext with Eventually - with Matchers{ + with Matchers { import TestUtils.{assertNotSpilled, assertSpilled} private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3 index 85bcb52790..e3342935c1 100644 --- a/dev/deps/spark-deps-hadoop-2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2-hive-2.3 @@ -29,8 +29,8 @@ avro/1.11.0//avro-1.11.0.jar azure-storage/2.0.0//azure-storage-2.0.0.jar blas/2.2.1//blas-2.2.1.jar bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar -breeze-macros_2.12/1.2//breeze-macros_2.12-1.2.jar -breeze_2.12/1.2//breeze_2.12-1.2.jar +breeze-macros_2.12/2.0//breeze-macros_2.12-2.0.jar +breeze_2.12/2.0//breeze_2.12-2.0.jar cats-kernel_2.12/2.1.1//cats-kernel_2.12-2.1.1.jar chill-java/0.10.0//chill-java-0.10.0.jar chill_2.12/0.10.0//chill_2.12-0.10.0.jar @@ -55,7 +55,6 @@ commons-net/3.1//commons-net-3.1.jar commons-pool/1.5.4//commons-pool-1.5.4.jar commons-text/1.9//commons-text-1.9.jar compress-lzf/1.1//compress-lzf-1.1.jar -core/1.1.2//core-1.1.2.jar curator-client/2.7.1//curator-client-2.7.1.jar curator-framework/2.7.1//curator-framework-2.7.1.jar curator-recipes/2.7.1//curator-recipes-2.7.1.jar @@ -236,13 +235,12 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.5//py4j-0.10.9.5.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar rocksdbjni/7.3.1//rocksdbjni-7.3.1.jar -scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar +scala-collection-compat_2.12/2.2.0//scala-collection-compat_2.12-2.2.0.jar scala-compiler/2.12.16//scala-compiler-2.12.16.jar scala-library/2.12.16//scala-library-2.12.16.jar scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar scala-reflect/2.12.16//scala-reflect-2.12.16.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar -shapeless_2.12/2.3.9//shapeless_2.12-2.3.9.jar shims/0.9.30//shims-0.9.30.jar slf4j-api/1.7.32//slf4j-api-1.7.32.jar snakeyaml/1.30//snakeyaml-1.30.jar diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index a91f97face..a2dfd894af 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -31,8 +31,8 @@ azure-keyvault-core/1.0.0//azure-keyvault-core-1.0.0.jar azure-storage/7.0.1//azure-storage-7.0.1.jar blas/2.2.1//blas-2.2.1.jar bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar -breeze-macros_2.12/1.2//breeze-macros_2.12-1.2.jar -breeze_2.12/1.2//breeze_2.12-1.2.jar +breeze-macros_2.12/2.0//breeze-macros_2.12-2.0.jar +breeze_2.12/2.0//breeze_2.12-2.0.jar cats-kernel_2.12/2.1.1//cats-kernel_2.12-2.1.1.jar chill-java/0.10.0//chill-java-0.10.0.jar chill_2.12/0.10.0//chill_2.12-0.10.0.jar @@ -52,7 +52,6 @@ commons-math3/3.6.1//commons-math3-3.6.1.jar commons-pool/1.5.4//commons-pool-1.5.4.jar commons-text/1.9//commons-text-1.9.jar compress-lzf/1.1//compress-lzf-1.1.jar -core/1.1.2//core-1.1.2.jar cos_api-bundle/5.6.19//cos_api-bundle-5.6.19.jar curator-client/2.13.0//curator-client-2.13.0.jar curator-framework/2.13.0//curator-framework-2.13.0.jar @@ -225,13 +224,12 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.5//py4j-0.10.9.5.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar rocksdbjni/7.3.1//rocksdbjni-7.3.1.jar -scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar +scala-collection-compat_2.12/2.2.0//scala-collection-compat_2.12-2.2.0.jar scala-compiler/2.12.16//scala-compiler-2.12.16.jar scala-library/2.12.16//scala-library-2.12.16.jar scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar scala-reflect/2.12.16//scala-reflect-2.12.16.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar -shapeless_2.12/2.3.9//shapeless_2.12-2.3.9.jar shims/0.9.30//shims-0.9.30.jar slf4j-api/1.7.32//slf4j-api-1.7.32.jar snakeyaml/1.30//snakeyaml-1.30.jar diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile new file mode 100644 index 0000000000..e3ba4f6110 --- /dev/null +++ b/dev/infra/Dockerfile @@ -0,0 +1,56 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 20.04. +FROM ubuntu:20.04 + +ENV FULL_REFRESH_DATE 20220706 + +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +ARG APT_INSTALL="apt-get install --no-install-recommends -y" + +RUN apt-get clean +RUN apt-get update +RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9 +RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java + +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install 'numpy<1.23.0' pyarrow 'pandas<1.4.0' scipy xmlrunner plotly>=4.8 sklearn 'mlflow>=1.0' coverage matplotlib + +RUN add-apt-repository ppa:pypy/ppa +RUN apt update +RUN $APT_INSTALL gfortran libopenblas-dev liblapack-dev +RUN $APT_INSTALL build-essential + +RUN mkdir -p /usr/local/pypy/pypy3.7 && \ + curl -sqL https://downloads.python.org/pypy/pypy3.7-v7.3.7-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.7 --strip-components=1 && \ + ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3.7 && \ + ln -sf /usr/local/pypy/pypy3.7/bin/pypy /usr/local/bin/pypy3 + +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN pypy3 -m pip install 'numpy<1.23.0' 'pandas<1.4.0' scipy coverage matplotlib + +RUN $APT_INSTALL gnupg ca-certificates pandoc +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list +RUN gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN gpg -a --export E084DAB9 | apt-key add - +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' +RUN apt update +RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev +RUN Rscript -e "install.packages(c('knitr', 'markdown', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')" diff --git a/dev/requirements.txt b/dev/requirements.txt index e7e0a4b427..c25b5dd9fb 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -36,6 +36,8 @@ jinja2<3.0.0 sphinx<3.1.0 sphinx-plotly-directive docutils<0.18.0 +# See SPARK-38279. +markupsafe==2.0.1 # Development scripts jira diff --git a/docs/README.md b/docs/README.md index 6bb83d8953..27238964f0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -59,9 +59,9 @@ See also https://github.com/sphinx-doc/sphinx/issues/7551. TODO(SPARK-35375): Jinja2 3.0.0+ causes error when building with Sphinx. See also https://issues.apache.org/jira/browse/SPARK-35375. --> - +Run the following command from $SPARK_HOME: ```sh -$ sudo pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc sphinx-plotly-directive 'jinja2<3.0.0' +$ sudo pip install -r dev/requirements.txt ``` ### R API Documentation (Optional) diff --git a/docs/configuration.md b/docs/configuration.md index fd189aa88b..26addffe88 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -3239,7 +3239,7 @@ See your cluster manager specific page for requirements and details on each of - # Stage Level Scheduling Overview The stage level scheduling feature allows users to specify task and executor resource requirements at the stage level. This allows for different stages to run with executors that have different resources. A prime example of this is one ETL stage runs with executors with just CPUs, the next stage is an ML stage that needs GPUs. Stage level scheduling allows for user to request different executors that have GPUs when the ML stage runs rather then having to acquire executors with GPUs at the start of the application and them be idle while the ETL stage is being run. -This is only available for the RDD API in Scala, Java, and Python. It is available on YARN and Kubernetes when dynamic allocation is enabled. See the [YARN](running-on-yarn.html#stage-level-scheduling-overview) page or [Kubernetes](running-on-kubernetes.html#stage-level-scheduling-overview) page for more implementation details. +This is only available for the RDD API in Scala, Java, and Python. It is available on YARN, Kubernetes and Standalone when dynamic allocation is enabled. See the [YARN](running-on-yarn.html#stage-level-scheduling-overview) page or [Kubernetes](running-on-kubernetes.html#stage-level-scheduling-overview) page or [Standalone](spark-standalone.html#stage-level-scheduling-overview) page for more implementation details. See the `RDD.withResources` and `ResourceProfileBuilder` API's for using this feature. The current implementation acquires new executors for each `ResourceProfile` created and currently has to be an exact match. Spark does not try to fit tasks into an executor that require a different ResourceProfile than the executor was created with. Executors that are not in use will idle timeout with the dynamic allocation logic. The default configuration for this feature is to only allow one ResourceProfile per stage. If the user associates more then 1 ResourceProfile to an RDD, Spark will throw an exception by default. See config `spark.scheduler.resource.profileMergeConflicts` to control that behavior. The current merge strategy Spark implements when `spark.scheduler.resource.profileMergeConflicts` is enabled is a simple max of each resource within the conflicting ResourceProfiles. Spark will create a new ResourceProfile with the max of each of the resources. diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md index f44ed8245e..69dd678aed 100644 --- a/docs/job-scheduling.md +++ b/docs/job-scheduling.md @@ -83,6 +83,10 @@ This feature is disabled by default and available on all coarse-grained cluster [Mesos coarse-grained mode](running-on-mesos.html#mesos-run-modes) and [K8s mode](running-on-kubernetes.html). +### Caveats + +- In [standalone mode](spark-standalone.html), without explicitly setting `spark.executor.cores`, each executor will get all the available cores of a worker. In this case, when dynamic allocation enabled, spark will possibly acquire much more executors than expected. When you want to use dynamic allocation in [standalone mode](spark-standalone.html), you are recommended to explicitly set cores for each executor before the issue [SPARK-30299](https://issues.apache.org/jira/browse/SPARK-30299) got fixed. + ### Configuration and Setup There are two ways for using this feature. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 1991d64fe4..8d1d05fbbb 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -455,6 +455,14 @@ if the worker has enough cores and memory. Otherwise, each executor grabs all th on the worker by default, in which case only one executor per application may be launched on each worker during one single schedule iteration. +# Stage Level Scheduling Overview + +Stage level scheduling is supported on Standalone when dynamic allocation is enabled. Currently, when the Master allocates executors for one application, it will schedule based on the order of the ResourceProfile ids for multiple ResourceProfiles. The ResourceProfile with smaller id will be scheduled firstly. Normally this won’t matter as Spark finishes one stage before starting another one, the only case this might have an affect is in a job server type scenario, so its something to keep in mind. For scheduling, we will only take executor memory and executor cores from built-in executor resources and all other custom resources from a ResourceProfile, other built-in executor resources such as offHeap and memoryOverhead won't take any effect. The base default profile will be created based on the spark configs when you submit an application. Executor memory and executor cores from the base default profile can be propagated to custom ResourceProfiles, but all other custom resources can not be propagated. + +## Caveats + +As mentioned in [Dynamic Resource Allocation](job-scheduling.html#dynamic-resource-allocation), if cores for each executor is not explicitly specified with dynamic allocation enabled, spark will possibly acquire much more executors than expected. So you are recommended to explicitly set executor cores for each resource profile when using stage level scheduling. + # Monitoring and Logging Spark's standalone mode offers a web-based user interface to monitor the cluster. The master and each worker has its own web UI that shows cluster and job statistics. By default, you can access the web UI for the master at port 8080. The port can be changed either in the configuration file or via command-line options. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 6d8507239e..15202202dd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.regression import java.util.Locale import breeze.stats.{distributions => dist} +import breeze.stats.distributions.Rand.FixedSeed.randBasis import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path @@ -679,7 +680,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine } } - private[regression] object Tweedie{ + private[regression] object Tweedie { /** Constant used in initialization and deviance to avoid numerical issues. */ val delta: Double = 0.1 diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 46986249e0..09425fe60f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -21,6 +21,7 @@ import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, FirstOrderMinimizer, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN} +import breeze.stats.distributions.Rand.FixedSeed.randBasis import breeze.stats.distributions.StudentsT import org.apache.hadoop.fs.Path diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala index d043c9e58e..d86410c1ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala @@ -28,7 +28,7 @@ import org.apache.spark.mllib.clustering.KMeansModel /** * PMML Model Export for KMeansModel class */ -private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModelExport{ +private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModelExport { populateKMeansPMML(model) diff --git a/mllib/src/test/scala/org/apache/spark/ml/MLEventsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/MLEventsSuite.scala index 1226ad9be5..f7e7e7fe2a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/MLEventsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/MLEventsSuite.scala @@ -146,7 +146,7 @@ class MLEventsSuite } // Test if they can be ser/de via JSON protocol. assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } @@ -204,7 +204,7 @@ class MLEventsSuite } // Test if they can be ser/de via JSON protocol. assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } @@ -236,7 +236,7 @@ class MLEventsSuite // Test if they can be ser/de via JSON protocol. eventually(timeout(10.seconds), interval(1.second)) { assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } @@ -264,7 +264,7 @@ class MLEventsSuite // Test if they can be ser/de via JSON protocol. eventually(timeout(10.seconds), interval(1.second)) { assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } @@ -299,7 +299,7 @@ class MLEventsSuite // Test if they can be ser/de via JSON protocol. eventually(timeout(10.seconds), interval(1.second)) { assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } @@ -327,7 +327,7 @@ class MLEventsSuite // Test if they can be ser/de via JSON protocol. eventually(timeout(10.seconds), interval(1.second)) { assert(events.nonEmpty) - events.map(JsonProtocol.sparkEventToJson).foreach { event => + events.map(JsonProtocol.sparkEventToJsonString).foreach { event => assert(JsonProtocol.sparkEventFromJson(event).isInstanceOf[MLEvent]) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala index f1e071357b..53be2444ec 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.internal.config.Kryo._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.serializer.KryoSerializer -class InstanceSuite extends SparkFunSuite{ +class InstanceSuite extends SparkFunSuite { test("Kryo class register") { val conf = new SparkConf(false) conf.set(KRYO_REGISTRATION_REQUIRED, true) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala index 4047e6d719..ff17be1fc5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.{DataFrame, Row} /** * Test suite for [[RandomForestRegressor]]. */ -class RandomForestRegressorSuite extends MLTest with DefaultReadWriteTest{ +class RandomForestRegressorSuite extends MLTest with DefaultReadWriteTest { import RandomForestRegressorSuite.compareAPIs import testImplicits._ diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index 289db336ec..20ba69a5ad 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -321,7 +321,7 @@ class TrainValidationSplitSuite } } -object TrainValidationSplitSuite extends SparkFunSuite{ +object TrainValidationSplitSuite extends SparkFunSuite { abstract class MyModel extends Model[MyModel] diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 41c8feb8a6..287ef127e6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -21,6 +21,7 @@ import scala.util.Random import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import breeze.stats.distributions.{Multinomial => BrzMultinomial} +import breeze.stats.distributions.Rand.FixedSeed.randBasis import org.scalatest.exceptions.TestFailedException import org.apache.spark.{SparkException, SparkFunSuite} diff --git a/pom.xml b/pom.xml index 7b45220780..5465ca50e4 100644 --- a/pom.xml +++ b/pom.xml @@ -198,7 +198,7 @@ 4.8 1.1 3.141.59 - 2.50.0 + 2.62.0 1.8 1.1.0 1.5.0 @@ -1084,7 +1084,7 @@ org.scalanlp breeze_${scala.binary.version} - 1.2 + 2.0 org.apache.commons @@ -3746,26 +3746,6 @@ ${test.debug.suite} - - - - netlib-lgpl - - - com.github.fommil.netlib - all - ${netlib.java.version} - pom - - - only-eclipse diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index fb71155657..3f3d857547 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -54,7 +54,15 @@ object MimaExcludes { ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.Classifier.getNumClasses"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.Classifier.getNumClasses$default$2"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.OneVsRest.extractInstances"), - ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.OneVsRestModel.extractInstances") + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.OneVsRestModel.extractInstances"), + + // [SPARK-39703][SPARK-39062] Mima complains with Scala 2.13 for the changes in DeployMessages + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.deploy.DeployMessages$LaunchExecutor$"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.DeployMessages#RequestExecutors.requestedTotal"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.deploy.DeployMessages#RequestExecutors.copy"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.deploy.DeployMessages#RequestExecutors.copy$default$2"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.deploy.DeployMessages#RequestExecutors.this"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.deploy.DeployMessages#RequestExecutors.apply") ) // Exclude rules for 3.3.x from 3.2.0 diff --git a/python/docs/source/reference/pyspark.sql/catalog.rst b/python/docs/source/reference/pyspark.sql/catalog.rst index 8267e06410..742af104df 100644 --- a/python/docs/source/reference/pyspark.sql/catalog.rst +++ b/python/docs/source/reference/pyspark.sql/catalog.rst @@ -29,12 +29,17 @@ Catalog Catalog.clearCache Catalog.createExternalTable Catalog.createTable + Catalog.currentCatalog Catalog.currentDatabase Catalog.databaseExists Catalog.dropGlobalTempView Catalog.dropTempView Catalog.functionExists + Catalog.getDatabase + Catalog.getFunction + Catalog.getTable Catalog.isCached + Catalog.listCatalogs Catalog.listColumns Catalog.listDatabases Catalog.listFunctions @@ -43,6 +48,7 @@ Catalog Catalog.refreshByPath Catalog.refreshTable Catalog.registerFunction + Catalog.setCurrentCatalog Catalog.setCurrentDatabase Catalog.tableExists Catalog.uncacheTable diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index d49af66479..1a2e38f81e 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -424,7 +424,7 @@ def computeSVD( >>> svd_model.s DenseVector([3.4641, 3.1623]) >>> svd_model.V - DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) + DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, ...0.0], 0) """ j_model = self._java_matrix_wrapper.call("computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) @@ -858,7 +858,7 @@ def computeSVD( >>> svd_model.s DenseVector([3.4641, 3.1623]) >>> svd_model.V - DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) + DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, ...0.0], 0) """ j_model = self._java_matrix_wrapper.call("computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) diff --git a/python/pyspark/pandas/numpy_compat.py b/python/pyspark/pandas/numpy_compat.py index ea72fa658e..f9b7bd67a9 100644 --- a/python/pyspark/pandas/numpy_compat.py +++ b/python/pyspark/pandas/numpy_compat.py @@ -166,7 +166,7 @@ def maybe_dispatch_ufunc_to_dunder_op( "true_divide": "truediv", "power": "pow", "remainder": "mod", - "divide": "div", + "divide": "truediv", "equal": "eq", "not_equal": "ne", "less": "lt", diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index a7852c110f..838077ed7c 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -5024,7 +5024,7 @@ def replace( else: if regex: # to_replace must be a string - cond = self.spark.column.rlike(to_replace) + cond = self.spark.column.rlike(cast(str, to_replace)) else: cond = self.spark.column.isin(to_replace) # to_replace may be a scalar diff --git a/python/pyspark/pandas/tests/test_resample.py b/python/pyspark/pandas/tests/test_resample.py index e9359b0a8a..390b41fa30 100644 --- a/python/pyspark/pandas/tests/test_resample.py +++ b/python/pyspark/pandas/tests/test_resample.py @@ -94,7 +94,7 @@ def pdf5(self): @property def pdf6(self): np.random.seed(55) - index = pd.date_range(start="2022-05-02 03:04:05", end="2022-05-03 06:07:08", freq="1S") + index = pd.date_range(start="2022-05-02 03:04:05", end="2022-05-02 06:07:08", freq="1S") return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) @property @@ -229,14 +229,14 @@ def test_dataframe_resample(self): self._test_resample( self.pdf1, self.psdf1, - ["Y", "3Y", "M", "9M", "D", "17D"], + ["3Y", "9M", "17D"], ["min", "max", "sum", "mean", "std", "var"], ) - self._test_resample(self.pdf2, self.psdf2, ["3A", "A", "11M", "D"], ["sum"]) - self._test_resample(self.pdf3, self.psdf3, ["27H", "1D", "2D", "1M"], ["sum"]) - self._test_resample(self.pdf4, self.psdf4, ["1H", "5H", "D", "2D"], ["sum"]) - self._test_resample(self.pdf5, self.psdf5, ["1T", "2T", "5MIN", "1H", "2H", "D"], ["sum"]) - self._test_resample(self.pdf6, self.psdf6, ["1S", "2S", "1MIN", "H", "2H"], ["sum"]) + self._test_resample(self.pdf2, self.psdf2, ["3A", "11M", "D"], ["sum"]) + self._test_resample(self.pdf3, self.psdf3, ["2D", "1M"], ["sum"]) + self._test_resample(self.pdf4, self.psdf4, ["1H", "2D"], ["sum"]) + self._test_resample(self.pdf5, self.psdf5, ["11T", "55MIN", "2H", "D"], ["sum"]) + self._test_resample(self.pdf6, self.psdf6, ["29S", "10MIN", "3H"], ["sum"]) def test_series_resample(self): self._test_resample(self.pdf1.A, self.psdf1.A, ["4Y"], ["sum"]) @@ -244,7 +244,7 @@ def test_series_resample(self): self._test_resample(self.pdf3.A, self.psdf3.A, ["18H"], ["sum"]) self._test_resample(self.pdf4.A, self.psdf4.A, ["6D"], ["sum"]) self._test_resample(self.pdf5.A, self.psdf5.A, ["47T"], ["sum"]) - self._test_resample(self.pdf6.A, self.psdf6.A, ["37S"], ["sum"]) + self._test_resample(self.pdf6.A, self.psdf6.A, ["111S"], ["sum"]) def test_resample_on(self): np.random.seed(77) diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 6d38a37f6a..548750d712 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -36,6 +36,7 @@ class CatalogMetadata(NamedTuple): class Database(NamedTuple): name: str + catalog: Optional[str] description: Optional[str] locationUri: str @@ -67,6 +68,8 @@ class Column(NamedTuple): class Function(NamedTuple): name: str + catalog: Optional[str] + namespace: Optional[List[str]] description: Optional[str] className: str isTemporary: bool @@ -139,11 +142,40 @@ def listDatabases(self) -> List[Database]: jdb = iter.next() databases.append( Database( - name=jdb.name(), description=jdb.description(), locationUri=jdb.locationUri() + name=jdb.name(), + catalog=jdb.catalog(), + description=jdb.description(), + locationUri=jdb.locationUri(), ) ) return databases + def getDatabase(self, dbName: str) -> Database: + """Get the database with the specified name. + This throws an :class:`AnalysisException` when the database cannot be found. + + .. versionadded:: 3.4.0 + + Parameters + ---------- + dbName : str + name of the database to check existence. + + Examples + -------- + >>> spark.catalog.getDatabase("default") + Database(name='default', catalog=None, description='default database', ... + >>> spark.catalog.getDatabase("spark_catalog.default") + Database(name='default', catalog='spark_catalog', description='default database', ... + """ + jdb = self._jcatalog.getDatabase(dbName) + return Database( + name=jdb.name(), + catalog=jdb.catalog(), + description=jdb.description(), + locationUri=jdb.locationUri(), + ) + def databaseExists(self, dbName: str) -> bool: """Check if the database with the specified name exists. @@ -212,7 +244,7 @@ def listTables(self, dbName: Optional[str] = None) -> List[Table]: def getTable(self, tableName: str) -> Table: """Get the table or view with the specified name. This table can be a temporary view or a - table/view. This throws an AnalysisException when no Table can be found. + table/view. This throws an :class:`AnalysisException` when no Table can be found. .. versionadded:: 3.4.0 @@ -257,6 +289,9 @@ def listFunctions(self, dbName: Optional[str] = None) -> List[Function]: If no database is specified, the current database is used. This includes all temporary functions. + + .. versionchanged:: 3.4 + Allowed ``dbName`` to be qualified with catalog name. """ if dbName is None: dbName = self.currentDatabase() @@ -264,9 +299,17 @@ def listFunctions(self, dbName: Optional[str] = None) -> List[Function]: functions = [] while iter.hasNext(): jfunction = iter.next() + jnamespace = jfunction.namespace() + if jnamespace is not None: + namespace = [jnamespace[i] for i in range(0, len(jnamespace))] + else: + namespace = None + functions.append( Function( name=jfunction.name(), + catalog=jfunction.catalog(), + namespace=namespace, description=jfunction.description(), className=jfunction.className(), isTemporary=jfunction.isTemporary(), @@ -288,19 +331,75 @@ def functionExists(self, functionName: str, dbName: Optional[str] = None) -> boo name of the database to check function existence in. If no database is specified, the current database is used + .. deprecated:: 3.4.0 + + Returns ------- bool Indicating whether the function exists + .. versionchanged:: 3.4 + Allowed ``functionName`` to be qualified with catalog name + Examples -------- >>> spark.catalog.functionExists("unexisting_function") False + >>> spark.catalog.functionExists("default.unexisting_function") + False + >>> spark.catalog.functionExists("spark_catalog.default.unexisting_function") + False """ if dbName is None: - dbName = self.currentDatabase() - return self._jcatalog.functionExists(dbName, functionName) + return self._jcatalog.functionExists(functionName) + else: + warnings.warn( + "`dbName` has been deprecated since Spark 3.4 and might be removed in " + "a future version. Use functionExists(`dbName.tableName`) instead.", + FutureWarning, + ) + return self._jcatalog.functionExists(dbName, functionName) + + def getFunction(self, functionName: str) -> Function: + """Get the function with the specified name. This function can be a temporary function or a + function. This throws an :class:`AnalysisException` when the function cannot be found. + + .. versionadded:: 3.4.0 + + Parameters + ---------- + tableName : str + name of the function to check existence. + + Examples + -------- + >>> func = spark.sql("CREATE FUNCTION my_func1 AS 'test.org.apache.spark.sql.MyDoubleAvg'") + >>> spark.catalog.getFunction("my_func1") + Function(name='my_func1', catalog=None, namespace=['default'], ... + >>> spark.catalog.getFunction("default.my_func1") + Function(name='my_func1', catalog=None, namespace=['default'], ... + >>> spark.catalog.getFunction("spark_catalog.default.my_func1") + Function(name='my_func1', catalog='spark_catalog', namespace=['default'], ... + >>> spark.catalog.getFunction("my_func2") + Traceback (most recent call last): + ... + pyspark.sql.utils.AnalysisException: ... + """ + jfunction = self._jcatalog.getFunction(functionName) + jnamespace = jfunction.namespace() + if jnamespace is not None: + namespace = [jnamespace[i] for i in range(0, len(jnamespace))] + else: + namespace = None + return Function( + name=jfunction.name(), + catalog=jfunction.catalog(), + namespace=namespace, + description=jfunction.description(), + className=jfunction.className(), + isTemporary=jfunction.isTemporary(), + ) def listColumns(self, tableName: str, dbName: Optional[str] = None) -> List[Column]: """Returns a list of columns for the given table/view in the specified database. @@ -309,14 +408,33 @@ def listColumns(self, tableName: str, dbName: Optional[str] = None) -> List[Colu .. versionadded:: 2.0.0 + Parameters + ---------- + tableName : str + name of the table to check existence + dbName : str, optional + name of the database to check table existence in. + + .. deprecated:: 3.4.0 + + .. versionchanged:: 3.4 + Allowed ``tableName`` to be qualified with catalog name when ``dbName`` is None. + Notes ----- the order of arguments here is different from that of its JVM counterpart because Python does not support method overloading. """ if dbName is None: - dbName = self.currentDatabase() - iter = self._jcatalog.listColumns(dbName, tableName).toLocalIterator() + iter = self._jcatalog.listColumns(tableName).toLocalIterator() + else: + warnings.warn( + "`dbName` has been deprecated since Spark 3.4 and might be removed in " + "a future version. Use listColumns(`dbName.tableName`) instead.", + FutureWarning, + ) + iter = self._jcatalog.listColumns(dbName, tableName).toLocalIterator() + columns = [] while iter.hasNext(): jcolumn = iter.next() @@ -590,7 +708,11 @@ def clearCache(self) -> None: @since(2.0) def refreshTable(self, tableName: str) -> None: - """Invalidates and refreshes all the cached data and metadata of the given table.""" + """Invalidates and refreshes all the cached data and metadata of the given table. + + .. versionchanged:: 3.4 + Allowed ``tableName`` to be qualified with catalog name. + """ self._jcatalog.refreshTable(tableName) @since("2.1.1") diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 04458d560e..31954a9569 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -573,57 +573,6 @@ def __iter__(self) -> None: >>> df.filter(df.name.contains('o')).collect() [Row(age=5, name='Bob')] """ - _rlike_doc = """ - SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex - match. - - Parameters - ---------- - other : str - an extended regex expression - - Examples - -------- - >>> df.filter(df.name.rlike('ice$')).collect() - [Row(age=2, name='Alice')] - """ - _like_doc = """ - SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. - - Parameters - ---------- - other : str - a SQL LIKE pattern - - See Also - -------- - pyspark.sql.Column.rlike - - Examples - -------- - >>> df.filter(df.name.like('Al%')).collect() - [Row(age=2, name='Alice')] - """ - _ilike_doc = """ - SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column` - based on a case insensitive match. - - .. versionadded:: 3.3.0 - - Parameters - ---------- - other : str - a SQL LIKE pattern - - See Also - -------- - pyspark.sql.Column.rlike - - Examples - -------- - >>> df.filter(df.name.ilike('%Ice')).collect() - [Row(age=2, name='Alice')] - """ _startswith_doc = """ String starts with. Returns a boolean :class:`Column` based on a string match. @@ -656,12 +605,72 @@ def __iter__(self) -> None: """ contains = _bin_op("contains", _contains_doc) - rlike = _bin_op("rlike", _rlike_doc) - like = _bin_op("like", _like_doc) - ilike = _bin_op("ilike", _ilike_doc) startswith = _bin_op("startsWith", _startswith_doc) endswith = _bin_op("endsWith", _endswith_doc) + def like(self: "Column", other: str) -> "Column": + """ + SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. + + Parameters + ---------- + other : str + a SQL LIKE pattern + + See Also + -------- + pyspark.sql.Column.rlike + + Examples + -------- + >>> df.filter(df.name.like('Al%')).collect() + [Row(age=2, name='Alice')] + """ + njc = getattr(self._jc, "like")(other) + return Column(njc) + + def rlike(self: "Column", other: str) -> "Column": + """ + SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex + match. + + Parameters + ---------- + other : str + an extended regex expression + + Examples + -------- + >>> df.filter(df.name.rlike('ice$')).collect() + [Row(age=2, name='Alice')] + """ + njc = getattr(self._jc, "rlike")(other) + return Column(njc) + + def ilike(self: "Column", other: str) -> "Column": + """ + SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column` + based on a case insensitive match. + + .. versionadded:: 3.3.0 + + Parameters + ---------- + other : str + a SQL LIKE pattern + + See Also + -------- + pyspark.sql.Column.rlike + + Examples + -------- + >>> df.filter(df.name.ilike('%Ice')).collect() + [Row(age=2, name='Alice')] + """ + njc = getattr(self._jc, "ilike")(other) + return Column(njc) + @overload def substr(self, startPos: int, length: int) -> "Column": ... diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py index 53c2489015..7d81234bce 100644 --- a/python/pyspark/sql/tests/test_catalog.py +++ b/python/pyspark/sql/tests/test_catalog.py @@ -53,6 +53,14 @@ def test_database_exists(self): self.assertTrue(spark.catalog.databaseExists("spark_catalog.some_db")) self.assertFalse(spark.catalog.databaseExists("spark_catalog.some_db2")) + def test_get_database(self): + spark = self.spark + with self.database("some_db"): + spark.sql("CREATE DATABASE some_db") + db = spark.catalog.getDatabase("spark_catalog.some_db") + self.assertEqual(db.name, "some_db") + self.assertEqual(db.catalog, "spark_catalog") + def test_list_tables(self): from pyspark.sql.catalog import Table @@ -176,8 +184,6 @@ def compareTables(t1, t2): ) def test_list_functions(self): - from pyspark.sql.catalog import Function - spark = self.spark with self.database("some_db"): spark.sql("CREATE DATABASE some_db") @@ -191,15 +197,12 @@ def test_list_functions(self): self.assertTrue("to_timestamp" in functions) self.assertTrue("to_unix_timestamp" in functions) self.assertTrue("current_database" in functions) + self.assertEqual(functions["+"].name, "+") + self.assertEqual(functions["+"].description, None) self.assertEqual( - functions["+"], - Function( - name="+", - description=None, - className="org.apache.spark.sql.catalyst.expressions.Add", - isTemporary=True, - ), + functions["+"].className, "org.apache.spark.sql.catalyst.expressions.Add" ) + self.assertTrue(functions["+"].isTemporary) self.assertEqual(functions, functionsDefault) with self.function("func1", "some_db.func2"): @@ -229,11 +232,26 @@ def test_function_exists(self): spark = self.spark with self.function("func1"): self.assertFalse(spark.catalog.functionExists("func1")) + self.assertFalse(spark.catalog.functionExists("default.func1")) + self.assertFalse(spark.catalog.functionExists("spark_catalog.default.func1")) self.assertFalse(spark.catalog.functionExists("func1", "default")) spark.sql("CREATE FUNCTION func1 AS 'org.apache.spark.data.bricks'") self.assertTrue(spark.catalog.functionExists("func1")) + self.assertTrue(spark.catalog.functionExists("default.func1")) + self.assertTrue(spark.catalog.functionExists("spark_catalog.default.func1")) self.assertTrue(spark.catalog.functionExists("func1", "default")) + def test_get_function(self): + spark = self.spark + with self.function("func1"): + spark.sql("CREATE FUNCTION func1 AS 'org.apache.spark.data.bricks'") + func1 = spark.catalog.getFunction("spark_catalog.default.func1") + self.assertTrue(func1.name == "func1") + self.assertTrue(func1.namespace == ["default"]) + self.assertTrue(func1.catalog == "spark_catalog") + self.assertTrue(func1.className == "org.apache.spark.data.bricks") + self.assertFalse(func1.isTemporary) + def test_list_columns(self): from pyspark.sql.catalog import Column @@ -245,7 +263,9 @@ def test_list_columns(self): spark.sql( "CREATE TABLE some_db.tab2 (nickname STRING, tolerance FLOAT) USING parquet" ) - columns = sorted(spark.catalog.listColumns("tab1"), key=lambda c: c.name) + columns = sorted( + spark.catalog.listColumns("spark_catalog.default.tab1"), key=lambda c: c.name + ) columnsDefault = sorted( spark.catalog.listColumns("tab1", "default"), key=lambda c: c.name ) @@ -352,6 +372,26 @@ def test_get_table(self): self.assertEqual(spark.catalog.getTable("default.tab1").catalog, "spark_catalog") self.assertEqual(spark.catalog.getTable("spark_catalog.default.tab1").name, "tab1") + def test_refresh_table(self): + import os + import tempfile + + spark = self.spark + with tempfile.TemporaryDirectory() as tmp_dir: + with self.table("my_tab"): + spark.sql( + "CREATE TABLE my_tab (col STRING) USING TEXT LOCATION '{}'".format(tmp_dir) + ) + spark.sql("INSERT INTO my_tab SELECT 'abc'") + spark.catalog.cacheTable("my_tab") + self.assertEqual(spark.table("my_tab").count(), 1) + + os.system("rm -rf {}/*".format(tmp_dir)) + self.assertEqual(spark.table("my_tab").count(), 1) + + spark.catalog.refreshTable("spark_catalog.default.my_tab") + self.assertEqual(spark.table("my_tab").count(), 0) + if __name__ == "__main__": import unittest diff --git a/python/run-tests.py b/python/run-tests.py index 1e3c1e8544..436b29fa18 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -96,7 +96,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): os.mkdir(metastore_dir) # Also override the JVM's temp directory by setting driver and executor options. - java_options = "-Djava.io.tmpdir={0} -Dio.netty.tryReflectionSetAccessible=true".format(tmp_dir) + java_options = "-Djava.io.tmpdir={0}".format(tmp_dir) + java_options = java_options + " -Dio.netty.tryReflectionSetAccessible=true -Xss4M" spark_args = [ "--conf", "spark.driver.extraJavaOptions='{0}'".format(java_options), "--conf", "spark.executor.extraJavaOptions='{0}'".format(java_options), diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala index 8102ca84af..deb0efffe6 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala @@ -72,6 +72,7 @@ private[spark] class BasicExecutorFeatureStep( kubernetesConf.sparkConf, isPythonApp, Map.empty) + assert(execResources.cores.nonEmpty) private val executorMemoryString = s"${execResources.executorMemoryMiB}m" // we don't include any kubernetes conf specific requests or limits when using custom @@ -80,7 +81,7 @@ private[spark] class BasicExecutorFeatureStep( if (isDefaultProfile && kubernetesConf.sparkConf.contains(KUBERNETES_EXECUTOR_REQUEST_CORES)) { kubernetesConf.get(KUBERNETES_EXECUTOR_REQUEST_CORES).get } else { - execResources.cores.toString + execResources.cores.get.toString } private val executorLimitCores = kubernetesConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES) @@ -123,7 +124,7 @@ private[spark] class BasicExecutorFeatureStep( val executorEnv: Seq[EnvVar] = { (Seq( (ENV_DRIVER_URL, driverUrl), - (ENV_EXECUTOR_CORES, execResources.cores.toString), + (ENV_EXECUTOR_CORES, execResources.cores.get.toString), (ENV_EXECUTOR_MEMORY, executorMemoryString), (ENV_APPLICATION_ID, kubernetesConf.appId), // This is to set the SPARK_CONF_DIR to be /opt/spark/conf diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala index 3519efd3fc..9bdc30e446 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala @@ -25,7 +25,7 @@ import scala.collection.mutable import scala.util.control.NonFatal import io.fabric8.kubernetes.api.model.{HasMetadata, PersistentVolumeClaim, Pod, PodBuilder} -import io.fabric8.kubernetes.client.KubernetesClient +import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException} import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.deploy.k8s.Config._ @@ -360,16 +360,22 @@ class ExecutorPodsAllocator( private def getReusablePVCs(applicationId: String, pvcsInUse: Seq[String]) = { if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && conf.get(KUBERNETES_DRIVER_REUSE_PVC) && driverPod.nonEmpty) { - val createdPVCs = kubernetesClient - .persistentVolumeClaims - .withLabel("spark-app-selector", applicationId) - .list() - .getItems - .asScala - - val reusablePVCs = createdPVCs.filterNot(pvc => pvcsInUse.contains(pvc.getMetadata.getName)) - logInfo(s"Found ${reusablePVCs.size} reusable PVCs from ${createdPVCs.size} PVCs") - reusablePVCs + try { + val createdPVCs = kubernetesClient + .persistentVolumeClaims + .withLabel("spark-app-selector", applicationId) + .list() + .getItems + .asScala + + val reusablePVCs = createdPVCs.filterNot(pvc => pvcsInUse.contains(pvc.getMetadata.getName)) + logInfo(s"Found ${reusablePVCs.size} reusable PVCs from ${createdPVCs.size} PVCs") + reusablePVCs + } catch { + case _: KubernetesClientException => + logInfo("Cannot list PVC resources. Please check account permissions.") + mutable.Buffer.empty[PersistentVolumeClaim] + } } else { mutable.Buffer.empty[PersistentVolumeClaim] } diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala index 84c4f3b8ba..420edddb69 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala @@ -16,10 +16,6 @@ */ package org.apache.spark.deploy.k8s.features -import java.io.File -import java.nio.charset.StandardCharsets -import java.nio.file.Files - import scala.collection.JavaConverters._ import com.google.common.net.InternetDomainName @@ -283,21 +279,20 @@ class BasicExecutorFeatureStepSuite extends SparkFunSuite with BeforeAndAfter { } test("Auth secret shouldn't propagate if files are loaded.") { - val secretDir = Utils.createTempDir("temp-secret") - val secretFile = new File(secretDir, "secret-file.txt") - Files.write(secretFile.toPath, "some-secret".getBytes(StandardCharsets.UTF_8)) - val conf = baseConf.clone() - .set(config.NETWORK_AUTH_ENABLED, true) - .set(config.AUTH_SECRET_FILE, secretFile.getAbsolutePath) - .set("spark.master", "k8s://127.0.0.1") - val secMgr = new SecurityManager(conf) - secMgr.initializeAuth() - val step = new BasicExecutorFeatureStep(KubernetesTestConf.createExecutorConf(sparkConf = conf), - secMgr, defaultProfile) + withSecretFile("some-secret") { secretFile => + val conf = baseConf.clone() + .set(config.NETWORK_AUTH_ENABLED, true) + .set(config.AUTH_SECRET_FILE, secretFile.getAbsolutePath) + .set("spark.master", "k8s://127.0.0.1") + val secMgr = new SecurityManager(conf) + secMgr.initializeAuth() + val step = new BasicExecutorFeatureStep( + KubernetesTestConf.createExecutorConf(sparkConf = conf), secMgr, defaultProfile) - val executor = step.configurePod(SparkPod.initialPod()) - assert(!KubernetesFeaturesTestUtils.containerHasEnvVar( - executor.container, SecurityManager.ENV_AUTH_SECRET)) + val executor = step.configurePod(SparkPod.initialPod()) + assert(!KubernetesFeaturesTestUtils.containerHasEnvVar( + executor.container, SecurityManager.ENV_AUTH_SECRET)) + } } test("SPARK-32661 test executor offheap memory") { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala index 87bd8ef3d9..7ce0b57d1e 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala @@ -20,9 +20,10 @@ import java.time.Instant import java.util.concurrent.atomic.AtomicInteger import scala.collection.JavaConverters._ +import scala.collection.mutable import io.fabric8.kubernetes.api.model._ -import io.fabric8.kubernetes.client.KubernetesClient +import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException} import io.fabric8.kubernetes.client.dsl.PodResource import org.mockito.{Mock, MockitoAnnotations} import org.mockito.ArgumentMatchers.{any, eq => meq} @@ -762,6 +763,13 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { " namespace default")) } + test("SPARK-39688: getReusablePVCs should handle accounts with no PVC permission") { + val getReusablePVCs = + PrivateMethod[mutable.Buffer[PersistentVolumeClaim]](Symbol("getReusablePVCs")) + when(persistentVolumeClaimList.getItems).thenThrow(new KubernetesClientException("Error")) + podsAllocatorUnderTest invokePrivate getReusablePVCs("appId", Seq.empty[String]) + } + private def executorPodAnswer(): Answer[KubernetesExecutorSpec] = (invocation: InvocationOnMock) => { val k8sConf: KubernetesExecutorConf = invocation.getArgument(0) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala index 9c31f9f912..c3af83118f 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala @@ -131,6 +131,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn pollEvents) } + after { + ResourceProfile.clearDefaultProfile() + } + test("Start all components") { schedulerBackendUnderTest.start() verify(podAllocator).setTotalExpectedExecutors(Map(defaultProfile -> 3)) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala index 97f7f4876e..17c2d4a938 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBuilderSuite.scala @@ -29,6 +29,11 @@ class KubernetesExecutorBuilderSuite extends PodBuilderSuite { val TEST_ANNOTATION_KEY: String = "executor-annotation-key" val TEST_ANNOTATION_VALUE: String = "executor-annotation-value" + override protected def afterEach(): Unit = { + ResourceProfile.clearDefaultProfile() + super.afterEach() + } + override protected def templateFileConf: ConfigEntry[_] = { Config.KUBERNETES_EXECUTOR_PODTEMPLATE_FILE } diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala index 7484e3b836..e628453740 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.deploy.TestPrematureExit class MesosClusterDispatcherSuite extends SparkFunSuite - with TestPrematureExit{ + with TestPrematureExit { test("prints usage on empty input") { testPrematureExit(Array[String](), diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 6158fbf8cd..a90ab180d8 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -342,8 +342,10 @@ private[yarn] class YarnAllocator( } else { customSparkResources } - val resource = - Resource.newInstance(resourcesWithDefaults.totalMemMiB.toInt, resourcesWithDefaults.cores) + + assert(resourcesWithDefaults.cores.nonEmpty) + val resource = Resource.newInstance( + resourcesWithDefaults.totalMemMiB.toInt, resourcesWithDefaults.cores.get) ResourceRequestHelper.setResourceRequests(customResources, resource) logDebug(s"Created resource capability: $resource") rpIdToYarnResource.putIfAbsent(rp.id, resource) @@ -753,7 +755,11 @@ private[yarn] class YarnAllocator( val defaultResources = ResourceProfile.getDefaultProfileExecutorResources(sparkConf) val containerMem = rp.executorResources.get(ResourceProfile.MEMORY). map(_.amount).getOrElse(defaultResources.executorMemoryMiB).toInt - val containerCores = rp.getExecutorCores.getOrElse(defaultResources.cores) + + assert(defaultResources.cores.nonEmpty) + val defaultCores = defaultResources.cores.get + val containerCores = rp.getExecutorCores.getOrElse(defaultCores) + val rpRunningExecs = getOrUpdateRunningExecutorForRPId(rpId).size if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId)) { getOrUpdateNumExecutorsStartingForRPId(rpId).incrementAndGet() diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/GeneralAggregateFunc.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/GeneralAggregateFunc.java index 81838074fb..04d48e2434 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/GeneralAggregateFunc.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/GeneralAggregateFunc.java @@ -35,6 +35,10 @@ *
  • COVAR_POP(input1, input2)
    Since 3.3.0
  • *
  • COVAR_SAMP(input1, input2)
    Since 3.3.0
  • *
  • CORR(input1, input2)
    Since 3.3.0
  • + *
  • REGR_INTERCEPT(input1, input2)
    Since 3.4.0
  • + *
  • REGR_R2(input1, input2)
    Since 3.4.0
  • + *
  • REGR_SLOPE(input1, input2)
    Since 3.4.0
  • + *
  • REGR_SXY(input1, input2)
    Since 3.4.0
  • * * * @since 3.3.0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index b236cf33af..20c719aec6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -530,7 +530,6 @@ object FunctionRegistry { expression[ToNumber]("to_number"), expression[TryToNumber]("try_to_number"), expression[ToCharacter]("to_char"), - expression[TryToCharacter]("try_to_char"), expression[GetJsonObject]("get_json_object"), expression[InitCap]("initcap"), expression[StringInstr]("instr"), @@ -586,6 +585,7 @@ object FunctionRegistry { expression[XPathShort]("xpath_short"), expression[XPathString]("xpath_string"), expression[RegExpCount]("regexp_count"), + expression[RegExpSubStr]("regexp_substr"), // datetime functions expression[AddMonths]("add_months"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 8128babacc..16d89c9b2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -1805,6 +1805,13 @@ class SessionCatalog( }.distinct } + /** + * List all temporary functions. + */ + def listTemporaryFunctions(): Seq[FunctionIdentifier] = { + (functionRegistry.listFunction() ++ tableFunctionRegistry.listFunction()) + .filter(isTemporaryFunction) + } // ----------------- // | Other methods | diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 9daa50ba5a..3e92c3d25e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -215,7 +215,11 @@ class CSVOptions( */ val lineSeparator: Option[String] = parameters.get("lineSep").map { sep => require(sep.nonEmpty, "'lineSep' cannot be an empty string.") - require(sep.length == 1, "'lineSep' can contain only 1 character.") + // Intentionally allow it up to 2 for Window's CRLF although multiple + // characters have an issue with quotes. This is intentionally undocumented. + require(sep.length <= 2, "'lineSep' can contain only 1 character.") + if (sep.length == 2) logWarning("It is not recommended to set 'lineSep' " + + "with 2 characters due to the limitation of supporting multi-char 'lineSep' within quotes.") sep } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index c97dfe1970..a53914b5f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.{InternalRow, JavaTypeInference, ScalaRefle import org.apache.spark.sql.catalyst.analysis.{Analyzer, GetColumnByOrdinal, SimpleAnalyzer, UnresolvedAttribute, UnresolvedExtractValue} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.{Deserializer, Serializer} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, InitializeJavaBean, Invoke, NewInstance} import org.apache.spark.sql.catalyst.optimizer.{ReassignLambdaVariableID, SimplifyCasts} import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LeafNode, LocalRelation} @@ -201,7 +200,7 @@ object ExpressionEncoder { override def apply(t: T): InternalRow = try { if (extractProjection == null) { inputRow = new GenericInternalRow(1) - extractProjection = GenerateUnsafeProjection.generate(expressions) + extractProjection = UnsafeProjection.create(expressions) } inputRow(0) = t extractProjection(inputRow) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallback.scala index 3b7219477b..0509b852cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallback.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallback.scala @@ -21,7 +21,6 @@ import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.util.Utils /** * Defines values for `SQLConf` config of fallback mode. Use for test only. @@ -43,9 +42,9 @@ abstract class CodeGeneratorWithInterpretedFallback[IN, OUT] extends Logging { val fallbackMode = CodegenObjectFactoryMode.withName(config) fallbackMode match { - case CodegenObjectFactoryMode.CODEGEN_ONLY if Utils.isTesting => + case CodegenObjectFactoryMode.CODEGEN_ONLY => createCodeGeneratedObject(in) - case CodegenObjectFactoryMode.NO_CODEGEN if Utils.isTesting => + case CodegenObjectFactoryMode.NO_CODEGEN => createInterpretedObject(in) case _ => try { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala index 974d4b5f86..9b52c7b07e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala @@ -44,11 +44,11 @@ case object Descending extends SortDirection { override def defaultNullOrdering: NullOrdering = NullsLast } -case object NullsFirst extends NullOrdering{ +case object NullsFirst extends NullOrdering { override def sql: String = "NULLS FIRST" } -case object NullsLast extends NullOrdering{ +case object NullsLast extends NullOrdering { override def sql: String = "NULLS LAST" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala index 0bf573334f..ee8ef4633d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala @@ -255,72 +255,3 @@ case class ToCharacter(left: Expression, right: Expression) newLeft: Expression, newRight: Expression): ToCharacter = copy(left = newLeft, right = newRight) } - -/** - * A function that converts decimal values to strings, returning NULL if the decimal value fails to - * match the format string. - */ -@ExpressionDescription( - usage = """ - - _FUNC_(numberExpr, formatExpr) - Convert `numberExpr` to a string based on the `formatExpr`. - Returns NULL if the conversion fails. The format follows the same semantics as the - to_char function. - """, - examples = """ - Examples: - > SELECT _FUNC_(454, '999'); - 454 - > SELECT _FUNC_(454.00, '000D00'); - 454.00 - > SELECT _FUNC_(12454, '99G999'); - 12,454 - > SELECT _FUNC_(78.12, '$99.99'); - $78.12 - > SELECT _FUNC_(-12454.8, '99G999D9S'); - 12,454.8- - """, - since = "3.4.0", - group = "string_funcs") -case class TryToCharacter(left: Expression, right: Expression) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { - private lazy val numberFormat = right.eval().toString.toUpperCase(Locale.ROOT) - private lazy val numberFormatter = new ToNumberParser(numberFormat, false) - - override def dataType: DataType = StringType - override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType, StringType) - override def nullable: Boolean = true - override def checkInputDataTypes(): TypeCheckResult = - ToCharacter(left, right).checkInputDataTypes() - override def prettyName: String = "try_to_char" - override def nullSafeEval(decimal: Any, format: Any): Any = { - val input = decimal.asInstanceOf[Decimal] - numberFormatter.format(input) - } - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val builder = - ctx.addReferenceObj("builder", numberFormatter, classOf[ToNumberParser].getName) - val eval = left.genCode(ctx) - ev.copy(code = - code""" - |${eval.code} - |boolean ${ev.isNull} = ${eval.isNull}; - |${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; - |if (!${ev.isNull}) { - | UTF8String result = $builder.format(${eval.value}); - | if (result == null) { - | ${ev.isNull} = true; - | ${ev.value} = null; - | } else { - | ${ev.isNull} = false; - | ${ev.value} = result; - | } - |} - """.stripMargin) - } - - override protected def withNewChildrenInternal( - newLeft: Expression, - newRight: Expression): TryToCharacter = - copy(left = newLeft, right = newRight) -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index fe982b2382..12e73f2e9f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -555,8 +555,27 @@ case class NewInstance( } override def eval(input: InternalRow): Any = { - val argValues = arguments.map(_.eval(input)) - constructor(argValues.map(_.asInstanceOf[AnyRef])) + var i = 0 + val len = arguments.length + var resultNull = false + while (i < len) { + val result = arguments(i).eval(input).asInstanceOf[Object] + evaluatedArgs(i) = result + resultNull = resultNull || (result == null && needNullCheckForIndex(i)) + i += 1 + } + if (needNullCheck && resultNull) { + // return null if one of arguments is null + null + } else { + try { + constructor(evaluatedArgs) + } catch { + // Re-throw the original exception. + case e: java.lang.reflect.InvocationTargetException if e.getCause != null => + throw e.getCause + } + } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -1916,12 +1935,11 @@ case class ValidateExternalType(child: Expression, expected: DataType, lenient: } } - override def eval(input: InternalRow): Any = { - val result = child.eval(input) - if (checkType(result)) { - result + override def nullSafeEval(input: Any): Any = { + if (checkType(input)) { + input } else { - throw new RuntimeException(s"${result.getClass.getName}$errMsg") + throw new RuntimeException(s"${input.getClass.getName}$errMsg") } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 126e0b6dc1..b240e849f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -84,16 +84,12 @@ abstract class StringRegexExpression extends BinaryExpression Arguments: * str - a string expression * pattern - a string expression. The pattern is a string which is matched literally, with - exception to the following special symbols: - - _ matches any one character in the input (similar to . in posix regular expressions) - + exception to the following special symbols:

    + _ matches any one character in the input (similar to . in posix regular expressions)\ % matches zero or more characters in the input (similar to .* in posix regular - expressions) - + expressions)

    Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc". - + to match "\abc", the pattern should be "\\abc".

    When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the pattern to match "\abc" should be "\abc". @@ -189,7 +185,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) copy(left = newLeft, right = newRight) } -// scalastyle:off line.contains.tab +// scalastyle:off line.contains.tab line.size.limit /** * Simple RegEx case-insensitive pattern matching function */ @@ -200,16 +196,12 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) Arguments: * str - a string expression * pattern - a string expression. The pattern is a string which is matched literally and - case-insensitively, with exception to the following special symbols: - - _ matches any one character in the input (similar to . in posix regular expressions) - + case-insensitively, with exception to the following special symbols:

    + _ matches any one character in the input (similar to . in posix regular expressions)

    % matches zero or more characters in the input (similar to .* in posix regular - expressions) - + expressions)

    Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc". - + to match "\abc", the pattern should be "\\abc".

    When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the pattern to match "\abc" should be "\abc". @@ -237,7 +229,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) """, since = "3.3.0", group = "predicate_funcs") -// scalastyle:on line.contains.tab +// scalastyle:on line.contains.tab line.size.limit case class ILike( left: Expression, right: Expression, @@ -574,12 +566,10 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) Arguments: * str - a string expression to search for a regular expression pattern match. * regexp - a string representing a regular expression. The regex string should be a - Java regular expression. - + Java regular expression.

    Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$". - + "^\\abc$".

    There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". @@ -783,12 +773,10 @@ abstract class RegExpExtractBase Arguments: * str - a string expression. * regexp - a string representing a regular expression. The regex string should be a - Java regular expression. - + Java regular expression.

    Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$". - + "^\\abc$".

    There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". @@ -888,12 +876,10 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio Arguments: * str - a string expression. * regexp - a string representing a regular expression. The regex string should be a - Java regular expression. - + Java regular expression.

    Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL parser. For example, to match "\abc", a regular expression for `regexp` can be - "^\\abc$". - + "^\\abc$".

    There is a SQL config 'spark.sql.parser.escapedStringLiterals' that can be used to fallback to the Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the `regexp` that can match "\abc" is "^\abc$". @@ -1018,3 +1004,42 @@ case class RegExpCount(left: Expression, right: Expression) newChildren: IndexedSeq[Expression]): RegExpCount = copy(left = newChildren(0), right = newChildren(1)) } + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(str, regexp) - Returns the substring that matches the regular expression `regexp` within the string `str`. If the regular expression is not found, the result is null. + """, + arguments = """ + Arguments: + * str - a string expression. + * regexp - a string representing a regular expression. The regex string should be a Java regular expression. + """, + examples = """ + Examples: + > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en'); + Steven + > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players', 'Jeck'); + NULL + """, + since = "3.4.0", + group = "string_funcs") +// scalastyle:on line.size.limit +case class RegExpSubStr(left: Expression, right: Expression) + extends RuntimeReplaceable with ImplicitCastInputTypes { + + override lazy val replacement: Expression = + new NullIf( + RegExpExtract(subject = left, regexp = right, idx = Literal(0)), + Literal("")) + + override def prettyName: String = "regexp_substr" + + override def children: Seq[Expression] = Seq(left, right) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression]): RegExpSubStr = + copy(left = newChildren(0), right = newChildren(1)) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala index 9cae2b622a..2de44d6f34 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/identifiers.scala @@ -142,7 +142,7 @@ case class FunctionIdentifier(funcName: String, database: Option[String], catalo override val identifier: String = funcName def this(funcName: String) = this(funcName, None, None) - def this(table: String, database: Option[String]) = this(table, database, None) + def this(funcName: String, database: Option[String]) = this(funcName, database, None) override def toString: String = unquotedString } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 6ba7907fda..977e9b1ab1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -314,25 +314,6 @@ object NestedColumnAliasing { } } -object GeneratorUnrequiredChildrenPruning { - def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match { - case p @ Project(_, g: Generate) => - val requiredAttrs = p.references ++ g.generator.references - val newChild = ColumnPruning.prunedChild(g.child, requiredAttrs) - val unrequired = g.generator.references -- p.references - val unrequiredIndices = newChild.output.zipWithIndex.filter(t => unrequired.contains(t._1)) - .map(_._2) - if (!newChild.fastEquals(g.child) || - unrequiredIndices.toSet != g.unrequiredChildIndex.toSet) { - Some(p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices))) - } else { - None - } - case _ => None - } -} - - /** * This prunes unnecessary nested columns from [[Generate]], or [[Project]] -> [[Generate]] */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index eda42a9adb..fa012aac4f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -862,12 +862,13 @@ object ColumnPruning extends Rule[LogicalPlan] { e.copy(child = prunedChild(child, e.references)) // prune unrequired references - // There are 2 types of pruning here: - // 1. For attributes in g.child.outputSet that is not used by the generator nor the project, - // we directly remove it from the output list of g.child. - // 2. For attributes that is not used by the project but it is used by the generator, we put - // it in g.unrequiredChildIndex to save memory usage. - case GeneratorUnrequiredChildrenPruning(rewrittenPlan) => rewrittenPlan + case p @ Project(_, g: Generate) if p.references != g.outputSet => + val requiredAttrs = p.references -- g.producedAttributes ++ g.generator.references + val newChild = prunedChild(g.child, requiredAttrs) + val unrequired = g.generator.references -- p.references + val unrequiredIndices = newChild.output.zipWithIndex.filter(t => unrequired.contains(t._1)) + .map(_._2) + p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices)) // prune unrequired nested fields from `Generate`. case GeneratorNestedColumnAliasing(rewrittenPlan) => rewrittenPlan @@ -928,7 +929,7 @@ object ColumnPruning extends Rule[LogicalPlan] { }) /** Applies a projection only when the child is producing unnecessary attributes */ - def prunedChild(c: LogicalPlan, allReferences: AttributeSet): LogicalPlan = + private def prunedChild(c: LogicalPlan, allReferences: AttributeSet) = if (!c.outputSet.subsetOf(allReferences)) { Project(c.output.filter(allReferences.contains), c) } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala index 59a302b1af..21799a5c68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala @@ -102,9 +102,7 @@ object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] { override def visitWindow(p: Window): Statistics = fallback(p) - override def visitSort(p: Sort): Statistics = { - BasicStatsPlanVisitor.visit(p.child) - } + override def visitSort(p: Sort): Statistics = fallback(p) override def visitTail(p: Tail): Statistics = { fallback(p) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala index 311dd31a96..77c728ba7c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala @@ -162,7 +162,7 @@ object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] { override def visitWindow(p: Window): Statistics = visitUnaryNode(p) - override def visitSort(p: Sort): Statistics = default(p) + override def visitSort(p: Sort): Statistics = p.child.stats override def visitTail(p: Tail): Statistics = { val limit = p.limitExpr.eval().asInstanceOf[Int] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 8081a3edc8..71d8a0740b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -886,7 +886,11 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product with Tre private def stringArgsForCatalogTable(table: CatalogTable): Seq[Any] = { table.storage.serde match { - case Some(serde) => table.identifier :: serde :: Nil + case Some(serde) + // SPARK-39564: don't print out serde to avoid introducing complicated and error-prone + // regex magic. + if !SQLConf.get.getConfString("spark.test.noSerdeInExplain", "false").toBoolean => + table.identifier :: serde :: Nil case _ => table.identifier :: Nil } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f0ecbd0f9b..1b7857ead5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1548,7 +1548,8 @@ object SQLConf { "during tests. `FALLBACK` means trying codegen first and then falling back to " + "interpreted if any compile error happens. Disabling fallback if `CODEGEN_ONLY`. " + "`NO_CODEGEN` skips codegen and goes interpreted path always. Note that " + - "this config works only for tests.") + "this configuration is only for the internal usage, and NOT supposed to be set by " + + "end users.") .version("2.4.0") .internal() .stringConf diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 537ffb815b..cae8cb9957 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -1076,14 +1076,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { case TypeCheckResult.TypeCheckFailure(message) => assert(message.contains(expectedErrMsg)) } - - val tryToCharResult = TryToCharacter(Decimal(456), Literal(format)).checkInputDataTypes() - assert(tryToCharResult != TypeCheckResult.TypeCheckSuccess, - s"The format string should have been invalid: $format") - tryToCharResult match { - case TypeCheckResult.TypeCheckFailure(message) => - assert(message.contains(expectedErrMsg)) - } } } @@ -1156,10 +1148,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test '.' and 'D' @@ -1194,14 +1182,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { expr = ToCharacter(Literal(decimal), Literal(format2)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format2)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } Seq( @@ -1228,10 +1208,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test ',' and 'G' @@ -1263,14 +1239,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { expr = ToCharacter(Literal(decimal), Literal(format2)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format2)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } Seq( @@ -1323,10 +1291,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test '$' @@ -1341,10 +1305,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test 'S' @@ -1380,10 +1340,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test 'MI' @@ -1416,10 +1372,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test 'PR' @@ -1455,10 +1407,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } // Test overflows @@ -1481,10 +1429,6 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { var expr: Expression = ToCharacter(Literal(decimal), Literal(format)) assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) checkEvaluation(expr, expected) - - expr = TryToCharacter(Literal(decimal), Literal(format)) - assert(expr.checkInputDataTypes() == TypeCheckResult.TypeCheckSuccess) - checkEvaluation(expr, expected) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala index 933519b3dd..f28df3839d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala @@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.optimizer.NestedColumnAliasingSuite.collectGeneratedAliases import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor @@ -465,37 +464,6 @@ class ColumnPruningSuite extends PlanTest { comparePlans(Optimize.execute(plan1.analyze), correctAnswer1) } - test("SPARK-38531: Nested field pruning for Project and PosExplode") { - val name = StructType.fromDDL("first string, middle string, last string") - val employer = StructType.fromDDL("id int, company struct") - val contact = LocalRelation( - 'id.int, - 'name.struct(name), - 'address.string, - 'friends.array(name), - 'relatives.map(StringType, name), - 'employer.struct(employer)) - - val query = contact - .select('id, 'friends) - .generate(PosExplode('friends)) - .select('col.getField("middle")) - .analyze - val optimized = Optimize.execute(query) - - val aliases = collectGeneratedAliases(optimized) - - val expected = contact - // GetStructField is pushed down, unused id column is pruned. - .select( - 'friends.getField("middle").as(aliases(0))) - .generate(PosExplode($"${aliases(0)}"), - unrequiredChildIndex = Seq(0)) // unrequiredChildIndex is added. - .select('col.as("col.middle")) - .analyze - comparePlans(optimized, expected) - } - test("SPARK-39445: Remove the window if windowExpressions is empty in column pruning") { object CustomOptimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Column pruning", FixedPoint(10), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala index 86a1cb4c3c..4362e0c517 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/BasicStatsEstimationSuite.scala @@ -360,7 +360,7 @@ class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase { checkStats( sort, expectedStatsCboOn = expectedSortStats, - expectedStatsCboOff = Statistics(sizeInBytes = expectedSize)) + expectedStatsCboOff = expectedSortStats) } /** Check estimated stats when cbo is turned on/off. */ diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt index 4118ac2ca7..d30b3327ff 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt @@ -2,669 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9287 9340 45 1.7 590.4 1.0X -Parquet Vectorized (Pushdown) 586 608 25 26.8 37.3 15.8X -Native ORC Vectorized 6930 7023 100 2.3 440.6 1.3X -Native ORC Vectorized (Pushdown) 482 500 24 32.7 30.6 19.3X +Parquet Vectorized 10439 10509 98 1.5 663.7 1.0X +Parquet Vectorized (Pushdown) 578 602 31 27.2 36.7 18.1X +Native ORC Vectorized 7076 7112 44 2.2 449.9 1.5X +Native ORC Vectorized (Pushdown) 487 496 7 32.3 31.0 21.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9454 9481 24 1.7 601.1 1.0X -Parquet Vectorized (Pushdown) 575 594 18 27.4 36.6 16.4X -Native ORC Vectorized 7092 7109 19 2.2 450.9 1.3X -Native ORC Vectorized (Pushdown) 493 500 12 31.9 31.3 19.2X +Parquet Vectorized 10580 10604 17 1.5 672.7 1.0X +Parquet Vectorized (Pushdown) 559 571 8 28.1 35.6 18.9X +Native ORC Vectorized 7188 7208 21 2.2 457.0 1.5X +Native ORC Vectorized (Pushdown) 473 481 9 33.3 30.0 22.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9427 9460 22 1.7 599.3 1.0X -Parquet Vectorized (Pushdown) 563 598 36 27.9 35.8 16.7X -Native ORC Vectorized 7073 7090 10 2.2 449.7 1.3X -Native ORC Vectorized (Pushdown) 479 489 9 32.8 30.5 19.7X +Parquet Vectorized 10526 10540 20 1.5 669.2 1.0X +Parquet Vectorized (Pushdown) 545 554 9 28.9 34.6 19.3X +Native ORC Vectorized 7144 7189 33 2.2 454.2 1.5X +Native ORC Vectorized (Pushdown) 448 462 8 35.1 28.5 23.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9418 9443 20 1.7 598.8 1.0X -Parquet Vectorized (Pushdown) 563 572 7 27.9 35.8 16.7X -Native ORC Vectorized 7238 7268 24 2.2 460.2 1.3X -Native ORC Vectorized (Pushdown) 467 477 9 33.7 29.7 20.2X +Parquet Vectorized 10506 10515 9 1.5 667.9 1.0X +Parquet Vectorized (Pushdown) 529 537 10 29.8 33.6 19.9X +Native ORC Vectorized 7118 7156 23 2.2 452.6 1.5X +Native ORC Vectorized (Pushdown) 436 445 10 36.0 27.7 24.1X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9433 9476 39 1.7 599.8 1.0X -Parquet Vectorized (Pushdown) 568 573 6 27.7 36.1 16.6X -Native ORC Vectorized 7117 7230 76 2.2 452.5 1.3X -Native ORC Vectorized (Pushdown) 472 482 9 33.3 30.0 20.0X +Parquet Vectorized 10505 10519 12 1.5 667.9 1.0X +Parquet Vectorized (Pushdown) 517 528 8 30.4 32.9 20.3X +Native ORC Vectorized 7232 7271 43 2.2 459.8 1.5X +Native ORC Vectorized (Pushdown) 439 448 7 35.8 27.9 23.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 17447 17456 9 0.9 1109.2 1.0X -Parquet Vectorized (Pushdown) 17535 17548 12 0.9 1114.9 1.0X -Native ORC Vectorized 15176 15205 23 1.0 964.8 1.1X -Native ORC Vectorized (Pushdown) 15332 15368 31 1.0 974.8 1.1X +Parquet Vectorized 19062 19147 82 0.8 1211.9 1.0X +Parquet Vectorized (Pushdown) 19151 19167 13 0.8 1217.6 1.0X +Native ORC Vectorized 15455 15470 15 1.0 982.6 1.2X +Native ORC Vectorized (Pushdown) 15605 15614 9 1.0 992.2 1.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8956 8989 43 1.8 569.4 1.0X -Parquet Vectorized (Pushdown) 546 561 24 28.8 34.7 16.4X -Native ORC Vectorized 6480 6558 47 2.4 412.0 1.4X -Native ORC Vectorized (Pushdown) 451 461 10 34.9 28.7 19.8X +Parquet Vectorized 9694 9721 30 1.6 616.3 1.0X +Parquet Vectorized (Pushdown) 475 484 10 33.1 30.2 20.4X +Native ORC Vectorized 6570 6598 32 2.4 417.7 1.5X +Native ORC Vectorized (Pushdown) 413 421 7 38.1 26.3 23.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9121 9177 35 1.7 579.9 1.0X -Parquet Vectorized (Pushdown) 552 561 7 28.5 35.1 16.5X -Native ORC Vectorized 6522 6565 45 2.4 414.7 1.4X -Native ORC Vectorized (Pushdown) 458 463 6 34.4 29.1 19.9X +Parquet Vectorized 9684 9695 15 1.6 615.7 1.0X +Parquet Vectorized (Pushdown) 494 501 7 31.8 31.4 19.6X +Native ORC Vectorized 6590 6607 23 2.4 419.0 1.5X +Native ORC Vectorized (Pushdown) 418 432 11 37.6 26.6 23.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9014 9036 18 1.7 573.1 1.0X -Parquet Vectorized (Pushdown) 556 573 17 28.3 35.3 16.2X -Native ORC Vectorized 6520 6591 53 2.4 414.5 1.4X -Native ORC Vectorized (Pushdown) 457 465 10 34.4 29.0 19.7X +Parquet Vectorized 9720 9736 24 1.6 618.0 1.0X +Parquet Vectorized (Pushdown) 489 498 8 32.2 31.1 19.9X +Native ORC Vectorized 6665 6680 12 2.4 423.8 1.5X +Native ORC Vectorized (Pushdown) 423 427 6 37.2 26.9 23.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9015 9024 15 1.7 573.1 1.0X -Parquet Vectorized (Pushdown) 550 567 25 28.6 35.0 16.4X -Native ORC Vectorized 6563 6587 23 2.4 417.3 1.4X -Native ORC Vectorized (Pushdown) 450 455 6 35.0 28.6 20.0X +Parquet Vectorized 9728 9738 15 1.6 618.5 1.0X +Parquet Vectorized (Pushdown) 484 492 9 32.5 30.8 20.1X +Native ORC Vectorized 6672 6685 12 2.4 424.2 1.5X +Native ORC Vectorized (Pushdown) 418 426 8 37.6 26.6 23.3X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9010 9033 20 1.7 572.9 1.0X -Parquet Vectorized (Pushdown) 548 562 12 28.7 34.9 16.4X -Native ORC Vectorized 6576 6608 26 2.4 418.1 1.4X -Native ORC Vectorized (Pushdown) 451 459 9 34.9 28.7 20.0X +Parquet Vectorized 9732 9741 6 1.6 618.8 1.0X +Parquet Vectorized (Pushdown) 489 494 3 32.2 31.1 19.9X +Native ORC Vectorized 6667 6682 15 2.4 423.9 1.5X +Native ORC Vectorized (Pushdown) 419 426 6 37.6 26.6 23.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8998 9025 33 1.7 572.0 1.0X -Parquet Vectorized (Pushdown) 556 571 23 28.3 35.4 16.2X -Native ORC Vectorized 6534 6577 33 2.4 415.4 1.4X -Native ORC Vectorized (Pushdown) 457 462 6 34.4 29.1 19.7X +Parquet Vectorized 9733 9739 9 1.6 618.8 1.0X +Parquet Vectorized (Pushdown) 484 515 46 32.5 30.8 20.1X +Native ORC Vectorized 6563 6578 15 2.4 417.2 1.5X +Native ORC Vectorized (Pushdown) 422 428 6 37.2 26.9 23.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9817 10010 204 1.6 624.2 1.0X -Parquet Vectorized (Pushdown) 2196 2205 11 7.2 139.6 4.5X -Native ORC Vectorized 7341 7388 49 2.1 466.7 1.3X -Native ORC Vectorized (Pushdown) 1867 1879 8 8.4 118.7 5.3X +Parquet Vectorized 10574 10600 33 1.5 672.3 1.0X +Parquet Vectorized (Pushdown) 2263 2273 14 7.0 143.9 4.7X +Native ORC Vectorized 7420 7462 24 2.1 471.8 1.4X +Native ORC Vectorized (Pushdown) 1866 1876 8 8.4 118.6 5.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12672 12699 24 1.2 805.6 1.0X -Parquet Vectorized (Pushdown) 8390 8398 10 1.9 533.4 1.5X -Native ORC Vectorized 10113 10152 31 1.6 642.9 1.3X -Native ORC Vectorized (Pushdown) 7137 7147 10 2.2 453.8 1.8X +Parquet Vectorized 13597 13631 26 1.2 864.4 1.0X +Parquet Vectorized (Pushdown) 9042 9050 7 1.7 574.9 1.5X +Native ORC Vectorized 10326 10368 45 1.5 656.5 1.3X +Native ORC Vectorized (Pushdown) 7330 7345 16 2.1 466.0 1.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15422 15497 55 1.0 980.5 1.0X -Parquet Vectorized (Pushdown) 14643 14650 6 1.1 931.0 1.1X -Native ORC Vectorized 12906 12951 28 1.2 820.5 1.2X -Native ORC Vectorized (Pushdown) 12454 12459 4 1.3 791.8 1.2X +Parquet Vectorized 16661 16679 13 0.9 1059.3 1.0X +Parquet Vectorized (Pushdown) 15807 15821 17 1.0 1005.0 1.1X +Native ORC Vectorized 13294 13314 34 1.2 845.2 1.3X +Native ORC Vectorized (Pushdown) 12779 12788 9 1.2 812.5 1.3X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16116 16132 15 1.0 1024.7 1.0X -Parquet Vectorized (Pushdown) 16176 16191 12 1.0 1028.4 1.0X -Native ORC Vectorized 13596 13609 10 1.2 864.4 1.2X -Native ORC Vectorized (Pushdown) 13741 13754 8 1.1 873.6 1.2X +Parquet Vectorized 17368 17394 23 0.9 1104.2 1.0X +Parquet Vectorized (Pushdown) 17456 17466 8 0.9 1109.8 1.0X +Native ORC Vectorized 13925 13947 24 1.1 885.3 1.2X +Native ORC Vectorized (Pushdown) 14073 14084 10 1.1 894.8 1.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16166 16189 26 1.0 1027.8 1.0X -Parquet Vectorized (Pushdown) 16223 16233 9 1.0 1031.4 1.0X -Native ORC Vectorized 13584 13596 8 1.2 863.6 1.2X -Native ORC Vectorized (Pushdown) 13729 13739 8 1.1 872.9 1.2X +Parquet Vectorized 17396 17408 9 0.9 1106.0 1.0X +Parquet Vectorized (Pushdown) 17443 17458 12 0.9 1109.0 1.0X +Native ORC Vectorized 13993 13999 5 1.1 889.6 1.2X +Native ORC Vectorized (Pushdown) 14129 14136 7 1.1 898.3 1.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16117 16134 14 1.0 1024.7 1.0X -Parquet Vectorized (Pushdown) 16187 16197 9 1.0 1029.1 1.0X -Native ORC Vectorized 13612 13625 12 1.2 865.4 1.2X -Native ORC Vectorized (Pushdown) 13770 13782 10 1.1 875.5 1.2X +Parquet Vectorized 17392 17425 24 0.9 1105.8 1.0X +Parquet Vectorized (Pushdown) 17477 17483 5 0.9 1111.2 1.0X +Native ORC Vectorized 13960 14024 75 1.1 887.6 1.2X +Native ORC Vectorized (Pushdown) 14089 14106 20 1.1 895.8 1.2X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8544 8581 31 1.8 543.2 1.0X -Parquet Vectorized (Pushdown) 475 483 8 33.1 30.2 18.0X -Native ORC Vectorized 7688 7708 30 2.0 488.8 1.1X -Native ORC Vectorized (Pushdown) 839 843 4 18.8 53.3 10.2X +Parquet Vectorized 8190 8235 67 1.9 520.7 1.0X +Parquet Vectorized (Pushdown) 420 426 7 37.4 26.7 19.5X +Native ORC Vectorized 7879 7890 8 2.0 500.9 1.0X +Native ORC Vectorized (Pushdown) 774 782 8 20.3 49.2 10.6X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8511 8530 13 1.8 541.1 1.0X -Parquet Vectorized (Pushdown) 473 494 34 33.3 30.0 18.0X -Native ORC Vectorized 7932 7943 7 2.0 504.3 1.1X -Native ORC Vectorized (Pushdown) 842 857 21 18.7 53.5 10.1X +Parquet Vectorized 8292 8310 15 1.9 527.2 1.0X +Parquet Vectorized (Pushdown) 424 431 6 37.1 27.0 19.5X +Native ORC Vectorized 8107 8117 10 1.9 515.4 1.0X +Native ORC Vectorized (Pushdown) 776 786 7 20.3 49.3 10.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8406 8427 21 1.9 534.4 1.0X -Parquet Vectorized (Pushdown) 537 548 9 29.3 34.1 15.7X -Native ORC Vectorized 7886 7895 11 2.0 501.3 1.1X -Native ORC Vectorized (Pushdown) 897 906 12 17.5 57.0 9.4X +Parquet Vectorized 8232 8245 8 1.9 523.4 1.0X +Parquet Vectorized (Pushdown) 494 501 8 31.8 31.4 16.7X +Native ORC Vectorized 8073 8094 20 1.9 513.3 1.0X +Native ORC Vectorized (Pushdown) 833 843 6 18.9 53.0 9.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8399 8410 10 1.9 534.0 1.0X -Parquet Vectorized (Pushdown) 538 552 22 29.2 34.2 15.6X -Native ORC Vectorized 7887 7895 6 2.0 501.4 1.1X -Native ORC Vectorized (Pushdown) 890 897 8 17.7 56.6 9.4X +Parquet Vectorized 8209 8232 21 1.9 521.9 1.0X +Parquet Vectorized (Pushdown) 481 494 10 32.7 30.6 17.1X +Native ORC Vectorized 8058 8068 6 2.0 512.3 1.0X +Native ORC Vectorized (Pushdown) 831 840 7 18.9 52.9 9.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8503 8521 13 1.8 540.6 1.0X -Parquet Vectorized (Pushdown) 538 546 9 29.3 34.2 15.8X -Native ORC Vectorized 7965 7966 1 2.0 506.4 1.1X -Native ORC Vectorized (Pushdown) 897 902 7 17.5 57.0 9.5X +Parquet Vectorized 8321 8330 12 1.9 529.0 1.0X +Parquet Vectorized (Pushdown) 488 495 4 32.2 31.0 17.0X +Native ORC Vectorized 8149 8156 9 1.9 518.1 1.0X +Native ORC Vectorized (Pushdown) 841 846 5 18.7 53.5 9.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 17851 17872 20 0.9 1134.9 1.0X -Parquet Vectorized (Pushdown) 17909 17919 6 0.9 1138.6 1.0X -Native ORC Vectorized 17337 17351 14 0.9 1102.2 1.0X -Native ORC Vectorized (Pushdown) 17649 17703 76 0.9 1122.1 1.0X +Parquet Vectorized 18379 18399 18 0.9 1168.5 1.0X +Parquet Vectorized (Pushdown) 18453 18469 19 0.9 1173.2 1.0X +Native ORC Vectorized 17678 17685 8 0.9 1123.9 1.0X +Native ORC Vectorized (Pushdown) 17936 17943 4 0.9 1140.4 1.0X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9884 9952 44 1.6 628.4 1.0X -Parquet Vectorized (Pushdown) 1393 1436 32 11.3 88.6 7.1X -Native ORC Vectorized 7347 7472 146 2.1 467.1 1.3X -Native ORC Vectorized (Pushdown) 7547 7573 18 2.1 479.8 1.3X +Parquet Vectorized 9928 9964 43 1.6 631.2 1.0X +Parquet Vectorized (Pushdown) 1349 1367 25 11.7 85.8 7.4X +Native ORC Vectorized 7484 7554 81 2.1 475.8 1.3X +Native ORC Vectorized (Pushdown) 7588 7598 8 2.1 482.4 1.3X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9668 9692 23 1.6 614.7 1.0X -Parquet Vectorized (Pushdown) 556 574 24 28.3 35.3 17.4X -Native ORC Vectorized 7181 7214 33 2.2 456.6 1.3X -Native ORC Vectorized (Pushdown) 7317 7364 34 2.1 465.2 1.3X +Parquet Vectorized 9751 9781 30 1.6 620.0 1.0X +Parquet Vectorized (Pushdown) 493 500 6 31.9 31.4 19.8X +Native ORC Vectorized 7231 7254 15 2.2 459.7 1.3X +Native ORC Vectorized (Pushdown) 7359 7398 25 2.1 467.9 1.3X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9639 9676 23 1.6 612.8 1.0X -Parquet Vectorized (Pushdown) 544 550 7 28.9 34.6 17.7X -Native ORC Vectorized 7218 7231 9 2.2 458.9 1.3X -Native ORC Vectorized (Pushdown) 7338 7373 30 2.1 466.5 1.3X +Parquet Vectorized 9755 9772 15 1.6 620.2 1.0X +Parquet Vectorized (Pushdown) 483 492 8 32.6 30.7 20.2X +Native ORC Vectorized 7212 7341 230 2.2 458.5 1.4X +Native ORC Vectorized (Pushdown) 7356 7393 21 2.1 467.7 1.3X + + +================================================================================================ +Pushdown benchmark for StringEndsWith +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8261 8282 27 1.9 525.2 1.0X +Parquet Vectorized (Pushdown) 595 603 6 26.4 37.8 13.9X +Native ORC Vectorized 8112 8128 13 1.9 515.7 1.0X +Native ORC Vectorized (Pushdown) 8368 8375 5 1.9 532.1 1.0X + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8208 8223 13 1.9 521.9 1.0X +Parquet Vectorized (Pushdown) 471 478 5 33.4 30.0 17.4X +Native ORC Vectorized 8043 8056 11 2.0 511.4 1.0X +Native ORC Vectorized (Pushdown) 8305 8314 11 1.9 528.0 1.0X + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8197 8221 24 1.9 521.2 1.0X +Parquet Vectorized (Pushdown) 465 475 11 33.8 29.6 17.6X +Native ORC Vectorized 8037 8049 15 2.0 511.0 1.0X +Native ORC Vectorized (Pushdown) 8308 8313 4 1.9 528.2 1.0X + + +================================================================================================ +Pushdown benchmark for StringContains +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8512 8545 42 1.8 541.2 1.0X +Parquet Vectorized (Pushdown) 1178 1198 22 13.4 74.9 7.2X +Native ORC Vectorized 8335 8349 13 1.9 529.9 1.0X +Native ORC Vectorized (Pushdown) 8590 8604 12 1.8 546.1 1.0X + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +---------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8189 8215 26 1.9 520.7 1.0X +Parquet Vectorized (Pushdown) 463 473 8 34.0 29.4 17.7X +Native ORC Vectorized 8036 8041 5 2.0 510.9 1.0X +Native ORC Vectorized (Pushdown) 8292 8301 9 1.9 527.2 1.0X + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------ +Parquet Vectorized 8203 8216 12 1.9 521.5 1.0X +Parquet Vectorized (Pushdown) 464 472 9 33.9 29.5 17.7X +Native ORC Vectorized 8027 8036 5 2.0 510.3 1.0X +Native ORC Vectorized (Pushdown) 8301 8311 10 1.9 527.8 1.0X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3177 3189 18 5.0 202.0 1.0X -Parquet Vectorized (Pushdown) 123 129 6 127.4 7.8 25.7X -Native ORC Vectorized 4736 4821 141 3.3 301.1 0.7X -Native ORC Vectorized (Pushdown) 152 158 7 103.7 9.6 20.9X +Parquet Vectorized 3552 3561 14 4.4 225.8 1.0X +Parquet Vectorized (Pushdown) 113 118 5 138.8 7.2 31.3X +Native ORC Vectorized 4946 4953 8 3.2 314.5 0.7X +Native ORC Vectorized (Pushdown) 142 150 7 110.6 9.0 25.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4821 4831 7 3.3 306.5 1.0X -Parquet Vectorized (Pushdown) 2367 2385 25 6.6 150.5 2.0X -Native ORC Vectorized 6437 6448 9 2.4 409.2 0.7X -Native ORC Vectorized (Pushdown) 2772 2779 5 5.7 176.2 1.7X +Parquet Vectorized 5246 5255 6 3.0 333.5 1.0X +Parquet Vectorized (Pushdown) 2472 2480 8 6.4 157.2 2.1X +Native ORC Vectorized 6648 6652 4 2.4 422.7 0.8X +Native ORC Vectorized (Pushdown) 2797 2808 13 5.6 177.8 1.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10194 10200 6 1.5 648.1 1.0X -Parquet Vectorized (Pushdown) 9807 9819 12 1.6 623.5 1.0X -Native ORC Vectorized 12069 12077 5 1.3 767.3 0.8X -Native ORC Vectorized (Pushdown) 11497 11502 4 1.4 731.0 0.9X +Parquet Vectorized 10788 10805 16 1.5 685.9 1.0X +Parquet Vectorized (Pushdown) 10330 10339 6 1.5 656.8 1.0X +Native ORC Vectorized 12284 12352 132 1.3 781.0 0.9X +Native ORC Vectorized (Pushdown) 11707 11723 12 1.3 744.3 0.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11617 11629 13 1.4 738.6 1.0X -Parquet Vectorized (Pushdown) 11658 11671 15 1.3 741.2 1.0X -Native ORC Vectorized 13461 13485 20 1.2 855.8 0.9X -Native ORC Vectorized (Pushdown) 13520 13530 9 1.2 859.6 0.9X +Parquet Vectorized 12126 12143 17 1.3 771.0 1.0X +Parquet Vectorized (Pushdown) 12179 12197 19 1.3 774.3 1.0X +Native ORC Vectorized 13729 13745 12 1.1 872.8 0.9X +Native ORC Vectorized (Pushdown) 13791 13814 17 1.1 876.8 0.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3394 3413 32 4.6 215.8 1.0X -Parquet Vectorized (Pushdown) 125 140 32 125.6 8.0 27.1X -Native ORC Vectorized 4735 4753 30 3.3 301.0 0.7X -Native ORC Vectorized (Pushdown) 148 154 6 106.2 9.4 22.9X +Parquet Vectorized 3764 3784 16 4.2 239.3 1.0X +Parquet Vectorized (Pushdown) 115 121 7 137.1 7.3 32.8X +Native ORC Vectorized 4941 4960 21 3.2 314.2 0.8X +Native ORC Vectorized (Pushdown) 139 145 5 112.9 8.9 27.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4266 4284 24 3.7 271.2 1.0X -Parquet Vectorized (Pushdown) 1325 1331 8 11.9 84.3 3.2X -Native ORC Vectorized 5637 5649 7 2.8 358.4 0.8X -Native ORC Vectorized (Pushdown) 1499 1504 5 10.5 95.3 2.8X +Parquet Vectorized 4659 4674 11 3.4 296.2 1.0X +Parquet Vectorized (Pushdown) 1356 1368 16 11.6 86.2 3.4X +Native ORC Vectorized 5857 5876 15 2.7 372.4 0.8X +Native ORC Vectorized (Pushdown) 1501 1509 11 10.5 95.4 3.1X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7643 7657 12 2.1 486.0 1.0X -Parquet Vectorized (Pushdown) 6057 6064 4 2.6 385.1 1.3X -Native ORC Vectorized 9201 9209 7 1.7 585.0 0.8X -Native ORC Vectorized (Pushdown) 6861 6873 12 2.3 436.2 1.1X +Parquet Vectorized 8098 8110 8 1.9 514.9 1.0X +Parquet Vectorized (Pushdown) 6292 6301 8 2.5 400.0 1.3X +Native ORC Vectorized 9416 9433 14 1.7 598.7 0.9X +Native ORC Vectorized (Pushdown) 7014 7032 11 2.2 445.9 1.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10877 10888 6 1.4 691.6 1.0X -Parquet Vectorized (Pushdown) 10611 10615 4 1.5 674.7 1.0X -Native ORC Vectorized 12554 12564 9 1.3 798.2 0.9X -Native ORC Vectorized (Pushdown) 12065 12074 6 1.3 767.1 0.9X +Parquet Vectorized 11443 11455 11 1.4 727.5 1.0X +Parquet Vectorized (Pushdown) 11093 11118 17 1.4 705.3 1.0X +Native ORC Vectorized 12868 12886 23 1.2 818.1 0.9X +Native ORC Vectorized (Pushdown) 12401 12422 14 1.3 788.5 0.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5000 5009 13 3.1 317.9 1.0X -Parquet Vectorized (Pushdown) 138 146 19 114.3 8.7 36.3X -Native ORC Vectorized 4709 4731 22 3.3 299.4 1.1X -Native ORC Vectorized (Pushdown) 148 153 6 106.6 9.4 33.9X +Parquet Vectorized 5430 5440 6 2.9 345.2 1.0X +Parquet Vectorized (Pushdown) 126 130 4 124.7 8.0 43.0X +Native ORC Vectorized 4972 4986 27 3.2 316.1 1.1X +Native ORC Vectorized (Pushdown) 138 143 4 114.4 8.7 39.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6147 6164 17 2.6 390.8 1.0X -Parquet Vectorized (Pushdown) 1738 1754 14 9.1 110.5 3.5X -Native ORC Vectorized 5772 5779 7 2.7 367.0 1.1X -Native ORC Vectorized (Pushdown) 1651 1653 2 9.5 105.0 3.7X +Parquet Vectorized 6513 6531 23 2.4 414.1 1.0X +Parquet Vectorized (Pushdown) 1765 1772 6 8.9 112.2 3.7X +Native ORC Vectorized 6049 6053 3 2.6 384.6 1.1X +Native ORC Vectorized (Pushdown) 1675 1684 9 9.4 106.5 3.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10534 10539 4 1.5 669.8 1.0X -Parquet Vectorized (Pushdown) 8085 8092 5 1.9 514.1 1.3X -Native ORC Vectorized 9894 9904 10 1.6 629.0 1.1X -Native ORC Vectorized (Pushdown) 7626 7643 16 2.1 484.8 1.4X +Parquet Vectorized 10962 10974 9 1.4 697.0 1.0X +Parquet Vectorized (Pushdown) 8358 8368 9 1.9 531.4 1.3X +Native ORC Vectorized 10319 10343 31 1.5 656.1 1.1X +Native ORC Vectorized (Pushdown) 7923 7933 7 2.0 503.7 1.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14726 14754 49 1.1 936.2 1.0X -Parquet Vectorized (Pushdown) 14309 14330 14 1.1 909.8 1.0X -Native ORC Vectorized 13912 13927 12 1.1 884.5 1.1X -Native ORC Vectorized (Pushdown) 13510 13524 15 1.2 859.0 1.1X +Parquet Vectorized 15327 15330 2 1.0 974.5 1.0X +Parquet Vectorized (Pushdown) 14830 14844 9 1.1 942.9 1.0X +Native ORC Vectorized 14520 14568 91 1.1 923.2 1.1X +Native ORC Vectorized (Pushdown) 14066 14080 10 1.1 894.3 1.1X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9132 9167 37 1.7 580.6 1.0X -Parquet Vectorized (Pushdown) 555 578 24 28.3 35.3 16.4X -Native ORC Vectorized 6587 6617 26 2.4 418.8 1.4X -Native ORC Vectorized (Pushdown) 463 469 7 34.0 29.5 19.7X +Parquet Vectorized 9158 9202 40 1.7 582.3 1.0X +Parquet Vectorized (Pushdown) 494 502 7 31.9 31.4 18.6X +Native ORC Vectorized 6735 6795 81 2.3 428.2 1.4X +Native ORC Vectorized (Pushdown) 427 430 6 36.8 27.2 21.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9148 9178 30 1.7 581.6 1.0X -Parquet Vectorized (Pushdown) 554 562 7 28.4 35.2 16.5X -Native ORC Vectorized 6581 6607 31 2.4 418.4 1.4X -Native ORC Vectorized (Pushdown) 461 465 5 34.2 29.3 19.9X +Parquet Vectorized 9194 9207 12 1.7 584.5 1.0X +Parquet Vectorized (Pushdown) 494 500 6 31.8 31.4 18.6X +Native ORC Vectorized 6727 6755 22 2.3 427.7 1.4X +Native ORC Vectorized (Pushdown) 427 434 10 36.9 27.1 21.6X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9158 9208 59 1.7 582.2 1.0X -Parquet Vectorized (Pushdown) 556 578 30 28.3 35.3 16.5X -Native ORC Vectorized 6458 6571 64 2.4 410.6 1.4X -Native ORC Vectorized (Pushdown) 463 472 11 34.0 29.4 19.8X +Parquet Vectorized 9190 9207 15 1.7 584.3 1.0X +Parquet Vectorized (Pushdown) 502 507 4 31.4 31.9 18.3X +Native ORC Vectorized 6717 6747 20 2.3 427.0 1.4X +Native ORC Vectorized (Pushdown) 429 437 7 36.7 27.3 21.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9198 9228 19 1.7 584.8 1.0X -Parquet Vectorized (Pushdown) 572 582 9 27.5 36.4 16.1X -Native ORC Vectorized 6592 6612 19 2.4 419.1 1.4X -Native ORC Vectorized (Pushdown) 479 487 6 32.9 30.4 19.2X +Parquet Vectorized 9243 9264 25 1.7 587.6 1.0X +Parquet Vectorized (Pushdown) 525 535 7 29.9 33.4 17.6X +Native ORC Vectorized 6741 6761 14 2.3 428.6 1.4X +Native ORC Vectorized (Pushdown) 452 459 7 34.8 28.7 20.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8967 9004 26 1.8 570.1 1.0X -Parquet Vectorized (Pushdown) 583 619 36 27.0 37.1 15.4X -Native ORC Vectorized 6518 6601 49 2.4 414.4 1.4X -Native ORC Vectorized (Pushdown) 483 491 10 32.6 30.7 18.6X +Parquet Vectorized 9238 9279 81 1.7 587.3 1.0X +Parquet Vectorized (Pushdown) 519 530 12 30.3 33.0 17.8X +Native ORC Vectorized 6741 6784 48 2.3 428.6 1.4X +Native ORC Vectorized (Pushdown) 449 457 6 35.0 28.6 20.6X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9170 9197 20 1.7 583.0 1.0X -Parquet Vectorized (Pushdown) 583 589 4 27.0 37.0 15.7X -Native ORC Vectorized 6589 6602 9 2.4 418.9 1.4X -Native ORC Vectorized (Pushdown) 481 489 9 32.7 30.6 19.1X +Parquet Vectorized 9224 9250 26 1.7 586.5 1.0X +Parquet Vectorized (Pushdown) 519 526 6 30.3 33.0 17.8X +Native ORC Vectorized 6734 6766 20 2.3 428.1 1.4X +Native ORC Vectorized (Pushdown) 449 456 6 35.0 28.5 20.6X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9968 10003 25 1.6 633.8 1.0X -Parquet Vectorized (Pushdown) 1397 1440 31 11.3 88.8 7.1X -Native ORC Vectorized 6897 6950 38 2.3 438.5 1.4X -Native ORC Vectorized (Pushdown) 619 633 22 25.4 39.3 16.1X +Parquet Vectorized 9491 9500 9 1.7 603.4 1.0X +Parquet Vectorized (Pushdown) 1294 1329 31 12.2 82.3 7.3X +Native ORC Vectorized 7033 7051 21 2.2 447.1 1.3X +Native ORC Vectorized (Pushdown) 582 588 7 27.0 37.0 16.3X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10000 10037 42 1.6 635.8 1.0X -Parquet Vectorized (Pushdown) 5187 5208 19 3.0 329.8 1.9X -Native ORC Vectorized 6903 6945 29 2.3 438.9 1.4X -Native ORC Vectorized (Pushdown) 632 641 8 24.9 40.2 15.8X +Parquet Vectorized 9503 9520 16 1.7 604.2 1.0X +Parquet Vectorized (Pushdown) 4804 4842 37 3.3 305.5 2.0X +Native ORC Vectorized 7057 7085 24 2.2 448.7 1.3X +Native ORC Vectorized (Pushdown) 612 618 7 25.7 38.9 15.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9973 10017 33 1.6 634.0 1.0X -Parquet Vectorized (Pushdown) 9003 9036 20 1.7 572.4 1.1X -Native ORC Vectorized 6911 6971 74 2.3 439.4 1.4X -Native ORC Vectorized (Pushdown) 647 663 21 24.3 41.1 15.4X +Parquet Vectorized 9501 9510 10 1.7 604.0 1.0X +Parquet Vectorized (Pushdown) 8354 8370 11 1.9 531.1 1.1X +Native ORC Vectorized 7058 7090 27 2.2 448.7 1.3X +Native ORC Vectorized (Pushdown) 627 631 3 25.1 39.9 15.2X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9889 9955 53 1.6 628.8 1.0X -Parquet Vectorized (Pushdown) 1448 1468 22 10.9 92.0 6.8X -Native ORC Vectorized 6810 6824 18 2.3 433.0 1.5X -Native ORC Vectorized (Pushdown) 732 759 25 21.5 46.5 13.5X +Parquet Vectorized 9423 9450 18 1.7 599.1 1.0X +Parquet Vectorized (Pushdown) 1338 1375 30 11.8 85.1 7.0X +Native ORC Vectorized 6988 7010 13 2.3 444.3 1.3X +Native ORC Vectorized (Pushdown) 738 752 20 21.3 46.9 12.8X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9926 9955 27 1.6 631.1 1.0X -Parquet Vectorized (Pushdown) 5174 5223 29 3.0 329.0 1.9X -Native ORC Vectorized 6799 6825 15 2.3 432.3 1.5X -Native ORC Vectorized (Pushdown) 830 854 17 19.0 52.8 12.0X +Parquet Vectorized 9435 9468 23 1.7 599.9 1.0X +Parquet Vectorized (Pushdown) 4934 4972 22 3.2 313.7 1.9X +Native ORC Vectorized 6969 7001 24 2.3 443.1 1.4X +Native ORC Vectorized (Pushdown) 800 813 21 19.7 50.9 11.8X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9850 9953 94 1.6 626.2 1.0X -Parquet Vectorized (Pushdown) 8653 8686 32 1.8 550.2 1.1X -Native ORC Vectorized 6745 6782 34 2.3 428.8 1.5X -Native ORC Vectorized (Pushdown) 826 846 41 19.0 52.5 11.9X +Parquet Vectorized 9460 9466 9 1.7 601.4 1.0X +Parquet Vectorized (Pushdown) 8537 8562 18 1.8 542.8 1.1X +Native ORC Vectorized 6991 7015 17 2.2 444.5 1.4X +Native ORC Vectorized (Pushdown) 826 833 6 19.0 52.5 11.4X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3598 3613 11 4.4 228.7 1.0X -Parquet Vectorized (Pushdown) 172 179 6 91.5 10.9 20.9X -Native ORC Vectorized 3184 3205 24 4.9 202.4 1.1X -Native ORC Vectorized (Pushdown) 205 212 7 76.6 13.1 17.5X +Parquet Vectorized 3993 4003 11 3.9 253.9 1.0X +Parquet Vectorized (Pushdown) 163 169 7 96.5 10.4 24.5X +Native ORC Vectorized 3381 3411 26 4.7 214.9 1.2X +Native ORC Vectorized (Pushdown) 193 201 6 81.4 12.3 20.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4352 4367 17 3.6 276.7 1.0X -Parquet Vectorized (Pushdown) 1233 1243 10 12.8 78.4 3.5X -Native ORC Vectorized 3848 3868 16 4.1 244.6 1.1X -Native ORC Vectorized (Pushdown) 1163 1172 8 13.5 74.0 3.7X +Parquet Vectorized 4725 4747 42 3.3 300.4 1.0X +Parquet Vectorized (Pushdown) 1244 1254 10 12.6 79.1 3.8X +Native ORC Vectorized 4060 4087 34 3.9 258.1 1.2X +Native ORC Vectorized (Pushdown) 1163 1177 10 13.5 73.9 4.1X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7477 7484 7 2.1 475.4 1.0X -Parquet Vectorized (Pushdown) 5761 5769 5 2.7 366.3 1.3X -Native ORC Vectorized 6686 6718 39 2.4 425.1 1.1X -Native ORC Vectorized (Pushdown) 5218 5225 8 3.0 331.8 1.4X +Parquet Vectorized 7870 7878 8 2.0 500.4 1.0X +Parquet Vectorized (Pushdown) 5961 5966 3 2.6 379.0 1.3X +Native ORC Vectorized 6851 6884 29 2.3 435.6 1.1X +Native ORC Vectorized (Pushdown) 5311 5339 29 3.0 337.7 1.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10627 10638 12 1.5 675.7 1.0X -Parquet Vectorized (Pushdown) 10341 10346 5 1.5 657.4 1.0X -Native ORC Vectorized 9499 9543 36 1.7 603.9 1.1X -Native ORC Vectorized (Pushdown) 9257 9266 6 1.7 588.5 1.1X +Parquet Vectorized 10976 11000 14 1.4 697.8 1.0X +Parquet Vectorized (Pushdown) 10637 10661 22 1.5 676.3 1.0X +Native ORC Vectorized 9730 9749 16 1.6 618.6 1.1X +Native ORC Vectorized (Pushdown) 9488 9494 3 1.7 603.3 1.2X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3648 3658 14 4.3 232.0 1.0X -Parquet Vectorized (Pushdown) 3641 3656 12 4.3 231.5 1.0X -Native ORC Vectorized 3053 3109 47 5.2 194.1 1.2X -Native ORC Vectorized (Pushdown) 123 127 5 128.1 7.8 29.7X +Parquet Vectorized 4024 4033 8 3.9 255.9 1.0X +Parquet Vectorized (Pushdown) 4038 4058 15 3.9 256.7 1.0X +Native ORC Vectorized 3374 3392 20 4.7 214.5 1.2X +Native ORC Vectorized (Pushdown) 117 122 5 134.9 7.4 34.5X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4460 4474 9 3.5 283.6 1.0X -Parquet Vectorized (Pushdown) 4466 4475 7 3.5 283.9 1.0X -Native ORC Vectorized 3841 3848 9 4.1 244.2 1.2X -Native ORC Vectorized (Pushdown) 1220 1230 10 12.9 77.6 3.7X +Parquet Vectorized 4876 4889 10 3.2 310.0 1.0X +Parquet Vectorized (Pushdown) 4889 4895 5 3.2 310.9 1.0X +Native ORC Vectorized 4155 4165 13 3.8 264.2 1.2X +Native ORC Vectorized (Pushdown) 1240 1252 8 12.7 78.8 3.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7700 7713 10 2.0 489.5 1.0X -Parquet Vectorized (Pushdown) 7701 7705 5 2.0 489.6 1.0X -Native ORC Vectorized 6945 6961 21 2.3 441.6 1.1X -Native ORC Vectorized (Pushdown) 5498 5504 7 2.9 349.6 1.4X +Parquet Vectorized 8161 8185 16 1.9 518.9 1.0X +Parquet Vectorized (Pushdown) 8167 8179 8 1.9 519.2 1.0X +Native ORC Vectorized 7253 7265 9 2.2 461.1 1.1X +Native ORC Vectorized (Pushdown) 5636 5644 10 2.8 358.4 1.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10809 10817 5 1.5 687.2 1.0X -Parquet Vectorized (Pushdown) 10810 10815 6 1.5 687.3 1.0X -Native ORC Vectorized 9958 9966 6 1.6 633.1 1.1X -Native ORC Vectorized (Pushdown) 9702 9713 6 1.6 616.9 1.1X +Parquet Vectorized 11364 11372 10 1.4 722.5 1.0X +Parquet Vectorized (Pushdown) 11372 11376 4 1.4 723.0 1.0X +Native ORC Vectorized 10310 10321 8 1.5 655.5 1.1X +Native ORC Vectorized (Pushdown) 10029 10044 12 1.6 637.7 1.1X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3363 3386 24 4.7 213.8 1.0X -Parquet Vectorized (Pushdown) 124 145 35 126.6 7.9 27.1X -Native ORC Vectorized 3077 3086 14 5.1 195.6 1.1X -Native ORC Vectorized (Pushdown) 122 126 5 129.0 7.8 27.6X +Parquet Vectorized 3688 3702 10 4.3 234.5 1.0X +Parquet Vectorized (Pushdown) 118 122 5 132.9 7.5 31.2X +Native ORC Vectorized 3358 3369 19 4.7 213.5 1.1X +Native ORC Vectorized (Pushdown) 116 120 5 136.0 7.4 31.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4112 4135 35 3.8 261.5 1.0X -Parquet Vectorized (Pushdown) 1268 1278 8 12.4 80.6 3.2X -Native ORC Vectorized 3839 3860 23 4.1 244.1 1.1X -Native ORC Vectorized (Pushdown) 1223 1228 7 12.9 77.8 3.4X +Parquet Vectorized 4515 4529 10 3.5 287.1 1.0X +Parquet Vectorized (Pushdown) 1298 1312 11 12.1 82.5 3.5X +Native ORC Vectorized 4159 4167 4 3.8 264.4 1.1X +Native ORC Vectorized (Pushdown) 1234 1235 1 12.7 78.5 3.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7334 7345 10 2.1 466.3 1.0X -Parquet Vectorized (Pushdown) 5792 5798 4 2.7 368.2 1.3X -Native ORC Vectorized 6939 6954 21 2.3 441.1 1.1X -Native ORC Vectorized (Pushdown) 5496 5508 8 2.9 349.4 1.3X +Parquet Vectorized 7786 7806 16 2.0 495.0 1.0X +Parquet Vectorized (Pushdown) 6041 6047 4 2.6 384.1 1.3X +Native ORC Vectorized 7243 7256 8 2.2 460.5 1.1X +Native ORC Vectorized (Pushdown) 5638 5645 9 2.8 358.5 1.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10447 10451 4 1.5 664.2 1.0X -Parquet Vectorized (Pushdown) 10175 10180 4 1.5 646.9 1.0X -Native ORC Vectorized 9946 10087 262 1.6 632.4 1.1X -Native ORC Vectorized (Pushdown) 9699 9706 5 1.6 616.7 1.1X +Parquet Vectorized 10990 10999 11 1.4 698.7 1.0X +Parquet Vectorized (Pushdown) 10674 10680 7 1.5 678.7 1.0X +Native ORC Vectorized 10307 10319 12 1.5 655.3 1.1X +Native ORC Vectorized (Pushdown) 10041 10048 4 1.6 638.4 1.1X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3409 3421 16 4.6 216.7 1.0X -Parquet Vectorized (Pushdown) 125 135 17 126.2 7.9 27.3X -Native ORC Vectorized 3077 3093 32 5.1 195.6 1.1X -Native ORC Vectorized (Pushdown) 121 126 5 129.5 7.7 28.1X +Parquet Vectorized 3716 3755 71 4.2 236.3 1.0X +Parquet Vectorized (Pushdown) 114 118 4 138.5 7.2 32.7X +Native ORC Vectorized 3347 3355 8 4.7 212.8 1.1X +Native ORC Vectorized (Pushdown) 113 117 4 138.8 7.2 32.8X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4151 4160 7 3.8 263.9 1.0X -Parquet Vectorized (Pushdown) 1271 1272 2 12.4 80.8 3.3X -Native ORC Vectorized 3831 3852 28 4.1 243.6 1.1X -Native ORC Vectorized (Pushdown) 1215 1220 8 12.9 77.3 3.4X +Parquet Vectorized 4546 4557 13 3.5 289.0 1.0X +Parquet Vectorized (Pushdown) 1298 1316 13 12.1 82.5 3.5X +Native ORC Vectorized 4140 4148 10 3.8 263.2 1.1X +Native ORC Vectorized (Pushdown) 1230 1242 8 12.8 78.2 3.7X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7365 7373 7 2.1 468.2 1.0X -Parquet Vectorized (Pushdown) 5807 5812 3 2.7 369.2 1.3X -Native ORC Vectorized 6941 6957 26 2.3 441.3 1.1X -Native ORC Vectorized (Pushdown) 5492 5501 6 2.9 349.1 1.3X +Parquet Vectorized 7826 7841 17 2.0 497.6 1.0X +Parquet Vectorized (Pushdown) 6053 6058 5 2.6 384.8 1.3X +Native ORC Vectorized 7235 7238 5 2.2 460.0 1.1X +Native ORC Vectorized (Pushdown) 5642 5653 8 2.8 358.7 1.4X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10482 10486 4 1.5 666.4 1.0X -Parquet Vectorized (Pushdown) 10208 10208 1 1.5 649.0 1.0X -Native ORC Vectorized 9953 9965 10 1.6 632.8 1.1X -Native ORC Vectorized (Pushdown) 9700 9711 10 1.6 616.7 1.1X +Parquet Vectorized 10998 11011 8 1.4 699.2 1.0X +Parquet Vectorized (Pushdown) 10684 10693 6 1.5 679.3 1.0X +Native ORC Vectorized 10296 10307 9 1.5 654.6 1.1X +Native ORC Vectorized (Pushdown) 10024 10028 6 1.6 637.3 1.1X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 152 160 8 0.0 152064240.0 1.0X -Parquet Vectorized (Pushdown) 154 159 5 0.0 153810651.0 1.0X -Native ORC Vectorized 144 148 5 0.0 144416828.0 1.1X -Native ORC Vectorized (Pushdown) 155 162 10 0.0 154769410.0 1.0X +Parquet Vectorized 141 147 5 0.0 141156296.0 1.0X +Parquet Vectorized (Pushdown) 141 146 4 0.0 141219096.0 1.0X +Native ORC Vectorized 135 139 5 0.0 135256253.0 1.0X +Native ORC Vectorized (Pushdown) 145 150 5 0.0 144968981.0 1.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1255 1322 59 0.0 1254527065.0 1.0X -Parquet Vectorized (Pushdown) 1309 1369 71 0.0 1308503240.0 1.0X -Native ORC Vectorized 1256 1318 64 0.0 1256393786.0 1.0X -Native ORC Vectorized (Pushdown) 1277 1300 19 0.0 1276552021.0 1.0X +Parquet Vectorized 1547 1593 60 0.0 1546942527.0 1.0X +Parquet Vectorized (Pushdown) 1613 1648 37 0.0 1612664638.0 1.0X +Native ORC Vectorized 1553 1593 54 0.0 1552761511.0 1.0X +Native ORC Vectorized (Pushdown) 1575 1619 47 0.0 1574744637.0 1.0X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 5400 5578 230 0.0 5399589388.0 1.0X -Parquet Vectorized (Pushdown) 5590 5724 179 0.0 5590208406.0 1.0X -Native ORC Vectorized 5403 5533 162 0.0 5402710816.0 1.0X -Native ORC Vectorized (Pushdown) 5442 5534 126 0.0 5442267827.0 1.0X +Parquet Vectorized 6791 6927 175 0.0 6790722699.0 1.0X +Parquet Vectorized (Pushdown) 6996 7118 208 0.0 6996423296.0 1.0X +Native ORC Vectorized 6785 6897 203 0.0 6785074307.0 1.0X +Native ORC Vectorized (Pushdown) 6859 6945 159 0.0 6859258499.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt index 230ff9873c..6663cc6d10 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt @@ -2,669 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12765 12814 75 1.2 811.6 1.0X -Parquet Vectorized (Pushdown) 642 657 15 24.5 40.8 19.9X -Native ORC Vectorized 8167 8208 47 1.9 519.3 1.6X -Native ORC Vectorized (Pushdown) 608 624 11 25.9 38.6 21.0X +Parquet Vectorized 8574 8745 248 1.8 545.1 1.0X +Parquet Vectorized (Pushdown) 521 540 14 30.2 33.1 16.4X +Native ORC Vectorized 7117 7171 76 2.2 452.5 1.2X +Native ORC Vectorized (Pushdown) 499 507 15 31.6 31.7 17.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12909 12921 9 1.2 820.7 1.0X -Parquet Vectorized (Pushdown) 647 669 38 24.3 41.1 19.9X -Native ORC Vectorized 8325 8333 6 1.9 529.3 1.6X -Native ORC Vectorized (Pushdown) 592 612 26 26.6 37.7 21.8X +Parquet Vectorized 8658 8673 12 1.8 550.4 1.0X +Parquet Vectorized (Pushdown) 498 509 17 31.6 31.6 17.4X +Native ORC Vectorized 7231 7238 8 2.2 459.7 1.2X +Native ORC Vectorized (Pushdown) 494 506 13 31.8 31.4 17.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12915 12942 20 1.2 821.1 1.0X -Parquet Vectorized (Pushdown) 626 642 10 25.1 39.8 20.6X -Native ORC Vectorized 8332 8359 29 1.9 529.8 1.6X -Native ORC Vectorized (Pushdown) 579 585 6 27.2 36.8 22.3X +Parquet Vectorized 8627 8642 13 1.8 548.5 1.0X +Parquet Vectorized (Pushdown) 496 509 13 31.7 31.5 17.4X +Native ORC Vectorized 7203 7212 11 2.2 457.9 1.2X +Native ORC Vectorized (Pushdown) 494 498 8 31.9 31.4 17.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12881 12905 30 1.2 819.0 1.0X -Parquet Vectorized (Pushdown) 623 633 9 25.2 39.6 20.7X -Native ORC Vectorized 8363 8386 20 1.9 531.7 1.5X -Native ORC Vectorized (Pushdown) 579 598 32 27.2 36.8 22.2X +Parquet Vectorized 8579 8612 32 1.8 545.4 1.0X +Parquet Vectorized (Pushdown) 479 487 7 32.8 30.5 17.9X +Native ORC Vectorized 7194 7211 17 2.2 457.4 1.2X +Native ORC Vectorized (Pushdown) 474 480 6 33.2 30.1 18.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12983 13017 44 1.2 825.4 1.0X -Parquet Vectorized (Pushdown) 636 648 15 24.7 40.5 20.4X -Native ORC Vectorized 8344 8376 33 1.9 530.5 1.6X -Native ORC Vectorized (Pushdown) 578 583 6 27.2 36.7 22.5X +Parquet Vectorized 8637 8644 10 1.8 549.1 1.0X +Parquet Vectorized (Pushdown) 485 500 10 32.5 30.8 17.8X +Native ORC Vectorized 7182 7192 8 2.2 456.6 1.2X +Native ORC Vectorized (Pushdown) 469 474 4 33.5 29.8 18.4X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 23020 23057 29 0.7 1463.6 1.0X -Parquet Vectorized (Pushdown) 23149 23187 22 0.7 1471.8 1.0X -Native ORC Vectorized 18361 18387 18 0.9 1167.4 1.3X -Native ORC Vectorized (Pushdown) 18672 18719 36 0.8 1187.1 1.2X +Parquet Vectorized 16691 16738 33 0.9 1061.2 1.0X +Parquet Vectorized (Pushdown) 16714 16745 26 0.9 1062.6 1.0X +Native ORC Vectorized 15315 15347 26 1.0 973.7 1.1X +Native ORC Vectorized (Pushdown) 15495 15524 35 1.0 985.1 1.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11761 11834 92 1.3 747.7 1.0X -Parquet Vectorized (Pushdown) 600 616 11 26.2 38.1 19.6X -Native ORC Vectorized 7481 7524 49 2.1 475.6 1.6X -Native ORC Vectorized (Pushdown) 543 548 3 29.0 34.5 21.7X +Parquet Vectorized 8048 8096 82 2.0 511.7 1.0X +Parquet Vectorized (Pushdown) 462 473 8 34.1 29.4 17.4X +Native ORC Vectorized 6436 6479 47 2.4 409.2 1.3X +Native ORC Vectorized (Pushdown) 445 453 6 35.4 28.3 18.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11790 11806 10 1.3 749.6 1.0X -Parquet Vectorized (Pushdown) 616 626 8 25.5 39.2 19.1X -Native ORC Vectorized 7555 7582 18 2.1 480.3 1.6X -Native ORC Vectorized (Pushdown) 558 566 8 28.2 35.5 21.1X +Parquet Vectorized 8055 8072 13 2.0 512.1 1.0X +Parquet Vectorized (Pushdown) 471 485 11 33.4 29.9 17.1X +Native ORC Vectorized 6425 6447 15 2.4 408.5 1.3X +Native ORC Vectorized (Pushdown) 456 459 2 34.5 29.0 17.7X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11858 11944 134 1.3 753.9 1.0X -Parquet Vectorized (Pushdown) 611 616 4 25.7 38.8 19.4X -Native ORC Vectorized 7590 7605 18 2.1 482.6 1.6X -Native ORC Vectorized (Pushdown) 547 554 6 28.8 34.8 21.7X +Parquet Vectorized 8119 8128 11 1.9 516.2 1.0X +Parquet Vectorized (Pushdown) 473 479 4 33.2 30.1 17.1X +Native ORC Vectorized 6488 6497 9 2.4 412.5 1.3X +Native ORC Vectorized (Pushdown) 446 455 6 35.2 28.4 18.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11866 11882 17 1.3 754.4 1.0X -Parquet Vectorized (Pushdown) 607 616 9 25.9 38.6 19.6X -Native ORC Vectorized 7549 7567 13 2.1 480.0 1.6X -Native ORC Vectorized (Pushdown) 547 553 7 28.7 34.8 21.7X +Parquet Vectorized 8098 8105 8 1.9 514.8 1.0X +Parquet Vectorized (Pushdown) 458 464 5 34.3 29.1 17.7X +Native ORC Vectorized 6504 6512 6 2.4 413.5 1.2X +Native ORC Vectorized (Pushdown) 455 461 4 34.6 28.9 17.8X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11876 11913 39 1.3 755.1 1.0X -Parquet Vectorized (Pushdown) 606 611 6 25.9 38.5 19.6X -Native ORC Vectorized 7578 7608 30 2.1 481.8 1.6X -Native ORC Vectorized (Pushdown) 543 553 10 29.0 34.5 21.9X +Parquet Vectorized 8088 8110 20 1.9 514.2 1.0X +Parquet Vectorized (Pushdown) 467 472 4 33.7 29.7 17.3X +Native ORC Vectorized 6486 6497 10 2.4 412.4 1.2X +Native ORC Vectorized (Pushdown) 446 450 3 35.3 28.4 18.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11885 11913 32 1.3 755.6 1.0X -Parquet Vectorized (Pushdown) 602 610 6 26.1 38.3 19.7X -Native ORC Vectorized 7563 7582 19 2.1 480.9 1.6X -Native ORC Vectorized (Pushdown) 541 559 19 29.0 34.4 22.0X +Parquet Vectorized 8117 8135 18 1.9 516.0 1.0X +Parquet Vectorized (Pushdown) 457 465 11 34.4 29.1 17.8X +Native ORC Vectorized 6533 6540 7 2.4 415.4 1.2X +Native ORC Vectorized (Pushdown) 445 450 4 35.3 28.3 18.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 12837 12874 26 1.2 816.2 1.0X -Parquet Vectorized (Pushdown) 2745 2753 7 5.7 174.5 4.7X -Native ORC Vectorized 8541 8554 10 1.8 543.1 1.5X -Native ORC Vectorized (Pushdown) 2248 2264 10 7.0 143.0 5.7X +Parquet Vectorized 8924 8937 18 1.8 567.4 1.0X +Parquet Vectorized (Pushdown) 2043 2104 128 7.7 129.9 4.4X +Native ORC Vectorized 7331 7338 6 2.1 466.1 1.2X +Native ORC Vectorized (Pushdown) 1891 1898 4 8.3 120.2 4.7X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16394 16422 18 1.0 1042.3 1.0X -Parquet Vectorized (Pushdown) 10836 10845 8 1.5 688.9 1.5X -Native ORC Vectorized 12096 12113 11 1.3 769.1 1.4X -Native ORC Vectorized (Pushdown) 8736 8745 9 1.8 555.4 1.9X +Parquet Vectorized 11730 11775 49 1.3 745.8 1.0X +Parquet Vectorized (Pushdown) 7984 7998 10 2.0 507.6 1.5X +Native ORC Vectorized 10196 10222 16 1.5 648.3 1.2X +Native ORC Vectorized (Pushdown) 7270 7288 26 2.2 462.2 1.6X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 19941 19973 22 0.8 1267.8 1.0X -Parquet Vectorized (Pushdown) 18899 18962 71 0.8 1201.6 1.1X -Native ORC Vectorized 15626 15639 8 1.0 993.5 1.3X -Native ORC Vectorized (Pushdown) 15207 15232 18 1.0 966.8 1.3X +Parquet Vectorized 14566 14601 26 1.1 926.1 1.0X +Parquet Vectorized (Pushdown) 13905 13931 28 1.1 884.0 1.0X +Native ORC Vectorized 13103 13143 32 1.2 833.1 1.1X +Native ORC Vectorized (Pushdown) 12662 12673 15 1.2 805.0 1.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 20800 20839 26 0.8 1322.4 1.0X -Parquet Vectorized (Pushdown) 20938 21045 140 0.8 1331.2 1.0X -Native ORC Vectorized 16524 16685 162 1.0 1050.6 1.3X -Native ORC Vectorized (Pushdown) 16759 16879 152 0.9 1065.5 1.2X +Parquet Vectorized 15339 15362 13 1.0 975.3 1.0X +Parquet Vectorized (Pushdown) 15433 15446 12 1.0 981.2 1.0X +Native ORC Vectorized 13781 13797 12 1.1 876.1 1.1X +Native ORC Vectorized (Pushdown) 13946 13953 8 1.1 886.7 1.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 20830 20851 25 0.8 1324.3 1.0X -Parquet Vectorized (Pushdown) 20940 21030 113 0.8 1331.3 1.0X -Native ORC Vectorized 16530 16553 22 1.0 1050.9 1.3X -Native ORC Vectorized (Pushdown) 16691 16756 94 0.9 1061.2 1.2X +Parquet Vectorized 15314 15371 48 1.0 973.6 1.0X +Parquet Vectorized (Pushdown) 15440 15451 9 1.0 981.6 1.0X +Native ORC Vectorized 13805 13832 28 1.1 877.7 1.1X +Native ORC Vectorized (Pushdown) 13950 13975 22 1.1 886.9 1.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 20794 20854 54 0.8 1322.0 1.0X -Parquet Vectorized (Pushdown) 20845 20896 88 0.8 1325.3 1.0X -Native ORC Vectorized 16537 16558 14 1.0 1051.4 1.3X -Native ORC Vectorized (Pushdown) 16801 16810 12 0.9 1068.2 1.2X +Parquet Vectorized 15288 15327 29 1.0 972.0 1.0X +Parquet Vectorized (Pushdown) 15412 15434 13 1.0 979.9 1.0X +Native ORC Vectorized 13805 13824 16 1.1 877.7 1.1X +Native ORC Vectorized (Pushdown) 13997 14021 24 1.1 889.9 1.1X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11227 11272 51 1.4 713.8 1.0X -Parquet Vectorized (Pushdown) 533 544 11 29.5 33.9 21.1X -Native ORC Vectorized 8937 8959 37 1.8 568.2 1.3X -Native ORC Vectorized (Pushdown) 991 1016 36 15.9 63.0 11.3X +Parquet Vectorized 7565 7605 35 2.1 481.0 1.0X +Parquet Vectorized (Pushdown) 403 415 13 39.0 25.6 18.8X +Native ORC Vectorized 7768 7781 19 2.0 493.8 1.0X +Native ORC Vectorized (Pushdown) 818 823 3 19.2 52.0 9.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11419 11432 14 1.4 726.0 1.0X -Parquet Vectorized (Pushdown) 529 538 10 29.7 33.7 21.6X -Native ORC Vectorized 9244 9269 22 1.7 587.7 1.2X -Native ORC Vectorized (Pushdown) 1009 1030 39 15.6 64.1 11.3X +Parquet Vectorized 7742 7755 10 2.0 492.2 1.0X +Parquet Vectorized (Pushdown) 406 413 5 38.8 25.8 19.1X +Native ORC Vectorized 8019 8030 11 2.0 509.8 1.0X +Native ORC Vectorized (Pushdown) 822 826 3 19.1 52.2 9.4X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11275 11304 19 1.4 716.9 1.0X -Parquet Vectorized (Pushdown) 615 639 40 25.6 39.1 18.3X -Native ORC Vectorized 9160 9181 15 1.7 582.4 1.2X -Native ORC Vectorized (Pushdown) 1073 1079 4 14.7 68.2 10.5X +Parquet Vectorized 7575 7582 6 2.1 481.6 1.0X +Parquet Vectorized (Pushdown) 459 468 7 34.3 29.2 16.5X +Native ORC Vectorized 7934 7944 10 2.0 504.4 1.0X +Native ORC Vectorized (Pushdown) 879 889 16 17.9 55.9 8.6X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11263 11276 12 1.4 716.1 1.0X -Parquet Vectorized (Pushdown) 615 619 4 25.6 39.1 18.3X -Native ORC Vectorized 9133 9158 17 1.7 580.6 1.2X -Native ORC Vectorized (Pushdown) 1064 1081 33 14.8 67.7 10.6X +Parquet Vectorized 7590 7611 16 2.1 482.6 1.0X +Parquet Vectorized (Pushdown) 457 463 5 34.4 29.0 16.6X +Native ORC Vectorized 7936 7942 6 2.0 504.6 1.0X +Native ORC Vectorized (Pushdown) 873 879 4 18.0 55.5 8.7X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11342 11441 148 1.4 721.1 1.0X -Parquet Vectorized (Pushdown) 615 623 7 25.6 39.1 18.4X -Native ORC Vectorized 9206 9221 12 1.7 585.3 1.2X -Native ORC Vectorized (Pushdown) 1063 1071 6 14.8 67.6 10.7X +Parquet Vectorized 7715 7724 8 2.0 490.5 1.0X +Parquet Vectorized (Pushdown) 463 471 5 33.9 29.5 16.6X +Native ORC Vectorized 8038 8043 7 2.0 511.1 1.0X +Native ORC Vectorized (Pushdown) 874 884 8 18.0 55.6 8.8X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 22292 22350 104 0.7 1417.3 1.0X -Parquet Vectorized (Pushdown) 22393 22474 89 0.7 1423.7 1.0X -Native ORC Vectorized 20424 20439 15 0.8 1298.6 1.1X -Native ORC Vectorized (Pushdown) 20750 20803 36 0.8 1319.3 1.1X +Parquet Vectorized 16502 16555 40 1.0 1049.1 1.0X +Parquet Vectorized (Pushdown) 16597 16624 20 0.9 1055.2 1.0X +Native ORC Vectorized 17396 17568 116 0.9 1106.0 0.9X +Native ORC Vectorized (Pushdown) 17437 17572 188 0.9 1108.6 0.9X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 13199 13292 80 1.2 839.2 1.0X -Parquet Vectorized (Pushdown) 1776 1785 8 8.9 112.9 7.4X -Native ORC Vectorized 8608 8650 52 1.8 547.3 1.5X -Native ORC Vectorized (Pushdown) 8720 8742 22 1.8 554.4 1.5X +Parquet Vectorized 8868 8996 74 1.8 563.8 1.0X +Parquet Vectorized (Pushdown) 1235 1272 36 12.7 78.5 7.2X +Native ORC Vectorized 7295 7418 112 2.2 463.8 1.2X +Native ORC Vectorized (Pushdown) 7519 7620 82 2.1 478.0 1.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12843 12858 14 1.2 816.5 1.0X -Parquet Vectorized (Pushdown) 612 629 19 25.7 38.9 21.0X -Native ORC Vectorized 8280 8288 10 1.9 526.4 1.6X -Native ORC Vectorized (Pushdown) 8436 8445 7 1.9 536.4 1.5X +Parquet Vectorized 8767 8846 67 1.8 557.4 1.0X +Parquet Vectorized (Pushdown) 469 477 5 33.5 29.8 18.7X +Native ORC Vectorized 7156 7273 78 2.2 454.9 1.2X +Native ORC Vectorized (Pushdown) 7483 7494 10 2.1 475.7 1.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12921 12946 19 1.2 821.5 1.0X -Parquet Vectorized (Pushdown) 599 609 7 26.2 38.1 21.6X -Native ORC Vectorized 8291 8300 10 1.9 527.1 1.6X -Native ORC Vectorized (Pushdown) 8461 8480 21 1.9 537.9 1.5X +Parquet Vectorized 8672 8843 108 1.8 551.4 1.0X +Parquet Vectorized (Pushdown) 461 466 5 34.1 29.3 18.8X +Native ORC Vectorized 7029 7099 54 2.2 446.9 1.2X +Native ORC Vectorized (Pushdown) 7297 7374 63 2.2 464.0 1.2X + + +================================================================================================ +Pushdown benchmark for StringEndsWith +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 7611 7660 63 2.1 483.9 1.0X +Parquet Vectorized (Pushdown) 551 581 29 28.6 35.0 13.8X +Native ORC Vectorized 7859 7965 103 2.0 499.6 1.0X +Native ORC Vectorized (Pushdown) 8131 8226 63 1.9 516.9 0.9X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 7386 7511 72 2.1 469.6 1.0X +Parquet Vectorized (Pushdown) 461 469 5 34.1 29.3 16.0X +Native ORC Vectorized 7813 7886 47 2.0 496.7 0.9X +Native ORC Vectorized (Pushdown) 7979 8222 312 2.0 507.3 0.9X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 7521 7625 180 2.1 478.1 1.0X +Parquet Vectorized (Pushdown) 464 466 2 33.9 29.5 16.2X +Native ORC Vectorized 7780 7873 60 2.0 494.6 1.0X +Native ORC Vectorized (Pushdown) 8166 8205 46 1.9 519.2 0.9X + + +================================================================================================ +Pushdown benchmark for StringContains +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 7883 7893 16 2.0 501.2 1.0X +Parquet Vectorized (Pushdown) 1128 1132 7 13.9 71.7 7.0X +Native ORC Vectorized 8256 8280 27 1.9 524.9 1.0X +Native ORC Vectorized (Pushdown) 8522 8533 7 1.8 541.8 0.9X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +---------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 7576 7581 7 2.1 481.7 1.0X +Parquet Vectorized (Pushdown) 468 477 8 33.6 29.8 16.2X +Native ORC Vectorized 7772 7899 99 2.0 494.1 1.0X +Native ORC Vectorized (Pushdown) 8069 8175 74 1.9 513.0 0.9X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------ +Parquet Vectorized 7401 7461 36 2.1 470.5 1.0X +Parquet Vectorized (Pushdown) 464 471 5 33.9 29.5 16.0X +Native ORC Vectorized 7720 7812 90 2.0 490.9 1.0X +Native ORC Vectorized (Pushdown) 8031 8176 82 2.0 510.6 0.9X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6087 6102 23 2.6 387.0 1.0X -Parquet Vectorized (Pushdown) 150 158 8 104.5 9.6 40.5X -Native ORC Vectorized 5614 5631 21 2.8 356.9 1.1X -Native ORC Vectorized (Pushdown) 184 188 4 85.3 11.7 33.0X +Parquet Vectorized 3601 3623 25 4.4 229.0 1.0X +Parquet Vectorized (Pushdown) 118 122 5 132.9 7.5 30.4X +Native ORC Vectorized 4979 4997 26 3.2 316.6 0.7X +Native ORC Vectorized (Pushdown) 155 159 5 101.7 9.8 23.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8032 8051 18 2.0 510.7 1.0X -Parquet Vectorized (Pushdown) 3277 3284 6 4.8 208.3 2.5X -Native ORC Vectorized 7586 7596 6 2.1 482.3 1.1X -Native ORC Vectorized (Pushdown) 3244 3255 8 4.8 206.3 2.5X +Parquet Vectorized 5223 5232 9 3.0 332.0 1.0X +Parquet Vectorized (Pushdown) 2442 2453 13 6.4 155.3 2.1X +Native ORC Vectorized 6650 6651 2 2.4 422.8 0.8X +Native ORC Vectorized (Pushdown) 2777 2781 3 5.7 176.6 1.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14463 14475 16 1.1 919.5 1.0X -Parquet Vectorized (Pushdown) 13700 13816 217 1.1 871.0 1.1X -Native ORC Vectorized 14172 14194 19 1.1 901.0 1.0X -Native ORC Vectorized (Pushdown) 13518 13537 19 1.2 859.4 1.1X +Parquet Vectorized 10591 10617 33 1.5 673.3 1.0X +Parquet Vectorized (Pushdown) 10190 10200 8 1.5 647.9 1.0X +Native ORC Vectorized 12103 12126 16 1.3 769.5 0.9X +Native ORC Vectorized (Pushdown) 11524 11533 7 1.4 732.6 0.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 16161 16182 15 1.0 1027.5 1.0X -Parquet Vectorized (Pushdown) 16195 16225 22 1.0 1029.6 1.0X -Native ORC Vectorized 15883 15917 25 1.0 1009.8 1.0X -Native ORC Vectorized (Pushdown) 16004 16038 24 1.0 1017.5 1.0X +Parquet Vectorized 11946 12068 197 1.3 759.5 1.0X +Parquet Vectorized (Pushdown) 12013 12030 15 1.3 763.7 1.0X +Native ORC Vectorized 13455 13478 19 1.2 855.5 0.9X +Native ORC Vectorized (Pushdown) 13553 13577 21 1.2 861.7 0.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6321 6349 29 2.5 401.9 1.0X -Parquet Vectorized (Pushdown) 149 158 10 105.5 9.5 42.4X -Native ORC Vectorized 5584 5602 12 2.8 355.1 1.1X -Native ORC Vectorized (Pushdown) 180 186 8 87.5 11.4 35.2X +Parquet Vectorized 3789 3802 14 4.2 240.9 1.0X +Parquet Vectorized (Pushdown) 118 123 5 133.3 7.5 32.1X +Native ORC Vectorized 4973 4985 25 3.2 316.2 0.8X +Native ORC Vectorized (Pushdown) 150 155 5 104.9 9.5 25.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7339 7379 45 2.1 466.6 1.0X -Parquet Vectorized (Pushdown) 1780 1790 7 8.8 113.1 4.1X -Native ORC Vectorized 6664 6674 8 2.4 423.7 1.1X -Native ORC Vectorized (Pushdown) 1759 1775 15 8.9 111.8 4.2X +Parquet Vectorized 4605 4623 20 3.4 292.8 1.0X +Parquet Vectorized (Pushdown) 1309 1320 20 12.0 83.2 3.5X +Native ORC Vectorized 5829 5836 8 2.7 370.6 0.8X +Native ORC Vectorized (Pushdown) 1468 1473 4 10.7 93.3 3.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11393 11414 19 1.4 724.3 1.0X -Parquet Vectorized (Pushdown) 8339 8350 11 1.9 530.2 1.4X -Native ORC Vectorized 10815 10856 55 1.5 687.6 1.1X -Native ORC Vectorized (Pushdown) 8108 8122 16 1.9 515.5 1.4X +Parquet Vectorized 7862 7871 8 2.0 499.8 1.0X +Parquet Vectorized (Pushdown) 6056 6064 9 2.6 385.0 1.3X +Native ORC Vectorized 9165 9179 11 1.7 582.7 0.9X +Native ORC Vectorized (Pushdown) 6774 6781 6 2.3 430.7 1.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 15297 15319 21 1.0 972.6 1.0X -Parquet Vectorized (Pushdown) 14771 14794 22 1.1 939.1 1.0X -Native ORC Vectorized 14853 14864 11 1.1 944.3 1.0X -Native ORC Vectorized (Pushdown) 14364 14376 11 1.1 913.2 1.1X +Parquet Vectorized 11016 11026 8 1.4 700.4 1.0X +Parquet Vectorized (Pushdown) 10730 10744 12 1.5 682.2 1.0X +Native ORC Vectorized 12419 12427 9 1.3 789.6 0.9X +Native ORC Vectorized (Pushdown) 12015 12030 12 1.3 763.9 0.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8333 8368 43 1.9 529.8 1.0X -Parquet Vectorized (Pushdown) 162 172 14 97.0 10.3 51.4X -Native ORC Vectorized 5561 5584 21 2.8 353.6 1.5X -Native ORC Vectorized (Pushdown) 180 184 4 87.6 11.4 46.4X +Parquet Vectorized 5415 5429 17 2.9 344.3 1.0X +Parquet Vectorized (Pushdown) 127 132 5 123.9 8.1 42.7X +Native ORC Vectorized 4982 4998 14 3.2 316.7 1.1X +Native ORC Vectorized (Pushdown) 152 156 4 103.2 9.7 35.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9597 9614 18 1.6 610.2 1.0X -Parquet Vectorized (Pushdown) 2225 2241 12 7.1 141.5 4.3X -Native ORC Vectorized 6778 6799 21 2.3 430.9 1.4X -Native ORC Vectorized (Pushdown) 1894 1898 5 8.3 120.4 5.1X +Parquet Vectorized 6415 6445 31 2.5 407.9 1.0X +Parquet Vectorized (Pushdown) 1661 1663 3 9.5 105.6 3.9X +Native ORC Vectorized 5923 5933 7 2.7 376.6 1.1X +Native ORC Vectorized (Pushdown) 1593 1594 1 9.9 101.3 4.0X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14523 14644 239 1.1 923.3 1.0X -Parquet Vectorized (Pushdown) 10466 10492 30 1.5 665.4 1.4X -Native ORC Vectorized 11500 11577 144 1.4 731.2 1.3X -Native ORC Vectorized (Pushdown) 8835 8879 75 1.8 561.7 1.6X +Parquet Vectorized 10410 10426 16 1.5 661.9 1.0X +Parquet Vectorized (Pushdown) 7809 7819 9 2.0 496.5 1.3X +Native ORC Vectorized 9825 9834 9 1.6 624.6 1.1X +Native ORC Vectorized (Pushdown) 7419 7426 9 2.1 471.7 1.4X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 19354 19459 206 0.8 1230.5 1.0X -Parquet Vectorized (Pushdown) 18751 18770 17 0.8 1192.2 1.0X -Native ORC Vectorized 16128 16195 74 1.0 1025.4 1.2X -Native ORC Vectorized (Pushdown) 15596 15619 31 1.0 991.6 1.2X +Parquet Vectorized 14383 14398 16 1.1 914.5 1.0X +Parquet Vectorized (Pushdown) 13876 13900 14 1.1 882.2 1.0X +Native ORC Vectorized 13625 13638 12 1.2 866.2 1.1X +Native ORC Vectorized (Pushdown) 13194 13226 19 1.2 838.9 1.1X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11822 11863 27 1.3 751.6 1.0X -Parquet Vectorized (Pushdown) 626 649 37 25.1 39.8 18.9X -Native ORC Vectorized 7601 7643 46 2.1 483.3 1.6X -Native ORC Vectorized (Pushdown) 551 568 23 28.5 35.0 21.4X +Parquet Vectorized 8086 8129 75 1.9 514.1 1.0X +Parquet Vectorized (Pushdown) 472 484 9 33.3 30.0 17.1X +Native ORC Vectorized 6513 6533 32 2.4 414.1 1.2X +Native ORC Vectorized (Pushdown) 443 472 50 35.5 28.1 18.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11775 11791 14 1.3 748.7 1.0X -Parquet Vectorized (Pushdown) 631 645 19 24.9 40.1 18.7X -Native ORC Vectorized 7530 7543 12 2.1 478.7 1.6X -Native ORC Vectorized (Pushdown) 548 558 10 28.7 34.9 21.5X +Parquet Vectorized 8079 8103 22 1.9 513.6 1.0X +Parquet Vectorized (Pushdown) 468 476 6 33.6 29.7 17.3X +Native ORC Vectorized 6487 6495 5 2.4 412.4 1.2X +Native ORC Vectorized (Pushdown) 446 454 6 35.3 28.4 18.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11764 11775 17 1.3 747.9 1.0X -Parquet Vectorized (Pushdown) 631 649 27 24.9 40.1 18.6X -Native ORC Vectorized 7509 7534 28 2.1 477.4 1.6X -Native ORC Vectorized (Pushdown) 555 564 13 28.4 35.3 21.2X +Parquet Vectorized 8016 8028 8 2.0 509.7 1.0X +Parquet Vectorized (Pushdown) 473 477 5 33.3 30.1 17.0X +Native ORC Vectorized 6488 6492 3 2.4 412.5 1.2X +Native ORC Vectorized (Pushdown) 448 456 5 35.1 28.5 17.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11822 11843 14 1.3 751.6 1.0X -Parquet Vectorized (Pushdown) 660 676 26 23.8 42.0 17.9X -Native ORC Vectorized 7548 7561 14 2.1 479.9 1.6X -Native ORC Vectorized (Pushdown) 574 590 20 27.4 36.5 20.6X +Parquet Vectorized 8057 8068 8 2.0 512.2 1.0X +Parquet Vectorized (Pushdown) 484 491 5 32.5 30.8 16.7X +Native ORC Vectorized 6519 6525 5 2.4 414.5 1.2X +Native ORC Vectorized (Pushdown) 476 481 4 33.1 30.2 16.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11841 11869 24 1.3 752.8 1.0X -Parquet Vectorized (Pushdown) 672 681 15 23.4 42.8 17.6X -Native ORC Vectorized 7561 7574 18 2.1 480.7 1.6X -Native ORC Vectorized (Pushdown) 580 585 5 27.1 36.9 20.4X +Parquet Vectorized 8126 8147 20 1.9 516.6 1.0X +Parquet Vectorized (Pushdown) 495 501 5 31.8 31.5 16.4X +Native ORC Vectorized 6501 6508 8 2.4 413.3 1.2X +Native ORC Vectorized (Pushdown) 470 473 3 33.5 29.9 17.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11788 11807 18 1.3 749.5 1.0X -Parquet Vectorized (Pushdown) 661 678 34 23.8 42.0 17.8X -Native ORC Vectorized 7545 7572 20 2.1 479.7 1.6X -Native ORC Vectorized (Pushdown) 572 580 6 27.5 36.4 20.6X +Parquet Vectorized 8088 8096 6 1.9 514.2 1.0X +Parquet Vectorized (Pushdown) 498 500 2 31.6 31.7 16.2X +Native ORC Vectorized 6489 6499 8 2.4 412.6 1.2X +Native ORC Vectorized (Pushdown) 467 472 5 33.7 29.7 17.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12133 12148 11 1.3 771.4 1.0X -Parquet Vectorized (Pushdown) 1762 1771 6 8.9 112.1 6.9X -Native ORC Vectorized 7857 7877 13 2.0 499.6 1.5X -Native ORC Vectorized (Pushdown) 744 760 31 21.1 47.3 16.3X +Parquet Vectorized 8388 8396 7 1.9 533.3 1.0X +Parquet Vectorized (Pushdown) 1251 1259 7 12.6 79.5 6.7X +Native ORC Vectorized 6763 6768 3 2.3 430.0 1.2X +Native ORC Vectorized (Pushdown) 603 608 4 26.1 38.3 13.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12137 12244 202 1.3 771.6 1.0X -Parquet Vectorized (Pushdown) 6283 6302 12 2.5 399.5 1.9X -Native ORC Vectorized 7856 7859 3 2.0 499.5 1.5X -Native ORC Vectorized (Pushdown) 772 778 5 20.4 49.1 15.7X +Parquet Vectorized 8422 8459 25 1.9 535.5 1.0X +Parquet Vectorized (Pushdown) 4326 4329 3 3.6 275.0 1.9X +Native ORC Vectorized 6833 6839 4 2.3 434.4 1.2X +Native ORC Vectorized (Pushdown) 632 636 5 24.9 40.2 13.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12112 12157 78 1.3 770.0 1.0X -Parquet Vectorized (Pushdown) 10496 10511 14 1.5 667.3 1.2X -Native ORC Vectorized 7842 7855 12 2.0 498.6 1.5X -Native ORC Vectorized (Pushdown) 783 790 12 20.1 49.8 15.5X +Parquet Vectorized 8395 8399 3 1.9 533.8 1.0X +Parquet Vectorized (Pushdown) 7601 7780 162 2.1 483.3 1.1X +Native ORC Vectorized 6768 6777 9 2.3 430.3 1.2X +Native ORC Vectorized (Pushdown) 638 641 3 24.7 40.6 13.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12049 12065 15 1.3 766.0 1.0X -Parquet Vectorized (Pushdown) 1761 1769 7 8.9 111.9 6.8X -Native ORC Vectorized 7880 7894 13 2.0 501.0 1.5X -Native ORC Vectorized (Pushdown) 881 888 8 17.8 56.0 13.7X +Parquet Vectorized 8731 8759 22 1.8 555.1 1.0X +Parquet Vectorized (Pushdown) 1282 1286 4 12.3 81.5 6.8X +Native ORC Vectorized 6718 6762 38 2.3 427.1 1.3X +Native ORC Vectorized (Pushdown) 724 730 8 21.7 46.0 12.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12022 12052 24 1.3 764.3 1.0X -Parquet Vectorized (Pushdown) 6298 6310 14 2.5 400.4 1.9X -Native ORC Vectorized 7778 7789 10 2.0 494.5 1.5X -Native ORC Vectorized (Pushdown) 959 989 49 16.4 61.0 12.5X +Parquet Vectorized 8738 8745 5 1.8 555.6 1.0X +Parquet Vectorized (Pushdown) 4570 4588 12 3.4 290.5 1.9X +Native ORC Vectorized 6759 6792 21 2.3 429.7 1.3X +Native ORC Vectorized (Pushdown) 853 859 4 18.4 54.2 10.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12050 12073 19 1.3 766.1 1.0X -Parquet Vectorized (Pushdown) 10998 11013 18 1.4 699.3 1.1X -Native ORC Vectorized 7777 7788 8 2.0 494.5 1.5X -Native ORC Vectorized (Pushdown) 997 1006 13 15.8 63.4 12.1X +Parquet Vectorized 8671 8681 7 1.8 551.3 1.0X +Parquet Vectorized (Pushdown) 7576 7581 6 2.1 481.7 1.1X +Native ORC Vectorized 6710 6720 8 2.3 426.6 1.3X +Native ORC Vectorized (Pushdown) 837 840 3 18.8 53.2 10.4X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6615 6664 58 2.4 420.6 1.0X -Parquet Vectorized (Pushdown) 221 228 5 71.1 14.1 29.9X -Native ORC Vectorized 3569 3596 35 4.4 226.9 1.9X -Native ORC Vectorized (Pushdown) 251 259 9 62.7 16.0 26.4X +Parquet Vectorized 3958 3978 26 4.0 251.6 1.0X +Parquet Vectorized (Pushdown) 166 171 6 94.8 10.5 23.9X +Native ORC Vectorized 2992 3000 12 5.3 190.2 1.3X +Native ORC Vectorized (Pushdown) 205 208 3 76.7 13.0 19.3X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7480 7506 25 2.1 475.6 1.0X -Parquet Vectorized (Pushdown) 1668 1675 7 9.4 106.0 4.5X -Native ORC Vectorized 4416 4429 14 3.6 280.8 1.7X -Native ORC Vectorized (Pushdown) 1396 1402 6 11.3 88.7 5.4X +Parquet Vectorized 4705 4714 11 3.3 299.1 1.0X +Parquet Vectorized (Pushdown) 1235 1239 3 12.7 78.5 3.8X +Native ORC Vectorized 3685 3691 4 4.3 234.3 1.3X +Native ORC Vectorized (Pushdown) 1155 1165 10 13.6 73.4 4.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11098 11116 12 1.4 705.6 1.0X -Parquet Vectorized (Pushdown) 7947 7961 11 2.0 505.3 1.4X -Native ORC Vectorized 7965 7972 5 2.0 506.4 1.4X -Native ORC Vectorized (Pushdown) 6308 6312 3 2.5 401.1 1.8X +Parquet Vectorized 7695 7703 6 2.0 489.2 1.0X +Parquet Vectorized (Pushdown) 5820 5830 14 2.7 370.0 1.3X +Native ORC Vectorized 6567 6574 4 2.4 417.5 1.2X +Native ORC Vectorized (Pushdown) 5187 5193 5 3.0 329.8 1.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14771 14814 46 1.1 939.1 1.0X -Parquet Vectorized (Pushdown) 14179 14193 12 1.1 901.5 1.0X -Native ORC Vectorized 11496 11521 19 1.4 730.9 1.3X -Native ORC Vectorized (Pushdown) 11222 11248 18 1.4 713.5 1.3X +Parquet Vectorized 10714 10724 11 1.5 681.2 1.0X +Parquet Vectorized (Pushdown) 10379 10390 13 1.5 659.9 1.0X +Native ORC Vectorized 9434 9444 10 1.7 599.8 1.1X +Native ORC Vectorized (Pushdown) 9221 9242 15 1.7 586.3 1.2X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6682 6692 9 2.4 424.9 1.0X -Parquet Vectorized (Pushdown) 6670 6687 17 2.4 424.0 1.0X -Native ORC Vectorized 3709 3712 3 4.2 235.8 1.8X -Native ORC Vectorized (Pushdown) 150 155 5 104.8 9.5 44.5X +Parquet Vectorized 4149 4162 13 3.8 263.8 1.0X +Parquet Vectorized (Pushdown) 4141 4151 9 3.8 263.2 1.0X +Native ORC Vectorized 3080 3091 12 5.1 195.8 1.3X +Native ORC Vectorized (Pushdown) 123 128 4 127.4 7.9 33.6X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7637 7737 194 2.1 485.5 1.0X -Parquet Vectorized (Pushdown) 7621 7651 26 2.1 484.5 1.0X -Native ORC Vectorized 4623 4630 8 3.4 293.9 1.7X -Native ORC Vectorized (Pushdown) 1436 1439 3 11.0 91.3 5.3X +Parquet Vectorized 4902 4923 13 3.2 311.7 1.0X +Parquet Vectorized (Pushdown) 4894 4899 5 3.2 311.2 1.0X +Native ORC Vectorized 3851 3858 10 4.1 244.8 1.3X +Native ORC Vectorized (Pushdown) 1191 1194 4 13.2 75.7 4.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11468 11522 74 1.4 729.1 1.0X -Parquet Vectorized (Pushdown) 11474 11490 13 1.4 729.5 1.0X -Native ORC Vectorized 8263 8267 5 1.9 525.3 1.4X -Native ORC Vectorized (Pushdown) 6526 6542 10 2.4 414.9 1.8X +Parquet Vectorized 8112 8137 15 1.9 515.7 1.0X +Parquet Vectorized (Pushdown) 8092 8108 18 1.9 514.4 1.0X +Native ORC Vectorized 6838 6842 4 2.3 434.7 1.2X +Native ORC Vectorized (Pushdown) 5377 5381 4 2.9 341.8 1.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 15138 15150 13 1.0 962.5 1.0X -Parquet Vectorized (Pushdown) 15137 15169 21 1.0 962.4 1.0X -Native ORC Vectorized 11930 11943 12 1.3 758.5 1.3X -Native ORC Vectorized (Pushdown) 11658 11676 16 1.3 741.2 1.3X +Parquet Vectorized 11222 11234 12 1.4 713.5 1.0X +Parquet Vectorized (Pushdown) 11234 11262 36 1.4 714.2 1.0X +Native ORC Vectorized 9837 9847 11 1.6 625.4 1.1X +Native ORC Vectorized (Pushdown) 9599 9604 5 1.6 610.3 1.2X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6205 6239 21 2.5 394.5 1.0X -Parquet Vectorized (Pushdown) 147 151 6 107.0 9.3 42.2X -Native ORC Vectorized 3712 3721 7 4.2 236.0 1.7X -Native ORC Vectorized (Pushdown) 149 156 8 105.4 9.5 41.6X +Parquet Vectorized 3663 3672 10 4.3 232.9 1.0X +Parquet Vectorized (Pushdown) 115 118 4 137.2 7.3 31.9X +Native ORC Vectorized 3079 3085 5 5.1 195.7 1.2X +Native ORC Vectorized (Pushdown) 122 125 4 128.5 7.8 29.9X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7186 7205 27 2.2 456.9 1.0X -Parquet Vectorized (Pushdown) 1708 1717 8 9.2 108.6 4.2X -Native ORC Vectorized 4617 4629 7 3.4 293.5 1.6X -Native ORC Vectorized (Pushdown) 1433 1438 5 11.0 91.1 5.0X +Parquet Vectorized 4466 4476 13 3.5 284.0 1.0X +Parquet Vectorized (Pushdown) 1266 1270 5 12.4 80.5 3.5X +Native ORC Vectorized 3846 3851 5 4.1 244.5 1.2X +Native ORC Vectorized (Pushdown) 1190 1193 3 13.2 75.6 3.8X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10971 11022 90 1.4 697.5 1.0X -Parquet Vectorized (Pushdown) 7990 7998 7 2.0 508.0 1.4X -Native ORC Vectorized 8276 8285 14 1.9 526.2 1.3X -Native ORC Vectorized (Pushdown) 6527 6548 19 2.4 415.0 1.7X +Parquet Vectorized 7632 7648 18 2.1 485.2 1.0X +Parquet Vectorized (Pushdown) 5900 5904 4 2.7 375.1 1.3X +Native ORC Vectorized 6850 6856 7 2.3 435.5 1.1X +Native ORC Vectorized (Pushdown) 5390 5396 7 2.9 342.7 1.4X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14721 14817 102 1.1 935.9 1.0X -Parquet Vectorized (Pushdown) 14084 14129 41 1.1 895.4 1.0X -Native ORC Vectorized 11948 11957 14 1.3 759.6 1.2X -Native ORC Vectorized (Pushdown) 11636 11644 9 1.4 739.8 1.3X +Parquet Vectorized 10772 10787 14 1.5 684.9 1.0X +Parquet Vectorized (Pushdown) 10425 10436 8 1.5 662.8 1.0X +Native ORC Vectorized 9836 9850 11 1.6 625.3 1.1X +Native ORC Vectorized (Pushdown) 9578 9583 5 1.6 609.0 1.1X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6234 6276 57 2.5 396.4 1.0X -Parquet Vectorized (Pushdown) 146 150 4 107.9 9.3 42.8X -Native ORC Vectorized 3696 3712 18 4.3 235.0 1.7X -Native ORC Vectorized (Pushdown) 149 154 5 105.6 9.5 41.8X +Parquet Vectorized 3702 3713 8 4.2 235.4 1.0X +Parquet Vectorized (Pushdown) 114 117 3 138.1 7.2 32.5X +Native ORC Vectorized 3081 3089 8 5.1 195.9 1.2X +Native ORC Vectorized (Pushdown) 121 125 4 129.6 7.7 30.5X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7194 7228 29 2.2 457.4 1.0X -Parquet Vectorized (Pushdown) 1719 1726 5 9.2 109.3 4.2X -Native ORC Vectorized 4622 4632 17 3.4 293.8 1.6X -Native ORC Vectorized (Pushdown) 1439 1446 6 10.9 91.5 5.0X +Parquet Vectorized 4489 4494 9 3.5 285.4 1.0X +Parquet Vectorized (Pushdown) 1270 1282 14 12.4 80.7 3.5X +Native ORC Vectorized 3846 3856 7 4.1 244.5 1.2X +Native ORC Vectorized (Pushdown) 1187 1190 4 13.3 75.5 3.8X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11025 11081 89 1.4 700.9 1.0X -Parquet Vectorized (Pushdown) 8024 8045 22 2.0 510.2 1.4X -Native ORC Vectorized 8279 8300 27 1.9 526.4 1.3X -Native ORC Vectorized (Pushdown) 6536 6550 9 2.4 415.6 1.7X +Parquet Vectorized 7663 7679 17 2.1 487.2 1.0X +Parquet Vectorized (Pushdown) 5877 5891 11 2.7 373.6 1.3X +Native ORC Vectorized 6847 6850 3 2.3 435.3 1.1X +Native ORC Vectorized (Pushdown) 5373 5382 8 2.9 341.6 1.4X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14737 14751 10 1.1 937.0 1.0X -Parquet Vectorized (Pushdown) 14138 14163 15 1.1 898.9 1.0X -Native ORC Vectorized 11952 12008 101 1.3 759.9 1.2X -Native ORC Vectorized (Pushdown) 11616 11623 7 1.4 738.5 1.3X +Parquet Vectorized 10752 10862 206 1.5 683.6 1.0X +Parquet Vectorized (Pushdown) 10471 10479 7 1.5 665.7 1.0X +Native ORC Vectorized 9861 9868 9 1.6 627.0 1.1X +Native ORC Vectorized (Pushdown) 9604 9614 7 1.6 610.6 1.1X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 199 203 6 0.0 198570345.0 1.0X -Parquet Vectorized (Pushdown) 200 206 6 0.0 200485033.0 1.0X -Native ORC Vectorized 193 208 8 0.0 193123650.0 1.0X -Native ORC Vectorized (Pushdown) 206 219 9 0.0 206190335.0 1.0X +Parquet Vectorized 151 155 4 0.0 150695208.0 1.0X +Parquet Vectorized (Pushdown) 153 157 5 0.0 152853214.0 1.0X +Native ORC Vectorized 144 154 9 0.0 143923196.0 1.0X +Native ORC Vectorized (Pushdown) 152 159 8 0.0 152359467.0 1.0X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1631 1822 248 0.0 1631451389.0 1.0X -Parquet Vectorized (Pushdown) 1665 1705 56 0.0 1665363203.0 1.0X -Native ORC Vectorized 1596 1657 88 0.0 1595629417.0 1.0X -Native ORC Vectorized (Pushdown) 1627 1660 46 0.0 1626580577.0 1.0X +Parquet Vectorized 1582 1614 31 0.0 1581508551.0 1.0X +Parquet Vectorized (Pushdown) 1626 1664 73 0.0 1625795573.0 1.0X +Native ORC Vectorized 1563 1586 23 0.0 1563053391.0 1.0X +Native ORC Vectorized (Pushdown) 1583 1619 64 0.0 1583075187.0 1.0X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.13.0-1021-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7019 7077 68 0.0 7019082096.0 1.0X -Parquet Vectorized (Pushdown) 7256 7434 175 0.0 7255521503.0 1.0X -Native ORC Vectorized 7048 7298 228 0.0 7047696613.0 1.0X -Native ORC Vectorized (Pushdown) 7066 7258 169 0.0 7065891513.0 1.0X +Parquet Vectorized 7027 7227 148 0.0 7026812316.0 1.0X +Parquet Vectorized (Pushdown) 7197 7404 199 0.0 7197391646.0 1.0X +Native ORC Vectorized 7143 7258 117 0.0 7143461935.0 1.0X +Native ORC Vectorized (Pushdown) 7137 7323 228 0.0 7137286231.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index 1a02b1611e..24e1594e56 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,669 +2,733 @@ Pushdown for many distinct value case ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8874 9713 1323 1.8 564.2 1.0X -Parquet Vectorized (Pushdown) 607 617 9 25.9 38.6 14.6X -Native ORC Vectorized 6195 6885 1498 2.5 393.9 1.4X -Native ORC Vectorized (Pushdown) 522 554 39 30.1 33.2 17.0X +Parquet Vectorized 10541 10920 716 1.5 670.2 1.0X +Parquet Vectorized (Pushdown) 616 639 29 25.5 39.2 17.1X +Native ORC Vectorized 6367 7100 1513 2.5 404.8 1.7X +Native ORC Vectorized (Pushdown) 523 557 47 30.1 33.3 20.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9007 9112 197 1.7 572.6 1.0X -Parquet Vectorized (Pushdown) 594 606 14 26.5 37.8 15.2X -Native ORC Vectorized 6387 6398 11 2.5 406.1 1.4X -Native ORC Vectorized (Pushdown) 526 590 83 29.9 33.4 17.1X +Parquet Vectorized 10524 10666 134 1.5 669.1 1.0X +Parquet Vectorized (Pushdown) 585 609 20 26.9 37.2 18.0X +Native ORC Vectorized 6429 6511 77 2.4 408.7 1.6X +Native ORC Vectorized (Pushdown) 524 551 32 30.0 33.3 20.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9065 9107 77 1.7 576.3 1.0X -Parquet Vectorized (Pushdown) 585 594 13 26.9 37.2 15.5X -Native ORC Vectorized 6421 6450 51 2.4 408.2 1.4X -Native ORC Vectorized (Pushdown) 508 533 31 31.0 32.3 17.9X +Parquet Vectorized 10295 10478 207 1.5 654.6 1.0X +Parquet Vectorized (Pushdown) 553 583 20 28.4 35.2 18.6X +Native ORC Vectorized 6201 6450 178 2.5 394.2 1.7X +Native ORC Vectorized (Pushdown) 502 528 39 31.4 31.9 20.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9037 9086 90 1.7 574.5 1.0X -Parquet Vectorized (Pushdown) 581 590 12 27.1 36.9 15.6X -Native ORC Vectorized 6414 6423 6 2.5 407.8 1.4X -Native ORC Vectorized (Pushdown) 501 523 36 31.4 31.9 18.0X +Parquet Vectorized 10360 10471 112 1.5 658.7 1.0X +Parquet Vectorized (Pushdown) 550 571 17 28.6 35.0 18.8X +Native ORC Vectorized 6465 6472 8 2.4 411.0 1.6X +Native ORC Vectorized (Pushdown) 520 571 62 30.2 33.1 19.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9040 9074 36 1.7 574.8 1.0X -Parquet Vectorized (Pushdown) 579 587 13 27.2 36.8 15.6X -Native ORC Vectorized 6424 6443 19 2.4 408.4 1.4X -Native ORC Vectorized (Pushdown) 506 524 36 31.1 32.2 17.9X +Parquet Vectorized 10401 10562 106 1.5 661.3 1.0X +Parquet Vectorized (Pushdown) 544 568 24 28.9 34.6 19.1X +Native ORC Vectorized 6475 6544 65 2.4 411.7 1.6X +Native ORC Vectorized (Pushdown) 508 533 31 31.0 32.3 20.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 16926 16949 19 0.9 1076.1 1.0X -Parquet Vectorized (Pushdown) 17017 17027 11 0.9 1081.9 1.0X -Native ORC Vectorized 14405 14426 18 1.1 915.8 1.2X -Native ORC Vectorized (Pushdown) 14547 14566 16 1.1 924.9 1.2X +Parquet Vectorized 17904 18027 125 0.9 1138.3 1.0X +Parquet Vectorized (Pushdown) 17931 17979 48 0.9 1140.0 1.0X +Native ORC Vectorized 12862 13722 489 1.2 817.7 1.4X +Native ORC Vectorized (Pushdown) 14091 14156 55 1.1 895.9 1.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8459 9267 1292 1.9 537.8 1.0X -Parquet Vectorized (Pushdown) 568 583 23 27.7 36.1 14.9X -Native ORC Vectorized 5805 6369 1208 2.7 369.1 1.5X -Native ORC Vectorized (Pushdown) 488 525 61 32.2 31.0 17.3X +Parquet Vectorized 9693 10784 1430 1.6 616.2 1.0X +Parquet Vectorized (Pushdown) 538 555 20 29.2 34.2 18.0X +Native ORC Vectorized 5904 6535 1244 2.7 375.4 1.6X +Native ORC Vectorized (Pushdown) 484 531 67 32.5 30.8 20.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8463 8541 156 1.9 538.0 1.0X -Parquet Vectorized (Pushdown) 577 589 15 27.3 36.7 14.7X -Native ORC Vectorized 5824 5861 60 2.7 370.3 1.5X -Native ORC Vectorized (Pushdown) 492 512 38 32.0 31.3 17.2X +Parquet Vectorized 9731 10023 498 1.6 618.7 1.0X +Parquet Vectorized (Pushdown) 549 584 33 28.7 34.9 17.7X +Native ORC Vectorized 5916 5945 19 2.7 376.2 1.6X +Native ORC Vectorized (Pushdown) 482 510 40 32.7 30.6 20.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8511 8560 78 1.8 541.1 1.0X -Parquet Vectorized (Pushdown) 575 584 8 27.4 36.5 14.8X -Native ORC Vectorized 5869 5881 7 2.7 373.1 1.5X -Native ORC Vectorized (Pushdown) 495 520 52 31.8 31.4 17.2X +Parquet Vectorized 9773 10040 468 1.6 621.4 1.0X +Parquet Vectorized (Pushdown) 546 550 6 28.8 34.7 17.9X +Native ORC Vectorized 5765 5958 110 2.7 366.5 1.7X +Native ORC Vectorized (Pushdown) 477 535 49 33.0 30.3 20.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8495 8523 46 1.9 540.1 1.0X -Parquet Vectorized (Pushdown) 569 577 11 27.7 36.2 14.9X -Native ORC Vectorized 5881 5883 2 2.7 373.9 1.4X -Native ORC Vectorized (Pushdown) 484 503 33 32.5 30.8 17.5X +Parquet Vectorized 9747 10046 494 1.6 619.7 1.0X +Parquet Vectorized (Pushdown) 541 551 16 29.1 34.4 18.0X +Native ORC Vectorized 5983 6001 17 2.6 380.4 1.6X +Native ORC Vectorized (Pushdown) 475 517 46 33.1 30.2 20.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8489 8510 28 1.9 539.7 1.0X -Parquet Vectorized (Pushdown) 570 576 9 27.6 36.3 14.9X -Native ORC Vectorized 5883 5889 4 2.7 374.1 1.4X -Native ORC Vectorized (Pushdown) 490 508 31 32.1 31.1 17.3X +Parquet Vectorized 9721 9838 98 1.6 618.1 1.0X +Parquet Vectorized (Pushdown) 548 564 15 28.7 34.9 17.7X +Native ORC Vectorized 5975 5982 6 2.6 379.9 1.6X +Native ORC Vectorized (Pushdown) 479 500 40 32.9 30.4 20.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8492 8502 6 1.9 539.9 1.0X -Parquet Vectorized (Pushdown) 570 579 9 27.6 36.3 14.9X -Native ORC Vectorized 5879 5887 5 2.7 373.8 1.4X -Native ORC Vectorized (Pushdown) 487 507 34 32.3 31.0 17.4X +Parquet Vectorized 9267 9697 243 1.7 589.2 1.0X +Parquet Vectorized (Pushdown) 543 556 15 28.9 34.5 17.1X +Native ORC Vectorized 5821 5963 85 2.7 370.1 1.6X +Native ORC Vectorized (Pushdown) 477 525 49 33.0 30.3 19.4X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9264 9538 551 1.7 589.0 1.0X -Parquet Vectorized (Pushdown) 2118 2142 15 7.4 134.6 4.4X -Native ORC Vectorized 6645 6662 18 2.4 422.5 1.4X -Native ORC Vectorized (Pushdown) 1807 1831 15 8.7 114.9 5.1X +Parquet Vectorized 10522 10682 246 1.5 669.0 1.0X +Parquet Vectorized (Pushdown) 2111 2214 61 7.5 134.2 5.0X +Native ORC Vectorized 6657 6731 52 2.4 423.3 1.6X +Native ORC Vectorized (Pushdown) 1733 1813 72 9.1 110.2 6.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11961 11975 9 1.3 760.5 1.0X -Parquet Vectorized (Pushdown) 8018 8027 7 2.0 509.8 1.5X -Native ORC Vectorized 9329 9340 8 1.7 593.2 1.3X -Native ORC Vectorized (Pushdown) 6739 6747 6 2.3 428.5 1.8X +Parquet Vectorized 13171 13221 75 1.2 837.4 1.0X +Parquet Vectorized (Pushdown) 8576 8601 17 1.8 545.2 1.5X +Native ORC Vectorized 8951 9351 230 1.8 569.1 1.5X +Native ORC Vectorized (Pushdown) 6567 6735 97 2.4 417.5 2.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 14621 14635 13 1.1 929.6 1.0X -Parquet Vectorized (Pushdown) 13884 13894 11 1.1 882.7 1.1X -Native ORC Vectorized 12085 12111 21 1.3 768.3 1.2X -Native ORC Vectorized (Pushdown) 11715 11723 6 1.3 744.8 1.2X +Parquet Vectorized 14256 15569 737 1.1 906.4 1.0X +Parquet Vectorized (Pushdown) 14932 14996 87 1.1 949.3 1.0X +Native ORC Vectorized 12178 12304 92 1.3 774.2 1.2X +Native ORC Vectorized (Pushdown) 11609 11719 94 1.4 738.1 1.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15248 15266 12 1.0 969.4 1.0X -Parquet Vectorized (Pushdown) 15311 15320 7 1.0 973.4 1.0X -Native ORC Vectorized 12631 12650 18 1.2 803.1 1.2X -Native ORC Vectorized (Pushdown) 12816 12845 22 1.2 814.8 1.2X +Parquet Vectorized 16179 16414 134 1.0 1028.6 1.0X +Parquet Vectorized (Pushdown) 16469 16502 35 1.0 1047.0 1.0X +Native ORC Vectorized 12527 12717 170 1.3 796.5 1.3X +Native ORC Vectorized (Pushdown) 12259 12758 324 1.3 779.4 1.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15223 15238 20 1.0 967.8 1.0X -Parquet Vectorized (Pushdown) 15273 15284 11 1.0 971.0 1.0X -Native ORC Vectorized 12636 12645 6 1.2 803.4 1.2X -Native ORC Vectorized (Pushdown) 12808 12825 10 1.2 814.3 1.2X +Parquet Vectorized 16485 16519 32 1.0 1048.1 1.0X +Parquet Vectorized (Pushdown) 16313 16530 155 1.0 1037.1 1.0X +Native ORC Vectorized 12357 12592 147 1.3 785.6 1.3X +Native ORC Vectorized (Pushdown) 12212 12625 237 1.3 776.4 1.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15277 15403 257 1.0 971.3 1.0X -Parquet Vectorized (Pushdown) 15342 15354 15 1.0 975.4 1.0X -Native ORC Vectorized 12634 12644 10 1.2 803.2 1.2X -Native ORC Vectorized (Pushdown) 12805 12820 14 1.2 814.1 1.2X +Parquet Vectorized 14628 15501 735 1.1 930.0 1.0X +Parquet Vectorized (Pushdown) 16459 16488 34 1.0 1046.5 0.9X +Native ORC Vectorized 12628 12723 60 1.2 802.9 1.2X +Native ORC Vectorized (Pushdown) 11419 12132 626 1.4 726.0 1.3X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8114 8948 1033 1.9 515.9 1.0X -Parquet Vectorized (Pushdown) 498 503 8 31.6 31.6 16.3X -Native ORC Vectorized 7355 7906 1198 2.1 467.6 1.1X -Native ORC Vectorized (Pushdown) 893 917 45 17.6 56.8 9.1X +Parquet Vectorized 9249 9597 435 1.7 588.0 1.0X +Parquet Vectorized (Pushdown) 471 484 26 33.4 29.9 19.6X +Native ORC Vectorized 7615 8225 1196 2.1 484.1 1.2X +Native ORC Vectorized (Pushdown) 872 934 66 18.0 55.4 10.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8262 8279 16 1.9 525.3 1.0X -Parquet Vectorized (Pushdown) 505 510 9 31.1 32.1 16.3X -Native ORC Vectorized 7631 7653 25 2.1 485.1 1.1X -Native ORC Vectorized (Pushdown) 902 918 32 17.4 57.3 9.2X +Parquet Vectorized 9078 9471 226 1.7 577.2 1.0X +Parquet Vectorized (Pushdown) 478 490 15 32.9 30.4 19.0X +Native ORC Vectorized 7763 7823 60 2.0 493.6 1.2X +Native ORC Vectorized (Pushdown) 816 892 77 19.3 51.9 11.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8178 8195 14 1.9 520.0 1.0X -Parquet Vectorized (Pushdown) 565 575 11 27.8 35.9 14.5X -Native ORC Vectorized 7522 7534 12 2.1 478.2 1.1X -Native ORC Vectorized (Pushdown) 952 983 40 16.5 60.5 8.6X +Parquet Vectorized 8486 9028 394 1.9 539.5 1.0X +Parquet Vectorized (Pushdown) 549 563 13 28.6 34.9 15.4X +Native ORC Vectorized 7788 7844 80 2.0 495.1 1.1X +Native ORC Vectorized (Pushdown) 926 987 46 17.0 58.8 9.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8170 8190 23 1.9 519.4 1.0X -Parquet Vectorized (Pushdown) 557 566 9 28.2 35.4 14.7X -Native ORC Vectorized 7537 7550 11 2.1 479.2 1.1X -Native ORC Vectorized (Pushdown) 946 977 42 16.6 60.1 8.6X +Parquet Vectorized 9384 10306 NaN 1.7 596.6 1.0X +Parquet Vectorized (Pushdown) 482 521 32 32.6 30.6 19.5X +Native ORC Vectorized 7479 7771 167 2.1 475.5 1.3X +Native ORC Vectorized (Pushdown) 922 946 38 17.1 58.6 10.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8274 8287 10 1.9 526.1 1.0X -Parquet Vectorized (Pushdown) 559 568 10 28.1 35.6 14.8X -Native ORC Vectorized 7638 7649 14 2.1 485.6 1.1X -Native ORC Vectorized (Pushdown) 951 985 43 16.5 60.4 8.7X +Parquet Vectorized 8984 9473 301 1.8 571.2 1.0X +Parquet Vectorized (Pushdown) 492 541 29 32.0 31.3 18.3X +Native ORC Vectorized 7459 7812 198 2.1 474.2 1.2X +Native ORC Vectorized (Pushdown) 928 990 79 17.0 59.0 9.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 16320 16340 21 1.0 1037.6 1.0X -Parquet Vectorized (Pushdown) 16350 16377 17 1.0 1039.5 1.0X -Native ORC Vectorized 15504 15519 12 1.0 985.7 1.1X -Native ORC Vectorized (Pushdown) 15903 15915 14 1.0 1011.1 1.0X +Parquet Vectorized 17691 17789 84 0.9 1124.7 1.0X +Parquet Vectorized (Pushdown) 17775 17863 72 0.9 1130.1 1.0X +Native ORC Vectorized 15730 15964 203 1.0 1000.1 1.1X +Native ORC Vectorized (Pushdown) 16405 16521 106 1.0 1043.0 1.1X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9291 10470 1189 1.7 590.7 1.0X -Parquet Vectorized (Pushdown) 1382 1393 10 11.4 87.9 6.7X -Native ORC Vectorized 6549 7152 1317 2.4 416.4 1.4X -Native ORC Vectorized (Pushdown) 6724 6728 3 2.3 427.5 1.4X +Parquet Vectorized 11328 12986 1138 1.4 720.2 1.0X +Parquet Vectorized (Pushdown) 1315 1387 74 12.0 83.6 8.6X +Native ORC Vectorized 6337 7318 NaN 2.5 402.9 1.8X +Native ORC Vectorized (Pushdown) 6808 6869 89 2.3 432.8 1.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9066 9124 110 1.7 576.4 1.0X -Parquet Vectorized (Pushdown) 575 583 10 27.3 36.6 15.8X -Native ORC Vectorized 6364 6412 86 2.5 404.6 1.4X -Native ORC Vectorized (Pushdown) 6548 6559 11 2.4 416.3 1.4X +Parquet Vectorized 10598 10792 374 1.5 673.8 1.0X +Parquet Vectorized (Pushdown) 545 555 13 28.9 34.6 19.5X +Native ORC Vectorized 6241 6513 182 2.5 396.8 1.7X +Native ORC Vectorized (Pushdown) 6337 6632 176 2.5 402.9 1.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9056 9110 104 1.7 575.7 1.0X -Parquet Vectorized (Pushdown) 568 577 15 27.7 36.1 15.9X -Native ORC Vectorized 6354 6364 10 2.5 403.9 1.4X -Native ORC Vectorized (Pushdown) 6527 6533 9 2.4 415.0 1.4X +Parquet Vectorized 9642 10349 602 1.6 613.0 1.0X +Parquet Vectorized (Pushdown) 482 547 46 32.6 30.6 20.0X +Native ORC Vectorized 6162 6470 193 2.6 391.8 1.6X +Native ORC Vectorized (Pushdown) 6634 6655 22 2.4 421.8 1.5X + + +================================================================================================ +Pushdown benchmark for StringEndsWith +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 9713 10936 981 1.6 617.6 1.0X +Parquet Vectorized (Pushdown) 633 682 38 24.9 40.2 15.3X +Native ORC Vectorized 7075 8071 1484 2.2 449.8 1.4X +Native ORC Vectorized (Pushdown) 8120 8160 34 1.9 516.3 1.2X + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 8963 9371 232 1.8 569.8 1.0X +Parquet Vectorized (Pushdown) 539 554 18 29.2 34.3 16.6X +Native ORC Vectorized 7745 7802 45 2.0 492.4 1.2X +Native ORC Vectorized (Pushdown) 7912 8118 147 2.0 503.0 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +----------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 9440 9500 65 1.7 600.2 1.0X +Parquet Vectorized (Pushdown) 538 549 20 29.2 34.2 17.6X +Native ORC Vectorized 7002 7473 303 2.2 445.2 1.3X +Native ORC Vectorized (Pushdown) 8052 8098 52 2.0 511.9 1.2X + + +================================================================================================ +Pushdown benchmark for StringContains +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 9782 10528 968 1.6 621.9 1.0X +Parquet Vectorized (Pushdown) 1297 1317 14 12.1 82.4 7.5X +Native ORC Vectorized 7995 8568 1153 2.0 508.3 1.2X +Native ORC Vectorized (Pushdown) 7814 8229 232 2.0 496.8 1.3X + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +---------------------------------------------------------------------------------------------------------------------------- +Parquet Vectorized 9408 9438 29 1.7 598.1 1.0X +Parquet Vectorized (Pushdown) 538 553 17 29.2 34.2 17.5X +Native ORC Vectorized 7779 7847 91 2.0 494.6 1.2X +Native ORC Vectorized (Pushdown) 7669 7970 183 2.1 487.6 1.2X + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------------ +Parquet Vectorized 9413 9491 137 1.7 598.4 1.0X +Parquet Vectorized (Pushdown) 530 542 17 29.7 33.7 17.7X +Native ORC Vectorized 7626 7813 106 2.1 484.9 1.2X +Native ORC Vectorized (Pushdown) 8053 8139 105 2.0 512.0 1.2X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3530 3593 113 4.5 224.4 1.0X -Parquet Vectorized (Pushdown) 135 141 12 116.3 8.6 26.1X -Native ORC Vectorized 4375 4457 146 3.6 278.1 0.8X -Native ORC Vectorized (Pushdown) 169 180 35 93.2 10.7 20.9X +Parquet Vectorized 4796 4822 33 3.3 304.9 1.0X +Parquet Vectorized (Pushdown) 136 142 9 115.8 8.6 35.3X +Native ORC Vectorized 4364 4515 87 3.6 277.5 1.1X +Native ORC Vectorized (Pushdown) 167 178 20 94.2 10.6 28.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4919 4930 12 3.2 312.7 1.0X -Parquet Vectorized (Pushdown) 2211 2217 8 7.1 140.6 2.2X -Native ORC Vectorized 5758 5762 2 2.7 366.1 0.9X -Native ORC Vectorized (Pushdown) 2389 2400 7 6.6 151.9 2.1X +Parquet Vectorized 6219 6274 48 2.5 395.4 1.0X +Parquet Vectorized (Pushdown) 2259 2493 136 7.0 143.6 2.8X +Native ORC Vectorized 5867 5925 57 2.7 373.0 1.1X +Native ORC Vectorized (Pushdown) 2285 2437 130 6.9 145.3 2.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9490 9502 12 1.7 603.4 1.0X -Parquet Vectorized (Pushdown) 9048 9057 13 1.7 575.2 1.0X -Native ORC Vectorized 10320 10335 13 1.5 656.1 0.9X -Native ORC Vectorized (Pushdown) 9816 9827 11 1.6 624.1 1.0X +Parquet Vectorized 10201 10778 341 1.5 648.6 1.0X +Parquet Vectorized (Pushdown) 9422 9912 362 1.7 599.0 1.1X +Native ORC Vectorized 10277 10550 170 1.5 653.4 1.0X +Native ORC Vectorized (Pushdown) 9784 9985 161 1.6 622.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10666 10694 25 1.5 678.1 1.0X -Parquet Vectorized (Pushdown) 10693 10705 13 1.5 679.9 1.0X -Native ORC Vectorized 13144 13155 9 1.2 835.7 0.8X -Native ORC Vectorized (Pushdown) 13208 13219 15 1.2 839.8 0.8X +Parquet Vectorized 11899 12161 163 1.3 756.5 1.0X +Parquet Vectorized (Pushdown) 11348 12024 409 1.4 721.5 1.0X +Native ORC Vectorized 11676 11822 93 1.3 742.4 1.0X +Native ORC Vectorized (Pushdown) 11736 11847 90 1.3 746.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3672 3696 20 4.3 233.5 1.0X -Parquet Vectorized (Pushdown) 136 140 5 115.5 8.7 27.0X -Native ORC Vectorized 4377 4392 12 3.6 278.3 0.8X -Native ORC Vectorized (Pushdown) 164 170 15 95.8 10.4 22.4X +Parquet Vectorized 4986 4990 5 3.2 317.0 1.0X +Parquet Vectorized (Pushdown) 138 143 10 114.3 8.8 36.2X +Native ORC Vectorized 4586 4649 49 3.4 291.5 1.1X +Native ORC Vectorized (Pushdown) 163 176 27 96.3 10.4 30.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4430 4458 34 3.6 281.7 1.0X -Parquet Vectorized (Pushdown) 1231 1239 6 12.8 78.3 3.6X -Native ORC Vectorized 5101 5106 7 3.1 324.3 0.9X -Native ORC Vectorized (Pushdown) 1302 1311 7 12.1 82.8 3.4X +Parquet Vectorized 5753 5836 76 2.7 365.8 1.0X +Parquet Vectorized (Pushdown) 1358 1380 19 11.6 86.3 4.2X +Native ORC Vectorized 5117 5342 140 3.1 325.3 1.1X +Native ORC Vectorized (Pushdown) 1311 1337 36 12.0 83.3 4.4X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7353 7373 15 2.1 467.5 1.0X -Parquet Vectorized (Pushdown) 5592 5601 7 2.8 355.5 1.3X -Native ORC Vectorized 7943 7957 10 2.0 505.0 0.9X -Native ORC Vectorized (Pushdown) 5870 5880 13 2.7 373.2 1.3X +Parquet Vectorized 8633 8649 21 1.8 548.9 1.0X +Parquet Vectorized (Pushdown) 5565 5736 371 2.8 353.8 1.6X +Native ORC Vectorized 8083 8113 25 1.9 513.9 1.1X +Native ORC Vectorized (Pushdown) 5767 5968 114 2.7 366.7 1.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10221 10235 14 1.5 649.9 1.0X -Parquet Vectorized (Pushdown) 9875 9907 29 1.6 627.8 1.0X -Native ORC Vectorized 10756 10768 8 1.5 683.8 1.0X -Native ORC Vectorized (Pushdown) 10389 10400 9 1.5 660.5 1.0X +Parquet Vectorized 10562 11064 425 1.5 671.5 1.0X +Parquet Vectorized (Pushdown) 10224 10722 393 1.5 650.1 1.0X +Native ORC Vectorized 10843 10862 22 1.5 689.4 1.0X +Native ORC Vectorized (Pushdown) 10148 10381 177 1.5 645.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5580 5603 20 2.8 354.8 1.0X -Parquet Vectorized (Pushdown) 149 152 4 105.4 9.5 37.4X -Native ORC Vectorized 4389 4399 7 3.6 279.0 1.3X -Native ORC Vectorized (Pushdown) 164 171 18 95.9 10.4 34.0X +Parquet Vectorized 6258 6872 366 2.5 397.8 1.0X +Parquet Vectorized (Pushdown) 147 152 8 107.0 9.3 42.6X +Native ORC Vectorized 4590 4651 50 3.4 291.8 1.4X +Native ORC Vectorized (Pushdown) 152 179 22 103.3 9.7 41.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6516 6534 16 2.4 414.3 1.0X -Parquet Vectorized (Pushdown) 1620 1623 3 9.7 103.0 4.0X -Native ORC Vectorized 5284 5288 6 3.0 335.9 1.2X -Native ORC Vectorized (Pushdown) 1466 1473 12 10.7 93.2 4.4X +Parquet Vectorized 7711 7916 173 2.0 490.3 1.0X +Parquet Vectorized (Pushdown) 1751 1773 18 9.0 111.3 4.4X +Native ORC Vectorized 5327 5464 81 3.0 338.7 1.4X +Native ORC Vectorized (Pushdown) 1481 1499 36 10.6 94.1 5.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10168 10189 18 1.5 646.4 1.0X -Parquet Vectorized (Pushdown) 7472 7517 33 2.1 475.0 1.4X -Native ORC Vectorized 8782 8795 11 1.8 558.4 1.2X -Native ORC Vectorized (Pushdown) 6671 6696 21 2.4 424.2 1.5X +Parquet Vectorized 11614 11665 56 1.4 738.4 1.0X +Parquet Vectorized (Pushdown) 7807 8159 237 2.0 496.3 1.5X +Native ORC Vectorized 8932 8998 129 1.8 567.9 1.3X +Native ORC Vectorized (Pushdown) 6776 6841 48 2.3 430.8 1.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 13808 14420 1196 1.1 877.9 1.0X -Parquet Vectorized (Pushdown) 13319 13391 90 1.2 846.8 1.0X -Native ORC Vectorized 12205 12227 24 1.3 776.0 1.1X -Native ORC Vectorized (Pushdown) 11830 11860 28 1.3 752.2 1.2X +Parquet Vectorized 14324 14958 451 1.1 910.7 1.0X +Parquet Vectorized (Pushdown) 14529 14614 64 1.1 923.7 1.0X +Native ORC Vectorized 12359 12471 113 1.3 785.8 1.2X +Native ORC Vectorized (Pushdown) 11961 12048 66 1.3 760.5 1.2X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8737 9914 NaN 1.8 555.5 1.0X -Parquet Vectorized (Pushdown) 585 618 72 26.9 37.2 14.9X -Native ORC Vectorized 6016 6776 1444 2.6 382.5 1.5X -Native ORC Vectorized (Pushdown) 497 536 81 31.6 31.6 17.6X +Parquet Vectorized 9590 11618 NaN 1.6 609.7 1.0X +Parquet Vectorized (Pushdown) 558 569 12 28.2 35.5 17.2X +Native ORC Vectorized 5954 6612 1410 2.6 378.5 1.6X +Native ORC Vectorized (Pushdown) 441 485 53 35.6 28.1 21.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8522 8575 45 1.8 541.8 1.0X -Parquet Vectorized (Pushdown) 583 590 7 27.0 37.1 14.6X -Native ORC Vectorized 5907 5917 12 2.7 375.6 1.4X -Native ORC Vectorized (Pushdown) 498 544 65 31.6 31.7 17.1X +Parquet Vectorized 9258 9693 281 1.7 588.6 1.0X +Parquet Vectorized (Pushdown) 563 584 15 28.0 35.8 16.5X +Native ORC Vectorized 5926 5974 52 2.7 376.8 1.6X +Native ORC Vectorized (Pushdown) 486 523 47 32.4 30.9 19.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8505 8600 154 1.8 540.7 1.0X -Parquet Vectorized (Pushdown) 585 589 8 26.9 37.2 14.5X -Native ORC Vectorized 5909 5927 21 2.7 375.7 1.4X -Native ORC Vectorized (Pushdown) 498 518 40 31.6 31.6 17.1X +Parquet Vectorized 9787 9944 149 1.6 622.2 1.0X +Parquet Vectorized (Pushdown) 558 564 7 28.2 35.5 17.5X +Native ORC Vectorized 5954 6015 62 2.6 378.5 1.6X +Native ORC Vectorized (Pushdown) 485 524 45 32.4 30.8 20.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8520 8631 184 1.8 541.7 1.0X -Parquet Vectorized (Pushdown) 605 610 6 26.0 38.5 14.1X -Native ORC Vectorized 5925 5936 11 2.7 376.7 1.4X -Native ORC Vectorized (Pushdown) 515 534 35 30.5 32.8 16.5X +Parquet Vectorized 9798 10304 801 1.6 622.9 1.0X +Parquet Vectorized (Pushdown) 584 599 17 26.9 37.1 16.8X +Native ORC Vectorized 5969 5984 13 2.6 379.5 1.6X +Native ORC Vectorized (Pushdown) 501 519 36 31.4 31.8 19.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8532 8650 223 1.8 542.5 1.0X -Parquet Vectorized (Pushdown) 614 617 3 25.6 39.0 13.9X -Native ORC Vectorized 5951 5963 13 2.6 378.4 1.4X -Native ORC Vectorized (Pushdown) 520 538 36 30.3 33.0 16.4X +Parquet Vectorized 9452 9777 202 1.7 600.9 1.0X +Parquet Vectorized (Pushdown) 581 595 17 27.1 37.0 16.3X +Native ORC Vectorized 5968 5988 36 2.6 379.4 1.6X +Native ORC Vectorized (Pushdown) 504 522 32 31.2 32.1 18.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8515 8550 27 1.8 541.3 1.0X -Parquet Vectorized (Pushdown) 614 617 5 25.6 39.0 13.9X -Native ORC Vectorized 5940 5950 8 2.6 377.7 1.4X -Native ORC Vectorized (Pushdown) 513 532 33 30.6 32.6 16.6X +Parquet Vectorized 9607 9792 170 1.6 610.8 1.0X +Parquet Vectorized (Pushdown) 527 573 45 29.8 33.5 18.2X +Native ORC Vectorized 5851 6087 133 2.7 372.0 1.6X +Native ORC Vectorized (Pushdown) 500 521 38 31.4 31.8 19.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8854 8872 18 1.8 563.0 1.0X -Parquet Vectorized (Pushdown) 1390 1396 6 11.3 88.4 6.4X -Native ORC Vectorized 6241 6343 68 2.5 396.8 1.4X -Native ORC Vectorized (Pushdown) 641 668 31 24.5 40.8 13.8X +Parquet Vectorized 9469 9996 298 1.7 602.0 1.0X +Parquet Vectorized (Pushdown) 1480 1493 14 10.6 94.1 6.4X +Native ORC Vectorized 6265 6278 17 2.5 398.3 1.5X +Native ORC Vectorized (Pushdown) 623 691 38 25.2 39.6 15.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8859 8863 3 1.8 563.2 1.0X -Parquet Vectorized (Pushdown) 4449 4454 5 3.5 282.9 2.0X -Native ORC Vectorized 6366 6379 11 2.5 404.7 1.4X -Native ORC Vectorized (Pushdown) 654 675 39 24.1 41.6 13.6X +Parquet Vectorized 9566 10022 334 1.6 608.2 1.0X +Parquet Vectorized (Pushdown) 4660 5049 224 3.4 296.3 2.1X +Native ORC Vectorized 6267 6303 50 2.5 398.5 1.5X +Native ORC Vectorized (Pushdown) 656 704 42 24.0 41.7 14.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8836 8845 13 1.8 561.8 1.0X -Parquet Vectorized (Pushdown) 7934 7941 9 2.0 504.4 1.1X -Native ORC Vectorized 6215 6220 4 2.5 395.1 1.4X -Native ORC Vectorized (Pushdown) 666 698 36 23.6 42.3 13.3X +Parquet Vectorized 9321 9914 371 1.7 592.6 1.0X +Parquet Vectorized (Pushdown) 8505 8702 118 1.8 540.7 1.1X +Native ORC Vectorized 6089 6240 85 2.6 387.1 1.5X +Native ORC Vectorized (Pushdown) 654 695 37 24.0 41.6 14.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8769 8774 6 1.8 557.5 1.0X -Parquet Vectorized (Pushdown) 1379 1391 12 11.4 87.6 6.4X -Native ORC Vectorized 6142 6155 22 2.6 390.5 1.4X -Native ORC Vectorized (Pushdown) 759 785 32 20.7 48.3 11.5X +Parquet Vectorized 9772 9993 127 1.6 621.3 1.0X +Parquet Vectorized (Pushdown) 1345 1466 122 11.7 85.5 7.3X +Native ORC Vectorized 6200 6267 103 2.5 394.2 1.6X +Native ORC Vectorized (Pushdown) 744 783 33 21.1 47.3 13.1X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8763 8833 143 1.8 557.1 1.0X -Parquet Vectorized (Pushdown) 4659 4674 10 3.4 296.2 1.9X -Native ORC Vectorized 6133 6137 4 2.6 389.9 1.4X -Native ORC Vectorized (Pushdown) 847 871 25 18.6 53.8 10.3X +Parquet Vectorized 9880 10074 152 1.6 628.1 1.0X +Parquet Vectorized (Pushdown) 4670 5085 251 3.4 296.9 2.1X +Native ORC Vectorized 5895 6129 131 2.7 374.8 1.7X +Native ORC Vectorized (Pushdown) 849 934 68 18.5 54.0 11.6X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8784 11179 870 1.8 558.5 1.0X -Parquet Vectorized (Pushdown) 7928 7933 6 2.0 504.0 1.1X -Native ORC Vectorized 6158 7225 1010 2.6 391.5 1.4X -Native ORC Vectorized (Pushdown) 852 875 27 18.5 54.1 10.3X +Parquet Vectorized 9556 9926 213 1.6 607.6 1.0X +Parquet Vectorized (Pushdown) 8856 8905 61 1.8 563.0 1.1X +Native ORC Vectorized 6137 6173 51 2.6 390.2 1.6X +Native ORC Vectorized (Pushdown) 836 898 49 18.8 53.2 11.4X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3932 4116 221 4.0 250.0 1.0X -Parquet Vectorized (Pushdown) 184 187 6 85.5 11.7 21.4X -Native ORC Vectorized 2624 2642 35 6.0 166.8 1.5X -Native ORC Vectorized (Pushdown) 219 226 18 71.9 13.9 18.0X +Parquet Vectorized 5197 5225 33 3.0 330.4 1.0X +Parquet Vectorized (Pushdown) 190 195 7 82.8 12.1 27.3X +Native ORC Vectorized 2550 2585 40 6.2 162.2 2.0X +Native ORC Vectorized (Pushdown) 212 228 27 74.3 13.5 24.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4574 4767 400 3.4 290.8 1.0X -Parquet Vectorized (Pushdown) 1162 1171 6 13.5 73.9 3.9X -Native ORC Vectorized 3250 3263 22 4.8 206.6 1.4X -Native ORC Vectorized (Pushdown) 1059 1070 9 14.9 67.3 4.3X +Parquet Vectorized 5806 5866 58 2.7 369.1 1.0X +Parquet Vectorized (Pushdown) 1254 1316 53 12.5 79.7 4.6X +Native ORC Vectorized 3211 3215 4 4.9 204.1 1.8X +Native ORC Vectorized (Pushdown) 1062 1071 12 14.8 67.5 5.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7254 7269 10 2.2 461.2 1.0X -Parquet Vectorized (Pushdown) 5393 5402 14 2.9 342.9 1.3X -Native ORC Vectorized 5798 5821 14 2.7 368.6 1.3X -Native ORC Vectorized (Pushdown) 4622 4627 4 3.4 293.8 1.6X +Parquet Vectorized 8326 8498 123 1.9 529.3 1.0X +Parquet Vectorized (Pushdown) 6037 6106 66 2.6 383.8 1.4X +Native ORC Vectorized 5724 5796 45 2.7 363.9 1.5X +Native ORC Vectorized (Pushdown) 4638 4652 19 3.4 294.9 1.8X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9951 9964 11 1.6 632.7 1.0X -Parquet Vectorized (Pushdown) 9603 9617 25 1.6 610.5 1.0X -Native ORC Vectorized 8411 8432 27 1.9 534.8 1.2X -Native ORC Vectorized (Pushdown) 8218 8226 7 1.9 522.5 1.2X +Parquet Vectorized 10622 11327 418 1.5 675.3 1.0X +Parquet Vectorized (Pushdown) 10144 10719 333 1.6 644.9 1.0X +Native ORC Vectorized 7425 8222 463 2.1 472.1 1.4X +Native ORC Vectorized (Pushdown) 7305 8035 409 2.2 464.5 1.5X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4187 4209 27 3.8 266.2 1.0X -Parquet Vectorized (Pushdown) 4192 4197 5 3.8 266.5 1.0X -Native ORC Vectorized 2748 2762 17 5.7 174.7 1.5X -Native ORC Vectorized (Pushdown) 138 144 16 114.2 8.8 30.4X +Parquet Vectorized 5508 5573 69 2.9 350.2 1.0X +Parquet Vectorized (Pushdown) 5497 5544 89 2.9 349.5 1.0X +Native ORC Vectorized 2420 2525 131 6.5 153.8 2.3X +Native ORC Vectorized (Pushdown) 137 144 15 115.1 8.7 40.3X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4941 4959 12 3.2 314.1 1.0X -Parquet Vectorized (Pushdown) 4940 4950 9 3.2 314.1 1.0X -Native ORC Vectorized 3441 3450 6 4.6 218.8 1.4X -Native ORC Vectorized (Pushdown) 1103 1114 10 14.3 70.1 4.5X +Parquet Vectorized 6255 6330 100 2.5 397.7 1.0X +Parquet Vectorized (Pushdown) 6170 6252 61 2.5 392.3 1.0X +Native ORC Vectorized 3365 3374 7 4.7 213.9 1.9X +Native ORC Vectorized (Pushdown) 959 976 13 16.4 61.0 6.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7869 7883 17 2.0 500.3 1.0X -Parquet Vectorized (Pushdown) 7867 7881 10 2.0 500.2 1.0X -Native ORC Vectorized 6200 6226 20 2.5 394.2 1.3X -Native ORC Vectorized (Pushdown) 4921 4937 18 3.2 312.9 1.6X +Parquet Vectorized 9044 9134 59 1.7 575.0 1.0X +Parquet Vectorized (Pushdown) 8816 8965 146 1.8 560.5 1.0X +Native ORC Vectorized 6038 6053 14 2.6 383.9 1.5X +Native ORC Vectorized (Pushdown) 4790 4810 16 3.3 304.6 1.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10750 10766 9 1.5 683.5 1.0X -Parquet Vectorized (Pushdown) 10737 10758 20 1.5 682.6 1.0X -Native ORC Vectorized 8947 8981 49 1.8 568.9 1.2X -Native ORC Vectorized (Pushdown) 8720 8728 8 1.8 554.4 1.2X +Parquet Vectorized 11786 11979 151 1.3 749.3 1.0X +Parquet Vectorized (Pushdown) 11463 11795 225 1.4 728.8 1.0X +Native ORC Vectorized 8459 8709 156 1.9 537.8 1.4X +Native ORC Vectorized (Pushdown) 7979 8447 425 2.0 507.3 1.5X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3610 3621 11 4.4 229.5 1.0X -Parquet Vectorized (Pushdown) 135 139 6 116.2 8.6 26.7X -Native ORC Vectorized 2741 2753 19 5.7 174.3 1.3X -Native ORC Vectorized (Pushdown) 135 142 16 116.2 8.6 26.7X +Parquet Vectorized 4436 4800 228 3.5 282.1 1.0X +Parquet Vectorized (Pushdown) 137 143 7 114.5 8.7 32.3X +Native ORC Vectorized 2676 2688 15 5.9 170.2 1.7X +Native ORC Vectorized (Pushdown) 134 143 23 117.0 8.5 33.0X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4365 4378 9 3.6 277.5 1.0X -Parquet Vectorized (Pushdown) 1215 1226 9 13.0 77.2 3.6X -Native ORC Vectorized 3434 3444 7 4.6 218.3 1.3X -Native ORC Vectorized (Pushdown) 1095 1107 15 14.4 69.6 4.0X +Parquet Vectorized 5606 5657 60 2.8 356.5 1.0X +Parquet Vectorized (Pushdown) 1334 1349 23 11.8 84.8 4.2X +Native ORC Vectorized 3373 3408 62 4.7 214.5 1.7X +Native ORC Vectorized (Pushdown) 1076 1110 33 14.6 68.4 5.2X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7289 7310 24 2.2 463.4 1.0X -Parquet Vectorized (Pushdown) 5567 5597 23 2.8 353.9 1.3X -Native ORC Vectorized 6204 6243 33 2.5 394.4 1.2X -Native ORC Vectorized (Pushdown) 4916 4938 21 3.2 312.6 1.5X +Parquet Vectorized 8446 8493 50 1.9 537.0 1.0X +Parquet Vectorized (Pushdown) 6001 6108 67 2.6 381.5 1.4X +Native ORC Vectorized 6034 6082 37 2.6 383.7 1.4X +Native ORC Vectorized (Pushdown) 4791 4806 13 3.3 304.6 1.8X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10119 10148 20 1.6 643.4 1.0X -Parquet Vectorized (Pushdown) 9798 9824 21 1.6 622.9 1.0X -Native ORC Vectorized 8953 8966 7 1.8 569.2 1.1X -Native ORC Vectorized (Pushdown) 8730 8756 25 1.8 555.0 1.2X +Parquet Vectorized 11219 11315 84 1.4 713.3 1.0X +Parquet Vectorized (Pushdown) 10295 10716 264 1.5 654.5 1.1X +Native ORC Vectorized 8693 8798 112 1.8 552.7 1.3X +Native ORC Vectorized (Pushdown) 7998 8362 239 2.0 508.5 1.4X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3782 3826 86 4.2 240.4 1.0X -Parquet Vectorized (Pushdown) 135 138 6 116.4 8.6 28.0X -Native ORC Vectorized 2733 2736 3 5.8 173.8 1.4X -Native ORC Vectorized (Pushdown) 136 142 16 115.3 8.7 27.7X +Parquet Vectorized 4622 5010 228 3.4 293.8 1.0X +Parquet Vectorized (Pushdown) 121 136 15 129.7 7.7 38.1X +Native ORC Vectorized 2395 2602 118 6.6 152.3 1.9X +Native ORC Vectorized (Pushdown) 133 141 21 118.2 8.5 34.7X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4521 4535 11 3.5 287.4 1.0X -Parquet Vectorized (Pushdown) 1233 1244 9 12.8 78.4 3.7X -Native ORC Vectorized 3428 3434 7 4.6 218.0 1.3X -Native ORC Vectorized (Pushdown) 1094 1106 12 14.4 69.6 4.1X +Parquet Vectorized 5694 5797 68 2.8 362.0 1.0X +Parquet Vectorized (Pushdown) 1296 1338 26 12.1 82.4 4.4X +Native ORC Vectorized 3367 3408 52 4.7 214.1 1.7X +Native ORC Vectorized (Pushdown) 960 1047 58 16.4 61.0 5.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7434 7449 16 2.1 472.6 1.0X -Parquet Vectorized (Pushdown) 5631 5664 23 2.8 358.0 1.3X -Native ORC Vectorized 6194 6213 18 2.5 393.8 1.2X -Native ORC Vectorized (Pushdown) 4912 4937 27 3.2 312.3 1.5X +Parquet Vectorized 8593 8688 77 1.8 546.3 1.0X +Parquet Vectorized (Pushdown) 6022 6181 132 2.6 382.9 1.4X +Native ORC Vectorized 5730 6013 195 2.7 364.3 1.5X +Native ORC Vectorized (Pushdown) 4636 4813 103 3.4 294.8 1.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10277 10290 12 1.5 653.4 1.0X -Parquet Vectorized (Pushdown) 9939 9949 9 1.6 631.9 1.0X -Native ORC Vectorized 8925 8932 5 1.8 567.4 1.2X -Native ORC Vectorized (Pushdown) 8711 8722 13 1.8 553.8 1.2X +Parquet Vectorized 11111 11267 98 1.4 706.4 1.0X +Parquet Vectorized (Pushdown) 10828 10901 83 1.5 688.4 1.0X +Native ORC Vectorized 7966 8554 377 2.0 506.5 1.4X +Native ORC Vectorized (Pushdown) 8306 8453 131 1.9 528.1 1.3X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 177 180 4 0.0 176597527.0 1.0X -Parquet Vectorized (Pushdown) 179 184 8 0.0 179049335.0 1.0X -Native ORC Vectorized 165 169 5 0.0 165392336.0 1.1X -Native ORC Vectorized (Pushdown) 177 181 5 0.0 177191031.0 1.0X +Parquet Vectorized 165 181 9 0.0 165274170.0 1.0X +Parquet Vectorized (Pushdown) 182 193 22 0.0 182084552.0 0.9X +Native ORC Vectorized 154 169 10 0.0 153949658.0 1.1X +Native ORC Vectorized (Pushdown) 183 188 5 0.0 183334682.0 0.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1390 1660 299 0.0 1389636669.0 1.0X -Parquet Vectorized (Pushdown) 1448 1467 16 0.0 1447963287.0 1.0X -Native ORC Vectorized 1371 1396 19 0.0 1370726861.0 1.0X -Native ORC Vectorized (Pushdown) 1388 1415 20 0.0 1387966614.0 1.0X +Parquet Vectorized 1655 2069 688 0.0 1655292270.0 1.0X +Parquet Vectorized (Pushdown) 1910 1918 9 0.0 1909884497.0 0.9X +Native ORC Vectorized 1848 1889 41 0.0 1847853824.0 0.9X +Native ORC Vectorized (Pushdown) 1862 1868 5 0.0 1861974825.0 0.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.13.0-1021-azure +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6277 7015 918 0.0 6276800820.0 1.0X -Parquet Vectorized (Pushdown) 6398 6420 13 0.0 6398363663.0 1.0X -Native ORC Vectorized 6154 6202 28 0.0 6153717451.0 1.0X -Native ORC Vectorized (Pushdown) 6150 6191 29 0.0 6150438862.0 1.0X +Parquet Vectorized 7721 8802 1054 0.0 7720771648.0 1.0X +Parquet Vectorized (Pushdown) 8444 8628 173 0.0 8443708092.0 0.9X +Native ORC Vectorized 7854 8339 283 0.0 7854189202.0 1.0X +Native ORC Vectorized (Pushdown) 7812 8325 310 0.0 7811643781.0 1.0X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk11-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk11-results.txt index 0e1a6d504d..142572bb69 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk11-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 1460 1941 680 0.0 Infinity 1.0X +q1 1339 1579 339 0.3 2903.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 2422 2665 344 0.0 Infinity 1.0X +q2 1321 1500 253 1.7 591.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 566 578 8 0.0 Infinity 1.0X +q3 463 514 48 6.4 156.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 15396 15718 456 0.0 Infinity 1.0X +q4 6635 7140 715 0.8 1273.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 3251 3670 592 0.0 Infinity 1.0X +q5 1638 1754 164 3.4 291.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 2139 2232 131 0.0 Infinity 1.0X +q6 2024 2407 542 1.5 648.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 1285 1365 113 0.0 Infinity 1.0X +q7 926 935 7 5.3 189.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 1038 1085 67 0.0 Infinity 1.0X +q8 1103 1197 133 2.8 355.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 2709 2729 28 0.0 Infinity 1.0X +q9 1444 1493 69 0.0 41248439.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 5975 6075 140 0.0 Infinity 1.0X +q10 3069 3244 247 0.7 1482.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 3569 4018 635 0.0 Infinity 1.0X +q11 2494 2670 248 1.5 661.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 503 541 45 0.0 Infinity 1.0X +q12 270 320 59 3.0 333.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 2950 3044 132 0.0 Infinity 1.0X +q13 1331 1484 217 3.7 270.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 24716 25725 1427 0.0 Infinity 1.0X +q14a 7211 8039 1171 0.7 1405.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 20165 20747 822 0.0 Infinity 1.0X +q14b 5828 5960 188 0.9 1136.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 837 896 55 0.0 Infinity 1.0X +q15 565 605 53 2.9 339.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 2124 2190 93 0.0 Infinity 1.0X +q16 1353 1502 211 1.2 865.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 3178 3351 245 0.0 Infinity 1.0X +q17 2772 2931 225 1.7 589.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 2523 2653 184 0.0 Infinity 1.0X +q18 1854 1915 86 1.9 514.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 818 876 53 0.0 Infinity 1.0X +q19 535 693 150 5.8 171.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 513 521 7 0.0 Infinity 1.0X +q20 448 472 20 3.4 292.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 1458 1496 54 0.0 Infinity 1.0X +q21 1112 1214 144 10.6 94.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 4247 4364 166 0.0 Infinity 1.0X +q22 4472 4719 349 2.6 377.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 15449 15516 95 0.0 Infinity 1.0X +q23a 7856 8054 279 0.7 1502.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 18832 19116 401 0.0 Infinity 1.0X +q23b 8528 8742 303 0.6 1630.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 3190 3852 937 0.0 Infinity 1.0X +q24a 583 702 107 5.7 174.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 3326 3374 68 0.0 Infinity 1.0X +q24b 473 577 77 7.0 141.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 3145 3174 40 0.0 Infinity 1.0X +q25 2511 2685 246 1.9 534.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 852 879 25 0.0 Infinity 1.0X +q26 569 592 19 6.1 164.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 1392 1393 1 0.0 Infinity 1.0X +q27 1140 1158 25 4.3 233.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 3913 3932 27 0.0 Infinity 1.0X +q28 2298 2427 182 1.3 798.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 3145 3199 77 0.0 Infinity 1.0X +q29 2686 2789 145 1.7 571.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 1263 1304 57 0.0 Infinity 1.0X +q30 805 901 89 0.4 2730.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 2182 2520 479 0.0 Infinity 1.0X +q31 1833 2313 680 2.0 492.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 641 652 14 0.0 Infinity 1.0X +q32 341 390 61 4.5 222.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 1779 1971 272 0.0 Infinity 1.0X +q33 1137 1221 118 4.6 219.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 828 841 11 0.0 Infinity 1.0X +q34 759 795 58 4.0 247.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 5293 5346 75 0.0 Infinity 1.0X +q35 2550 2866 447 0.8 1231.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 1222 1226 6 0.0 Infinity 1.0X +q36 841 846 5 3.5 283.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 1432 1460 39 0.0 Infinity 1.0X +q37 1483 1569 123 9.0 111.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 1927 2083 221 0.0 Infinity 1.0X +q38 1303 1431 181 4.0 250.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 2762 2911 210 0.0 Infinity 1.0X +q39a 2319 2742 599 5.1 195.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 2710 2841 186 0.0 Infinity 1.0X +q39b 2523 2550 38 4.7 213.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 732 801 64 0.0 Infinity 1.0X +q40 643 781 121 2.6 383.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 412 466 50 0.0 Infinity 1.0X +q41 260 369 73 0.1 14418.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 424 444 22 0.0 Infinity 1.0X +q42 273 368 68 10.9 91.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 673 683 11 0.0 Infinity 1.0X +q43 551 630 55 5.4 186.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 1325 1340 21 0.0 Infinity 1.0X +q44 1026 1057 44 2.8 353.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 552 618 52 0.0 Infinity 1.0X +q45 469 573 154 2.0 488.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 1097 1124 39 0.0 Infinity 1.0X +q46 920 1075 220 3.4 295.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 4876 5064 266 0.0 Infinity 1.0X +q47 2593 2836 344 1.1 872.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 2709 2734 35 0.0 Infinity 1.0X +q48 1213 1227 19 4.1 246.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 2172 2361 267 0.0 Infinity 1.0X +q49 1387 1639 355 4.0 247.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 1467 1516 69 0.0 Infinity 1.0X +q50 1195 1230 49 2.7 368.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 4681 4847 234 0.0 Infinity 1.0X +q51 3414 3868 641 1.1 929.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 423 443 27 0.0 Infinity 1.0X +q52 345 407 44 8.6 116.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 723 741 18 0.0 Infinity 1.0X +q53 646 683 33 4.6 217.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 3656 3675 27 0.0 Infinity 1.0X +q54 2324 2403 112 2.3 440.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 416 450 31 0.0 Infinity 1.0X +q55 260 341 72 11.4 87.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 1552 1585 47 0.0 Infinity 1.0X +q56 990 1020 43 5.2 191.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 2971 3246 388 0.0 Infinity 1.0X +q57 1929 1948 26 0.8 1260.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 1843 2035 271 0.0 Infinity 1.0X +q58 1178 1295 166 4.4 229.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 2128 2177 69 0.0 Infinity 1.0X +q59 1124 1182 82 2.6 380.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 1478 1498 27 0.0 Infinity 1.0X +q60 1009 1069 85 5.1 194.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 1471 1574 145 0.0 Infinity 1.0X +q61 1066 1128 89 2.9 341.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 539 550 11 0.0 Infinity 1.0X +q62 374 419 28 2.1 472.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 738 751 23 0.0 Infinity 1.0X +q63 558 617 66 5.3 187.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 8378 9105 1028 0.0 Infinity 1.0X +q64 4712 5577 1224 1.5 680.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 1642 1685 61 0.0 Infinity 1.0X +q65 1031 1061 43 2.9 347.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 2038 2056 24 0.0 Infinity 1.0X +q66 1090 1444 501 2.1 470.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 10208 10302 133 0.0 Infinity 1.0X +q67 8923 9116 273 0.3 3003.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 1021 1050 42 0.0 Infinity 1.0X +q68 1028 1064 51 3.0 330.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 5244 5326 116 0.0 Infinity 1.0X +q69 2706 2865 224 0.8 1306.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 1441 1448 9 0.0 Infinity 1.0X +q70 994 1012 26 3.0 336.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 1230 1240 13 0.0 Infinity 1.0X +q71 756 811 53 6.9 144.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 21418 22601 1674 0.0 Infinity 1.0X +q72 108953 109207 359 0.1 7098.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 779 783 3 0.0 Infinity 1.0X +q73 619 710 81 4.9 202.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 2947 3332 545 0.0 Infinity 1.0X +q74 2088 2668 820 1.8 553.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 5149 5374 317 0.0 Infinity 1.0X +q75 2545 2998 641 2.2 451.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 969 974 5 0.0 Infinity 1.0X +q76 509 592 57 10.1 99.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 1928 2256 464 0.0 Infinity 1.0X +q77 1463 1475 16 3.8 260.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 4871 5152 397 0.0 Infinity 1.0X +q78 4133 4242 154 1.4 735.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 906 958 57 0.0 Infinity 1.0X +q79 751 831 94 4.1 245.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 3756 4051 417 0.0 Infinity 1.0X +q80 2522 2957 615 2.2 446.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 1124 1152 40 0.0 Infinity 1.0X +q81 964 1022 82 0.4 2627.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1954 1981 39 0.0 Infinity 1.0X +q82 1992 2130 196 7.4 135.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 1150 1159 13 0.0 Infinity 1.0X +q83 695 731 32 0.9 1168.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 1301 1333 46 0.0 Infinity 1.0X +q84 1235 1239 5 1.9 522.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 4009 4176 235 0.0 Infinity 1.0X +q85 2841 3052 298 1.0 1002.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 657 678 25 0.0 Infinity 1.0X +q86 293 322 43 2.8 362.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 2230 2470 339 0.0 Infinity 1.0X +q87 1314 1379 92 4.0 252.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 2772 2959 265 0.0 Infinity 1.0X +q88 2202 2267 92 1.4 740.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 819 856 40 0.0 Infinity 1.0X +q89 609 693 78 4.9 204.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 436 445 8 0.0 Infinity 1.0X +q90 266 330 32 3.0 327.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 753 837 73 0.0 Infinity 1.0X +q91 587 630 34 3.9 255.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 532 557 27 0.0 Infinity 1.0X +q92 297 362 58 2.7 366.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 1280 1304 34 0.0 Infinity 1.0X +q93 762 828 66 4.2 240.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 1034 1072 53 0.0 Infinity 1.0X +q94 737 886 161 1.1 875.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 6223 6526 429 0.0 Infinity 1.0X +q95 8426 8574 209 0.1 10007.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 392 399 7 0.0 Infinity 1.0X +q96 300 340 46 9.9 100.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1845 1932 124 0.0 Infinity 1.0X +q97 2098 2344 348 2.1 477.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 607 643 26 0.0 Infinity 1.0X +q98 604 636 33 4.9 203.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 650 689 51 0.0 Infinity 1.0X +q99 405 480 68 3.7 267.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 5008 5199 270 0.0 Infinity 1.0X +q5a-v2.7 1939 2146 292 2.9 344.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 1873 1930 80 0.0 Infinity 1.0X +q6-v2.7 1901 2128 320 1.6 609.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 4496 4505 13 0.0 Infinity 1.0X +q10a-v2.7 2513 2818 431 0.8 1213.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 3597 3918 454 0.0 Infinity 1.0X +q11-v2.7 2274 2471 279 1.7 602.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 405 432 22 0.0 Infinity 1.0X +q12-v2.7 213 245 32 3.8 263.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 18204 18604 565 0.0 Infinity 1.0X +q14-v2.7 5950 6558 859 0.9 1159.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 116778 117402 883 0.0 Infinity 1.0X +q14a-v2.7 10474 10940 659 0.5 2041.8 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 4616 4850 331 0.0 Infinity 1.0X +q18a-v2.7 3064 3264 283 1.2 850.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 449 487 46 0.0 Infinity 1.0X +q20-v2.7 328 365 35 4.7 214.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 20882 20987 149 0.0 Infinity 1.0X +q22-v2.7 16149 16356 293 0.7 1364.4 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 10305 10646 483 0.0 Infinity 1.0X +q22a-v2.7 3107 3290 259 3.8 262.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 2843 3091 350 0.0 Infinity 1.0X +q24-v2.7 533 574 36 6.3 159.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 2733 2857 177 0.0 Infinity 1.0X +q27a-v2.7 2055 2211 221 2.4 420.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 784 834 43 0.0 Infinity 1.0X +q34-v2.7 678 715 33 4.5 221.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 4648 4900 358 0.0 Infinity 1.0X +q35-v2.7 2592 2650 82 0.8 1251.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 4225 4350 177 0.0 Infinity 1.0X +q35a-v2.7 2176 2329 217 1.0 1050.9 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 2651 2830 253 0.0 Infinity 1.0X +q36a-v2.7 795 808 20 3.7 267.6 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 4884 5022 195 0.0 Infinity 1.0X +q47-v2.7 2348 2770 597 1.3 790.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 2126 2311 262 0.0 Infinity 1.0X +q49-v2.7 1158 1380 314 4.8 206.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 30262 30597 474 0.0 Infinity 1.0X +q51a-v2.7 20194 20996 1135 0.2 5499.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 2962 3086 176 0.0 Infinity 1.0X +q57-v2.7 1601 1783 258 1.0 1045.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 8345 8680 474 0.0 Infinity 1.0X +q64-v2.7 5253 5424 242 1.3 759.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 18924 19300 532 0.0 Infinity 1.0X +q67a-v2.7 11145 11459 443 0.3 3751.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 2575 2677 144 0.0 Infinity 1.0X +q70a-v2.7 1303 1358 78 2.3 441.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 21513 22533 1442 0.0 Infinity 1.0X +q72-v2.7 108129 110913 NaN 0.1 7045.1 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 2891 3182 411 0.0 Infinity 1.0X +q74-v2.7 1874 2366 697 2.0 496.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 5079 5308 324 0.0 Infinity 1.0X +q75-v2.7 2672 2955 401 2.1 474.3 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 4166 4419 357 0.0 Infinity 1.0X +q77a-v2.7 1696 1758 88 3.3 302.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 4992 5258 376 0.0 Infinity 1.0X +q78-v2.7 4100 4483 541 1.4 730.2 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 6027 6309 399 0.0 Infinity 1.0X +q80a-v2.7 3268 3309 58 1.7 579.0 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 1475 1639 232 0.0 Infinity 1.0X +q86a-v2.7 383 453 75 2.1 472.7 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 596 626 34 0.0 Infinity 1.0X +q98-v2.7 520 617 71 5.7 175.0 1.0X diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-jdk17-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk17-results.txt new file mode 100644 index 0000000000..29801027f7 --- /dev/null +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-jdk17-results.txt @@ -0,0 +1,810 @@ +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q1 1033 1152 168 0.4 2239.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q2 1091 1127 50 2.0 488.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q3 319 344 19 9.3 107.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q4 6286 6719 613 0.8 1206.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q5 1569 1904 473 3.6 278.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q6 1535 1679 204 2.0 491.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q7 885 935 54 5.5 181.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q8 660 716 72 4.7 212.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q9 1381 1434 76 0.0 39451080.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q10 2354 2505 214 0.9 1136.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q11 2480 2607 180 1.5 657.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q12 243 267 19 3.3 299.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q13 1066 1096 42 4.6 216.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q14a 7371 7682 441 0.7 1436.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q14b 6048 6229 255 0.8 1179.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q15 565 594 49 2.9 339.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q16 919 1046 179 1.7 587.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q17 2047 2259 300 2.3 435.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q18 1461 1558 137 2.5 405.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q19 530 541 12 5.9 169.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q20 273 285 10 5.6 178.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q21 1003 1017 20 11.8 84.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q22 4546 4606 85 2.6 384.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q23a 8246 8453 294 0.6 1576.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q23b 8717 8809 130 0.6 1666.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q24a 358 440 62 9.3 107.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q24b 205 406 137 16.3 61.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q25 2259 2417 224 2.1 480.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q26 674 700 22 5.1 195.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q27 956 1016 86 5.1 195.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q28 2062 2425 514 1.4 715.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q29 2317 2339 32 2.0 493.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q30 960 970 13 0.3 3257.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q31 1870 2278 577 2.0 502.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q32 467 632 118 3.3 304.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q33 1086 1233 207 4.8 209.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q34 717 749 28 4.3 234.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q35 2308 2440 187 0.9 1114.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q36 955 1004 70 3.1 321.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q37 1416 1449 47 9.4 106.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q38 1282 1308 36 4.1 246.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q39a 2686 2769 118 4.4 226.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q39b 2444 2446 3 4.8 206.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q40 512 599 131 3.3 305.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q41 182 251 77 0.1 10102.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q42 334 367 27 8.9 112.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q43 518 557 40 5.7 175.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q44 596 625 32 4.9 205.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q45 322 359 40 3.0 335.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q46 911 950 54 3.4 293.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q47 2642 2771 183 1.1 889.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q48 1288 1309 30 3.8 261.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q49 1308 1356 68 4.3 233.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q50 1031 1060 41 3.1 318.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q51 3978 4253 388 0.9 1083.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q52 305 342 29 9.7 102.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q53 540 592 56 5.5 181.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q54 2361 2419 82 2.2 447.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q55 241 300 47 12.3 81.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q56 687 763 87 7.5 132.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q57 1271 1558 407 1.2 829.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q58 693 735 39 7.4 135.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q59 952 1011 83 3.1 322.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q60 944 1228 402 5.5 182.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q61 1199 1230 44 2.6 384.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q62 356 409 42 2.2 450.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q63 532 546 11 5.6 179.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q64 4899 4991 130 1.4 707.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q65 1304 1324 28 2.3 439.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q66 1240 1620 537 1.9 535.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q67 8365 8506 200 0.4 2815.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q68 938 980 43 3.3 301.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q69 2476 2618 201 0.8 1195.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q70 1058 1069 16 2.8 358.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q71 742 833 81 7.0 142.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q72 102897 104819 2719 0.1 6704.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q73 627 692 57 4.9 204.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q74 2139 2604 658 1.8 567.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q75 2587 3123 758 2.2 459.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q76 499 571 70 10.3 97.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q77 1553 1625 102 3.6 276.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q78 3020 3287 378 1.9 537.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q79 698 805 104 4.4 228.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q80 2539 2653 161 2.2 449.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q81 481 513 34 0.8 1310.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q82 1627 1665 53 9.0 110.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q83 445 479 23 1.3 748.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q84 857 927 61 2.8 362.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q85 2816 2851 50 1.0 993.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q86 328 369 32 2.5 405.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q87 1176 1191 21 4.4 225.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q88 2280 2501 313 1.3 766.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q89 569 593 30 5.2 191.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q90 240 290 25 3.4 295.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q91 463 516 46 5.0 201.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q92 277 405 185 2.9 341.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q93 682 688 6 4.6 215.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q94 629 671 37 1.3 746.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q95 9785 9986 285 0.1 11620.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q96 335 352 26 8.9 112.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q97 1786 1921 191 2.5 406.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q98 570 590 19 5.2 191.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q99 450 527 89 3.4 297.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q5a-v2.7 2379 2579 283 2.4 422.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q6-v2.7 1651 1831 254 1.9 529.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q10a-v2.7 2612 2730 167 0.8 1261.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q11-v2.7 2739 3086 491 1.4 726.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q12-v2.7 190 212 17 4.3 234.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q14-v2.7 6097 6714 872 0.8 1188.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q14a-v2.7 10786 11440 925 0.5 2102.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q18a-v2.7 2742 2969 321 1.3 761.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q20-v2.7 308 353 24 5.0 201.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q22-v2.7 17452 17464 16 0.7 1474.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q22a-v2.7 2990 3106 164 4.0 252.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q24-v2.7 363 523 99 9.2 108.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q27a-v2.7 2185 2512 462 2.2 446.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q34-v2.7 666 703 63 4.6 217.6 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q35-v2.7 2309 2412 147 0.9 1114.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q35a-v2.7 2216 2231 21 0.9 1070.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q36a-v2.7 878 929 45 3.4 295.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q47-v2.7 2269 2519 353 1.3 763.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q49-v2.7 1091 1348 363 5.1 194.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q51a-v2.7 20587 20980 556 0.2 5606.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q57-v2.7 1268 1441 245 1.2 828.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q64-v2.7 3734 3996 371 1.9 539.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q67a-v2.7 10734 10916 258 0.3 3613.0 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q70a-v2.7 1077 1107 43 2.7 364.7 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q72-v2.7 106867 106983 164 0.1 6962.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q74-v2.7 2087 2469 539 1.8 553.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q75-v2.7 2573 2996 598 2.2 456.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q77a-v2.7 1706 2061 502 3.3 303.8 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q78-v2.7 4226 4228 4 1.3 752.5 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q80a-v2.7 3172 3304 186 1.8 561.9 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q86a-v2.7 420 478 69 1.9 518.1 1.0X + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +q98-v2.7 457 530 68 6.5 154.0 1.0X + diff --git a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt index 8228e191ec..2a0a11ae15 100644 --- a/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt +++ b/sql/core/benchmarks/TPCDSQueryBenchmark-results.txt @@ -1,810 +1,810 @@ -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q1 1626 1675 69 0.0 Infinity 1.0X +q1 962 1073 157 0.5 2085.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q2 2166 2277 158 0.0 Infinity 1.0X +q2 1117 1124 10 2.0 500.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q3 465 505 65 0.0 Infinity 1.0X +q3 303 330 35 9.8 102.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q4 15108 15662 784 0.0 Infinity 1.0X +q4 6074 6513 621 0.9 1165.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5 3087 3281 274 0.0 Infinity 1.0X +q5 1496 1637 200 3.8 265.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6 1780 1873 132 0.0 Infinity 1.0X +q6 1678 1777 139 1.9 537.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q7 1103 1137 49 0.0 Infinity 1.0X +q7 798 827 37 6.1 163.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q8 998 1019 30 0.0 Infinity 1.0X +q8 564 696 94 5.5 181.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q9 2445 2463 25 0.0 Infinity 1.0X +q9 1145 1167 30 0.0 32727984.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10 4853 5233 537 0.0 Infinity 1.0X +q10 2611 2721 155 0.8 1261.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11 3370 3417 67 0.0 Infinity 1.0X +q11 2401 2473 101 1.6 636.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12 442 538 67 0.0 Infinity 1.0X +q12 261 274 21 3.1 321.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q13 2589 2767 253 0.0 Infinity 1.0X +q13 1072 1091 26 4.6 217.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a 23714 24391 957 0.0 Infinity 1.0X +q14a 6406 6813 576 0.8 1248.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14b 19056 19103 66 0.0 Infinity 1.0X +q14b 5288 5334 66 1.0 1030.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q15 771 797 24 0.0 Infinity 1.0X +q15 550 584 28 3.0 331.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q16 1658 1707 69 0.0 Infinity 1.0X +q16 963 986 29 1.6 616.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q17 2905 2979 104 0.0 Infinity 1.0X +q17 2131 2208 108 2.2 453.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18 2272 2423 213 0.0 Infinity 1.0X +q18 1425 1730 431 2.5 395.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q19 707 730 35 0.0 Infinity 1.0X +q19 483 502 22 6.5 154.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20 449 506 42 0.0 Infinity 1.0X +q20 268 287 13 5.7 175.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q21 1154 1167 19 0.0 Infinity 1.0X +q21 790 800 15 15.0 66.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22 4056 4476 594 0.0 Infinity 1.0X +q22 4347 4363 22 2.7 367.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23a 14557 14780 317 0.0 Infinity 1.0X +q23a 8103 8232 183 0.6 1549.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q23b 17887 18451 799 0.0 Infinity 1.0X +q23b 8128 8354 320 0.6 1554.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24a 2930 3193 372 0.0 Infinity 1.0X +q24a 332 507 172 10.0 99.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24b 2760 2958 280 0.0 Infinity 1.0X +q24b 264 459 233 12.6 79.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q25 2913 3150 335 0.0 Infinity 1.0X +q25 2068 2244 250 2.3 440.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q26 810 819 15 0.0 Infinity 1.0X +q26 530 558 26 6.5 153.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27 1171 1217 65 0.0 Infinity 1.0X +q27 793 807 14 6.2 162.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q28 3212 3273 86 0.0 Infinity 1.0X +q28 1556 1657 142 1.9 540.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q29 2917 3107 270 0.0 Infinity 1.0X +q29 2112 2240 181 2.2 449.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q30 1248 1277 40 0.0 Infinity 1.0X +q30 634 661 21 0.5 2149.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q31 1924 2091 237 0.0 Infinity 1.0X +q31 1253 1325 103 3.0 336.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q32 559 597 26 0.0 Infinity 1.0X +q32 317 342 25 4.8 207.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q33 1314 1325 16 0.0 Infinity 1.0X +q33 736 741 6 7.0 142.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34 761 773 11 0.0 Infinity 1.0X +q34 513 571 42 6.0 167.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35 4967 4984 24 0.0 Infinity 1.0X +q35 2100 2118 26 1.0 1014.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36 1109 1116 9 0.0 Infinity 1.0X +q36 800 829 37 3.7 269.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q37 1200 1234 48 0.0 Infinity 1.0X +q37 1123 1152 41 11.8 84.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q38 1898 2035 194 0.0 Infinity 1.0X +q38 911 936 39 5.7 174.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39a 2252 2362 155 0.0 Infinity 1.0X +q39a 1656 1823 236 7.1 139.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q39b 2142 2248 150 0.0 Infinity 1.0X +q39b 1627 1688 86 7.3 137.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q40 654 687 57 0.0 Infinity 1.0X +q40 511 539 24 3.3 305.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q41 383 448 51 0.0 Infinity 1.0X +q41 183 202 26 0.1 10144.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q42 358 383 21 0.0 Infinity 1.0X +q42 240 255 14 12.4 80.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q43 577 619 37 0.0 Infinity 1.0X +q43 417 433 14 7.1 141.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q44 1188 1234 65 0.0 Infinity 1.0X +q44 525 534 8 5.5 181.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q45 529 562 25 0.0 Infinity 1.0X +q45 313 354 32 3.1 325.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q46 993 1023 42 0.0 Infinity 1.0X +q46 692 697 8 4.5 222.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47 4547 4741 274 0.0 Infinity 1.0X +q47 2328 2357 42 1.3 783.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q48 2303 2426 174 0.0 Infinity 1.0X +q48 1125 1136 15 4.4 228.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49 2080 2086 9 0.0 Infinity 1.0X +q49 996 1012 22 5.6 177.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q50 1371 1388 24 0.0 Infinity 1.0X +q50 1013 1039 37 3.2 312.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51 4373 4513 197 0.0 Infinity 1.0X +q51 3408 3446 54 1.1 928.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q52 360 379 28 0.0 Infinity 1.0X +q52 236 244 10 12.6 79.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q53 661 677 23 0.0 Infinity 1.0X +q53 413 430 15 7.2 139.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q54 3454 3611 222 0.0 Infinity 1.0X +q54 1819 1868 69 2.9 344.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q55 360 383 37 0.0 Infinity 1.0X +q55 233 251 19 12.8 78.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q56 1350 1388 53 0.0 Infinity 1.0X +q56 659 702 37 7.9 127.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57 2910 3156 349 0.0 Infinity 1.0X +q57 1163 1182 27 1.3 759.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q58 1733 1762 42 0.0 Infinity 1.0X +q58 726 739 12 7.1 141.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q59 2021 2044 33 0.0 Infinity 1.0X +q59 889 902 18 3.3 300.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q60 1356 1404 67 0.0 Infinity 1.0X +q60 697 727 36 7.4 134.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q61 1290 1292 3 0.0 Infinity 1.0X +q61 912 913 1 3.4 292.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q62 479 506 33 0.0 Infinity 1.0X +q62 276 287 12 2.9 348.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q63 619 647 20 0.0 Infinity 1.0X +q63 407 419 18 7.3 136.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64 7745 8352 859 0.0 Infinity 1.0X +q64 3772 4011 338 1.8 545.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q65 1307 1337 43 0.0 Infinity 1.0X +q65 925 955 32 3.2 311.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q66 1879 2128 352 0.0 Infinity 1.0X +q66 998 1009 15 2.3 430.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67 9682 9703 29 0.0 Infinity 1.0X +q67 8340 8436 135 0.4 2807.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q68 928 952 34 0.0 Infinity 1.0X +q68 630 681 45 4.9 202.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q69 4261 4330 97 0.0 Infinity 1.0X +q69 2272 2359 124 0.9 1097.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70 1345 1361 23 0.0 Infinity 1.0X +q70 843 862 32 3.5 285.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q71 1103 1119 22 0.0 Infinity 1.0X +q71 554 571 13 9.4 106.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72 20211 21316 1562 0.0 Infinity 1.0X +q72 112849 113359 720 0.1 7352.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q73 680 725 49 0.0 Infinity 1.0X +q73 472 482 17 6.5 154.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74 3007 3109 144 0.0 Infinity 1.0X +q74 1641 1704 89 2.3 435.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75 4597 4942 487 0.0 Infinity 1.0X +q75 2089 2270 257 2.7 370.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q76 899 937 38 0.0 Infinity 1.0X +q76 457 487 25 11.2 89.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77 1794 2086 412 0.0 Infinity 1.0X +q77 822 848 22 6.8 146.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78 4717 4827 155 0.0 Infinity 1.0X +q78 2696 2708 17 2.1 480.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q79 830 880 47 0.0 Infinity 1.0X +q79 569 586 25 5.4 185.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80 3233 3315 116 0.0 Infinity 1.0X +q80 2081 2094 19 2.7 368.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q81 982 1070 123 0.0 Infinity 1.0X +q81 438 492 38 0.8 1193.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q82 1674 1738 89 0.0 Infinity 1.0X +q82 1584 1597 18 9.3 107.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q83 1067 1104 52 0.0 Infinity 1.0X +q83 441 481 26 1.3 741.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q84 1166 1210 62 0.0 Infinity 1.0X +q84 990 1002 17 2.4 418.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q85 3682 3831 211 0.0 Infinity 1.0X +q85 2324 2542 309 1.2 819.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86 616 635 18 0.0 Infinity 1.0X +q86 304 318 14 2.7 375.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q87 2101 2230 183 0.0 Infinity 1.0X +q87 858 879 21 6.1 164.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q88 2415 2523 153 0.0 Infinity 1.0X +q88 1664 1778 162 1.8 559.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q89 677 732 47 0.0 Infinity 1.0X +q89 446 478 25 6.7 150.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q90 414 429 19 0.0 Infinity 1.0X +q90 209 232 26 3.9 256.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q91 793 814 19 0.0 Infinity 1.0X +q91 523 546 21 4.4 227.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q92 508 530 24 0.0 Infinity 1.0X +q92 240 254 11 3.4 296.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q93 1138 1155 24 0.0 Infinity 1.0X +q93 591 613 15 5.4 186.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q94 979 1060 115 0.0 Infinity 1.0X +q94 511 538 20 1.6 607.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q95 5805 6024 310 0.0 Infinity 1.0X +q95 6991 7538 773 0.1 8303.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q96 337 345 10 0.0 Infinity 1.0X +q96 242 252 15 12.3 81.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q97 1641 1748 152 0.0 Infinity 1.0X +q97 1491 1501 14 2.9 339.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98 538 587 61 0.0 Infinity 1.0X +q98 394 407 14 7.5 132.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q99 619 633 19 0.0 Infinity 1.0X +q99 378 396 15 4.0 250.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q5a-v2.7 4861 4954 132 0.0 Infinity 1.0X +q5a-v2.7 1630 1773 203 3.5 289.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q6-v2.7 1798 1861 89 0.0 Infinity 1.0X +q6-v2.7 1493 1510 25 2.1 478.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q10a-v2.7 4093 4209 164 0.0 Infinity 1.0X +q10a-v2.7 2526 2683 222 0.8 1219.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q11-v2.7 3336 3404 96 0.0 Infinity 1.0X +q11-v2.7 2161 2524 514 1.7 572.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q12-v2.7 380 408 30 0.0 Infinity 1.0X +q12-v2.7 208 224 21 3.9 256.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14-v2.7 17331 17776 629 0.0 Infinity 1.0X +q14-v2.7 5125 5232 152 1.0 999.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q14a-v2.7 111982 112268 404 0.0 Infinity 1.0X +q14a-v2.7 10315 10351 51 0.5 2010.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q18a-v2.7 4063 4659 843 0.0 Infinity 1.0X +q18a-v2.7 2708 3029 453 1.3 751.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q20-v2.7 420 446 29 0.0 Infinity 1.0X +q20-v2.7 233 247 17 6.6 151.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22-v2.7 18976 19164 265 0.0 Infinity 1.0X +q22-v2.7 16824 16841 24 0.7 1421.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q22a-v2.7 9087 9281 275 0.0 Infinity 1.0X +q22a-v2.7 2460 2511 71 4.8 207.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q24-v2.7 2817 2834 24 0.0 Infinity 1.0X +q24-v2.7 263 402 84 12.7 78.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q27a-v2.7 2301 2401 141 0.0 Infinity 1.0X +q27a-v2.7 1640 1692 74 3.0 335.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q34-v2.7 700 731 48 0.0 Infinity 1.0X +q34-v2.7 469 489 19 6.5 153.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35-v2.7 4158 4513 503 0.0 Infinity 1.0X +q35-v2.7 1901 2004 146 1.1 917.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q35a-v2.7 3904 3979 106 0.0 Infinity 1.0X +q35a-v2.7 1873 1904 45 1.1 904.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q36a-v2.7 2430 2534 147 0.0 Infinity 1.0X +q36a-v2.7 761 815 51 3.9 256.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q47-v2.7 4502 4808 433 0.0 Infinity 1.0X +q47-v2.7 2070 2150 114 1.4 696.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q49-v2.7 1904 2159 360 0.0 Infinity 1.0X +q49-v2.7 934 949 14 6.0 166.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q51a-v2.7 27939 28264 460 0.0 Infinity 1.0X +q51a-v2.7 22864 23113 352 0.2 6226.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q57-v2.7 2813 2981 237 0.0 Infinity 1.0X +q57-v2.7 1180 1197 25 1.3 770.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q64-v2.7 8413 8612 282 0.0 Infinity 1.0X +q64-v2.7 3784 4197 585 1.8 546.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q67a-v2.7 17696 17858 230 0.0 Infinity 1.0X +q67a-v2.7 11365 11488 173 0.3 3825.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q70a-v2.7 2511 2562 71 0.0 Infinity 1.0X +q70a-v2.7 972 990 24 3.0 329.3 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q72-v2.7 20209 22083 2650 0.0 Infinity 1.0X +q72-v2.7 110614 111100 688 0.1 7207.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q74-v2.7 2870 2912 60 0.0 Infinity 1.0X +q74-v2.7 1681 1732 72 2.2 445.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q75-v2.7 4534 4870 475 0.0 Infinity 1.0X +q75-v2.7 2053 2236 258 2.7 364.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q77a-v2.7 4010 4285 388 0.0 Infinity 1.0X +q77a-v2.7 1488 1587 140 3.8 264.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q78-v2.7 4879 4969 127 0.0 Infinity 1.0X +q78-v2.7 2789 2963 247 2.0 496.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q80a-v2.7 5338 5728 552 0.0 Infinity 1.0X +q80a-v2.7 2535 2561 37 2.2 449.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q86a-v2.7 1370 1391 29 0.0 Infinity 1.0X +q86a-v2.7 388 404 16 2.1 478.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz TPCDS Snappy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -q98-v2.7 577 612 31 0.0 Infinity 1.0X +q98-v2.7 389 402 13 7.6 131.1 1.0X diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk11-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk11-results.txt new file mode 100644 index 0000000000..60c67b2f6a --- /dev/null +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk11-results.txt @@ -0,0 +1,12 @@ +================================================================================================ +TakeOrderedAndProject +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +TakeOrderedAndProject with SMJ for doExecute 521 537 14 0.0 52122.1 1.0X +TakeOrderedAndProject with SMJ for executeCollect 247 300 48 0.0 24737.7 2.1X + + diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk17-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk17-results.txt new file mode 100644 index 0000000000..21697d5444 --- /dev/null +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-jdk17-results.txt @@ -0,0 +1,12 @@ +================================================================================================ +TakeOrderedAndProject +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +TakeOrderedAndProject with SMJ for doExecute 339 364 30 0.0 33873.3 1.0X +TakeOrderedAndProject with SMJ for executeCollect 129 146 22 0.1 12949.3 2.6X + + diff --git a/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt new file mode 100644 index 0000000000..9cfe6ceb36 --- /dev/null +++ b/sql/core/benchmarks/TakeOrderedAndProjectBenchmark-results.txt @@ -0,0 +1,12 @@ +================================================================================================ +TakeOrderedAndProject +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +TakeOrderedAndProject with SMJ: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------------- +TakeOrderedAndProject with SMJ for doExecute 287 337 43 0.0 28734.9 1.0X +TakeOrderedAndProject with SMJ for executeCollect 150 170 30 0.1 15037.8 1.9X + + diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java index 7c885863ff..a6960f733d 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java @@ -42,68 +42,6 @@ * These utilities are mostly used to convert ColumnVectors into other formats. */ public class ColumnVectorUtils { - /** - * Populates the entire `col` with `row[fieldIdx]` - */ - public static void populate(WritableColumnVector col, InternalRow row, int fieldIdx) { - int capacity = col.capacity; - DataType t = col.dataType(); - - if (row.isNullAt(fieldIdx)) { - col.putNulls(0, capacity); - } else { - if (t == DataTypes.BooleanType) { - col.putBooleans(0, capacity, row.getBoolean(fieldIdx)); - } else if (t == DataTypes.BinaryType) { - col.putByteArray(0, row.getBinary(fieldIdx)); - } else if (t == DataTypes.ByteType) { - col.putBytes(0, capacity, row.getByte(fieldIdx)); - } else if (t == DataTypes.ShortType) { - col.putShorts(0, capacity, row.getShort(fieldIdx)); - } else if (t == DataTypes.IntegerType) { - col.putInts(0, capacity, row.getInt(fieldIdx)); - } else if (t == DataTypes.LongType) { - col.putLongs(0, capacity, row.getLong(fieldIdx)); - } else if (t == DataTypes.FloatType) { - col.putFloats(0, capacity, row.getFloat(fieldIdx)); - } else if (t == DataTypes.DoubleType) { - col.putDoubles(0, capacity, row.getDouble(fieldIdx)); - } else if (t == DataTypes.StringType) { - UTF8String v = row.getUTF8String(fieldIdx); - byte[] bytes = v.getBytes(); - for (int i = 0; i < capacity; i++) { - col.putByteArray(i, bytes); - } - } else if (t instanceof DecimalType) { - DecimalType dt = (DecimalType)t; - Decimal d = row.getDecimal(fieldIdx, dt.precision(), dt.scale()); - if (dt.precision() <= Decimal.MAX_INT_DIGITS()) { - col.putInts(0, capacity, (int)d.toUnscaledLong()); - } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) { - col.putLongs(0, capacity, d.toUnscaledLong()); - } else { - final BigInteger integer = d.toJavaBigDecimal().unscaledValue(); - byte[] bytes = integer.toByteArray(); - for (int i = 0; i < capacity; i++) { - col.putByteArray(i, bytes, 0, bytes.length); - } - } - } else if (t instanceof CalendarIntervalType) { - CalendarInterval c = (CalendarInterval)row.get(fieldIdx, t); - col.getChild(0).putInts(0, capacity, c.months); - col.getChild(1).putInts(0, capacity, c.days); - col.getChild(2).putLongs(0, capacity, c.microseconds); - } else if (t instanceof DateType || t instanceof YearMonthIntervalType) { - col.putInts(0, capacity, row.getInt(fieldIdx)); - } else if (t instanceof TimestampType || t instanceof TimestampNTZType || - t instanceof DayTimeIntervalType) { - col.putLongs(0, capacity, row.getLong(fieldIdx)); - } else { - throw new RuntimeException(String.format("DataType %s is not supported" + - " in column vectorized reader.", t.sql())); - } - } - } /** * Populates the value of `row[fieldIdx]` into `ConstantColumnVector`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala index 1f6cb678f1..59f8099cbe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/interface.scala @@ -168,7 +168,8 @@ class Column( * A user-defined function in Spark, as returned by `listFunctions` method in [[Catalog]]. * * @param name name of the function. - * @param database name of the database the function belongs to. + * @param catalog name of the catalog that the table belongs to. + * @param namespace the namespace that the table belongs to. * @param description description of the function; description can be null. * @param className the fully qualified class name of the function. * @param isTemporary whether the function is a temporary function or not. @@ -177,12 +178,26 @@ class Column( @Stable class Function( val name: String, - @Nullable val database: String, + @Nullable val catalog: String, + @Nullable val namespace: Array[String], @Nullable val description: String, val className: String, val isTemporary: Boolean) extends DefinedByConstructorParams { + def this( + name: String, + database: String, + description: String, + className: String, + isTemporary: Boolean) = { + this(name, null, Array(database), description, className, isTemporary) + } + + def database: String = { + if (namespace != null && namespace.length == 1) namespace(0) else null + } + override def toString: String = { "Function[" + s"name='$name', " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index e629b7129b..3e39863f5b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -400,13 +400,8 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) throw QueryCompilationErrors.missingCatalogAbilityError(catalog, "functions") } - case ShowFunctions(ns: ResolvedNamespace, userScope, systemScope, pattern, output) => - ns match { - case DatabaseInSessionCatalog(db) => - ShowFunctionsCommand(db, pattern, userScope, systemScope, output) - case _ => - throw QueryCompilationErrors.missingCatalogAbilityError(ns.catalog, "functions") - } + case ShowFunctions(DatabaseInSessionCatalog(db), userScope, systemScope, pattern, output) => + ShowFunctionsCommand(db, pattern, userScope, systemScope, output) case DropFunction(ResolvedPersistentFunc(catalog, identifier, _), ifExists) => if (isSessionCatalog(catalog)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index fc7cc10978..44a5ba4a54 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -662,7 +662,7 @@ case class AdaptiveSparkPlanExec( // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery. // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan is // already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule. - val finalPlan = currentPhysicalPlan match { + val finalPlan = inputPlan match { case b: BroadcastExchangeLike if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan)) case _ => newPlan diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 8882261d96..7f30300a39 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 import org.apache.spark.sql.catalyst.util.{ResolveDefaultColumns, V2ExpressionBuilder} import org.apache.spark.sql.connector.catalog.SupportsRead import org.apache.spark.sql.connector.catalog.TableCapability._ -import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, FieldReference, NullOrdering, SortDirection, SortOrder => V2SortOrder, SortValue} +import org.apache.spark.sql.connector.expressions.{Expression => V2Expression, NullOrdering, SortDirection, SortOrder => V2SortOrder, SortValue} import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Aggregation, Avg, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum, UserDefinedAggregateFunc} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.{InSubqueryExec, RowDataSourceScanExec, SparkPlan} @@ -727,30 +727,28 @@ object DataSourceStrategy } case aggregate.Sum(PushableExpression(expr), _) => Some(new Sum(expr, agg.isDistinct)) case aggregate.Average(PushableExpression(expr), _) => Some(new Avg(expr, agg.isDistinct)) - case aggregate.VariancePop(PushableColumnWithoutNestedColumn(name), _) => - Some(new GeneralAggregateFunc( - "VAR_POP", agg.isDistinct, Array(FieldReference.column(name)))) - case aggregate.VarianceSamp(PushableColumnWithoutNestedColumn(name), _) => - Some(new GeneralAggregateFunc( - "VAR_SAMP", agg.isDistinct, Array(FieldReference.column(name)))) - case aggregate.StddevPop(PushableColumnWithoutNestedColumn(name), _) => - Some(new GeneralAggregateFunc( - "STDDEV_POP", agg.isDistinct, Array(FieldReference.column(name)))) - case aggregate.StddevSamp(PushableColumnWithoutNestedColumn(name), _) => - Some(new GeneralAggregateFunc( - "STDDEV_SAMP", agg.isDistinct, Array(FieldReference.column(name)))) - case aggregate.CovPopulation(PushableColumnWithoutNestedColumn(left), - PushableColumnWithoutNestedColumn(right), _) => - Some(new GeneralAggregateFunc("COVAR_POP", agg.isDistinct, - Array(FieldReference.column(left), FieldReference.column(right)))) - case aggregate.CovSample(PushableColumnWithoutNestedColumn(left), - PushableColumnWithoutNestedColumn(right), _) => - Some(new GeneralAggregateFunc("COVAR_SAMP", agg.isDistinct, - Array(FieldReference.column(left), FieldReference.column(right)))) - case aggregate.Corr(PushableColumnWithoutNestedColumn(left), - PushableColumnWithoutNestedColumn(right), _) => - Some(new GeneralAggregateFunc("CORR", agg.isDistinct, - Array(FieldReference.column(left), FieldReference.column(right)))) + case aggregate.VariancePop(PushableExpression(expr), _) => + Some(new GeneralAggregateFunc("VAR_POP", agg.isDistinct, Array(expr))) + case aggregate.VarianceSamp(PushableExpression(expr), _) => + Some(new GeneralAggregateFunc("VAR_SAMP", agg.isDistinct, Array(expr))) + case aggregate.StddevPop(PushableExpression(expr), _) => + Some(new GeneralAggregateFunc("STDDEV_POP", agg.isDistinct, Array(expr))) + case aggregate.StddevSamp(PushableExpression(expr), _) => + Some(new GeneralAggregateFunc("STDDEV_SAMP", agg.isDistinct, Array(expr))) + case aggregate.CovPopulation(PushableExpression(left), PushableExpression(right), _) => + Some(new GeneralAggregateFunc("COVAR_POP", agg.isDistinct, Array(left, right))) + case aggregate.CovSample(PushableExpression(left), PushableExpression(right), _) => + Some(new GeneralAggregateFunc("COVAR_SAMP", agg.isDistinct, Array(left, right))) + case aggregate.Corr(PushableExpression(left), PushableExpression(right), _) => + Some(new GeneralAggregateFunc("CORR", agg.isDistinct, Array(left, right))) + case aggregate.RegrIntercept(PushableExpression(left), PushableExpression(right)) => + Some(new GeneralAggregateFunc("REGR_INTERCEPT", agg.isDistinct, Array(left, right))) + case aggregate.RegrR2(PushableExpression(left), PushableExpression(right)) => + Some(new GeneralAggregateFunc("REGR_R2", agg.isDistinct, Array(left, right))) + case aggregate.RegrSlope(PushableExpression(left), PushableExpression(right)) => + Some(new GeneralAggregateFunc("REGR_SLOPE", agg.isDistinct, Array(left, right))) + case aggregate.RegrSXY(PushableExpression(left), PushableExpression(right)) => + Some(new GeneralAggregateFunc("REGR_SXY", agg.isDistinct, Array(left, right))) case aggregate.V2Aggregator(aggrFunc, children, _, _) => val translatedExprs = children.flatMap(PushableExpression.unapply(_)) if (translatedExprs.length == children.length) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index e856bb5b9c..3cc69656bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -61,7 +61,7 @@ object PartitionSpec { val emptySpec = PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[PartitionPath]) } -object PartitioningUtils extends SQLConfHelper{ +object PartitioningUtils extends SQLConfHelper { val timestampPartitionPattern = "yyyy-MM-dd HH:mm:ss[.S]" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index c0fa3e2ba6..b678effbea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -485,6 +485,15 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat s"DropIndex is not supported in this table ${table.name}.") } + case ShowFunctions(ResolvedNamespace(catalog, ns), userScope, systemScope, pattern, output) => + ShowFunctionsExec( + output, + catalog.asFunctionCatalog, + ns, + userScope, + systemScope, + pattern) :: Nil + case _ => Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala index 2e71428bca..7f9a62f42d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala @@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ /** * Physical plan node for describing a namespace. @@ -38,7 +39,7 @@ case class DescribeNamespaceExec( val metadata = catalog.loadNamespaceMetadata(ns) rows += toCatalystRow("Catalog Name", catalog.name()) - rows += toCatalystRow("Namespace Name", ns.last) + rows += toCatalystRow("Namespace Name", ns.quoted) CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.foreach { p => rows ++= Option(metadata.get(p)).map(toCatalystRow(p.capitalize, _)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowFunctionsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowFunctionsExec.scala new file mode 100644 index 0000000000..5ca0b01d42 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowFunctionsExec.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TableFunctionRegistry} +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.connector.catalog.FunctionCatalog +import org.apache.spark.sql.execution.LeafExecNode + +/** + * Physical plan node for showing functions. + */ +case class ShowFunctionsExec( + output: Seq[Attribute], + catalog: FunctionCatalog, + namespace: Seq[String], + userScope: Boolean, + systemScope: Boolean, + pattern: Option[String]) extends V2CommandExec with LeafExecNode { + + override protected def run(): Seq[InternalRow] = { + val rows = new ArrayBuffer[InternalRow]() + val systemFunctions = if (systemScope) { + // All built-in functions + (FunctionRegistry.functionSet ++ TableFunctionRegistry.functionSet).map(_.unquotedString) ++ + // Hard code "<>", "!=", "between", "case", and "||" + // for now as there is no corresponding functions. + // "<>", "!=", "between", "case", and "||" is system functions, + // only show when systemScope=true + FunctionRegistry.builtinOperators.keys.toSeq + } else Seq.empty + val userFunctions = if (userScope) { + // List all temporary functions in the session catalog + session.sessionState.catalog.listTemporaryFunctions().map(_.unquotedString) ++ + // List all functions registered in the given name space of the catalog + catalog.listFunctions(namespace.toArray).map(_.name()).toSeq + } else Seq.empty + val allFunctions = StringUtils.filterPattern( + userFunctions ++ systemFunctions, + pattern.getOrElse("*")).distinct.sorted + + allFunctions.foreach { fn => + rows += toCatalystRow(fn) + } + + rows.toSeq + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala index 2e3b9b20b5..39a81e6563 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala @@ -39,7 +39,7 @@ case class ParquetScanBuilder( dataSchema: StructType, options: CaseInsensitiveStringMap) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) - with SupportsPushDownAggregates{ + with SupportsPushDownAggregates { lazy val hadoopConf = { val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap // Hadoop Configurations are case sensitive. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala index dbba19002c..49f703fddb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala @@ -283,8 +283,13 @@ case class TakeOrderedAndProjectExec( } override def executeCollect(): Array[InternalRow] = { + val orderingSatisfies = SortOrder.orderingSatisfies(child.outputOrdering, sortOrder) val ord = new LazilyGeneratedOrdering(sortOrder, child.output) - val limited = child.execute().mapPartitionsInternal(_.map(_.copy())).takeOrdered(limit)(ord) + val limited = if (orderingSatisfies) { + child.execute().mapPartitionsInternal(_.map(_.copy()).take(limit)).takeOrdered(limit)(ord) + } else { + child.execute().mapPartitionsInternal(_.map(_.copy())).takeOrdered(limit)(ord) + } val data = if (offset > 0) limited.drop(offset) else limited if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) @@ -303,6 +308,7 @@ case class TakeOrderedAndProjectExec( override lazy val metrics = readMetrics ++ writeMetrics protected override def doExecute(): RDD[InternalRow] = { + val orderingSatisfies = SortOrder.orderingSatisfies(child.outputOrdering, sortOrder) val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val childRDD = child.execute() if (childRDD.getNumPartitions == 0) { @@ -311,8 +317,12 @@ case class TakeOrderedAndProjectExec( val singlePartitionRDD = if (childRDD.getNumPartitions == 1) { childRDD } else { - val localTopK = childRDD.mapPartitionsInternal { iter => - Utils.takeOrdered(iter.map(_.copy()), limit)(ord) + val localTopK = if (orderingSatisfies) { + childRDD.mapPartitionsInternal(_.map(_.copy()).take(limit)) + } else { + childRDD.mapPartitionsInternal { iter => + Utils.takeOrdered(iter.map(_.copy()), limit)(ord) + } } new ShuffledRowRDD( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala index 0c8cabb75e..80384f8cb3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala @@ -41,20 +41,34 @@ class StateSchemaCompatibilityChecker( fm.mkdirs(schemaFileLocation.getParent) def check(keySchema: StructType, valueSchema: StructType): Unit = { + check(keySchema, valueSchema, ignoreValueSchema = false) + } + + def check(keySchema: StructType, valueSchema: StructType, ignoreValueSchema: Boolean): Unit = { if (fm.exists(schemaFileLocation)) { logDebug(s"Schema file for provider $providerId exists. Comparing with provided schema.") val (storedKeySchema, storedValueSchema) = readSchemaFile() - if (storedKeySchema.equals(keySchema) && storedValueSchema.equals(valueSchema)) { + if (storedKeySchema.equals(keySchema) && + (ignoreValueSchema || storedValueSchema.equals(valueSchema))) { // schema is exactly same } else if (!schemasCompatible(storedKeySchema, keySchema) || - !schemasCompatible(storedValueSchema, valueSchema)) { + (!ignoreValueSchema && !schemasCompatible(storedValueSchema, valueSchema))) { + val errorMsgForKeySchema = s"- Provided key schema: $keySchema\n" + + s"- Existing key schema: $storedKeySchema\n" + + // If it is requested to skip checking the value schema, we also don't expose the value + // schema information to the error message. + val errorMsgForValueSchema = if (!ignoreValueSchema) { + s"- Provided value schema: $valueSchema\n" + + s"- Existing value schema: $storedValueSchema\n" + } else { + "" + } val errorMsg = "Provided schema doesn't match to the schema for existing state! " + "Please note that Spark allow difference of field name: check count of fields " + "and data type of each field.\n" + - s"- Provided key schema: $keySchema\n" + - s"- Provided value schema: $valueSchema\n" + - s"- Existing key schema: $storedKeySchema\n" + - s"- Existing value schema: $storedValueSchema\n" + + errorMsgForKeySchema + + errorMsgForValueSchema + s"If you want to force running query without schema validation, please set " + s"${SQLConf.STATE_SCHEMA_CHECK_ENABLED.key} to false.\n" + "Please note running query with incompatible schema could cause indeterministic" + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala index 64c9cd11f9..203cb2a287 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala @@ -515,7 +515,12 @@ object StateStore extends Logging { val checker = new StateSchemaCompatibilityChecker(storeProviderId, hadoopConf) // regardless of configuration, we check compatibility to at least write schema file // if necessary - val ret = Try(checker.check(keySchema, valueSchema)).toEither.fold(Some(_), _ => None) + // if the format validation for value schema is disabled, we also disable the schema + // compatibility checker for value schema as well. + val ret = Try( + checker.check(keySchema, valueSchema, + ignoreValueSchema = !storeConf.formatValidationCheckValue) + ).toEither.fold(Some(_), _ => None) if (storeConf.stateSchemaCheckEnabled) { ret } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala index 529db2609c..66bb37d7a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala @@ -48,7 +48,12 @@ class StateStoreConf( /** Whether validate the underlying format or not. */ val formatValidationEnabled: Boolean = sqlConf.stateStoreFormatValidationEnabled - /** Whether validate the value format when the format invalidation enabled. */ + /** + * Whether to validate the value side. This config is applied to both validators as below: + * + * - whether to validate the value format when the format validation is enabled. + * - whether to validate the value schema when the state schema check is enabled. + */ val formatValidationCheckValue: Boolean = extraOptions.getOrElse(StateStoreConf.FORMAT_VALIDATION_CHECK_VALUE_CONFIG, "true") == "true" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala index e2a0598644..2b8fc65156 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala @@ -783,13 +783,15 @@ case class StreamingDeduplicateExec( keyExpressions, getStateInfo, conf) :: Nil } + private val schemaForEmptyRow: StructType = StructType(Array(StructField("__dummy__", NullType))) + override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver child.execute().mapPartitionsWithStateStore( getStateInfo, keyExpressions.toStructType, - child.output.toStructType, + schemaForEmptyRow, numColsPrefixKey = 0, session.sessionState, Some(session.streams.stateStoreCoordinator), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala index 93f48ab199..32aa13a29c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala @@ -94,7 +94,7 @@ class WindowSpec private[sql]( * An offset indicates the number of rows above or below the current row, the frame for the * current row starts or ends. For instance, given a row based sliding frame with a lower bound * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from - * index 4 to index 6. + * index 4 to index 7. * * {{{ * import org.apache.spark.sql.expressions.Window diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 742ca5ccb1..880c084ab6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -23,16 +23,17 @@ import scala.util.control.NonFatal import org.apache.spark.sql._ import org.apache.spark.sql.catalog.{Catalog, CatalogMetadata, Column, Database, Function, Table} import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, FunctionIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedTable, ResolvedView, UnresolvedDBObjectName, UnresolvedNamespace, UnresolvedTable, UnresolvedTableOrView} +import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedNonPersistentFunc, ResolvedPersistentFunc, ResolvedTable, ResolvedView, UnresolvedDBObjectName, UnresolvedFunc, UnresolvedNamespace, UnresolvedTable, UnresolvedTableOrView} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.plans.logical.{CreateTable, LocalRelation, RecoverPartitions, ShowNamespaces, ShowTables, SubqueryAlias, TableSpec, View} +import org.apache.spark.sql.catalyst.plans.logical.{CreateTable, LocalRelation, RecoverPartitions, ShowFunctions, ShowNamespaces, ShowTables, SubqueryAlias, TableSpec, View} import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, Identifier, SupportsNamespaces, TableCatalog} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.{CatalogHelper, IdentifierHelper, MultipartIdentifierHelper, TransformHelper} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.internal.connector.V1Function import org.apache.spark.sql.types.StructType import org.apache.spark.storage.StorageLevel @@ -59,15 +60,18 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Returns the current default database in this session. */ - override def currentDatabase: String = sessionCatalog.getCurrentDatabase + override def currentDatabase: String = + sparkSession.sessionState.catalogManager.currentNamespace.toSeq.quoted /** * Sets the current default database in this session. */ @throws[AnalysisException]("database does not exist") override def setCurrentDatabase(dbName: String): Unit = { - requireDatabaseExists(dbName) - sessionCatalog.setCurrentDatabase(dbName) + // we assume dbName will not include the catalog prefix. e.g. if you call + // setCurrentDatabase("catalog.db") it will search for a database catalog.db in the catalog. + val ident = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(dbName) + sparkSession.sessionState.catalogManager.setCurrentNamespace(ident.toArray) } /** @@ -191,11 +195,40 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { */ @throws[AnalysisException]("database does not exist") override def listFunctions(dbName: String): Dataset[Function] = { - requireDatabaseExists(dbName) - val functions = sessionCatalog.listFunctions(dbName).map { case (functIdent, _) => - makeFunction(functIdent) + // `dbName` could be either a single database name (behavior in Spark 3.3 and prior) or + // a qualified namespace with catalog name. We assume it's a single database name + // and check if we can find the dbName in sessionCatalog. If so we listFunctions under + // that database. Otherwise we try 3-part name parsing and locate the database. + if (sessionCatalog.databaseExists(dbName)) { + val functions = sessionCatalog.listFunctions(dbName) + .map { case (functIdent, _) => makeFunction(functIdent) } + CatalogImpl.makeDataset(functions, sparkSession) + } else { + val ident = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(dbName) + val functions = collection.mutable.ArrayBuilder.make[Function] + + // built-in functions + val plan0 = ShowFunctions(UnresolvedNamespace(ident), + userScope = false, systemScope = true, None) + sparkSession.sessionState.executePlan(plan0).toRdd.collect().foreach { row => + // `lookupBuiltinOrTempFunction` and `lookupBuiltinOrTempTableFunction` in Analyzer + // require the input identifier only contains the function name, otherwise, built-in + // functions will be skipped. + val name = row.getString(0) + functions += makeFunction(Seq(name)) + } + + // user functions + val plan1 = ShowFunctions(UnresolvedNamespace(ident), + userScope = true, systemScope = false, None) + sparkSession.sessionState.executePlan(plan1).toRdd.collect().foreach { row => + // `row.getString(0)` may contain dbName like `db.function`, so extract the function name. + val name = row.getString(0).split("\\.").last + functions += makeFunction(ident :+ name) + } + + CatalogImpl.makeDataset(functions.result(), sparkSession) } - CatalogImpl.makeDataset(functions, sparkSession) } private def makeFunction(funcIdent: FunctionIdentifier): Function = { @@ -208,6 +241,39 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { isTemporary = metadata.getDb == null) } + private def makeFunction(ident: Seq[String]): Function = { + val plan = UnresolvedFunc(ident, "Catalog.makeFunction", false, None) + sparkSession.sessionState.executePlan(plan).analyzed match { + case f: ResolvedPersistentFunc => + val className = f.func match { + case f: V1Function => f.info.getClassName + case f => f.getClass.getName + } + new Function( + name = f.identifier.name(), + catalog = f.catalog.name(), + namespace = f.identifier.namespace(), + description = f.func.description(), + className = className, + isTemporary = false) + + case f: ResolvedNonPersistentFunc => + val className = f.func match { + case f: V1Function => f.info.getClassName + case f => f.getClass.getName + } + new Function( + name = f.name, + catalog = null, + namespace = null, + description = f.func.description(), + className = className, + isTemporary = true) + + case _ => throw QueryCompilationErrors.noSuchFunctionError(ident, plan) + } + } + /** * Returns a list of columns for the given table/view or temporary view. */ @@ -377,8 +443,19 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { * function. This throws an `AnalysisException` when no `Function` can be found. */ override def getFunction(functionName: String): Function = { - val functionIdent = sparkSession.sessionState.sqlParser.parseFunctionIdentifier(functionName) - getFunction(functionIdent.database.orNull, functionIdent.funcName) + // calling `sqlParser.parseFunctionIdentifier` to parse functionName. If it contains only + // function name and optionally contains a database name(thus a FunctionIdentifier), then + // we look up the function in sessionCatalog. + // Otherwise we try `sqlParser.parseMultipartIdentifier` to have a sequence of string as + // the qualified identifier and resolve the function through SQL analyzer. + try { + val ident = sparkSession.sessionState.sqlParser.parseFunctionIdentifier(functionName) + getFunction(ident.database.orNull, ident.funcName) + } catch { + case e: org.apache.spark.sql.catalyst.parser.ParseException => + val ident = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(functionName) + makeFunction(ident) + } } /** @@ -440,8 +517,23 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { * or a function. */ override def functionExists(functionName: String): Boolean = { - val functionIdent = sparkSession.sessionState.sqlParser.parseFunctionIdentifier(functionName) - functionExists(functionIdent.database.orNull, functionIdent.funcName) + try { + val ident = sparkSession.sessionState.sqlParser.parseFunctionIdentifier(functionName) + functionExists(ident.database.orNull, ident.funcName) + } catch { + case e: org.apache.spark.sql.catalyst.parser.ParseException => + try { + val ident = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(functionName) + val plan = UnresolvedFunc(ident, "Catalog.functionExists", false, None) + sparkSession.sessionState.executePlan(plan).analyzed match { + case _: ResolvedPersistentFunc => true + case _: ResolvedNonPersistentFunc => true + case _ => false + } + } catch { + case _: org.apache.spark.sql.AnalysisException => false + } + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala index 124cb001b5..1202f51ef9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala @@ -62,18 +62,27 @@ private[sql] object H2Dialect extends JdbcDialect { assert(f.children().length == 1) val distinct = if (f.isDistinct) "DISTINCT " else "" Some(s"STDDEV_SAMP($distinct${f.children().head})") - case f: GeneralAggregateFunc if f.name() == "COVAR_POP" => + case f: GeneralAggregateFunc if f.name() == "COVAR_POP" && !f.isDistinct => assert(f.children().length == 2) - val distinct = if (f.isDistinct) "DISTINCT " else "" - Some(s"COVAR_POP($distinct${f.children().head}, ${f.children().last})") - case f: GeneralAggregateFunc if f.name() == "COVAR_SAMP" => + Some(s"COVAR_POP(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "COVAR_SAMP" && !f.isDistinct => assert(f.children().length == 2) - val distinct = if (f.isDistinct) "DISTINCT " else "" - Some(s"COVAR_SAMP($distinct${f.children().head}, ${f.children().last})") - case f: GeneralAggregateFunc if f.name() == "CORR" => + Some(s"COVAR_SAMP(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "CORR" && !f.isDistinct => assert(f.children().length == 2) - val distinct = if (f.isDistinct) "DISTINCT " else "" - Some(s"CORR($distinct${f.children().head}, ${f.children().last})") + Some(s"CORR(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "REGR_INTERCEPT" && !f.isDistinct => + assert(f.children().length == 2) + Some(s"REGR_INTERCEPT(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "REGR_R2" && !f.isDistinct => + assert(f.children().length == 2) + Some(s"REGR_R2(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "REGR_SLOPE" && !f.isDistinct => + assert(f.children().length == 2) + Some(s"REGR_SLOPE(${f.children().head}, ${f.children().last})") + case f: GeneralAggregateFunc if f.name() == "REGR_SXY" && !f.isDistinct => + assert(f.children().length == 2) + Some(s"REGR_SXY(${f.children().head}, ${f.children().last})") case _ => None } ) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala index fa2a45be18..1c4d2cf0ae 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala @@ -76,7 +76,7 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int) * for the given Catalyst type. */ @DeveloperApi -abstract class JdbcDialect extends Serializable with Logging{ +abstract class JdbcDialect extends Serializable with Logging { /** * Check if this dialect instance can handle a certain jdbc url. * @param url the jdbc url. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala index 24f9bac74f..c4cb5369af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala @@ -206,7 +206,7 @@ private case object MySQLDialect extends JdbcDialect with SQLConfHelper { val indexName = rs.getString("key_name") val colName = rs.getString("column_name") val indexType = rs.getString("index_type") - val indexComment = rs.getString("Index_comment") + val indexComment = rs.getString("index_comment") if (indexMap.contains(indexName)) { val index = indexMap.get(indexName).get val newIndex = new TableIndex(indexName, indexType, diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 369594c902..0305781a48 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -241,6 +241,7 @@ | org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | struct | | org.apache.spark.sql.catalyst.expressions.RegExpExtractAll | regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400', '(\\d+)-(\\d+)', 1) | struct> | | org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | SELECT regexp_replace('100-200', '(\\d+)', 'num') | struct | +| org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr | SELECT regexp_substr('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en') | struct | | org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 | struct<(2 % 1.8):decimal(2,1)> | | org.apache.spark.sql.catalyst.expressions.Remainder | mod | SELECT 2 % 1.8 | struct<(2 % 1.8):decimal(2,1)> | | org.apache.spark.sql.catalyst.expressions.Reverse | reverse | SELECT reverse('Spark SQL') | struct | @@ -317,7 +318,6 @@ | org.apache.spark.sql.catalyst.expressions.TryMultiply | try_multiply | SELECT try_multiply(2, 3) | struct | | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct | | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct | -| org.apache.spark.sql.catalyst.expressions.TryToCharacter | try_to_char | SELECT try_to_char(454, '999') | struct | | org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number | SELECT try_to_number('454', '999') | struct | | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct | | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index a48cfe6848..d828d761b7 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -60,3 +60,12 @@ SELECT regexp_count('the fox', '(?i)FOX'); SELECT regexp_count('passwd7 plain A1234 a1234', '(?=[^ ]*[a-z])(?=[^ ]*[0-9])[^ ]+'); SELECT regexp_count(null, 'abc'); SELECT regexp_count('abc', null); + +-- regexp_substr +SELECT regexp_substr('1a 2b 14m', '\\d+'); +SELECT regexp_substr('1a 2b 14m', '\\d+ '); +SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)'); +SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)'); +SELECT regexp_substr('1a 2b 14m', ''); +SELECT regexp_substr('Spark', null); +SELECT regexp_substr(null, '.*'); diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index a2eb2b2a14..c82b892f48 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -435,3 +435,59 @@ SELECT regexp_count('abc', null) struct -- !query output NULL + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+ ') +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)') +-- !query schema +struct +-- !query output +1a + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)') +-- !query schema +struct +-- !query output +14m + + +-- !query +SELECT regexp_substr('1a 2b 14m', '') +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT regexp_substr('Spark', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT regexp_substr(null, '.*') +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.0.crc new file mode 100644 index 0000000000000000000000000000000000000000..1aee7033161ecac53eda98ef9b64746c31483c89 GIT binary patch literal 12 TcmYc;N@ieSU}E^Jwf`;v6eR=n literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.1.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/.1.crc new file mode 100644 index 0000000000000000000000000000000000000000..1aee7033161ecac53eda98ef9b64746c31483c89 GIT binary patch literal 12 TcmYc;N@ieSU}E^Jwf`;v6eR=n literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/0 new file mode 100644 index 0000000000..9c1e3021c3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/1 new file mode 100644 index 0000000000..9c1e3021c3 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/commits/1 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/metadata new file mode 100644 index 0000000000..78bd74a789 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/metadata @@ -0,0 +1 @@ +{"id":"33e8de33-00b8-4b60-8246-df2f433257ff"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/.0.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/.0.crc new file mode 100644 index 0000000000000000000000000000000000000000..726c678bc6a292057ba9ba7f414c1237c614317d GIT binary patch literal 16 XcmYc;N@ieSU}D&{nl;z&)w+BDBM%^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/0 new file mode 100644 index 0000000000..443c682435 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/0 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1656644489789,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/1 new file mode 100644 index 0000000000..67b4217556 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/offsets/1 @@ -0,0 +1,3 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1656644492462,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +1 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.1.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..1992982c58ff232b862a5e00e92235b8895264db GIT binary patch literal 12 TcmYc;N@ieSU}BKGxWp0w5!eF) literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/.2.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/1.delta new file mode 100644 index 0000000000000000000000000000000000000000..fec40e83a5471a5624119611a69d7bfdfc01a875 GIT binary patch literal 77 zcmeZ?GI7euPtI0VW?*120pe+5Z`p%^v;+eq!(>JvLjZ^qfdoGTg9rmV6GH&Qe<1LI K>JtUZ!T|s+-3_$> literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/2.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/.schema.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/.schema.crc new file mode 100644 index 0000000000000000000000000000000000000000..022717c6b5016bdd850f6cd11ba7005aa18b2472 GIT binary patch literal 12 TcmYc;N@ieSU}9Ky+`0?^6IlaU literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/schema b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/0/_metadata/schema new file mode 100644 index 0000000000000000000000000000000000000000..f132f9601b73a14dc3ce0a5f641c2c91bdd29dd2 GIT binary patch literal 254 zcmZQzDl=kWU|?j3s8%YeEJ#(dQYtPfDorj?(osswOwCCtRBVo9QsRdsD`Y%SRIX|yq22jMa!gv&DXN>bBP Oi%^`FmY7qF={5i=UQx&Z literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.1.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/.2.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..d18b77b93aff2c88bcfd28423b9f0322d1925578 GIT binary patch literal 12 TcmYc;N@ieSU}9kUlr$Fr5x)a2 literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/1.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/1/2.delta new file mode 100644 index 0000000000000000000000000000000000000000..fcbf8df80f5f9699414708afa88c5515e92a5b12 GIT binary patch literal 77 zcmeZ?GI7euPtI0VW?*120b;l7-}r)ov;+eq!(>JvLjZ`AfCN7Sg9rmV6GH&Qe<1LI K>JtUZ!T|tFSPnM; literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.1.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/.2.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/1.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/2/2.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.1.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/.2.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/1.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/3/2.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.1.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.1.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.2.delta.crc b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/.2.delta.crc new file mode 100644 index 0000000000000000000000000000000000000000..cf1d68e2acee3bca2b92320c4bafc702a6539ea0 GIT binary patch literal 12 TcmYc;N@ieSU}A7peP;>)5flQ* literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/1.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/2.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/state/0/4/2.delta new file mode 100644 index 0000000000000000000000000000000000000000..6352978051846970ca41a0ca97fd79952105726d GIT binary patch literal 46 icmeZ?GI7euPtF!)VPIeY;oA+q9RGp92POd&g989JFAHe^ literal 0 HcmV?d00001 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/explain.txt index d281e59c72..905d29293a 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/explain.txt @@ -13,11 +13,11 @@ TakeOrderedAndProject (46) : : : +- * BroadcastHashJoin Inner BuildRight (8) : : : :- * Filter (3) : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.store_returns (1) + : : : : +- Scan parquet spark_catalog.default.store_returns (1) : : : +- BroadcastExchange (7) : : : +- * Filter (6) : : : +- * ColumnarToRow (5) - : : : +- Scan parquet default.item (4) + : : : +- Scan parquet spark_catalog.default.item (4) : : +- ReusedExchange (10) : +- BroadcastExchange (28) : +- * HashAggregate (27) @@ -29,7 +29,7 @@ TakeOrderedAndProject (46) : : +- * BroadcastHashJoin Inner BuildRight (20) : : :- * Filter (18) : : : +- * ColumnarToRow (17) - : : : +- Scan parquet default.catalog_returns (16) + : : : +- Scan parquet spark_catalog.default.catalog_returns (16) : : +- ReusedExchange (19) : +- ReusedExchange (22) +- BroadcastExchange (43) @@ -42,12 +42,12 @@ TakeOrderedAndProject (46) : +- * BroadcastHashJoin Inner BuildRight (35) : :- * Filter (33) : : +- * ColumnarToRow (32) - : : +- Scan parquet default.web_returns (31) + : : +- Scan parquet spark_catalog.default.web_returns (31) : +- ReusedExchange (34) +- ReusedExchange (37) -(1) Scan parquet default.store_returns +(1) Scan parquet spark_catalog.default.store_returns Output [3]: [sr_item_sk#1, sr_return_quantity#2, sr_returned_date_sk#3] Batched: true Location: InMemoryFileIndex [] @@ -62,7 +62,7 @@ Input [3]: [sr_item_sk#1, sr_return_quantity#2, sr_returned_date_sk#3] Input [3]: [sr_item_sk#1, sr_return_quantity#2, sr_returned_date_sk#3] Condition : isnotnull(sr_item_sk#1) -(4) Scan parquet default.item +(4) Scan parquet spark_catalog.default.item Output [2]: [i_item_sk#5, i_item_id#6] Batched: true Location [not included in comparison]/{warehouse_dir}/item] @@ -119,7 +119,7 @@ Functions [1]: [sum(sr_return_quantity#2)] Aggregate Attributes [1]: [sum(sr_return_quantity#2)#10] Results [2]: [i_item_id#6 AS item_id#11, sum(sr_return_quantity#2)#10 AS sr_item_qty#12] -(16) Scan parquet default.catalog_returns +(16) Scan parquet spark_catalog.default.catalog_returns Output [3]: [cr_item_sk#13, cr_return_quantity#14, cr_returned_date_sk#15] Batched: true Location: InMemoryFileIndex [] @@ -189,7 +189,7 @@ Join condition: None Output [3]: [item_id#11, sr_item_qty#12, cr_item_qty#23] Input [4]: [item_id#11, sr_item_qty#12, item_id#22, cr_item_qty#23] -(31) Scan parquet default.web_returns +(31) Scan parquet spark_catalog.default.web_returns Output [3]: [wr_item_sk#24, wr_return_quantity#25, wr_returned_date_sk#26] Batched: true Location: InMemoryFileIndex [] @@ -271,20 +271,20 @@ BroadcastExchange (62) +- * BroadcastHashJoin LeftSemi BuildRight (60) :- * Filter (49) : +- * ColumnarToRow (48) - : +- Scan parquet default.date_dim (47) + : +- Scan parquet spark_catalog.default.date_dim (47) +- BroadcastExchange (59) +- * Project (58) +- * BroadcastHashJoin LeftSemi BuildRight (57) :- * ColumnarToRow (51) - : +- Scan parquet default.date_dim (50) + : +- Scan parquet spark_catalog.default.date_dim (50) +- BroadcastExchange (56) +- * Project (55) +- * Filter (54) +- * ColumnarToRow (53) - +- Scan parquet default.date_dim (52) + +- Scan parquet spark_catalog.default.date_dim (52) -(47) Scan parquet default.date_dim +(47) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date_sk#7, d_date#39] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] @@ -298,7 +298,7 @@ Input [2]: [d_date_sk#7, d_date#39] Input [2]: [d_date_sk#7, d_date#39] Condition : isnotnull(d_date_sk#7) -(50) Scan parquet default.date_dim +(50) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date#40, d_week_seq#41] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] @@ -307,7 +307,7 @@ ReadSchema: struct (51) ColumnarToRow [codegen id : 2] Input [2]: [d_date#40, d_week_seq#41] -(52) Scan parquet default.date_dim +(52) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date#42, d_week_seq#43] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/simplified.txt index 29ff19d745..f2e0a901c5 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.ansi/simplified.txt @@ -16,7 +16,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [sr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.store_returns [sr_item_sk,sr_return_quantity,sr_returned_date_sk] + Scan parquet spark_catalog.default.store_returns [sr_item_sk,sr_return_quantity,sr_returned_date_sk] SubqueryBroadcast [d_date_sk] #1 BroadcastExchange #2 WholeStageCodegen (3) @@ -25,7 +25,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [d_date_sk] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date] + Scan parquet spark_catalog.default.date_dim [d_date_sk,d_date] InputAdapter BroadcastExchange #3 WholeStageCodegen (2) @@ -33,7 +33,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty BroadcastHashJoin [d_week_seq,d_week_seq] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date,d_week_seq] + Scan parquet spark_catalog.default.date_dim [d_date,d_week_seq] InputAdapter BroadcastExchange #4 WholeStageCodegen (1) @@ -41,14 +41,14 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [d_date] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date,d_week_seq] + Scan parquet spark_catalog.default.date_dim [d_date,d_week_seq] InputAdapter BroadcastExchange #5 WholeStageCodegen (1) Filter [i_item_sk,i_item_id] ColumnarToRow InputAdapter - Scan parquet default.item [i_item_sk,i_item_id] + Scan parquet spark_catalog.default.item [i_item_sk,i_item_id] InputAdapter ReusedExchange [d_date_sk] #2 InputAdapter @@ -66,7 +66,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [cr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.catalog_returns [cr_item_sk,cr_return_quantity,cr_returned_date_sk] + Scan parquet spark_catalog.default.catalog_returns [cr_item_sk,cr_return_quantity,cr_returned_date_sk] ReusedSubquery [d_date_sk] #1 InputAdapter ReusedExchange [i_item_sk,i_item_id] #5 @@ -87,7 +87,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [wr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.web_returns [wr_item_sk,wr_return_quantity,wr_returned_date_sk] + Scan parquet spark_catalog.default.web_returns [wr_item_sk,wr_return_quantity,wr_returned_date_sk] ReusedSubquery [d_date_sk] #1 InputAdapter ReusedExchange [i_item_sk,i_item_id] #5 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/explain.txt index 885aace0dc..e6a65be7ec 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/explain.txt @@ -13,12 +13,12 @@ TakeOrderedAndProject (46) : : : +- * BroadcastHashJoin Inner BuildRight (5) : : : :- * Filter (3) : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.store_returns (1) + : : : : +- Scan parquet spark_catalog.default.store_returns (1) : : : +- ReusedExchange (4) : : +- BroadcastExchange (10) : : +- * Filter (9) : : +- * ColumnarToRow (8) - : : +- Scan parquet default.item (7) + : : +- Scan parquet spark_catalog.default.item (7) : +- BroadcastExchange (28) : +- * HashAggregate (27) : +- Exchange (26) @@ -29,7 +29,7 @@ TakeOrderedAndProject (46) : : +- * BroadcastHashJoin Inner BuildRight (20) : : :- * Filter (18) : : : +- * ColumnarToRow (17) - : : : +- Scan parquet default.catalog_returns (16) + : : : +- Scan parquet spark_catalog.default.catalog_returns (16) : : +- ReusedExchange (19) : +- ReusedExchange (22) +- BroadcastExchange (43) @@ -42,12 +42,12 @@ TakeOrderedAndProject (46) : +- * BroadcastHashJoin Inner BuildRight (35) : :- * Filter (33) : : +- * ColumnarToRow (32) - : : +- Scan parquet default.web_returns (31) + : : +- Scan parquet spark_catalog.default.web_returns (31) : +- ReusedExchange (34) +- ReusedExchange (37) -(1) Scan parquet default.store_returns +(1) Scan parquet spark_catalog.default.store_returns Output [3]: [sr_item_sk#1, sr_return_quantity#2, sr_returned_date_sk#3] Batched: true Location: InMemoryFileIndex [] @@ -74,7 +74,7 @@ Join condition: None Output [2]: [sr_item_sk#1, sr_return_quantity#2] Input [4]: [sr_item_sk#1, sr_return_quantity#2, sr_returned_date_sk#3, d_date_sk#5] -(7) Scan parquet default.item +(7) Scan parquet spark_catalog.default.item Output [2]: [i_item_sk#6, i_item_id#7] Batched: true Location [not included in comparison]/{warehouse_dir}/item] @@ -119,7 +119,7 @@ Functions [1]: [sum(sr_return_quantity#2)] Aggregate Attributes [1]: [sum(sr_return_quantity#2)#10] Results [2]: [i_item_id#7 AS item_id#11, sum(sr_return_quantity#2)#10 AS sr_item_qty#12] -(16) Scan parquet default.catalog_returns +(16) Scan parquet spark_catalog.default.catalog_returns Output [3]: [cr_item_sk#13, cr_return_quantity#14, cr_returned_date_sk#15] Batched: true Location: InMemoryFileIndex [] @@ -189,7 +189,7 @@ Join condition: None Output [3]: [item_id#11, sr_item_qty#12, cr_item_qty#23] Input [4]: [item_id#11, sr_item_qty#12, item_id#22, cr_item_qty#23] -(31) Scan parquet default.web_returns +(31) Scan parquet spark_catalog.default.web_returns Output [3]: [wr_item_sk#24, wr_return_quantity#25, wr_returned_date_sk#26] Batched: true Location: InMemoryFileIndex [] @@ -271,20 +271,20 @@ BroadcastExchange (62) +- * BroadcastHashJoin LeftSemi BuildRight (60) :- * Filter (49) : +- * ColumnarToRow (48) - : +- Scan parquet default.date_dim (47) + : +- Scan parquet spark_catalog.default.date_dim (47) +- BroadcastExchange (59) +- * Project (58) +- * BroadcastHashJoin LeftSemi BuildRight (57) :- * ColumnarToRow (51) - : +- Scan parquet default.date_dim (50) + : +- Scan parquet spark_catalog.default.date_dim (50) +- BroadcastExchange (56) +- * Project (55) +- * Filter (54) +- * ColumnarToRow (53) - +- Scan parquet default.date_dim (52) + +- Scan parquet spark_catalog.default.date_dim (52) -(47) Scan parquet default.date_dim +(47) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date_sk#5, d_date#39] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] @@ -298,7 +298,7 @@ Input [2]: [d_date_sk#5, d_date#39] Input [2]: [d_date_sk#5, d_date#39] Condition : isnotnull(d_date_sk#5) -(50) Scan parquet default.date_dim +(50) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date#40, d_week_seq#41] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] @@ -307,7 +307,7 @@ ReadSchema: struct (51) ColumnarToRow [codegen id : 2] Input [2]: [d_date#40, d_week_seq#41] -(52) Scan parquet default.date_dim +(52) Scan parquet spark_catalog.default.date_dim Output [2]: [d_date#42, d_week_seq#43] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/simplified.txt index 7f38503363..0026109bc2 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100.ansi/simplified.txt @@ -16,7 +16,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [sr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.store_returns [sr_item_sk,sr_return_quantity,sr_returned_date_sk] + Scan parquet spark_catalog.default.store_returns [sr_item_sk,sr_return_quantity,sr_returned_date_sk] SubqueryBroadcast [d_date_sk] #1 BroadcastExchange #2 WholeStageCodegen (3) @@ -25,7 +25,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [d_date_sk] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date] + Scan parquet spark_catalog.default.date_dim [d_date_sk,d_date] InputAdapter BroadcastExchange #3 WholeStageCodegen (2) @@ -33,7 +33,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty BroadcastHashJoin [d_week_seq,d_week_seq] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date,d_week_seq] + Scan parquet spark_catalog.default.date_dim [d_date,d_week_seq] InputAdapter BroadcastExchange #4 WholeStageCodegen (1) @@ -41,7 +41,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [d_date] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date,d_week_seq] + Scan parquet spark_catalog.default.date_dim [d_date,d_week_seq] InputAdapter ReusedExchange [d_date_sk] #2 InputAdapter @@ -50,7 +50,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [i_item_sk,i_item_id] ColumnarToRow InputAdapter - Scan parquet default.item [i_item_sk,i_item_id] + Scan parquet spark_catalog.default.item [i_item_sk,i_item_id] InputAdapter BroadcastExchange #6 WholeStageCodegen (11) @@ -66,7 +66,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [cr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.catalog_returns [cr_item_sk,cr_return_quantity,cr_returned_date_sk] + Scan parquet spark_catalog.default.catalog_returns [cr_item_sk,cr_return_quantity,cr_returned_date_sk] ReusedSubquery [d_date_sk] #1 InputAdapter ReusedExchange [d_date_sk] #2 @@ -87,7 +87,7 @@ TakeOrderedAndProject [item_id,sr_item_qty,sr_dev,cr_item_qty,cr_dev,wr_item_qty Filter [wr_item_sk] ColumnarToRow InputAdapter - Scan parquet default.web_returns [wr_item_sk,wr_return_quantity,wr_returned_date_sk] + Scan parquet spark_catalog.default.web_returns [wr_item_sk,wr_return_quantity,wr_returned_date_sk] ReusedSubquery [d_date_sk] #1 InputAdapter ReusedExchange [d_date_sk] #2 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 4daa0a1b3b..41593c701a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -3239,6 +3239,11 @@ class DataFrameSuite extends QueryTest } } } + + test("SPARK-39612: exceptAll with following count should work") { + val d1 = Seq("a").toDF + assert(d1.exceptAll(d1).count() === 0) + } } case class GroupByKey(a: Int, b: Int) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index c90fa12470..e872b6aaa6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.types._ */ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSparkSession - with AdaptiveSparkPlanHelper{ + with AdaptiveSparkPlanHelper { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala index b1b9ed0456..366120fb66 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala @@ -1754,6 +1754,25 @@ class DynamicPartitionPruningV1SuiteAEOff extends DynamicPartitionPruningV1Suite class DynamicPartitionPruningV1SuiteAEOn extends DynamicPartitionPruningV1Suite with EnableAdaptiveExecutionSuite { + test("SPARK-39447: Avoid AssertionError in AdaptiveSparkPlanExec.doExecuteBroadcast") { + val df = sql( + """ + |WITH empty_result AS ( + | SELECT * FROM fact_stats WHERE product_id < 0 + |) + |SELECT * + |FROM (SELECT /*+ SHUFFLE_MERGE(fact_sk) */ empty_result.store_id + | FROM fact_sk + | JOIN empty_result + | ON fact_sk.product_id = empty_result.product_id) t2 + | JOIN empty_result + | ON t2.store_id = empty_result.store_id + """.stripMargin) + + checkPartitionPruningPredicate(df, false, false) + checkAnswer(df, Nil) + } + test("SPARK-37995: PlanAdaptiveDynamicPruningFilters should use prepareExecutedPlan " + "rather than createSparkPlan to re-plan subquery") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 5e5bc27ff4..15a615ce6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2089,7 +2089,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("SPARK-15327: fail to compile generated code with complex data structure") { - withTempDir{ dir => + withTempDir { dir => val json = """ |{"h": {"b": {"c": [{"e": "adfgd"}], "a": [{"e": "testing", "count": 3}], diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 0601dce1d4..bd48d17303 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -147,6 +147,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper .set(SQLConf.SHUFFLE_PARTITIONS, 4) // use Java 8 time API to handle negative years properly .set(SQLConf.DATETIME_JAVA8API_ENABLED, true) + // SPARK-39564: don't print out serde to avoid introducing complicated and error-prone + // regex magic. + .set("spark.test.noSerdeInExplain", "true") // SPARK-32106 Since we add SQL test 'transform.sql' will use `cat` command, // here we need to ignore it. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala index 55f1713422..4fd8341b3f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala @@ -57,7 +57,7 @@ class SQLJsonProtocolSuite extends SparkFunSuite with LocalSparkSession { |} """.stripMargin - val reconstructedEvent = JsonProtocol.sparkEventFromJson(parse(SQLExecutionStartJsonString)) + val reconstructedEvent = JsonProtocol.sparkEventFromJson(SQLExecutionStartJsonString) if (newExecutionStartEvent) { val expectedEvent = SparkListenerSQLExecutionStart(0, "test desc", "test detail", "test plan", new SparkPlanInfo("TestNode", "test string", Nil, Map(), Nil), 0, @@ -79,8 +79,8 @@ class SQLJsonProtocolSuite extends SparkFunSuite with LocalSparkSession { event.executionName = Some("test") event.qe = qe event.executionFailure = Some(new RuntimeException("test")) - val json = JsonProtocol.sparkEventToJson(event) - assert(json == parse( + val json = JsonProtocol.sparkEventToJsonString(event) + assert(parse(json) == parse( """ |{ | "Event" : "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala index 766b8238bb..647d46f8fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala @@ -93,4 +93,38 @@ class TakeOrderedAndProjectSuite extends SparkPlanTest with SharedSparkSession { } } } + + test("TakeOrderedAndProject.doExecute with local sort") { + withClue(s"seed = $seed") { + val expected = (input: SparkPlan) => { + GlobalLimitExec(limit, + LocalLimitExec(limit, + ProjectExec(Seq(input.output.last), + SortExec(sortOrder, true, input)))) + } + + // test doExecute + Seq((10000, 10), (200, 10)).foreach { case (n, m) => + checkThatPlansAgree( + generateRandomInputData(n, m), + input => + noOpFilter( + TakeOrderedAndProjectExec(limit, sortOrder, Seq(input.output.last), + SortExec(sortOrder, false, input))), + input => expected(input), + sortAnswers = false) + } + + // test executeCollect + Seq((10000, 10), (200, 10)).foreach { case (n, m) => + checkThatPlansAgree( + generateRandomInputData(n, m), + input => + TakeOrderedAndProjectExec(limit, sortOrder, Seq(input.output.last), + SortExec(sortOrder, false, input)), + input => expected(input), + sortAnswers = false) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ConstantColumnVectorBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ConstantColumnVectorBenchmark.scala index 9e4902f2fb..8046a4b6cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ConstantColumnVectorBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ConstantColumnVectorBenchmark.scala @@ -22,7 +22,7 @@ import org.apache.commons.lang3.RandomStringUtils import org.apache.spark.benchmark.Benchmark import org.apache.spark.benchmark.BenchmarkBase import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.vectorized.{ColumnVectorUtils, ConstantColumnVector, OffHeapColumnVector, OnHeapColumnVector} +import org.apache.spark.sql.execution.vectorized.{ColumnVectorUtils, ConstantColumnVector, OffHeapColumnVector, OnHeapColumnVector, WritableColumnVector} import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnVector import org.apache.spark.unsafe.UTF8StringBuilder @@ -41,6 +41,22 @@ import org.apache.spark.unsafe.UTF8StringBuilder */ object ConstantColumnVectorBenchmark extends BenchmarkBase { + private def populate( + col: WritableColumnVector, batchSize: Int, row: InternalRow, fieldIdx: Int): Unit = { + col.dataType() match { + case IntegerType => col.putInts(0, batchSize, row.getInt(fieldIdx)) + case LongType => col.putLongs(0, batchSize, row.getLong(fieldIdx)) + case FloatType => col.putFloats(0, batchSize, row.getFloat(fieldIdx)) + case DoubleType => col.putDoubles(0, batchSize, row.getDouble(fieldIdx)) + case StringType => + val v = row.getUTF8String(fieldIdx) + val bytes = v.getBytes + (0 until batchSize).foreach { i => + col.putByteArray(i, bytes) + } + } + } + private def readValues(dataType: DataType, batchSize: Int, vector: ColumnVector): Unit = { dataType match { case IntegerType => @@ -86,14 +102,14 @@ object ConstantColumnVectorBenchmark extends BenchmarkBase { benchmark.addCase("OnHeapColumnVector") { _: Int => for (_ <- 0 until valuesPerIteration) { onHeapColumnVector.reset() - ColumnVectorUtils.populate(onHeapColumnVector, row, 0) + populate(onHeapColumnVector, batchSize, row, 0) } } benchmark.addCase("OffHeapColumnVector") { _: Int => for (_ <- 0 until valuesPerIteration) { offHeapColumnVector.reset() - ColumnVectorUtils.populate(offHeapColumnVector, row, 0) + populate(offHeapColumnVector, batchSize, row, 0) } } @@ -114,9 +130,9 @@ object ConstantColumnVectorBenchmark extends BenchmarkBase { val constantColumnVector = new ConstantColumnVector(batchSize, dataType) onHeapColumnVector.reset() - ColumnVectorUtils.populate(onHeapColumnVector, row, 0) + populate(onHeapColumnVector, batchSize, row, 0) offHeapColumnVector.reset() - ColumnVectorUtils.populate(offHeapColumnVector, row, 0) + populate(offHeapColumnVector, batchSize, row, 0) ColumnVectorUtils.populate(constantColumnVector, row, 0) val other = if (dataType == StringType) { @@ -184,7 +200,7 @@ object ConstantColumnVectorBenchmark extends BenchmarkBase { benchmark.addCase("OnHeapColumnVector") { _: Int => onHeapColumnVector.reset() - ColumnVectorUtils.populate(onHeapColumnVector, row, 0) + populate(onHeapColumnVector, batchSize, row, 0) for (_ <- 0 until valuesPerIteration) { readValues(dataType, batchSize, onHeapColumnVector) } @@ -192,7 +208,7 @@ object ConstantColumnVectorBenchmark extends BenchmarkBase { benchmark.addCase("OffHeapColumnVector") { _: Int => offHeapColumnVector.reset() - ColumnVectorUtils.populate(offHeapColumnVector, row, 0) + populate(offHeapColumnVector, batchSize, row, 0) for (_ <- 0 until valuesPerIteration) { readValues(dataType, batchSize, offHeapColumnVector) } @@ -229,13 +245,13 @@ object ConstantColumnVectorBenchmark extends BenchmarkBase { } benchmark.addCase("OnHeapColumnVector") { _: Int => - for (i <- 0 until valuesPerIteration) { + for (_ <- 0 until valuesPerIteration) { (0 until batchSize).foreach(onHeapColumnVector.isNullAt) } } benchmark.addCase("OffHeapColumnVector") { _: Int => - for (i <- 0 until valuesPerIteration) { + for (_ <- 0 until valuesPerIteration) { (0 until batchSize).foreach(offHeapColumnVector.isNullAt) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index f2d4ea8e1a..b4f1802ced 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -23,12 +23,14 @@ import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.TPCDSSchema import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_SECOND import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType /** * Benchmark to measure TPCDS query performance. @@ -65,23 +67,19 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark with Logging { "web_returns", "web_site", "reason", "call_center", "warehouse", "ship_mode", "income_band", "time_dim", "web_page") - def setupTables(dataLocation: String, createTempView: Boolean): Map[String, Long] = { + def setupTables(dataLocation: String, tableColumns: Map[String, StructType]): Map[String, Long] = tables.map { tableName => - if (createTempView) { - spark.read.parquet(s"$dataLocation/$tableName").createOrReplaceTempView(tableName) - } else { - spark.sql(s"DROP TABLE IF EXISTS $tableName") - spark.catalog.createTable(tableName, s"$dataLocation/$tableName", "parquet") - // Recover partitions but don't fail if a table is not partitioned. - Try { - spark.sql(s"ALTER TABLE $tableName RECOVER PARTITIONS") - }.getOrElse { - logInfo(s"Recovering partitions of table $tableName failed") - } + spark.sql(s"DROP TABLE IF EXISTS $tableName") + val options = Map("path" -> s"$dataLocation/$tableName") + spark.catalog.createTable(tableName, "parquet", tableColumns(tableName), options) + // Recover partitions but don't fail if a table is not partitioned. + Try { + spark.sql(s"ALTER TABLE $tableName RECOVER PARTITIONS") + }.getOrElse { + logInfo(s"Recovering partitions of table $tableName failed") } tableName -> spark.table(tableName).count() }.toMap - } def runTpcdsQueries( queryLocation: String, @@ -163,7 +161,7 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark with Logging { } val tableSizes = setupTables(benchmarkArgs.dataLocation, - createTempView = !benchmarkArgs.cboEnabled) + TPCDSSchemaHelper.getTableColumns) if (benchmarkArgs.cboEnabled) { spark.sql(s"SET ${SQLConf.CBO_ENABLED.key}=true") spark.sql(s"SET ${SQLConf.PLAN_STATS_ENABLED.key}=true") @@ -186,3 +184,8 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark with Logging { nameSuffix = nameSuffixForQueriesV2_7) } } + +object TPCDSSchemaHelper extends TPCDSSchema { + def getTableColumns: Map[String, StructType] = + tableColumns.map(kv => kv._1 -> StructType.fromDDL(kv._2)) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala index 80a6bffc61..0628acb8f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala @@ -21,7 +21,7 @@ import java.util.Locale class TPCDSQueryBenchmarkArguments(val args: Array[String]) { - var dataLocation: String = null + var dataLocation: String = sys.env.getOrElse("SPARK_TPCDS_DATA", null) var queryFilter: Set[String] = Set.empty var cboEnabled: Boolean = false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TakeOrderedAndProjectBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TakeOrderedAndProjectBenchmark.scala new file mode 100644 index 0000000000..88cdfebbb1 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TakeOrderedAndProjectBenchmark.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf + +/** + * TakeOrderedAndProject benchmark. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/TakeOrderedAndProjectBenchmark-results.txt". + * }}} + */ +object TakeOrderedAndProjectBenchmark extends SqlBasedBenchmark { + + private def takeOrderedAndProjectWithSMJ(): Unit = { + val row = 10 * 1000 + + val df1 = spark.range(0, row, 1, 2).selectExpr("id % 3 as c1") + val df2 = spark.range(0, row, 1, 2).selectExpr("id % 3 as c2") + + val benchmark = new Benchmark("TakeOrderedAndProject with SMJ", row, output = output) + + benchmark.addCase("TakeOrderedAndProject with SMJ for doExecute", 3) { _ => + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.SHUFFLE_PARTITIONS.key -> "5", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + df1.join(df2, col("c1") === col("c2")) + .orderBy(col("c1")) + .limit(100) + .noop() + } + } + + benchmark.addCase("TakeOrderedAndProject with SMJ for executeCollect", 3) { _ => + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.SHUFFLE_PARTITIONS.key -> "5", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + df1.join(df2, col("c1") === col("c2")) + .orderBy(col("c1")) + .limit(100) + .collect() + } + } + benchmark.run() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("TakeOrderedAndProject") { + takeOrderedAndProjectWithSMJ() + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowFunctionsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowFunctionsSuiteBase.scala index 130914d35d..415bc81379 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowFunctionsSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowFunctionsSuiteBase.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.command import java.util.Locale -import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.util.Utils /** @@ -38,9 +38,11 @@ import org.apache.spark.util.Utils trait ShowFunctionsSuiteBase extends QueryTest with DDLCommandTestUtils { override val command = "SHOW FUNCTIONS" + protected def funCatalog: String = catalog protected def createFunction(name: String): Unit = {} protected def dropFunction(name: String): Unit = {} protected def showFun(ns: String, name: String): String = s"$ns.$name".toLowerCase(Locale.ROOT) + protected def isTempFunctions(): Boolean = false /** * Drops function `funName` after calling `f`. @@ -51,7 +53,7 @@ trait ShowFunctionsSuiteBase extends QueryTest with DDLCommandTestUtils { } } - protected def withNamespaceAndFuns(ns: String, funNames: Seq[String], cat: String = catalog) + protected def withNamespaceAndFuns(ns: String, funNames: Seq[String], cat: String = funCatalog) (f: (String, Seq[String]) => Unit): Unit = { val nsCat = s"$cat.$ns" withNamespace(nsCat) { @@ -63,10 +65,124 @@ trait ShowFunctionsSuiteBase extends QueryTest with DDLCommandTestUtils { } } - protected def withNamespaceAndFun(ns: String, funName: String, cat: String = catalog) + protected def withNamespaceAndFun(ns: String, funName: String, cat: String = funCatalog) (f: (String, String) => Unit): Unit = { withNamespaceAndFuns(ns, Seq(funName), cat) { case (ns, Seq(name)) => f(ns, name) } } + + test("show a function") { + withNamespaceAndFun("ns", "iiilog") { (ns, f) => + val totalFuns = sql(s"SHOW FUNCTIONS IN $ns").count() + createFunction(f) + assert(sql(s"SHOW FUNCTIONS IN $ns").count() - totalFuns === 1) + assert(!sql(s"SHOW FUNCTIONS IN $ns").filter("contains(function, 'iiilog')").isEmpty) + } + } + + test("show a function in the USER name space") { + withNamespaceAndFun("ns", "logiii") { (ns, f) => + assert(sql(s"SHOW USER FUNCTIONS IN $ns").count() === 0) + createFunction(f) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns"), + Row(showFun("ns", "logiii")) :: Nil) + } + } + + test("show a temporary function as an USER function") { + withNamespaceAndFun("ns", "poggi") { (ns, f0) => + createFunction(f0) + val f1 = "temp_test_fun" + withUserDefinedFunction(f1 -> true) { + spark.udf.register(f1, (arg1: Int, arg2: String) => arg2 + arg1) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns"), + Row(showFun("ns", "poggi")) :: Row(f1) :: Nil) + QueryTest.checkAnswer( + sql(s"SHOW ALL FUNCTIONS IN $ns").filter(s"function='$f1'"), + Row(f1) :: Nil) + QueryTest.checkAnswer( + sql(s"SHOW SYSTEM FUNCTIONS IN $ns").filter(s"function='$f1'"), + Nil) + } + } + } + + test("show functions in the SYSTEM name space") { + withNamespaceAndFun("ns", "date_addi") { (ns, f) => + val systemFuns = sql(s"SHOW SYSTEM FUNCTIONS IN $ns") + assert(systemFuns.count() > 0) + createFunction(f) + assert(sql(s"SHOW SYSTEM FUNCTIONS IN $ns").count() === systemFuns.count()) + // Built-in operators + assert(!systemFuns.filter("function='case'").isEmpty) + // Built-in functions + assert(!systemFuns.filter("function='substring'").isEmpty) + } + } + + test("show functions among both user and system defined functions") { + withNamespaceAndFun("ns", "current_datei") { (ns, f) => + val allFuns = sql(s"SHOW ALL FUNCTIONS IN $ns").collect() + assert(allFuns.nonEmpty) + createFunction(f) + QueryTest.checkAnswer( + sql(s"SHOW ALL FUNCTIONS IN $ns"), + allFuns :+ Row(showFun("ns", "current_datei"))) + } + } + + test("show functions matched to the wildcard pattern") { + val testFuns = Seq("crc32i", "crc16j", "date1900", "Date1") + withNamespaceAndFuns("ns", testFuns) { (ns, funs) => + assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) + funs.foreach(createFunction) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns LIKE '*'"), + testFuns.map(testFun => Row(showFun("ns", testFun)))) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns LIKE '*rc*'"), + Seq("crc32i", "crc16j").map(testFun => Row(showFun("ns", testFun)))) + } + } + + test("show a function by its string name") { + assume(!isTempFunctions()) + val testFuns = Seq("crc32i", "crc16j") + withNamespaceAndFuns("ns", testFuns) { (ns, funs) => + assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) + funs.foreach(createFunction) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns 'crc32i'"), + Row(showFun("ns", "crc32i")) :: Nil) + } + } + + test("show functions matched to the '|' pattern") { + assume(!isTempFunctions()) + val testFuns = Seq("crc32i", "crc16j", "date1900", "Date1") + withNamespaceAndFuns("ns", testFuns) { (ns, funs) => + assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) + funs.foreach(createFunction) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns LIKE 'crc32i|date1900'"), + Seq("crc32i", "date1900").map(testFun => Row(showFun("ns", testFun)))) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS IN $ns LIKE 'crc32i|date*'"), + Seq("crc32i", "date1900", "Date1").map(testFun => Row(showFun("ns", testFun)))) + } + } + + test("show a function by its id") { + assume(!isTempFunctions()) + withNamespaceAndFun("ns", "crc32i") { (ns, fun) => + assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) + createFunction(fun) + QueryTest.checkAnswer( + sql(s"SHOW USER FUNCTIONS $fun"), + Row(showFun("ns", "crc32i")) :: Nil) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowFunctionsSuite.scala index f14b550688..f7ea4e7587 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowFunctionsSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.command.v1 import java.util.Locale -import org.apache.spark.sql.Row import org.apache.spark.sql.execution.command /** @@ -33,65 +32,14 @@ import org.apache.spark.sql.execution.command * `org.apache.spark.sql.hive.execution.command.ShowFunctionsSuite` */ trait ShowFunctionsSuiteBase extends command.ShowFunctionsSuiteBase - with command.TestsV1AndV2Commands { - - test("show a function") { - withNamespaceAndFun("ns", "iiilog") { (ns, f) => - val totalFuns = sql(s"SHOW FUNCTIONS IN $ns").count() - createFunction(f) - assert(sql(s"SHOW FUNCTIONS IN $ns").count() - totalFuns === 1) - assert(!sql(s"SHOW FUNCTIONS IN $ns").filter("contains(function, 'iiilog')").isEmpty) - } - } - - test("show a function in the USER name space") { - withNamespaceAndFun("ns", "logiii") { (ns, f) => - assert(sql(s"SHOW USER FUNCTIONS IN $ns").count() === 0) - createFunction(f) - checkAnswer(sql(s"SHOW USER FUNCTIONS IN $ns"), Row(showFun("ns", "logiii"))) - } - } - - test("show functions in the SYSTEM name space") { - withNamespaceAndFun("ns", "date_addi") { (ns, f) => - val systemFuns = sql(s"SHOW SYSTEM FUNCTIONS IN $ns").count() - assert(systemFuns > 0) - createFunction(f) - assert(sql(s"SHOW SYSTEM FUNCTIONS IN $ns").count() === systemFuns) - } - } - - test("show functions among both user and system defined functions") { - withNamespaceAndFun("ns", "current_datei") { (ns, f) => - val allFuns = sql(s"SHOW ALL FUNCTIONS IN $ns").collect() - assert(allFuns.nonEmpty) - createFunction(f) - checkAnswer( - sql(s"SHOW ALL FUNCTIONS IN $ns"), - allFuns :+ Row(showFun("ns", "current_datei"))) - } - } - - test("show functions matched to the wildcard pattern") { - val testFuns = Seq("crc32i", "crc16j", "date1900", "Date1") - withNamespaceAndFuns("ns", testFuns) { (ns, funs) => - assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) - funs.foreach(createFunction) - checkAnswer( - sql(s"SHOW USER FUNCTIONS IN $ns LIKE '*'"), - testFuns.map(testFun => Row(showFun("ns", testFun)))) - checkAnswer( - sql(s"SHOW USER FUNCTIONS IN $ns LIKE '*rc*'"), - Seq("crc32i", "crc16j").map(testFun => Row(showFun("ns", testFun)))) - } - } -} + with command.TestsV1AndV2Commands /** * The class contains tests for the `SHOW FUNCTIONS` command to check temporary functions. */ class ShowTempFunctionsSuite extends ShowFunctionsSuiteBase with CommandSuiteBase { override def commandVersion: String = super[ShowFunctionsSuiteBase].commandVersion + override protected def isTempFunctions(): Boolean = true override protected def createFunction(name: String): Unit = { spark.udf.register(name, (arg1: Int, arg2: String) => arg2 + arg1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/CommandSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/CommandSuiteBase.scala index ac38a589ff..15d56050c2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/CommandSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/CommandSuiteBase.scala @@ -21,9 +21,7 @@ import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.analysis.ResolvePartitionSpec import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.connector.catalog.{CatalogV2Implicits, Identifier, InMemoryCatalog, InMemoryPartitionTable, InMemoryPartitionTableCatalog, InMemoryTableCatalog} -import org.apache.spark.sql.connector.catalog.functions.UnboundFunction import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.util.Utils /** * The trait contains settings and utility functions. It can be mixed to the test suites for @@ -35,14 +33,13 @@ trait CommandSuiteBase extends SharedSparkSession { def catalogVersion: String = "V2" // The catalog version is added to test names def commandVersion: String = "V2" // The command version is added to test names def catalog: String = "test_catalog" // The default V2 catalog for testing - def funCatalog: String = s"fun_$catalog" def defaultUsing: String = "USING _" // The clause is used in creating v2 tables under testing // V2 catalogs created and used especially for testing override def sparkConf: SparkConf = super.sparkConf .set(s"spark.sql.catalog.$catalog", classOf[InMemoryPartitionTableCatalog].getName) .set(s"spark.sql.catalog.non_part_$catalog", classOf[InMemoryTableCatalog].getName) - .set(s"spark.sql.catalog.$funCatalog", classOf[InMemoryCatalog].getName) + .set(s"spark.sql.catalog.fun_$catalog", classOf[InMemoryCatalog].getName) def checkLocation( t: String, @@ -65,17 +62,4 @@ trait CommandSuiteBase extends SharedSparkSession { assert(partMetadata.containsKey("location")) assert(partMetadata.get("location") === expected) } - - - def withFun(ident: Identifier, fn: UnboundFunction)(f: => Unit): Unit = { - val cat = spark.sessionState - .catalogManager - .catalog(funCatalog) - .asInstanceOf[InMemoryCatalog] - - cat.createFunction(ident, fn) - Utils.tryWithSafeFinally(f) { - cat.dropFunction(ident) - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeNamespaceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeNamespaceSuite.scala index f801cdb577..3f13319fc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeNamespaceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeNamespaceSuite.scala @@ -47,7 +47,7 @@ class DescribeNamespaceSuite extends command.DescribeNamespaceSuiteBase with Com val description = descriptionDf.collect() assert(description === Seq( Row("Catalog Name", catalog), - Row("Namespace Name", "ns2"), + Row("Namespace Name", "ns1.ns2"), Row(SupportsNamespaces.PROP_COMMENT.capitalize, "test namespace"), Row(SupportsNamespaces.PROP_LOCATION.capitalize, "file:/tmp/ns_test"), Row(SupportsNamespaces.PROP_OWNER.capitalize, Utils.getCurrentUserName()), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowFunctionsSuite.scala index 88c16a5552..b3f791abda 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowFunctionsSuite.scala @@ -20,21 +20,32 @@ package org.apache.spark.sql.execution.command.v2 import test.org.apache.spark.sql.connector.catalog.functions.JavaStrLen import test.org.apache.spark.sql.connector.catalog.functions.JavaStrLen.JavaStrLenNoImpl -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.connector.catalog.Identifier +import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryCatalog} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper import org.apache.spark.sql.execution.command /** * The class contains tests for the `SHOW FUNCTIONS` command to check V2 table catalogs. */ class ShowFunctionsSuite extends command.ShowFunctionsSuiteBase with CommandSuiteBase { + override protected def funCatalog: String = s"fun_$catalog" + override protected def showFun(ns: String, name: String): String = name - test("only support session catalog") { - withFun(Identifier.of(Array.empty, "abc"), new JavaStrLen(new JavaStrLenNoImpl)) { - val e = intercept[AnalysisException] { - sql(s"SHOW FUNCTIONS LIKE $funCatalog.abc") - } - assert(e.getMessage === s"Catalog $funCatalog does not support functions") - } + private def getFunCatalog(): InMemoryCatalog = { + spark.sessionState.catalogManager.catalog(funCatalog).asInstanceOf[InMemoryCatalog] + } + + private def funNameToId(name: String): Identifier = { + val parts = name.split('.') + assert(parts.head == funCatalog, s"${parts.head} is wrong catalog. Expected: $funCatalog.") + new MultipartIdentifierHelper(parts.tail).asIdentifier + } + + override protected def createFunction(name: String): Unit = { + getFunCatalog().createFunction(funNameToId(name), new JavaStrLen(new JavaStrLenNoImpl)) + } + + override protected def dropFunction(name: String): Unit = { + getFunCatalog().dropFunction(funNameToId(name)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala index 2a67864de8..343b59a311 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.test.SharedSparkSession class FileFormatWriterSuite extends QueryTest with SharedSparkSession - with CodegenInterpretedPlanTest{ + with CodegenInterpretedPlanTest { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 62dccaad1d..bf92ffcf46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -34,6 +34,7 @@ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.commons.lang3.time.FastDateFormat import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec +import org.apache.logging.log4j.Level import org.apache.spark.{SparkConf, SparkException, TestUtils} import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Encoders, QueryTest, Row} @@ -2296,6 +2297,40 @@ abstract class CSVSuite assert(errMsg2.contains("'lineSep' can contain only 1 character")) } + Seq(true, false).foreach { multiLine => + test(s"""lineSep with 2 chars when multiLine set to $multiLine""") { + Seq("\r\n", "||", "|").foreach { newLine => + val logAppender = new LogAppender("lineSep WARN logger") + withTempDir { dir => + val inputData = if (multiLine) { + s"""name,"i am the${newLine} column1"${newLine}jack,30${newLine}tom,18""" + } else { + s"name,age${newLine}jack,30${newLine}tom,18" + } + Files.write(new File(dir, "/data.csv").toPath, inputData.getBytes()) + withLogAppender(logAppender) { + val df = spark.read + .options( + Map("header" -> "true", "multiLine" -> multiLine.toString, "lineSep" -> newLine)) + .csv(dir.getCanonicalPath) + // Due to the limitation of Univocity parser: + // multiple chars of newlines cannot be properly handled when they exist within quotes. + // Leave 2-char lineSep as an undocumented features and logWarn user + if (newLine != "||" || !multiLine) { + checkAnswer(df, Seq(Row("jack", "30"), Row("tom", "18"))) + } + if (newLine.length == 2) { + val message = "It is not recommended to set 'lineSep' with 2 characters due to" + assert(logAppender.loggingEvents.exists( + e => e.getLevel == Level.WARN && e.getMessage.getFormattedMessage.contains(message) + )) + } + } + } + } + } + } + test("SPARK-26208: write and read empty data to csv file with headers") { withTempPath { path => val df1 = spark.range(10).repartition(2).filter(_ < 0).map(_.toString).toDF diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala index 8d8d13211f..7aa8adc07e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala @@ -35,7 +35,6 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { val tempDir = Utils.createTempDir() val url = s"jdbc:h2:${tempDir.getCanonicalPath};user=testUser;password=testPass" val defaultMetadata = new MetadataBuilder().putLong("scale", 0).build() - var conn: java.sql.Connection = null override def sparkConf: SparkConf = super.sparkConf .set("spark.sql.catalog.h2", classOf[JDBCTableCatalog].getName) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 80153b00aa..c172091581 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random +import com.fasterxml.jackson.databind.ObjectMapper import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.io.FileCommitProtocol @@ -580,8 +581,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils assert(metricInfo.metadata === Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)) // After serializing to JSON, the original value type is lost, but we can still // identify that it's a SQL metric from the metadata - val metricInfoJson = JsonProtocol.accumulableInfoToJson(metricInfo) - val metricInfoDeser = JsonProtocol.accumulableInfoFromJson(metricInfoJson) + val mapper = new ObjectMapper() + val metricInfoJson = JsonProtocol.toJsonString( + JsonProtocol.accumulableInfoToJson(metricInfo, _)) + val metricInfoDeser = JsonProtocol.accumulableInfoFromJson(mapper.readTree(metricInfoJson)) metricInfoDeser.update match { case Some(v: String) => assert(v.toLong === 10L) case Some(v) => fail(s"deserialized metric value was not a string: ${v.getClass.getName}") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala index 1539341359..7ba18a8140 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala @@ -231,6 +231,16 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { assert((resultKeySchema, resultValueSchema) === (keySchema, valueSchema)) } + test("SPARK-39650: ignore value schema on compatibility check") { + val typeChangedValueSchema = StructType(valueSchema.map(_.copy(dataType = TimestampType))) + verifySuccess(keySchema, valueSchema, keySchema, typeChangedValueSchema, + ignoreValueSchema = true) + + val typeChangedKeySchema = StructType(keySchema.map(_.copy(dataType = TimestampType))) + verifyException(keySchema, valueSchema, typeChangedKeySchema, valueSchema, + ignoreValueSchema = true) + } + private def applyNewSchemaToNestedFieldInKey(newNestedSchema: StructType): StructType = { applyNewSchemaToNestedField(keySchema, newNestedSchema, "key3") } @@ -257,44 +267,57 @@ class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { dir: String, queryId: UUID, newKeySchema: StructType, - newValueSchema: StructType): Unit = { + newValueSchema: StructType, + ignoreValueSchema: Boolean): Unit = { // in fact, Spark doesn't support online state schema change, so need to check // schema only once for each running of JVM val providerId = StateStoreProviderId( StateStoreId(dir, opId, partitionId), queryId) new StateSchemaCompatibilityChecker(providerId, hadoopConf) - .check(newKeySchema, newValueSchema) + .check(newKeySchema, newValueSchema, ignoreValueSchema = ignoreValueSchema) } private def verifyException( oldKeySchema: StructType, oldValueSchema: StructType, newKeySchema: StructType, - newValueSchema: StructType): Unit = { + newValueSchema: StructType, + ignoreValueSchema: Boolean = false): Unit = { val dir = newDir() val queryId = UUID.randomUUID() - runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema) + runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema, + ignoreValueSchema = ignoreValueSchema) val e = intercept[StateSchemaNotCompatible] { - runSchemaChecker(dir, queryId, newKeySchema, newValueSchema) + runSchemaChecker(dir, queryId, newKeySchema, newValueSchema, + ignoreValueSchema = ignoreValueSchema) } - e.getMessage.contains("Provided schema doesn't match to the schema for existing state!") - e.getMessage.contains(newKeySchema.json) - e.getMessage.contains(newValueSchema.json) - e.getMessage.contains(oldKeySchema.json) - e.getMessage.contains(oldValueSchema.json) + assert(e.getMessage.contains("Provided schema doesn't match to the schema for existing state!")) + assert(e.getMessage.contains(newKeySchema.toString())) + assert(e.getMessage.contains(oldKeySchema.toString())) + + if (ignoreValueSchema) { + assert(!e.getMessage.contains(newValueSchema.toString())) + assert(!e.getMessage.contains(oldValueSchema.toString())) + } else { + assert(e.getMessage.contains(newValueSchema.toString())) + assert(e.getMessage.contains(oldValueSchema.toString())) + } } private def verifySuccess( oldKeySchema: StructType, oldValueSchema: StructType, newKeySchema: StructType, - newValueSchema: StructType): Unit = { + newValueSchema: StructType, + ignoreValueSchema: Boolean = false): Unit = { val dir = newDir() val queryId = UUID.randomUUID() - runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema) - runSchemaChecker(dir, queryId, newKeySchema, newValueSchema) + runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema, + ignoreValueSchema = ignoreValueSchema) + runSchemaChecker(dir, queryId, newKeySchema, newValueSchema, + ignoreValueSchema = ignoreValueSchema) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala index 2395f9164d..226b6e47a9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala @@ -608,8 +608,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils test("roundtripping SparkListenerDriverAccumUpdates through JsonProtocol (SPARK-18462)") { val event = SparkListenerDriverAccumUpdates(1L, Seq((2L, 3L))) - val json = JsonProtocol.sparkEventToJson(event) - assertValidDataInJson(json, + val json = JsonProtocol.sparkEventToJsonString(event) + assertValidDataInJson(parse(json), parse(""" |{ | "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates", @@ -627,14 +627,14 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils } // Test a case where the numbers in the JSON can only fit in longs: - val longJson = parse( + val longJson = """ |{ | "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates", | "executionId": 4294967294, | "accumUpdates": [[4294967294,3]] |} - """.stripMargin) + """.stripMargin JsonProtocol.sparkEventFromJson(longJson) match { case SparkListenerDriverAccumUpdates(executionId, accums) => assert(executionId == 4294967294L) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala index dfc64a41d9..1ef07bf9eb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.ui import org.apache.spark.sql.execution.SparkPlanInfo import org.apache.spark.sql.test.SharedSparkSession -class SparkPlanInfoSuite extends SharedSparkSession{ +class SparkPlanInfoSuite extends SharedSparkSession { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala index 4cf2376a3f..cdf41ed651 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.columnar.ColumnAccessor import org.apache.spark.sql.execution.columnar.compression.ColumnBuilderHelper import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarArray -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.unsafe.types.UTF8String class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach { private def withVector( @@ -605,14 +605,5 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach { } } } - - test("SPARK-38018: ColumnVectorUtils.populate to handle CalendarIntervalType correctly") { - val vector = new OnHeapColumnVector(5, CalendarIntervalType) - val row = new SpecificInternalRow(Array(CalendarIntervalType)) - val interval = new CalendarInterval(3, 5, 1000000) - row.setInterval(0, interval) - ColumnVectorUtils.populate(vector, row, 0) - assert(vector.getInterval(0) === interval) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 6e6138c91d..f313302683 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -32,8 +32,9 @@ import org.apache.spark.sql.catalyst.plans.logical.Range import org.apache.spark.sql.connector.FakeV2Provider import org.apache.spark.sql.connector.catalog.{CatalogManager, Identifier, InMemoryCatalog} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper +import org.apache.spark.sql.connector.catalog.functions._ import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types._ import org.apache.spark.storage.StorageLevel @@ -366,7 +367,7 @@ class CatalogSuite extends SharedSparkSession with AnalysisTest with BeforeAndAf val db = new Database("nama", "cataloa", "descripta", "locata") val table = new Table("nama", "cataloa", Array("databasa"), "descripta", "typa", isTemporary = false) - val function = new Function("nama", "databasa", "descripta", "classa", isTemporary = false) + val function = new Function("nama", "cataloa", Array("databasa"), "descripta", "classa", false) val column = new Column( "nama", "descripta", "typa", nullable = false, isPartition = true, isBucket = true) val dbFields = ScalaReflection.getConstructorParameterValues(db) @@ -377,7 +378,9 @@ class CatalogSuite extends SharedSparkSession with AnalysisTest with BeforeAndAf assert(Seq(tableFields(0), tableFields(1), tableFields(3), tableFields(4), tableFields(5)) == Seq("nama", "cataloa", "descripta", "typa", false)) assert(tableFields(2).asInstanceOf[Array[String]].sameElements(Array("databasa"))) - assert(functionFields == Seq("nama", "databasa", "descripta", "classa", false)) + assert((functionFields(0), functionFields(1), functionFields(3), functionFields(4), + functionFields(5)) == ("nama", "cataloa", "descripta", "classa", false)) + assert(functionFields(2).asInstanceOf[Array[String]].sameElements(Array("databasa"))) assert(columnFields == Seq("nama", "descripta", "typa", false, true, true)) val dbString = CatalogImpl.makeDataset(Seq(db), spark).showString(10) val tableString = CatalogImpl.makeDataset(Seq(table), spark).showString(10) @@ -386,7 +389,8 @@ class CatalogSuite extends SharedSparkSession with AnalysisTest with BeforeAndAf dbFields.foreach { f => assert(dbString.contains(f.toString)) } tableFields.foreach { f => assert(tableString.contains(f.toString) || tableString.contains(f.asInstanceOf[Array[String]].mkString(""))) } - functionFields.foreach { f => assert(functionString.contains(f.toString)) } + functionFields.foreach { f => assert(functionString.contains(f.toString) || + functionString.contains(f.asInstanceOf[Array[String]].mkString(""))) } columnFields.foreach { f => assert(columnString.contains(f.toString)) } } @@ -866,4 +870,82 @@ class CatalogSuite extends SharedSparkSession with AnalysisTest with BeforeAndAf sql(s"CREATE NAMESPACE $qualified") assert(spark.catalog.getDatabase(qualified).name === db) } + + test("three layer namespace compatibility - set current database") { + spark.catalog.setCurrentCatalog("testcat") + // namespace with the same name as catalog + sql("CREATE NAMESPACE testcat.testcat.my_db") + spark.catalog.setCurrentDatabase("testcat.my_db") + assert(spark.catalog.currentDatabase == "testcat.my_db") + // sessionCatalog still reports 'default' as current database + assert(sessionCatalog.getCurrentDatabase == "default") + val e = intercept[AnalysisException] { + spark.catalog.setCurrentDatabase("my_db") + }.getMessage + assert(e.contains("my_db")) + + // check that we can fall back to old sessionCatalog + createDatabase("hive_db") + val err = intercept[AnalysisException] { + spark.catalog.setCurrentDatabase("hive_db") + }.getMessage + assert(err.contains("hive_db")) + spark.catalog.setCurrentCatalog("spark_catalog") + spark.catalog.setCurrentDatabase("hive_db") + assert(spark.catalog.currentDatabase == "hive_db") + assert(sessionCatalog.getCurrentDatabase == "hive_db") + val e3 = intercept[AnalysisException] { + spark.catalog.setCurrentDatabase("unknown_db") + }.getMessage + assert(e3.contains("unknown_db")) + } + + test("SPARK-39579: Three layer namespace compatibility - " + + "listFunctions, getFunction, functionExists") { + createDatabase("my_db1") + createFunction("my_func1", Some("my_db1")) + + val functions1a = spark.catalog.listFunctions("my_db1").collect().map(_.name) + val functions1b = spark.catalog.listFunctions("spark_catalog.my_db1").collect().map(_.name) + assert(functions1a.length > 200 && functions1a.contains("my_func1")) + assert(functions1b.length > 200 && functions1b.contains("my_func1")) + // functions1b contains 5 more functions: [<>, ||, !=, case, between] + assert(functions1a.intersect(functions1b) === functions1a) + + assert(spark.catalog.functionExists("my_db1.my_func1")) + assert(spark.catalog.functionExists("spark_catalog.my_db1.my_func1")) + + val func1a = spark.catalog.getFunction("my_db1.my_func1") + val func1b = spark.catalog.getFunction("spark_catalog.my_db1.my_func1") + assert(func1a.name === func1b.name && func1a.namespace === func1b.namespace && + func1a.className === func1b.className && func1a.isTemporary === func1b.isTemporary) + assert(func1a.catalog === null && func1b.catalog === "spark_catalog") + assert(func1a.description === null && func1b.description === "N/A.") + + val function: UnboundFunction = new UnboundFunction { + override def bind(inputType: StructType): BoundFunction = new ScalarFunction[Int] { + override def inputTypes(): Array[DataType] = Array(IntegerType) + override def resultType(): DataType = IntegerType + override def name(): String = "my_bound_function" + } + override def description(): String = "hello" + override def name(): String = "my_function" + } + + val testCatalog: InMemoryCatalog = + spark.sessionState.catalogManager.catalog("testcat").asInstanceOf[InMemoryCatalog] + testCatalog.createFunction(Identifier.of(Array("my_db2"), "my_func2"), function) + + val functions2 = spark.catalog.listFunctions("testcat.my_db2").collect().map(_.name) + assert(functions2.length > 200 && functions2.contains("my_func2")) + + assert(spark.catalog.functionExists("testcat.my_db2.my_func2")) + assert(!spark.catalog.functionExists("testcat.my_db2.my_func3")) + + val func2 = spark.catalog.getFunction("testcat.my_db2.my_func2") + assert(func2.name === "my_func2" && func2.namespace === Array("my_db2") && + func2.catalog === "testcat" && func2.description === "hello" && + func2.isTemporary === false && + func2.className.startsWith("org.apache.spark.sql.internal.CatalogSuite")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 494ae6d548..b87fee6cec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -26,7 +26,6 @@ import scala.collection.JavaConverters._ import org.mockito.ArgumentMatchers._ import org.mockito.Mockito._ -import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} @@ -45,8 +44,7 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class JDBCSuite extends QueryTest - with BeforeAndAfter with PrivateMethodTester with SharedSparkSession { +class JDBCSuite extends QueryTest with SharedSparkSession { import testImplicits._ val url = "jdbc:h2:mem:testdb0" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala index 90ab976d9d..865d4718d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala @@ -44,7 +44,6 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel val tempDir = Utils.createTempDir() val url = s"jdbc:h2:${tempDir.getCanonicalPath};user=testUser;password=testPass" - var conn: java.sql.Connection = null val testH2Dialect = new JdbcDialect { override def canHandle(url: String): Boolean = H2Dialect.canHandle(url) @@ -856,11 +855,11 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel val df11 = sql( """ |SELECT * FROM h2.test.employee - |WHERE GREATEST(bonus, 1100) > 1200 AND LEAST(salary, 10000) > 9000 AND RAND(1) < 1 + |WHERE GREATEST(bonus, 1100) > 1200 AND RAND(1) < bonus |""".stripMargin) checkFiltersRemoved(df11) checkPushedInfo(df11, "PushedFilters: " + - "[(GREATEST(BONUS, 1100.0)) > 1200.0, (LEAST(SALARY, 10000.00)) > 9000.00, RAND(1) < 1.0]") + "[BONUS IS NOT NULL, (GREATEST(BONUS, 1100.0)) > 1200.0, RAND(1) < BONUS]") checkAnswer(df11, Row(2, "david", 10000, 1300, true)) val df12 = sql( @@ -1633,43 +1632,117 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel } test("scan with aggregate push-down: VAR_POP VAR_SAMP with filter and group by") { - val df = sql("SELECT VAR_POP(bonus), VAR_SAMP(bonus) FROM h2.test.employee WHERE dept > 0" + - " GROUP BY DePt") + val df = sql( + """ + |SELECT + | VAR_POP(bonus), + | VAR_POP(DISTINCT bonus), + | VAR_SAMP(bonus), + | VAR_SAMP(DISTINCT bonus) + |FROM h2.test.employee WHERE dept > 0 GROUP BY DePt""".stripMargin) checkFiltersRemoved(df) checkAggregateRemoved(df) - checkPushedInfo(df, "PushedAggregates: [VAR_POP(BONUS), VAR_SAMP(BONUS)], " + - "PushedFilters: [DEPT IS NOT NULL, DEPT > 0], PushedGroupByExpressions: [DEPT]") - checkAnswer(df, Seq(Row(10000d, 20000d), Row(2500d, 5000d), Row(0d, null))) + checkPushedInfo(df, + """ + |PushedAggregates: [VAR_POP(BONUS), VAR_POP(DISTINCT BONUS), + |VAR_SAMP(BONUS), VAR_SAMP(DISTINCT BONUS)], + |PushedFilters: [DEPT IS NOT NULL, DEPT > 0], + |PushedGroupByExpressions: [DEPT], + |""".stripMargin.replaceAll("\n", " ")) + checkAnswer(df, Seq(Row(10000d, 10000d, 20000d, 20000d), + Row(2500d, 2500d, 5000d, 5000d), Row(0d, 0d, null, null))) } test("scan with aggregate push-down: STDDEV_POP STDDEV_SAMP with filter and group by") { - val df = sql("SELECT STDDEV_POP(bonus), STDDEV_SAMP(bonus) FROM h2.test.employee" + - " WHERE dept > 0 GROUP BY DePt") + val df = sql( + """ + |SELECT + | STDDEV_POP(bonus), + | STDDEV_POP(DISTINCT bonus), + | STDDEV_SAMP(bonus), + | STDDEV_SAMP(DISTINCT bonus) + |FROM h2.test.employee WHERE dept > 0 GROUP BY DePt""".stripMargin) checkFiltersRemoved(df) checkAggregateRemoved(df) - checkPushedInfo(df, "PushedAggregates: [STDDEV_POP(BONUS), STDDEV_SAMP(BONUS)], " + - "PushedFilters: [DEPT IS NOT NULL, DEPT > 0], PushedGroupByExpressions: [DEPT]") - checkAnswer(df, Seq(Row(100d, 141.4213562373095d), Row(50d, 70.71067811865476d), Row(0d, null))) + checkPushedInfo(df, + """ + |PushedAggregates: [STDDEV_POP(BONUS), STDDEV_POP(DISTINCT BONUS), + |STDDEV_SAMP(BONUS), STDDEV_SAMP(DISTINCT BONUS)], + |PushedFilters: [DEPT IS NOT NULL, DEPT > 0], + |PushedGroupByExpressions: [DEPT], + |""".stripMargin.replaceAll("\n", " ")) + checkAnswer(df, Seq(Row(100d, 100d, 141.4213562373095d, 141.4213562373095d), + Row(50d, 50d, 70.71067811865476d, 70.71067811865476d), Row(0d, 0d, null, null))) } test("scan with aggregate push-down: COVAR_POP COVAR_SAMP with filter and group by") { - val df = sql("SELECT COVAR_POP(bonus, bonus), COVAR_SAMP(bonus, bonus)" + + val df1 = sql("SELECT COVAR_POP(bonus, bonus), COVAR_SAMP(bonus, bonus)" + " FROM h2.test.employee WHERE dept > 0 GROUP BY DePt") - checkFiltersRemoved(df) - checkAggregateRemoved(df) - checkPushedInfo(df, "PushedAggregates: [COVAR_POP(BONUS, BONUS), COVAR_SAMP(BONUS, BONUS)], " + + checkFiltersRemoved(df1) + checkAggregateRemoved(df1) + checkPushedInfo(df1, "PushedAggregates: [COVAR_POP(BONUS, BONUS), COVAR_SAMP(BONUS, BONUS)], " + "PushedFilters: [DEPT IS NOT NULL, DEPT > 0], PushedGroupByExpressions: [DEPT]") - checkAnswer(df, Seq(Row(10000d, 20000d), Row(2500d, 5000d), Row(0d, null))) + checkAnswer(df1, Seq(Row(10000d, 20000d), Row(2500d, 5000d), Row(0d, null))) + + val df2 = sql("SELECT COVAR_POP(DISTINCT bonus, bonus), COVAR_SAMP(DISTINCT bonus, bonus)" + + " FROM h2.test.employee WHERE dept > 0 GROUP BY DePt") + checkFiltersRemoved(df2) + checkAggregateRemoved(df2, false) + checkPushedInfo(df2, "PushedFilters: [DEPT IS NOT NULL, DEPT > 0]") + checkAnswer(df2, Seq(Row(10000d, 20000d), Row(2500d, 5000d), Row(0d, null))) } test("scan with aggregate push-down: CORR with filter and group by") { - val df = sql("SELECT CORR(bonus, bonus) FROM h2.test.employee WHERE dept > 0" + + val df1 = sql("SELECT CORR(bonus, bonus) FROM h2.test.employee WHERE dept > 0" + " GROUP BY DePt") - checkFiltersRemoved(df) - checkAggregateRemoved(df) - checkPushedInfo(df, "PushedAggregates: [CORR(BONUS, BONUS)], " + + checkFiltersRemoved(df1) + checkAggregateRemoved(df1) + checkPushedInfo(df1, "PushedAggregates: [CORR(BONUS, BONUS)], " + "PushedFilters: [DEPT IS NOT NULL, DEPT > 0], PushedGroupByExpressions: [DEPT]") - checkAnswer(df, Seq(Row(1d), Row(1d), Row(null))) + checkAnswer(df1, Seq(Row(1d), Row(1d), Row(null))) + + val df2 = sql("SELECT CORR(DISTINCT bonus, bonus) FROM h2.test.employee WHERE dept > 0" + + " GROUP BY DePt") + checkFiltersRemoved(df2) + checkAggregateRemoved(df2, false) + checkPushedInfo(df2, "PushedFilters: [DEPT IS NOT NULL, DEPT > 0]") + checkAnswer(df2, Seq(Row(1d), Row(1d), Row(null))) + } + + test("scan with aggregate push-down: linear regression functions with filter and group by") { + val df1 = sql( + """ + |SELECT + | REGR_INTERCEPT(bonus, bonus), + | REGR_R2(bonus, bonus), + | REGR_SLOPE(bonus, bonus), + | REGR_SXY(bonus, bonus) + |FROM h2.test.employee WHERE dept > 0 GROUP BY DePt""".stripMargin) + checkFiltersRemoved(df1) + checkAggregateRemoved(df1) + checkPushedInfo(df1, + """ + |PushedAggregates: [REGR_INTERCEPT(BONUS, BONUS), REGR_R2(BONUS, BONUS), + |REGR_SLOPE(BONUS, BONUS), REGR_SXY(BONUS, B..., + |PushedFilters: [DEPT IS NOT NULL, DEPT > 0], + |PushedGroupByExpressions: [DEPT], + |""".stripMargin.replaceAll("\n", " ")) + checkAnswer(df1, + Seq(Row(0.0, 1.0, 1.0, 20000.0), Row(0.0, 1.0, 1.0, 5000.0), Row(null, null, null, 0.0))) + + val df2 = sql( + """ + |SELECT + | REGR_INTERCEPT(DISTINCT bonus, bonus), + | REGR_R2(DISTINCT bonus, bonus), + | REGR_SLOPE(DISTINCT bonus, bonus), + | REGR_SXY(DISTINCT bonus, bonus) + |FROM h2.test.employee WHERE dept > 0 GROUP BY DePt""".stripMargin) + checkFiltersRemoved(df2) + checkAggregateRemoved(df2, false) + checkPushedInfo(df2, "PushedFilters: [DEPT IS NOT NULL, DEPT > 0], ReadSchema:") + checkAnswer(df2, + Seq(Row(0.0, 1.0, 1.0, 20000.0), Row(0.0, 1.0, 1.0, 5000.0), Row(null, null, null, 0.0))) } test("scan with aggregate push-down: aggregate over alias push down") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index c1908d95f3..0315e03d18 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -17,11 +17,16 @@ package org.apache.spark.sql.streaming +import java.io.File + +import org.apache.commons.io.FileUtils + import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.util.Utils class StreamingDeduplicationSuite extends StateStoreMetricsTest { @@ -413,4 +418,69 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { assertStateOperatorCustomMetric("numDroppedDuplicateRows", expected = 1) ) } + + test("SPARK-39650: duplicate with specific keys should allow input to change schema") { + withTempDir { checkpoint => + val dedupeInputData = MemoryStream[(String, Int)] + val dedupe = dedupeInputData.toDS().dropDuplicates("_1") + + testStream(dedupe, Append)( + StartStream(checkpointLocation = checkpoint.getCanonicalPath), + + AddData(dedupeInputData, "a" -> 1), + CheckLastBatch("a" -> 1), + + AddData(dedupeInputData, "a" -> 2, "b" -> 3), + CheckLastBatch("b" -> 3) + ) + + val dedupeInputData2 = MemoryStream[(String, Int, String)] + val dedupe2 = dedupeInputData2.toDS().dropDuplicates("_1") + + // initialize new memory stream with previously executed batches + dedupeInputData2.addData(("a", 1, "dummy")) + dedupeInputData2.addData(Seq(("a", 2, "dummy"), ("b", 3, "dummy"))) + + testStream(dedupe2, Append)( + StartStream(checkpointLocation = checkpoint.getCanonicalPath), + + AddData(dedupeInputData2, ("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")), + CheckLastBatch(("c", 9, "c")) + ) + } + } + + test("SPARK-39650: recovery from checkpoint having all columns as value schema") { + // NOTE: We are also changing the schema of input compared to the checkpoint. In the checkpoint + // we define the input schema as (String, Int). + val inputData = MemoryStream[(String, Int, String)] + val dedupe = inputData.toDS().dropDuplicates("_1") + + // The fix will land after Spark 3.3.0, hence we can check backward compatibility with + // checkpoint being built from Spark 3.3.0. + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-3.3.0-streaming-deduplication/").toURI + + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + + inputData.addData(("a", 1, "dummy")) + inputData.addData(("a", 2, "dummy"), ("b", 3, "dummy")) + + testStream(dedupe, Append)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + /* + Note: The checkpoint was generated using the following input in Spark version 3.3.0 + AddData(inputData, ("a", 1)), + CheckLastBatch(("a", 1)), + AddData(inputData, ("a", 2), ("b", 3)), + CheckLastBatch(("b", 3)) + */ + + AddData(inputData, ("a", 5, "a"), ("b", 2, "b"), ("c", 9, "c")), + CheckLastBatch(("c", 9, "c")) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala index e729fe32eb..7b53b4c785 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala @@ -247,7 +247,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { test("QueryStartedEvent serialization") { def testSerialization(event: QueryStartedEvent): Unit = { - val json = JsonProtocol.sparkEventToJson(event) + val json = JsonProtocol.sparkEventToJsonString(event) val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryStartedEvent] assert(newEvent.id === event.id) assert(newEvent.runId === event.runId) @@ -263,7 +263,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { test("QueryProgressEvent serialization") { def testSerialization(event: QueryProgressEvent): Unit = { import scala.collection.JavaConverters._ - val json = JsonProtocol.sparkEventToJson(event) + val json = JsonProtocol.sparkEventToJsonString(event) val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryProgressEvent] assert(newEvent.progress.json === event.progress.json) // json as a proxy for equality assert(newEvent.progress.durationMs.asScala === event.progress.durationMs.asScala) @@ -275,7 +275,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { test("QueryTerminatedEvent serialization") { def testSerialization(event: QueryTerminatedEvent): Unit = { - val json = JsonProtocol.sparkEventToJson(event) + val json = JsonProtocol.sparkEventToJsonString(event) val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryTerminatedEvent] assert(newEvent.id === event.id) assert(newEvent.runId === event.runId) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowFunctionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowFunctionsSuite.scala index 830949dbcf..d471669f25 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowFunctionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowFunctionsSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.hive.execution.command import java.util.Locale -import org.apache.spark.sql.Row import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.execution.command.v1 import org.apache.spark.sql.hive.execution.UDFToListInt @@ -38,39 +37,4 @@ class ShowFunctionsSuite extends v1.ShowFunctionsSuiteBase with CommandSuiteBase override protected def dropFunction(name: String): Unit = { sql(s"DROP FUNCTION IF EXISTS $name") } - - test("show a function by its string name") { - val testFuns = Seq("crc32i", "crc16j") - withNamespaceAndFuns("ns", testFuns) { (ns, funs) => - assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) - funs.foreach(createFunction) - checkAnswer( - sql(s"SHOW USER FUNCTIONS IN $ns 'crc32i'"), - Row(showFun("ns", "crc32i"))) - } - } - - test("show functions matched to the '|' pattern") { - val testFuns = Seq("crc32i", "crc16j", "date1900", "Date1") - withNamespaceAndFuns("ns", testFuns) { (ns, funs) => - assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) - funs.foreach(createFunction) - checkAnswer( - sql(s"SHOW USER FUNCTIONS IN $ns LIKE 'crc32i|date1900'"), - Seq("crc32i", "date1900").map(testFun => Row(showFun("ns", testFun)))) - checkAnswer( - sql(s"SHOW USER FUNCTIONS IN $ns LIKE 'crc32i|date*'"), - Seq("crc32i", "date1900", "Date1").map(testFun => Row(showFun("ns", testFun)))) - } - } - - test("show a function by its id") { - withNamespaceAndFun("ns", "crc32i") { (ns, fun) => - assert(sql(s"SHOW USER FUNCTIONS IN $ns").isEmpty) - createFunction(fun) - checkAnswer( - sql(s"SHOW USER FUNCTIONS $fun"), - Row(showFun("ns", "crc32i"))) - } - } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala index ce1afad7a9..733ab03f0f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.api.java import org.apache.spark.streaming.Time -private[streaming] trait PythonStreamingListener{ +private[streaming] trait PythonStreamingListener { /** Called when the streaming has been started */ def onStreamingStarted(streamingStarted: JavaStreamingListenerStreamingStarted): Unit = { }