From c76e822c23c73eb6c635e858ec40a53d034af05f Mon Sep 17 00:00:00 2001 From: raza Date: Wed, 18 Oct 2023 23:15:46 +0000 Subject: [PATCH 1/9] Fixed orc_test, parquet_test and regexp_test --- integration_tests/src/main/python/orc_test.py | 2 ++ integration_tests/src/main/python/parquet_test.py | 1 + integration_tests/src/main/python/regexp_test.py | 10 +++++----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index b66903955bd..4fdac4df082 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -19,6 +19,7 @@ from data_gen import * from marks import * from pyspark.sql.types import * +from spark_init_internal import spark_version from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330, is_spark_cdh, is_spark_340_or_later from parquet_test import _nested_pruning_schemas from conftest import is_databricks_runtime @@ -820,6 +821,7 @@ def test_read_hive_fixed_length_char(std_input_path, data_file, reader): @allow_non_gpu("ProjectExec") @pytest.mark.skipif(is_before_spark_340(), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") +@pytest.mark.skipif(is_databricks_runtime() and spark_version() == "3.4.1", reason="https://github.com/NVIDIA/spark-rapids/issues/8324") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql]) def test_project_fallback_when_reading_hive_fixed_length_char(std_input_path, data_file, reader): diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 51b327dfd70..3cf46595a31 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -798,6 +798,7 @@ def read_timestamp_nano_parquet(spark): @pytest.mark.skipif(spark_version() >= '3.2.0' and spark_version() < '3.2.4', reason='New config added in 3.2.4') @pytest.mark.skipif(spark_version() >= '3.3.0' and spark_version() < '3.3.2', reason='New config added in 3.3.2') @pytest.mark.skipif(is_databricks_runtime() and spark_version() == '3.3.2', reason='Config not in DB 12.2') +@pytest.mark.skipif(is_databricks_runtime() and spark_version() == '3.4.1', reason='Config not in DB 13.3') @allow_non_gpu('FileSourceScanExec, ColumnarToRowExec') def test_parquet_read_nano_as_longs_true(std_input_path): data_path = "%s/timestamp-nanos.parquet" % (std_input_path) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index fa563d69e88..3c1e2b0df78 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -20,7 +20,7 @@ from data_gen import * from marks import * from pyspark.sql.types import * -from spark_session import is_before_spark_320, is_before_spark_350, is_jvm_charset_utf8 +from spark_session import is_before_spark_320, is_before_spark_350, is_jvm_charset_utf8, is_databricks_runtime, spark_version if not is_jvm_charset_utf8(): pytestmark = [pytest.mark.regexp, pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled"))] @@ -489,7 +489,7 @@ def test_regexp_extract_no_match(): # Spark take care of the error handling @allow_non_gpu('ProjectExec', 'RegExpExtract') def test_regexp_extract_idx_negative(): - message = "The specified group index cannot be less than zero" if is_before_spark_350() else \ + message = "The specified group index cannot be less than zero" if is_before_spark_350() and not (is_databricks_runtime() and spark_version() == "3.4.1") else \ "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract` is invalid" gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') @@ -503,7 +503,7 @@ def test_regexp_extract_idx_negative(): # Spark take care of the error handling @allow_non_gpu('ProjectExec', 'RegExpExtract') def test_regexp_extract_idx_out_of_bounds(): - message = "Regex group count is 3, but the specified group index is 4" if is_before_spark_350() else \ + message = "Regex group count is 3, but the specified group index is 4" if is_before_spark_350() and not (is_databricks_runtime() and spark_version() == "3.4.1") else \ "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract` is invalid: Expects group index between 0 and 3, but got 4." gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') assert_gpu_and_cpu_error( @@ -826,7 +826,7 @@ def test_regexp_extract_all_idx_positive(): @allow_non_gpu('ProjectExec', 'RegExpExtractAll') def test_regexp_extract_all_idx_negative(): - message = "The specified group index cannot be less than zero" if is_before_spark_350() else \ + message = "The specified group index cannot be less than zero" if is_before_spark_350() and not (is_databricks_runtime() and spark_version() == "3.4.1") else \ "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract_all` is invalid" gen = mk_str_gen('[abcd]{0,3}') @@ -839,7 +839,7 @@ def test_regexp_extract_all_idx_negative(): @allow_non_gpu('ProjectExec', 'RegExpExtractAll') def test_regexp_extract_all_idx_out_of_bounds(): - message = "Regex group count is 2, but the specified group index is 3" if is_before_spark_350() else \ + message = "Regex group count is 2, but the specified group index is 3" if is_before_spark_350() and not (is_databricks_runtime() and spark_version() == "3.4.1") else \ "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract_all` is invalid: Expects group index between 0 and 2, but got 3." gen = mk_str_gen('[a-d]{1,2}.{0,1}[0-9]{1,2}') assert_gpu_and_cpu_error( From f9ec8a142542e2d9d763fea0149f3b1b542cefaf Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 25 Oct 2023 04:59:53 -0700 Subject: [PATCH 2/9] Added Support for PythonUDAF --- .../rapids/shims/Spark341PlusDBShims.scala | 30 ++++++++++++++++++- .../spark/rapids/shims/PythonUDFShim.scala | 1 + 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala index 45bcd373a75..0d460047e7e 100644 --- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala +++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala @@ -122,7 +122,35 @@ trait Spark341PlusDBShims extends Spark332PlusDBShims { } ).disabledByDefault("Collect Limit replacement can be slower on the GPU, if huge number " + "of rows in a batch it could help by limiting the number of rows transferred from " + - "GPU to CPU") + "GPU to CPU"), + GpuOverrides.expr[PythonUDAF]( + "UDF run in an external python process. Does not actually run on the GPU, but " + + "the transfer of data to/from it can be accelerated", + ExprChecks.fullAggAndProject( + // Different types of Pandas UDF support different sets of output type. Please refer to + // https://github.com/apache/spark/blob/master/python/pyspark/sql/udf.py#L98 + // for more details. + // It is impossible to specify the exact type signature for each Pandas UDF type in a + // single expression 'PythonUDF'. + // So use the 'unionOfPandasUdfOut' to cover all types for Spark. The type signature of + // plugin is also an union of all the types of Pandas UDF. + (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested() + TypeSig.STRUCT, + TypeSig.unionOfPandasUdfOut, + repeatingParamCheck = Some(RepeatingParamCheck( + "param", + (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), + TypeSig.all))), + (a, conf, p, r) => new ExprMeta[PythonUDAF](a, conf, p, r) { + override def replaceMessage: String = "not block GPU acceleration" + + override def noReplacementPossibleMessage(reasons: String): String = + s"blocks running on GPU because $reasons" + + override def convertToGpu(): GpuExpression = + GpuPythonUDAF(a.name, a.func, a.dataType, + childExprs.map(_.convertToGpu()), + a.evalType, a.udfDeterministic, a.resultId) + }) ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala index 890aa978001..c207313268a 100644 --- a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala +++ b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala @@ -15,6 +15,7 @@ */ /*** spark-rapids-shim-json-lines +{"spark": "341db"} {"spark": "350"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims From 2073fbd9e0304e2dbd9fb594bd10f2763b76dabc Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 25 Oct 2023 05:10:23 -0700 Subject: [PATCH 3/9] moved the PythonUDAF override to it's correct place --- .../spark/rapids/shims/PythonUDFShim.scala | 0 .../rapids/shims/Spark341PlusDBShims.scala | 31 ++----------------- 2 files changed, 2 insertions(+), 29 deletions(-) rename sql-plugin/src/main/{spark350 => spark341db}/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala (100%) diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala similarity index 100% rename from sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala rename to sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala index 0d460047e7e..d3c688bd7a6 100644 --- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala +++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.execution.{CollectLimitExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec} import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS +import org.apache.spark.sql.rapids.execution.python.GpuPythonUDAF import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null trait Spark341PlusDBShims extends Spark332PlusDBShims { @@ -122,35 +123,7 @@ trait Spark341PlusDBShims extends Spark332PlusDBShims { } ).disabledByDefault("Collect Limit replacement can be slower on the GPU, if huge number " + "of rows in a batch it could help by limiting the number of rows transferred from " + - "GPU to CPU"), - GpuOverrides.expr[PythonUDAF]( - "UDF run in an external python process. Does not actually run on the GPU, but " + - "the transfer of data to/from it can be accelerated", - ExprChecks.fullAggAndProject( - // Different types of Pandas UDF support different sets of output type. Please refer to - // https://github.com/apache/spark/blob/master/python/pyspark/sql/udf.py#L98 - // for more details. - // It is impossible to specify the exact type signature for each Pandas UDF type in a - // single expression 'PythonUDF'. - // So use the 'unionOfPandasUdfOut' to cover all types for Spark. The type signature of - // plugin is also an union of all the types of Pandas UDF. - (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested() + TypeSig.STRUCT, - TypeSig.unionOfPandasUdfOut, - repeatingParamCheck = Some(RepeatingParamCheck( - "param", - (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), - TypeSig.all))), - (a, conf, p, r) => new ExprMeta[PythonUDAF](a, conf, p, r) { - override def replaceMessage: String = "not block GPU acceleration" - - override def noReplacementPossibleMessage(reasons: String): String = - s"blocks running on GPU because $reasons" - - override def convertToGpu(): GpuExpression = - GpuPythonUDAF(a.name, a.func, a.dataType, - childExprs.map(_.convertToGpu()), - a.evalType, a.udfDeterministic, a.resultId) - }) + "GPU to CPU") ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = From 36dd1ba959e458212c26cb1d4d8307333786e3f2 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 27 Oct 2023 04:39:18 -0700 Subject: [PATCH 4/9] removed left over imports from the bad commit --- .../com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala index d3c688bd7a6..45bcd373a75 100644 --- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala +++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.execution.{CollectLimitExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec} import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS -import org.apache.spark.sql.rapids.execution.python.GpuPythonUDAF import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null trait Spark341PlusDBShims extends Spark332PlusDBShims { From 7e0ddd6e14ac933a7612d01f197504fc7eab0348 Mon Sep 17 00:00:00 2001 From: raza Date: Tue, 31 Oct 2023 04:57:31 +0000 Subject: [PATCH 5/9] build fix after upmerge --- .../sql/rapids/execution/python/GpuArrowPythonRunner.scala | 1 - .../rapids/execution/python/shims/GpuPythonArrowShims.scala | 3 +++ .../rapids/execution/python/shims/GpuPythonArrowShims.scala | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala index b323ac62843..3dbf2d3db55 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala @@ -35,7 +35,6 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.execution.python.shims.{GpuArrowPythonRunner, GpuPythonArrowOutput} import org.apache.spark.sql.rapids.shims.ArrowUtilsShim import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala index 681cdd3b11c..2fcccc4cd22 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala @@ -47,11 +47,14 @@ import ai.rapids.cudf._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ import org.apache.spark.sql.rapids.execution.python._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch /** diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala index 35fe8979d94..8f24e4b57d7 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala @@ -25,11 +25,14 @@ import ai.rapids.cudf._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ import org.apache.spark.sql.rapids.execution.python._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch /** From 27608759be060aae3109f763b3942cf7d6d223bf Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 1 Nov 2023 11:54:50 +0400 Subject: [PATCH 6/9] fixed imports --- .../sql/rapids/execution/python/GpuArrowPythonRunner.scala | 1 + .../rapids/execution/python/shims/GpuPythonArrowShims.scala | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala index 3dbf2d3db55..b323ac62843 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala @@ -35,6 +35,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.execution.python.shims.{GpuArrowPythonRunner, GpuPythonArrowOutput} import org.apache.spark.sql.rapids.shims.ArrowUtilsShim import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala index 2fcccc4cd22..681cdd3b11c 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala @@ -47,14 +47,11 @@ import ai.rapids.cudf._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion -import org.apache.arrow.vector.VectorSchemaRoot -import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ import org.apache.spark.sql.rapids.execution.python._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch /** From 5a8c7bd566ffc941c866c77784a61022152e9686 Mon Sep 17 00:00:00 2001 From: raza Date: Wed, 1 Nov 2023 10:46:17 +0000 Subject: [PATCH 7/9] fix 341db --- .../rapids/execution/python/shims/GpuPythonArrowShims.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala index 8f24e4b57d7..35fe8979d94 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala @@ -25,14 +25,11 @@ import ai.rapids.cudf._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion -import org.apache.arrow.vector.VectorSchemaRoot -import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ import org.apache.spark.sql.rapids.execution.python._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch /** From cd901f66124b472cfd45a7c43854b55ad3cca7b3 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 6 Nov 2023 11:42:51 -0800 Subject: [PATCH 8/9] Signing off Signed-off-by: Raza Jafri From 269f322d25d21d68b184948f581df9c01b5e2c7f Mon Sep 17 00:00:00 2001 From: raza Date: Thu, 9 Nov 2023 18:47:37 +0000 Subject: [PATCH 9/9] enable test_read_hive_fixed_length_char for 341db --- integration_tests/src/main/python/orc_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 4fdac4df082..cbb2ee9e703 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -806,8 +806,7 @@ def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, kee assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.orc(data_path), conf=all_confs) - -@pytest.mark.skipif(is_spark_340_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") +@pytest.mark.skipif(is_spark_340_or_later() and (not (is_databricks_runtime() and spark_version() == "3.4.1")), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql]) def test_read_hive_fixed_length_char(std_input_path, data_file, reader): @@ -820,8 +819,7 @@ def test_read_hive_fixed_length_char(std_input_path, data_file, reader): @allow_non_gpu("ProjectExec") -@pytest.mark.skipif(is_before_spark_340(), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") -@pytest.mark.skipif(is_databricks_runtime() and spark_version() == "3.4.1", reason="https://github.com/NVIDIA/spark-rapids/issues/8324") +@pytest.mark.skipif(is_before_spark_340() or (is_databricks_runtime() and spark_version() == "3.4.1"), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql]) def test_project_fallback_when_reading_hive_fixed_length_char(std_input_path, data_file, reader):