Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop the in-range check at INT96 output path [databricks] #8824

Merged
merged 15 commits into from
Aug 4, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import ai.rapids.cudf._
import com.nvidia.spark.RebaseHelper
import com.nvidia.spark.rapids.Arm.withResource
import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingArray
import com.nvidia.spark.rapids.shims.{ParquetFieldIdShims, ParquetTimestampNTZShims, SparkShimImpl}
import com.nvidia.spark.rapids.shims._
import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext}
import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel
Expand Down Expand Up @@ -341,38 +341,10 @@ class GpuParquetWriter(
// included in Spark's `TimestampType`.
case (cv, _) if cv.getType.isTimestampType && cv.getType != DType.TIMESTAMP_DAYS =>
val typeMillis = ParquetOutputTimestampType.TIMESTAMP_MILLIS.toString
val typeInt96 = ParquetOutputTimestampType.INT96.toString

outputTimestampType match {
case `typeMillis` if cv.getType != DType.TIMESTAMP_MILLISECONDS =>
cv.castTo(DType.TIMESTAMP_MILLISECONDS)

case `typeInt96` =>
val inRange = withResource(Scalar.fromLong(Long.MaxValue / 1000)) { upper =>
withResource(Scalar.fromLong(Long.MinValue / 1000)) { lower =>
withResource(cv.bitCastTo(DType.INT64)) { int64 =>
withResource(int64.greaterOrEqualTo(upper)) { a =>
withResource(int64.lessOrEqualTo(lower)) { b =>
a.or(b)
}
}
}
}
}
val anyInRange = withResource(inRange)(_.any())
withResource(anyInRange) { _ =>
require(!(anyInRange.isValid && anyInRange.getBoolean),
// Its the writer's responsibility to close the input batch when this
// exception is thrown.
"INT96 column contains one " +
"or more values that can overflow and will result in data " +
"corruption. Please set " +
"`spark.rapids.sql.format.parquet.writer.int96.enabled` to false " +
"so we can fallback on CPU for writing parquet but still take " +
"advantage of parquet read on the GPU.")
}
cv.copyToColumnVector() /* the input is unchanged */

// Here the value of `outputTimestampType` should be `TIMESTAMP_MICROS`
case _ => cv.copyToColumnVector() /* the input is unchanged */
}
Expand Down