NVIDIA · abellina · Jul 26, 2023 · Jun 20, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/...e/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala b/...e/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala
@@ -23,6 +23,8 @@ package com.nvidia.spark.rapids.delta
 
 import scala.collection.mutable
 
+import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.SpillableColumnarBatch
 import com.nvidia.spark.rapids.delta.shims.ShimJoinedProjection
 import org.apache.hadoop.fs.Path
 
@@ -152,11 +154,13 @@ class GpuDeltaTaskStatisticsTracker(
     })
   }
 
-  override def newBatch(filePath: String, batch: ColumnarBatch): Unit = {
+  override def newBatch(filePath: String, spillableBatch: SpillableColumnarBatch): Unit = {
     val aggBuffer = submittedFiles(filePath)
     extendedRow.update(0, aggBuffer)
 
-    batchStatsToRow(batch, gpuResultsBuffer)
+    withResource(spillableBatch.getColumnarBatch()) { batch =>
+      batchStatsToRow(batch, gpuResultsBuffer)
+    }
 
     extendedRow.update(1, gpuResultsBuffer)
     mergeStats.target(aggBuffer).apply(extendedRow)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable
 import ai.rapids.cudf.{HostBufferConsumer, HostMemoryBuffer, NvtxColor, NvtxRange, Table, TableWriter}
 import com.nvidia.spark.rapids.Arm.withResource
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry}
 import org.apache.hadoop.fs.{FSDataOutputStream, Path}
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
@@ -93,42 +94,17 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
       true
   }
 
-  /**
-   * Persists a columnar batch. Invoked on the executor side. When writing to dynamically
-   * partitioned tables, dynamic partition columns are not included in columns to be written.
-   *
-   * NOTE: This method will close `batch`. We do this because we want
-   * to free GPU memory after the GPU has finished encoding the data but before
-   * it is written to the distributed filesystem. The GPU semaphore is released
-   * during the distributed filesystem transfer to allow other tasks to start/continue
-   * GPU processing.
-   */
-  def writeAndClose(
-      batch: ColumnarBatch,
+  private[this] def updateStatistics(
+      writeStartTime: Long,
+      gpuTime: Long,
       statsTrackers: Seq[ColumnarWriteTaskStatsTracker]): Unit = {
-    var needToCloseBatch = true
-    try {
-      val writeStartTimestamp = System.nanoTime
-      val writeRange = new NvtxRange("File write", NvtxColor.YELLOW)
-      val gpuTime = try {
-        needToCloseBatch = false
-        writeBatch(batch)
-      } finally {
-        writeRange.close()
-      }
-
-      // Update statistics
-      val writeTime = System.nanoTime - writeStartTimestamp - gpuTime
-      statsTrackers.foreach {
-        case gpuTracker: GpuWriteTaskStatsTracker =>
-          gpuTracker.addWriteTime(writeTime)
-          gpuTracker.addGpuTime(gpuTime)
-        case _ =>
-      }
-    } finally {
-      if (needToCloseBatch) {
-        batch.close()
-      }
+    // Update statistics
+    val writeTime = System.nanoTime - writeStartTime - gpuTime
+    statsTrackers.foreach {
+      case gpuTracker: GpuWriteTaskStatsTracker =>
+        gpuTracker.addWriteTime(writeTime)
+        gpuTracker.addGpuTime(gpuTime)
+      case _ =>
     }
   }
 
@@ -137,93 +113,61 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
   }
 
   /**
-   * Writes the columnar batch and returns the time in ns taken to write
+   * Persists a columnar batch. Invoked on the executor side. When writing to dynamically
+   * partitioned tables, dynamic partition columns are not included in columns to be written.
    *
    * NOTE: This method will close `batch`. We do this because we want
    * to free GPU memory after the GPU has finished encoding the data but before
    * it is written to the distributed filesystem. The GPU semaphore is released
    * during the distributed filesystem transfer to allow other tasks to start/continue
    * GPU processing.
-   *
-   * @param batch Columnar batch that needs to be written
-   * @return time in ns taken to write the batch
    */
-  private[this] def writeBatch(batch: ColumnarBatch): Long = {
-    if (includeRetry) {
-      writeBatchWithRetry(batch)
-    } else {
-      writeBatchNoRetry(batch)
-    }
-  }
-
-  /** Apply any necessary casts before writing batch out */
-  def transform(cb: ColumnarBatch): Option[ColumnarBatch] = None
-
-  private[this] def writeBatchWithRetry(batch: ColumnarBatch): Long = {
-    val sb = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
-    RmmRapidsRetryIterator.withRetry(sb, RmmRapidsRetryIterator.splitSpillableInHalfByRows) { sb =>
-      val cr = new CheckpointRestore {
-        override def checkpoint(): Unit = ()
-        override def restore(): Unit = dropBufferedData()
-      }
-      val startTimestamp = System.nanoTime
-      withResource(sb.getColumnarBatch()) { cb =>
+  def writeSpillableAndClose(
+      spillableBatch: SpillableColumnarBatch,
+      statsTrackers: Seq[ColumnarWriteTaskStatsTracker]): Unit = {
+    val writeStartTime = System.nanoTime
+    val gpuTime = if (includeRetry) {
+      withRetry(spillableBatch, splitSpillableInHalfByRows) { sb =>
         //TODO: we should really apply the transformations to cast timestamps
         // to the expected types before spilling but we need a SpillableTable
         // rather than a SpillableColumnBatch to be able to do that
         // See https://github.com/NVIDIA/spark-rapids/issues/8262
-        RmmRapidsRetryIterator.withRestoreOnRetry(cr) {
-          withResource(new NvtxRange(s"GPU $rangeName write", NvtxColor.BLUE)) { _ =>
-            transform(cb) match {
-              case Some(transformed) =>
-                // because we created a new transformed batch, we need to make sure we close it
-                withResource(transformed) { _ =>
-                  scanAndWrite(transformed)
-                }
-              case _ =>
-                scanAndWrite(cb)
-            }
-          }
-        }
+        bufferBatchAndClose(sb.getColumnarBatch())
+      }.sum
+    } else {
+      withResource(spillableBatch) { _ =>
+        bufferBatchAndClose(spillableBatch.getColumnarBatch())
       }
-      GpuSemaphore.releaseIfNecessary(TaskContext.get)
-      val gpuTime = System.nanoTime - startTimestamp
-      writeBufferedData()
-      gpuTime
-    }.sum
+    }
+    // we successfully buffered to host memory, release the semaphore and write
+    // the buffered data to the FS
+    GpuSemaphore.releaseIfNecessary(TaskContext.get)
+    writeBufferedData()
+    updateStatistics(writeStartTime, gpuTime, statsTrackers)
   }
 
-  private[this] def writeBatchNoRetry(batch: ColumnarBatch): Long = {
-    var needToCloseBatch = true
-    try {
-      val startTimestamp = System.nanoTime
+  private[this] def bufferBatchAndClose(batch: ColumnarBatch): Long = {
+    val startTimestamp = System.nanoTime
+    withRestoreOnRetry(checkpointRestore) {
       withResource(new NvtxRange(s"GPU $rangeName write", NvtxColor.BLUE)) { _ =>
-        transform(batch) match {
-          case Some(transformed) =>
-            // because we created a new transformed batch, we need to make sure we close it
-            withResource(transformed) { _ =>
-              scanAndWrite(transformed)
-            }
-          case _ =>
-            scanAndWrite(batch)
+        withResource(transformAndClose(batch)) { maybeTransformed =>
+          encodeAndBufferToHost(maybeTransformed)
         }
       }
-
-      // Batch is no longer needed, write process from here does not use GPU.
-      batch.close()
-      needToCloseBatch = false
-      GpuSemaphore.releaseIfNecessary(TaskContext.get)
-      val gpuTime = System.nanoTime - startTimestamp
-      writeBufferedData()
-      gpuTime
-    } finally {
-      if (needToCloseBatch) {
-        batch.close()
-      }
     }
+    // time spent on GPU encoding to the host sink
+    System.nanoTime - startTimestamp
+  }
+
+  /** Apply any necessary casts before writing batch out */
+  def transformAndClose(cb: ColumnarBatch): ColumnarBatch = cb
+
+  private val checkpointRestore = new CheckpointRestore {
+    override def checkpoint(): Unit = ()
+    override def restore(): Unit = dropBufferedData()
   }
 
-  private def scanAndWrite(batch: ColumnarBatch): Unit = {
+  private def encodeAndBufferToHost(batch: ColumnarBatch): Unit = {
     withResource(GpuColumnVector.from(batch)) { table =>
       scanTableBeforeWrite(table)
       anythingWritten = true
@@ -238,9 +182,10 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
   def close(): Unit = {
     if (!anythingWritten) {
       // This prevents writing out bad files
-      writeBatch(GpuColumnVector.emptyBatch(dataSchema))
+      bufferBatchAndClose(GpuColumnVector.emptyBatch(dataSchema))
     }
     tableWriter.close()
+    GpuSemaphore.releaseIfNecessary(TaskContext.get())
     writeBufferedData()
     outputStream.close()
   }

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
@@ -320,12 +320,14 @@ class GpuParquetWriter(
     }
   }
 
-  override def transform(batch: ColumnarBatch): Option[ColumnarBatch] = {
-    val transformedCols = GpuColumnVector.extractColumns(batch).safeMap { cv =>
-      new GpuColumnVector(cv.dataType, deepTransformColumn(cv.getBase, cv.dataType))
-        .asInstanceOf[org.apache.spark.sql.vectorized.ColumnVector]
+  override def transformAndClose(batch: ColumnarBatch): ColumnarBatch = {
+    withResource(batch) { _ =>
+      val transformedCols = GpuColumnVector.extractColumns(batch).safeMap { cv =>
+        new GpuColumnVector(cv.dataType, deepTransformColumn(cv.getBase, cv.dataType))
+            .asInstanceOf[org.apache.spark.sql.vectorized.ColumnVector]
+      }
+      new ColumnarBatch(transformedCols)
     }
-    Some(new ColumnarBatch(transformedCols))
   }
 
   private def deepTransformColumn(cv: ColumnVector, dt: DataType): ColumnVector = {

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTextFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTextFileFormat.scala
@@ -137,40 +137,42 @@ class GpuHiveTextWriter(override val path: String,
    * This writer currently reformats timestamp and floating point
    * columns.
    */
-  override def transform(cb: ColumnarBatch): Option[ColumnarBatch] = {
-    withResource(GpuColumnVector.from(cb)) { table =>
-      val columns = for (i <- 0 until table.getNumberOfColumns) yield {
-        table.getColumn(i) match {
-          case c if c.getType.hasTimeResolution =>
-            // By default, the CUDF CSV writer writes timestamps in the following format:
-            //   "2020-09-16T22:32:01.123456Z"
-            // Hive's LazySimpleSerDe format expects timestamps to be formatted thus:
-            //   "uuuu-MM-dd HH:mm:ss[.SSS...]"
-            // (Specifically, no `T` between `dd` and `HH`, and no `Z` at the end.)
-            val col = withResource(c.asStrings("%Y-%m-%d %H:%M:%S.%f")) { asStrings =>
-              withResource(Scalar.fromString("\\N")) { nullString =>
-                asStrings.replaceNulls(nullString)
+  override def transformAndClose(cb: ColumnarBatch): ColumnarBatch = {
+    withResource(cb) { _ =>
+      withResource(GpuColumnVector.from(cb)) { table =>
+        val columns = for (i <- 0 until table.getNumberOfColumns) yield {
+          table.getColumn(i) match {
+            case c if c.getType.hasTimeResolution =>
+              // By default, the CUDF CSV writer writes timestamps in the following format:
+              //   "2020-09-16T22:32:01.123456Z"
+              // Hive's LazySimpleSerDe format expects timestamps to be formatted thus:
+              //   "uuuu-MM-dd HH:mm:ss[.SSS...]"
+              // (Specifically, no `T` between `dd` and `HH`, and no `Z` at the end.)
+              val col = withResource(c.asStrings("%Y-%m-%d %H:%M:%S.%f")) { asStrings =>
+                withResource(Scalar.fromString("\\N")) { nullString =>
+                  asStrings.replaceNulls(nullString)
+                }
               }
-            }
-            GpuColumnVector.from(col, StringType)
-          case c if c.getType == DType.FLOAT32 || c.getType == DType.FLOAT64 =>
-            // By default, the CUDF CSV writer writes floats with value `Infinity`
-            // as `"Inf"`.
-            // Hive's LazySimplSerDe expects such values to be written as `"Infinity"`.
-            // All occurrences of `Inf` need to be replaced with `Infinity`.
-            val col = withResource(c.castTo(DType.STRING)) { asStrings =>
-              withResource(Scalar.fromString("Inf")) { infString =>
-                withResource(Scalar.fromString("Infinity")) { infinityString =>
-                  asStrings.stringReplace(infString, infinityString)
+              GpuColumnVector.from(col, StringType)
+            case c if c.getType == DType.FLOAT32 || c.getType == DType.FLOAT64 =>
+              // By default, the CUDF CSV writer writes floats with value `Infinity`
+              // as `"Inf"`.
+              // Hive's LazySimplSerDe expects such values to be written as `"Infinity"`.
+              // All occurrences of `Inf` need to be replaced with `Infinity`.
+              val col = withResource(c.castTo(DType.STRING)) { asStrings =>
+                withResource(Scalar.fromString("Inf")) { infString =>
+                  withResource(Scalar.fromString("Infinity")) { infinityString =>
+                    asStrings.stringReplace(infString, infinityString)
+                  }
                 }
               }
-            }
-            GpuColumnVector.from(col, StringType)
-          case c =>
-            GpuColumnVector.from(c.incRefCount(), cb.column(i).dataType())
+              GpuColumnVector.from(col, StringType)
+            case c =>
+              GpuColumnVector.from(c.incRefCount(), cb.column(i).dataType())
+          }
         }
+        new ColumnarBatch(columns.toArray, cb.numRows())
       }
-      Some(new ColumnarBatch(columns.toArray, cb.numRows()))
     }
   }
 

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/BasicColumnarWriteStatsTracker.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/BasicColumnarWriteStatsTracker.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets
 
 import scala.collection.mutable
 
+import com.nvidia.spark.rapids.SpillableColumnarBatch
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
@@ -30,7 +31,6 @@ import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.WriteTaskStats
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.rapids.BasicColumnarWriteJobStatsTracker._
-import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -153,8 +153,8 @@ class BasicColumnarWriteTaskStatsTracker(
     }
   }
 
-  override def newBatch(filePath: String, batch: ColumnarBatch): Unit = {
-    numRows += batch.numRows
+  override def newBatch(filePath: String, spillableBatch: SpillableColumnarBatch): Unit = {
+    numRows += spillableBatch.numRows
   }
 
   override def getFinalStats(taskCommitTime: Long): WriteTaskStats = {

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ColumnarWriteStatsTracker.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/ColumnarWriteStatsTracker.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@
 
 package org.apache.spark.sql.rapids
 
+import com.nvidia.spark.rapids.SpillableColumnarBatch
+
 import org.apache.spark.sql.execution.datasources.WriteTaskStats
-import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
  * A trait for classes that are capable of collecting statistics on columnar data that's being
@@ -52,10 +53,11 @@ trait ColumnarWriteTaskStatsTracker {
   /**
    * Process a new column batch to update the tracked statistics accordingly.
    * The batch will be written to the most recently witnessed file (via `newFile`).
+   *
    * @param filePath Path of the file which the batch is written to.
-   * @param batch Current data batch to be processed.
+   * @param spillableBatch Current spillable data batch to be processed.
    */
-  def newBatch(filePath: String, batch: ColumnarBatch): Unit
+  def newBatch(filePath: String, spillableBatch: SpillableColumnarBatch): Unit
 
   /**
    * Returns the final statistics computed so far.