apache · cloud-fan · Sep 27, 2017 · Oct 4, 2017 · Oct 12, 2017 · viirya
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -30,9 +30,8 @@ public interface ReadSupport {
   /**
    * Creates a {@link DataSourceV2Reader} to scan the data from this data source.
    *
-   * @param options the options for this data source reader, which is an immutable case-insensitive
-   *                string-to-string map.
-   * @return a reader that implements the actual read logic.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
    */
   DataSourceV2Reader createReader(DataSourceV2Options options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
@@ -39,9 +39,8 @@ public interface ReadSupportWithSchema {
    *               physical schema of the underlying storage of this data source reader, e.g.
    *               CSV files, JSON files, etc, while this reader may not read data with full
    *               schema, as column pruning or other optimizations may happen.
-   * @param options the options for this data source reader, which is an immutable case-insensitive
-   *                string-to-string map.
-   * @return a reader that implements the actual read logic.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
    */
   DataSourceV2Reader createReader(StructType schema, DataSourceV2Options options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -24,6 +24,10 @@
 /**
  * A data reader returned by {@link ReadTask#createReader()} and is responsible for outputting data
  * for a RDD partition.
+ *
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
+ * source readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for data source
+ * readers that mix in {@link SupportsScanUnsafeRow}.
  */
 @InterfaceStability.Evolving
 public interface DataReader<T> extends Closeable {

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -30,7 +30,7 @@
  * {@link org.apache.spark.sql.sources.v2.ReadSupportWithSchema#createReader(
  * StructType, org.apache.spark.sql.sources.v2.DataSourceV2Options)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
- * logic should be delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
+ * logic is delegated to {@link ReadTask}s that are returned by {@link #createReadTasks()}.
  *
  * There are mainly 3 kinds of query optimizations:
  *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
@@ -27,7 +27,8 @@
  * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
  *
  * Note that, the read task will be serialized and sent to executors, then the data reader will be
- * created on executors and do the actual reading.
+ * created on executors and do the actual reading. So {@link ReadTask} must be serializable and
+ * {@link DataReader} doesn't need to be.
  */
 @InterfaceStability.Evolving
 public interface ReadTask<T> extends Serializable {

diff --git a/...src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java b/...src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownCatalystFilters.java
@@ -40,4 +40,12 @@ public interface SupportsPushDownCatalystFilters {
    * Pushes down filters, and returns unsupported filters.
    */
   Expression[] pushCatalystFilters(Expression[] filters);
+
+  /**
+   * Returns the catalyst filters that are pushed in {@link #pushCatalystFilters(Expression[])}.
+   * It's possible that there is no filters in the query and
+   * {@link #pushCatalystFilters(Expression[])} is never called, empty array should be returned for
+   * this case.
+   */
+  Expression[] pushedCatalystFilters();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsPushDownFilters.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources.v2.reader;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.catalyst.expressions.Expression;
 import org.apache.spark.sql.sources.Filter;
 
 /**
@@ -35,4 +36,11 @@ public interface SupportsPushDownFilters {
    * Pushes down filters, and returns unsupported filters.
    */
   Filter[] pushFilters(Filter[] filters);
+
+  /**
+   * Returns the filters that are pushed in {@link #pushFilters(Filter[])}.
+   * It's possible that there is no filters in the query and {@link #pushFilters(Filter[])}
+   * is never called, empty array should be returned for this case.
+   */
+  Filter[] pushedFilters();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -184,7 +184,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
     val cls = DataSource.lookupDataSource(source)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
-      val dataSource = cls.newInstance()
       val options = new DataSourceV2Options(extraOptions.asJava)
 
       val reader = (cls.newInstance(), userSpecifiedSchema) match {
@@ -194,8 +193,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         case (ds: ReadSupport, None) =>
           ds.createReader(options)
 
-        case (_: ReadSupportWithSchema, None) =>
-          throw new AnalysisException(s"A schema needs to be specified when using $dataSource.")
+        case (ds: ReadSupportWithSchema, None) =>
+          throw new AnalysisException(s"A schema needs to be specified when using $ds.")
 
         case (ds: ReadSupport, Some(schema)) =>
           val reader = ds.createReader(options)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.ExperimentalMethods
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.optimizer.Optimizer
 import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
+import org.apache.spark.sql.execution.datasources.v2.PushDownOperatorsToDataSource
 import org.apache.spark.sql.execution.python.ExtractPythonUDFFromAggregate
 
 class SparkOptimizer(
@@ -31,7 +32,8 @@ class SparkOptimizer(
   override def batches: Seq[Batch] = (preOptimizationBatches ++ super.batches :+
     Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDF from Aggregate", Once, ExtractPythonUDFFromAggregate) :+
-    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions)) ++
+    Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
+    Batch("Push down operators to data source scan", Once, PushDownOperatorsToDataSource)) ++
     postHocOptimizationBatches :+
     Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*)
 

diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import java.util.Objects
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.sources.v2.reader._
+
+/**
+ * A base class for data source reader holder and defines equals/hashCode methods.
+ */
+trait DataSourceReaderHolder {
+  def fullOutput: Seq[AttributeReference]
+  def reader: DataSourceV2Reader
+
+  override def equals(other: Any): Boolean = other match {
+    case other: DataSourceV2Relation =>
+      val basicEquals = this.fullOutput == other.fullOutput &&
+        this.reader.getClass == other.reader.getClass &&
+        this.reader.readSchema() == other.reader.readSchema()
+
+      val samePushedFilters = (this.reader, other.reader) match {
+        case (l: SupportsPushDownCatalystFilters, r: SupportsPushDownCatalystFilters) =>
+          l.pushedCatalystFilters().toSeq == r.pushedCatalystFilters().toSeq
+        case (l: SupportsPushDownFilters, r: SupportsPushDownFilters) =>
+          l.pushedFilters().toSeq == r.pushedFilters().toSeq
+        case _ => true
+      }
+
+      basicEquals && samePushedFilters
+
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(fullOutput, reader.getClass, reader.readSchema())
+    val filters: Any = reader match {
+      case s: SupportsPushDownCatalystFilters => s.pushedCatalystFilters().toSeq
+      case s: SupportsPushDownFilters => s.pushedFilters().toSeq
+      case _ => Nil
+    }
+    (state :+ filters).map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b)
+  }
+
+  lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name =>
+    fullOutput.find(_.name == name).get
+  }
+}
diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
-import org.apache.spark.sql.sources.v2.reader.{DataSourceV2Reader, SupportsReportStatistics}
+import org.apache.spark.sql.sources.v2.reader._
 
 case class DataSourceV2Relation(
-    output: Seq[AttributeReference],
-    reader: DataSourceV2Reader) extends LeafNode {
+    fullOutput: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends LeafNode with DataSourceReaderHolder {
 
   override def computeStats(): Statistics = reader match {
     case r: SupportsReportStatistics =>

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -29,20 +29,12 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.StructType
 
+/**
+ * Physical plan node for scanning data from a data source.
+ */
 case class DataSourceV2ScanExec(
-    fullOutput: Array[AttributeReference],
-    @transient reader: DataSourceV2Reader,
-    // TODO: these 3 parameters are only used to determine the equality of the scan node, however,
-    // the reader also have this information, and ideally we can just rely on the equality of the
-    // reader. The only concern is, the reader implementation is outside of Spark and we have no
-    // control.
-    readSchema: StructType,
-    @transient filters: ExpressionSet,
-    hashPartitionKeys: Seq[String]) extends LeafExecNode {
-
-  def output: Seq[Attribute] = readSchema.map(_.name).map { name =>
-    fullOutput.find(_.name == name).get
-  }
+    fullOutput: Seq[AttributeReference],
+    @transient reader: DataSourceV2Reader) extends LeafExecNode with DataSourceReaderHolder {
 
   override def references: AttributeSet = AttributeSet.empty
 
@@ -74,7 +66,7 @@ class RowToUnsafeRowReadTask(rowReadTask: ReadTask[Row], schema: StructType)
   override def preferredLocations: Array[String] = rowReadTask.preferredLocations
 
   override def createReader: DataReader[UnsafeRow] = {
-    new RowToUnsafeDataReader(rowReadTask.createReader, RowEncoder.apply(schema))
+    new RowToUnsafeDataReader(rowReadTask.createReader, RowEncoder.apply(schema).resolveAndBind())
   }
 }
 

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -29,64 +29,8 @@ import org.apache.spark.sql.sources.v2.reader._
 object DataSourceV2Strategy extends Strategy {
   // TODO: write path
   override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-    case PhysicalOperation(projects, filters, DataSourceV2Relation(output, reader)) =>
-      val stayUpFilters: Seq[Expression] = reader match {
-        case r: SupportsPushDownCatalystFilters =>
-          r.pushCatalystFilters(filters.toArray)
-
-        case r: SupportsPushDownFilters =>
-          // A map from original Catalyst expressions to corresponding translated data source
-          // filters. If a predicate is not in this map, it means it cannot be pushed down.
-          val translatedMap: Map[Expression, Filter] = filters.flatMap { p =>
-            DataSourceStrategy.translateFilter(p).map(f => p -> f)
-          }.toMap
-
-          // Catalyst predicate expressions that cannot be converted to data source filters.
-          val nonConvertiblePredicates = filters.filterNot(translatedMap.contains)
-
-          // Data source filters that cannot be pushed down. An unhandled filter means
-          // the data source cannot guarantee the rows returned can pass the filter.
-          // As a result we must return it so Spark can plan an extra filter operator.
-          val unhandledFilters = r.pushFilters(translatedMap.values.toArray).toSet
-          val unhandledPredicates = translatedMap.filter { case (_, f) =>
-            unhandledFilters.contains(f)
-          }.keys
-
-          nonConvertiblePredicates ++ unhandledPredicates
-
-        case _ => filters
-      }
-
-      val attrMap = AttributeMap(output.zip(output))
-      val projectSet = AttributeSet(projects.flatMap(_.references))
-      val filterSet = AttributeSet(stayUpFilters.flatMap(_.references))
-
-      // Match original case of attributes.
-      // TODO: nested fields pruning
-      val requiredColumns = (projectSet ++ filterSet).toSeq.map(attrMap)
-      reader match {
-        case r: SupportsPushDownRequiredColumns =>
-          r.pruneColumns(requiredColumns.toStructType)
-        case _ =>
-      }
-
-      val scan = DataSourceV2ScanExec(
-        output.toArray,
-        reader,
-        reader.readSchema(),
-        ExpressionSet(filters),
-        Nil)
-
-      val filterCondition = stayUpFilters.reduceLeftOption(And)
-      val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
-
-      val withProject = if (projects == withFilter.output) {
-        withFilter
-      } else {
-        ProjectExec(projects, withFilter)
-      }
-
-      withProject :: Nil
+    case DataSourceV2Relation(output, reader) =>
+      DataSourceV2ScanExec(output, reader) :: Nil
 
     case _ => Nil
   }