Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

Commit

Permalink
Make required config accessible
Browse files Browse the repository at this point in the history
  • Loading branch information
PHILO-HE committed Aug 19, 2022
1 parent 758246f commit 633a548
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
*/
package com.intel.oap.spark.sql.execution.datasources.v2.arrow

import com.intel.oap.sql.shims.SparkShimLoader

import java.util.Locale

import scala.collection.JavaConverters._
Expand All @@ -25,9 +27,10 @@ import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
import org.apache.spark.sql.connector.read.PartitionReaderFactory
import org.apache.spark.sql.execution.PartitionedFileUtil
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitioningAwareFileIndex}
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionDirectory, PartitioningAwareFileIndex}
import org.apache.spark.sql.execution.datasources.v2.FileScan
import org.apache.spark.sql.execution.datasources.v2.arrow.ScanUtils
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
Expand Down Expand Up @@ -71,9 +74,26 @@ case class ArrowScan(
this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)

// compute maxSplitBytes
def maxSplitBytes(
sparkSession: SparkSession,
selectedPartitions: Seq[PartitionDirectory]): Long = {
val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes
// val minPartitionNum = sparkSession.sessionState.conf.filesMinPartitionNum
// .getOrElse(sparkSession.leafNodeDefaultParallelism)
val minPartitionNum = sparkSession.sessionState.conf.filesMinPartitionNum
.getOrElse(SparkShimLoader.getSparkShims.leafNodeDefaultParallelism(sparkSession))
val PREFERRED_PARTITION_SIZE_LOWER_BOUND: Long = 128 * 1024 * 1024
val PREFERRED_PARTITION_SIZE_UPPER_BOUND: Long = 512 * 1024 * 1024
val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
val bytesPerCore = totalBytes / minPartitionNum

Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
}

override def partitions: Seq[FilePartition] = {
val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters)
// val maxSplitBytes = FilePartition.maxSplitBytes(sparkSession, selectedPartitions)
val maxSplitBytes = FilePartition.maxSplitBytes(sparkSession, selectedPartitions)
// val partitionAttributes = fileIndex.partitionSchema.toAttributes
val partitionAttributes = ScanUtils.toAttributes(fileIndex)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ import org.apache.spark.shuffle.MigratableResolver
import org.apache.spark.shuffle.ShuffleHandle
import org.apache.spark.shuffle.api.ShuffleExecutorComponents
import org.apache.spark.shuffle.sort.SortShuffleWriter
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning}
import org.apache.spark.sql.execution.{ShufflePartitionSpec, SparkPlan}
Expand Down Expand Up @@ -121,4 +120,7 @@ trait SparkShims {
def getEndMapIndexOfCoalescedMapperPartitionSpec(spec: ShufflePartitionSpec): Int

def getNumReducersOfCoalescedMapperPartitionSpec(spec: ShufflePartitionSpec): Int

def leafNodeDefaultParallelism(sparkSession: SparkSession): Int

}
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,8 @@ class Spark311Shims extends SparkShims {
throw new RuntimeException("This method should not be invoked in spark 3.1.")
}

override def leafNodeDefaultParallelism(sparkSession: SparkSession): Int = {
sparkSession.sparkContext.defaultParallelism
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,8 @@ class Spark321Shims extends SparkShims {
}
}

override def leafNodeDefaultParallelism(sparkSession: SparkSession): Int = {
org.apache.spark.sql.util.ShimUtils.leafNodeDefaultParallelism(sparkSession)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.util

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf

object ShimUtils {

def leafNodeDefaultParallelism(sparkSession: SparkSession): Int = {
sparkSession.conf.get(SQLConf.LEAF_NODE_DEFAULT_PARALLELISM).getOrElse(
sparkSession.sparkContext.defaultParallelism)
}
}

0 comments on commit 633a548

Please sign in to comment.