From 310c64ceb93d042c63b0706221ad98d6d5324268 Mon Sep 17 00:00:00 2001 From: 924060929 <924060929@qq.com> Date: Mon, 6 May 2024 15:48:16 +0800 Subject: [PATCH] new distribute planner --- .../org/apache/doris/nereids/DorisParser.g4 | 1 + .../main/java/org/apache/doris/common/Id.java | 7 +- .../org/apache/doris/common/TreeNode.java | 24 ++ .../doris/common/profile/SummaryProfile.java | 11 + .../apache/doris/nereids/NereidsPlanner.java | 60 +++- .../nereids/parser/LogicalPlanBuilder.java | 3 + .../ChildrenPropertiesRegulator.java | 11 +- .../expressions/functions/table/Numbers.java | 10 + .../nereids/trees/plans/PlaceholderId.java | 7 +- .../trees/plans/commands/ExplainCommand.java | 1 + .../plans/distribute/DistributePlanner.java | 66 ++++ .../plans/distribute/DistributedPlan.java | 57 +++ .../plans/distribute/FragmentIdMapping.java | 71 ++++ .../distribute/PipelineDistributedPlan.java | 72 ++++ .../org/apache/doris/nereids/util/Utils.java | 14 + .../doris/nereids/worker/BackendWorker.java | 74 ++++ .../nereids/worker/BackendWorkerManager.java | 63 ++++ .../worker/LoadBalanceScanWorkerSelector.java | 326 ++++++++++++++++++ .../nereids/worker/ScanWorkerSelector.java | 62 ++++ .../apache/doris/nereids/worker/Worker.java | 38 ++ .../doris/nereids/worker/WorkerManager.java | 25 ++ .../nereids/worker/WorkerScanRanges.java | 33 ++ .../apache/doris/nereids/worker/Workload.java | 22 ++ .../worker/job/AbstractUnassignedJob.java | 74 ++++ .../worker/job/AbstractUnassignedScanJob.java | 192 +++++++++++ .../doris/nereids/worker/job/AssignedJob.java | 36 ++ .../worker/job/AssignedJobBuilder.java | 64 ++++ .../nereids/worker/job/BucketScanSource.java | 149 ++++++++ .../worker/job/CustomAssignmentJob.java | 29 ++ .../nereids/worker/job/DefaultScanSource.java | 111 ++++++ .../worker/job/LocalShuffleAssignedJob.java | 42 +++ .../doris/nereids/worker/job/ScanRange.java | 23 ++ .../doris/nereids/worker/job/ScanRanges.java | 104 ++++++ .../doris/nereids/worker/job/ScanSource.java | 41 +++ .../doris/nereids/worker/job/Splittable.java | 69 ++++ .../nereids/worker/job/StaticAssignedJob.java | 96 ++++++ ...ssignedGatherScanMultiRemoteTablesJob.java | 85 +++++ .../nereids/worker/job/UnassignedJob.java | 51 +++ .../worker/job/UnassignedJobBuilder.java | 266 ++++++++++++++ .../job/UnassignedQueryConstantJob.java | 48 +++ .../job/UnassignedScanBucketOlapTableJob.java | 283 +++++++++++++++ .../job/UnassignedScanSingleOlapTableJob.java | 82 +++++ .../UnassignedScanSingleRemoteTableJob.java | 53 +++ .../worker/job/UnassignedShuffleJob.java | 123 +++++++ .../job/UnassignedSpecifyInstancesJob.java | 49 +++ .../worker/job/UninstancedScanSource.java | 36 ++ .../nereids/worker/job/WorkerScanSource.java | 34 ++ .../doris/planner/BucketSpecifyInstances.java | 33 ++ .../apache/doris/planner/DataGenScanNode.java | 4 + .../planner/DefaultSpecifyInstances.java | 33 ++ .../planner/NereidsSpecifyInstances.java | 55 +++ .../apache/doris/planner/OlapScanNode.java | 23 +- .../apache/doris/planner/PlanFragment.java | 30 ++ .../org/apache/doris/planner/PlanNode.java | 25 ++ .../org/apache/doris/planner/ScanNode.java | 9 +- .../java/org/apache/doris/qe/Coordinator.java | 141 ++++++++ .../apache/doris/qe/NereidsCoordinator.java | 64 ++++ .../org/apache/doris/qe/SessionVariable.java | 44 +++ .../org/apache/doris/qe/StmtExecutor.java | 6 +- .../distribute/colocate_union_numbers.out | 10 + .../prune_bucket_with_bucket_shuffle_join.out | 5 + .../distribute/shuffle_left_join.out | 9 + .../distribute/colocate_union_numbers.groovy | 60 ++++ ...une_bucket_with_bucket_shuffle_join.groovy | 82 +++++ .../distribute/shuffle_left_join.groovy | 82 +++++ 65 files changed, 3889 insertions(+), 24 deletions(-) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributePlanner.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributedPlan.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/FragmentIdMapping.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/PipelineDistributedPlan.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/BackendWorker.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/BackendWorkerManager.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/LoadBalanceScanWorkerSelector.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/ScanWorkerSelector.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Worker.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerManager.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerScanRanges.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Workload.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedScanJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJobBuilder.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/BucketScanSource.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/CustomAssignmentJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/DefaultScanSource.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/LocalShuffleAssignedJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRange.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRanges.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanSource.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/Splittable.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/StaticAssignedJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedGatherScanMultiRemoteTablesJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJobBuilder.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedQueryConstantJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanBucketOlapTableJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleOlapTableJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleRemoteTableJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedShuffleJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedSpecifyInstancesJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UninstancedScanSource.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/WorkerScanSource.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/planner/BucketSpecifyInstances.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/planner/DefaultSpecifyInstances.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/planner/NereidsSpecifyInstances.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java create mode 100644 regression-test/data/nereids_syntax_p0/distribute/colocate_union_numbers.out create mode 100644 regression-test/data/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.out create mode 100644 regression-test/data/nereids_syntax_p0/distribute/shuffle_left_join.out create mode 100644 regression-test/suites/nereids_syntax_p0/distribute/colocate_union_numbers.groovy create mode 100644 regression-test/suites/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.groovy create mode 100644 regression-test/suites/nereids_syntax_p0/distribute/shuffle_left_join.groovy diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 31102bedf3cda0f..1a2267732e13fce 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -261,6 +261,7 @@ planType | OPTIMIZED | PHYSICAL // same type | SHAPE | MEMO + | DISTRIBUTED | ALL // default type ; diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Id.java b/fe/fe-core/src/main/java/org/apache/doris/common/Id.java index 9d6dad50a462380..a6bd3896708058c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/Id.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/Id.java @@ -25,7 +25,7 @@ /** * Integer ids that cannot accidentally be compared with ints. */ -public class Id> { +public class Id> implements Comparable> { protected final int id; public Id(int id) { @@ -62,4 +62,9 @@ public ArrayList asList() { public String toString() { return Integer.toString(id); } + + @Override + public int compareTo(Id idTypeId) { + return id - idTypeId.id; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/TreeNode.java b/fe/fe-core/src/main/java/org/apache/doris/common/TreeNode.java index 7693acf3bb33d28..1aa4f3adc11b2ea 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/TreeNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/TreeNode.java @@ -239,4 +239,28 @@ public C findFirstOf(Class cl) { return null; } + /** anyMatch */ + public boolean anyMatch(Predicate> func) { + if (func.apply(this)) { + return true; + } + + for (NodeType child : children) { + if (child.anyMatch(func)) { + return true; + } + } + return false; + } + + /** foreachDown */ + public void foreachDown(Predicate> visitor) { + if (!visitor.test(this)) { + return; + } + + for (TreeNode child : getChildren()) { + child.foreachDown(visitor); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java index 8a7119cb2cef398..9b5ec5d4cd8afc2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java @@ -86,6 +86,7 @@ public class SummaryProfile { public static final String NEREIDS_REWRITE_TIME = "Nereids Rewrite Time"; public static final String NEREIDS_OPTIMIZE_TIME = "Nereids Optimize Time"; public static final String NEREIDS_TRANSLATE_TIME = "Nereids Translate Time"; + public static final String NEREIDS_DISTRIBUTE_TIME = "Nereids Distribute Time"; public static final String FRAGMENT_COMPRESSED_SIZE = "Fragment Compressed Size"; public static final String FRAGMENT_RPC_COUNT = "Fragment RPC Count"; @@ -199,6 +200,7 @@ public class SummaryProfile { private long nereidsRewriteFinishTime = -1; private long nereidsOptimizeFinishTime = -1; private long nereidsTranslateFinishTime = -1; + private long nereidsDistributeFinishTime = -1; // timestamp of query begin private long queryBeginTime = -1; // Analysis end time @@ -315,6 +317,7 @@ private void updateExecutionSummaryProfile() { executionSummaryProfile.addInfoString(NEREIDS_REWRITE_TIME, getPrettyNereidsRewriteTime()); executionSummaryProfile.addInfoString(NEREIDS_OPTIMIZE_TIME, getPrettyNereidsOptimizeTime()); executionSummaryProfile.addInfoString(NEREIDS_TRANSLATE_TIME, getPrettyNereidsTranslateTime()); + executionSummaryProfile.addInfoString(NEREIDS_DISTRIBUTE_TIME, getPrettyNereidsDistributeTime()); executionSummaryProfile.addInfoString(ANALYSIS_TIME, getPrettyTime(queryAnalysisFinishTime, queryBeginTime, TUnit.TIME_MS)); executionSummaryProfile.addInfoString(PLAN_TIME, @@ -419,6 +422,10 @@ public void setNereidsTranslateTime() { this.nereidsTranslateFinishTime = TimeUtils.getStartTimeMs(); } + public void setNereidsDistributeTime() { + this.nereidsDistributeFinishTime = TimeUtils.getStartTimeMs(); + } + public void setQueryBeginTime() { this.queryBeginTime = TimeUtils.getStartTimeMs(); } @@ -654,6 +661,10 @@ public String getPrettyNereidsTranslateTime() { return getPrettyTime(nereidsTranslateFinishTime, nereidsOptimizeFinishTime, TUnit.TIME_MS); } + public String getPrettyNereidsDistributeTime() { + return getPrettyTime(nereidsDistributeFinishTime, nereidsTranslateFinishTime, TUnit.TIME_MS); + } + private String getPrettyGetPartitionVersionTime() { if (getPartitionVersionTime == 0) { return "N/A"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 829cf6512d6d676..e0d6f2f7589b136 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -52,6 +52,9 @@ import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.commands.ExplainCommand.ExplainLevel; +import org.apache.doris.nereids.trees.plans.distribute.DistributePlanner; +import org.apache.doris.nereids.trees.plans.distribute.DistributedPlan; +import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping; import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; import org.apache.doris.nereids.trees.plans.logical.LogicalSqlCache; import org.apache.doris.nereids.trees.plans.physical.PhysicalEmptyRelation; @@ -70,6 +73,7 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ResultSet; import org.apache.doris.qe.ResultSetMetaData; +import org.apache.doris.qe.SessionVariable; import org.apache.doris.qe.cache.CacheAnalyzer; import com.google.common.annotations.VisibleForTesting; @@ -102,6 +106,7 @@ public class NereidsPlanner extends Planner { private Plan rewrittenPlan; private Plan optimizedPlan; private PhysicalPlan physicalPlan; + private FragmentIdMapping distributedPlans; // The cost of optimized plan private double cost = 0; private LogicalPlanAdapter logicalPlanAdapter; @@ -130,17 +135,18 @@ public void plan(StatementBase queryStmt, org.apache.doris.thrift.TQueryOptions LogicalPlan parsedPlan = logicalPlanAdapter.getLogicalPlan(); NereidsTracer.logImportantTime("EndParsePlan"); setParsedPlan(parsedPlan); + PhysicalProperties requireProperties = buildInitRequireProperties(); statementContext.getStopwatch().start(); boolean showPlanProcess = showPlanProcess(queryStmt.getExplainOptions()); Plan resultPlan = plan(parsedPlan, requireProperties, explainLevel, showPlanProcess); statementContext.getStopwatch().stop(); setOptimizedPlan(resultPlan); - if (explainLevel.isPlanLevel) { - return; + + if (resultPlan instanceof PhysicalPlan) { + physicalPlan = (PhysicalPlan) resultPlan; + distribute(physicalPlan, explainLevel); } - physicalPlan = (PhysicalPlan) resultPlan; - translate(physicalPlan); } @VisibleForTesting @@ -315,7 +321,7 @@ private void optimize() { } } - private void translate(PhysicalPlan resultPlan) throws UserException { + private void splitFragments(PhysicalPlan resultPlan) throws UserException { if (resultPlan instanceof PhysicalSqlCache) { return; } @@ -360,6 +366,27 @@ private void translate(PhysicalPlan resultPlan) throws UserException { ScanNode.setVisibleVersionForOlapScanNodes(getScanNodes()); } + private void distribute(PhysicalPlan physicalPlan, ExplainLevel explainLevel) throws UserException { + boolean canUseNereidsDistributePlanner = SessionVariable.canUseNereidsDistributePlanner(); + if ((!canUseNereidsDistributePlanner && explainLevel.isPlanLevel)) { + return; + } else if ((canUseNereidsDistributePlanner && explainLevel.isPlanLevel + && (explainLevel != ExplainLevel.ALL_PLAN && explainLevel != ExplainLevel.DISTRIBUTED_PLAN))) { + return; + } + + splitFragments(physicalPlan); + + if (!canUseNereidsDistributePlanner) { + return; + } + + distributedPlans = new DistributePlanner(fragments).plan(); + if (statementContext.getConnectContext().getExecutor() != null) { + statementContext.getConnectContext().getExecutor().getSummaryProfile().setNereidsDistributeTime(); + } + } + private PhysicalPlan postProcess(PhysicalPlan physicalPlan) { return new PlanPostProcessors(cascadesContext).process(physicalPlan); } @@ -498,6 +525,17 @@ public String getExplainString(ExplainOptions explainOptions) { + "\n\n========== MATERIALIZATIONS ==========\n" + materializationStringBuilder; break; + case DISTRIBUTED_PLAN: + StringBuilder distributedPlanStringBuilder = new StringBuilder(); + + distributedPlanStringBuilder.append("========== DISTRIBUTED PLAN ==========\n"); + if (distributedPlans == null || distributedPlans.isEmpty()) { + plan = "Distributed plan not generated, please set enable_nereids_distribute_planner " + + "and enable_pipeline_x_engine to true"; + } else { + plan += DistributedPlan.toString(Lists.newArrayList(distributedPlans.values())) + "\n\n"; + } + break; case ALL_PLAN: plan = "========== PARSED PLAN " + getTimeMetricString(SummaryProfile::getPrettyParseSqlTime) + " ==========\n" @@ -510,7 +548,13 @@ public String getExplainString(ExplainOptions explainOptions) { + rewrittenPlan.treeString() + "\n\n" + "========== OPTIMIZED PLAN " + getTimeMetricString(SummaryProfile::getPrettyNereidsOptimizeTime) + " ==========\n" - + optimizedPlan.treeString(); + + optimizedPlan.treeString() + "\n\n"; + + if (distributedPlans != null && !distributedPlans.isEmpty()) { + plan += "========== DISTRIBUTED PLAN " + + getTimeMetricString(SummaryProfile::getPrettyNereidsDistributeTime) + " ==========\n"; + plan += DistributedPlan.toString(Lists.newArrayList(distributedPlans.values())) + "\n\n"; + } break; default: plan = super.getExplainString(explainOptions) @@ -681,6 +725,10 @@ public PhysicalPlan getPhysicalPlan() { return physicalPlan; } + public FragmentIdMapping getDistributedPlans() { + return distributedPlans; + } + public LogicalPlanAdapter getLogicalPlanAdapter() { return logicalPlanAdapter; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 077a1a4d881a2d0..6b7c6eed26a90bb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -3417,6 +3417,9 @@ private ExplainLevel parseExplainPlanType(PlanTypeContext planTypeContext) { if (planTypeContext.MEMO() != null) { return ExplainLevel.MEMO_PLAN; } + if (planTypeContext.DISTRIBUTED() != null) { + return ExplainLevel.DISTRIBUTED_PLAN; + } return ExplainLevel.ALL_PLAN; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/properties/ChildrenPropertiesRegulator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/properties/ChildrenPropertiesRegulator.java index 3beed014aac9109..af9174d31325ed7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/properties/ChildrenPropertiesRegulator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/properties/ChildrenPropertiesRegulator.java @@ -46,6 +46,7 @@ import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; import org.apache.doris.nereids.util.JoinUtils; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -217,8 +218,8 @@ private boolean couldNotRightBucketShuffleJoin(JoinType joinType, DistributionSp } @Override - public Boolean visitPhysicalHashJoin(PhysicalHashJoin hashJoin, - Void context) { + public Boolean visitPhysicalHashJoin( + PhysicalHashJoin hashJoin, Void context) { Preconditions.checkArgument(children.size() == 2, "children.size() != 2"); Preconditions.checkArgument(childrenProperties.size() == 2); Preconditions.checkArgument(requiredProperties.size() == 2); @@ -248,7 +249,8 @@ public Boolean visitPhysicalHashJoin(PhysicalHashJoin R accept(ExpressionVisitor visitor, C context) { return visitor.visitNumbers(this, context); } + @Override + public PhysicalProperties getPhysicalProperties() { + if (SessionVariable.canUseNereidsDistributePlanner()) { + return PhysicalProperties.ANY; + } + return super.getPhysicalProperties(); + } + @Override public Numbers withChildren(List children) { Preconditions.checkArgument(children().size() == 1 diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlaceholderId.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlaceholderId.java index be3cb645fe6233c..f1d410100e16faf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlaceholderId.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlaceholderId.java @@ -23,7 +23,7 @@ /** * placeholder id for prepared statement parameters */ -public class PlaceholderId extends Id implements Comparable { +public class PlaceholderId extends Id { public PlaceholderId(int id) { super(id); @@ -55,9 +55,4 @@ public boolean equals(Object obj) { public int hashCode() { return super.hashCode(); } - - @Override - public int compareTo(PlaceholderId o) { - return this.id - o.id; - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExplainCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExplainCommand.java index 500f1d4a09a478c..03278723d344abe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExplainCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExplainCommand.java @@ -49,6 +49,7 @@ public enum ExplainLevel { OPTIMIZED_PLAN(true), SHAPE_PLAN(true), MEMO_PLAN(true), + DISTRIBUTED_PLAN(true), ALL_PLAN(true) ; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributePlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributePlanner.java new file mode 100644 index 000000000000000..bad620d189330cb --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributePlanner.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.distribute; + +import org.apache.doris.nereids.worker.job.AssignedJob; +import org.apache.doris.nereids.worker.job.AssignedJobBuilder; +import org.apache.doris.nereids.worker.job.UnassignedJob; +import org.apache.doris.nereids.worker.job.UnassignedJobBuilder; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanFragmentId; + +import com.google.common.collect.ListMultimap; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; + +/** DistributePlanner */ +public class DistributePlanner { + private final List fragments; + private final FragmentIdMapping idToFragments; + + public DistributePlanner(List fragments) { + this.fragments = Objects.requireNonNull(fragments, "fragments can not be null"); + this.idToFragments = FragmentIdMapping.buildFragmentMapping(fragments); + } + + public FragmentIdMapping plan() { + FragmentIdMapping fragmentJobs = UnassignedJobBuilder.buildJobs(idToFragments); + ListMultimap instanceJobs = AssignedJobBuilder.buildJobs(fragmentJobs); + return buildDistributePlans(fragmentJobs, instanceJobs); + } + + private FragmentIdMapping buildDistributePlans( + Map idToUnassignedJobs, + ListMultimap idToAssignedJobs) { + FragmentIdMapping idToDistributedPlans = new FragmentIdMapping<>(); + for (Entry kv : idToFragments.entrySet()) { + PlanFragmentId fragmentId = kv.getKey(); + PlanFragment fragment = kv.getValue(); + + UnassignedJob fragmentJob = idToUnassignedJobs.get(fragmentId); + List instanceJobs = idToAssignedJobs.get(fragmentId); + + List childrenPlans = idToDistributedPlans.getByChildrenFragments(fragment); + idToDistributedPlans.put(fragmentId, new PipelineDistributedPlan(fragmentJob, instanceJobs, childrenPlans)); + } + return (FragmentIdMapping) idToDistributedPlans; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributedPlan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributedPlan.java new file mode 100644 index 000000000000000..f4bf53cc232ea82 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/DistributedPlan.java @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.distribute; + +import org.apache.doris.nereids.trees.AbstractTreeNode; +import org.apache.doris.nereids.util.Utils; +import org.apache.doris.nereids.worker.job.UnassignedJob; + +import java.util.List; +import java.util.Objects; + +/** DistributedPlan */ +@lombok.Getter +public abstract class DistributedPlan extends AbstractTreeNode { + protected final UnassignedJob fragmentJob; + protected final List inputs; + + public DistributedPlan(UnassignedJob fragmentJob, List inputs) { + this.fragmentJob = Objects.requireNonNull(fragmentJob, "fragmentJob can not be null"); + this.inputs = Utils.fastToImmutableList(Objects.requireNonNull(inputs, "inputs can not be null")); + } + + @Override + public DistributedPlan withChildren(List children) { + throw new UnsupportedOperationException(); + } + + public abstract String toString(int displayFragmentId); + + /** toString */ + public static String toString(List distributedPlansBottomToTop) { + StringBuilder distributedPlanStringBuilder = new StringBuilder(); + int fragmentDisplayId = 0; + for (int i = distributedPlansBottomToTop.size() - 1; i >= 0; i--) { + DistributedPlan distributedPlan = distributedPlansBottomToTop.get(i); + distributedPlanStringBuilder + .append(distributedPlan.toString(fragmentDisplayId++)) + .append("\n"); + } + return distributedPlanStringBuilder.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/FragmentIdMapping.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/FragmentIdMapping.java new file mode 100644 index 000000000000000..95bf36051d2033b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/FragmentIdMapping.java @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.distribute; + +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanFragmentId; + +import com.google.common.collect.ImmutableList; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * FragmentIdMapping: + * key: PlanFragmentId + * value: T + * + * NOTE: this map should order by PlanFragmentId asc + */ +public class FragmentIdMapping extends TreeMap { + public FragmentIdMapping() { + } + + public FragmentIdMapping(Comparator comparator) { + super(comparator); + } + + public FragmentIdMapping(Map m) { + super(m); + } + + public FragmentIdMapping(SortedMap m) { + super(m); + } + + /** getByChildrenFragments */ + public List getByChildrenFragments(PlanFragment fragment) { + List children = fragment.getChildren(); + ImmutableList.Builder values = ImmutableList.builderWithExpectedSize(children.size()); + for (PlanFragment child : children) { + values.add(get(child.getFragmentId())); + } + return values.build(); + } + + public static FragmentIdMapping buildFragmentMapping(List fragments) { + FragmentIdMapping idToFragments = new FragmentIdMapping<>(); + for (PlanFragment fragment : fragments) { + idToFragments.put(fragment.getFragmentId(), fragment); + } + return idToFragments; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/PipelineDistributedPlan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/PipelineDistributedPlan.java new file mode 100644 index 000000000000000..d268e8da20b32f7 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/PipelineDistributedPlan.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.plans.distribute; + +import org.apache.doris.nereids.util.Utils; +import org.apache.doris.nereids.worker.job.AssignedJob; +import org.apache.doris.nereids.worker.job.UnassignedJob; +import org.apache.doris.thrift.TExplainLevel; + +import java.util.List; +import java.util.Objects; + +/** PipelineDistributedPlan */ +public class PipelineDistributedPlan extends DistributedPlan { + protected final List instanceJobs; + + public PipelineDistributedPlan( + UnassignedJob fragmentJob, + List instanceJobs, + List inputs) { + super(fragmentJob, inputs); + this.instanceJobs = Utils.fastToImmutableList( + Objects.requireNonNull(instanceJobs, "instanceJobs can not be null") + ); + } + + public List getInstanceJobs() { + return instanceJobs; + } + + @Override + public String toString(int displayFragmentId) { + StringBuilder instancesStr = new StringBuilder(); + for (int i = 0; i < instanceJobs.size(); i++) { + instancesStr.append(instanceJobs.get(i).toString(false)); + if (i + 1 < instanceJobs.size()) { + instancesStr.append(",\n"); + } + } + String instancesStrWithIndent = Utils.addLinePrefix(instancesStr.toString(), " "); + + String explainString = Utils.addLinePrefix( + fragmentJob.getFragment().getExplainString(TExplainLevel.VERBOSE).trim(), " " + ); + + return "PipelineDistributedPlan(\n" + + " id: " + displayFragmentId + ",\n" + + " parallel: " + instanceJobs.size() + ",\n" + + " fragmentJob: " + fragmentJob + ",\n" + + " fragment: {\n" + + " " + explainString + "\n" + + " },\n" + + " instanceJobs: [\n" + instancesStrWithIndent + "\n" + + " ]\n" + + ")"; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java index 852e148ef1d9cbf..908f37b7d87a5e7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java @@ -431,4 +431,18 @@ public static Optional fastReduce(List list, BiFunction> backends = Suppliers.memoize(() -> { + try { + return Env.getCurrentSystemInfo().getBackendsWithIdByCurrentCluster(); + } catch (Exception t) { + throw new NereidsException("Can not get backends: " + t, t); + } + }); + + @Override + public Worker getWorker(long backendId) { + ImmutableMap backends = this.backends.get(); + Backend backend = backends.get(backendId); + if (backend == null) { + throw new IllegalStateException("Backend " + backendId + " is not exist"); + } + return new BackendWorker(backend); + } + + @Override + public Worker randomAvailableWorker() { + try { + Reference selectedBackendId = new Reference<>(); + ImmutableMap backends = this.backends.get(); + SimpleScheduler.getHost(backends, selectedBackendId); + Backend selctedBackend = backends.get(selectedBackendId.getRef()); + return new BackendWorker(selctedBackend); + } catch (Exception t) { + throw new NereidsException("Can not get backends: " + t, t); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/LoadBalanceScanWorkerSelector.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/LoadBalanceScanWorkerSelector.java new file mode 100644 index 000000000000000..c4df834519afc11 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/LoadBalanceScanWorkerSelector.java @@ -0,0 +1,326 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.worker.job.BucketScanSource; +import org.apache.doris.nereids.worker.job.DefaultScanSource; +import org.apache.doris.nereids.worker.job.ScanRanges; +import org.apache.doris.nereids.worker.job.UnassignedJob; +import org.apache.doris.nereids.worker.job.UnassignedScanBucketOlapTableJob; +import org.apache.doris.nereids.worker.job.UninstancedScanSource; +import org.apache.doris.planner.DataPartition; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.thrift.TExternalScanRange; +import org.apache.doris.thrift.TFileRangeDesc; +import org.apache.doris.thrift.TFileScanRange; +import org.apache.doris.thrift.TPaloScanRange; +import org.apache.doris.thrift.TScanRange; +import org.apache.doris.thrift.TScanRangeLocation; +import org.apache.doris.thrift.TScanRangeLocations; +import org.apache.doris.thrift.TScanRangeParams; + +import com.google.common.base.Function; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.function.BiFunction; + +/** LoadBalanceScanWorkerSelector */ +public class LoadBalanceScanWorkerSelector implements ScanWorkerSelector { + private final BackendWorkerManager workerManager = new BackendWorkerManager(); + private final Map workloads = Maps.newLinkedHashMap(); + + @Override + public WorkerManager getWorkerManager() { + return workerManager; + } + + @Override + public Worker selectMinWorkloadWorker(List workers) { + Worker minWorkloadWorker = null; + WorkerWorkload minWorkload = new WorkerWorkload(Integer.MAX_VALUE, Long.MAX_VALUE); + for (Worker worker : workers) { + WorkerWorkload workload = getWorkload(worker); + if (minWorkload.compareTo(workload) > 0) { + minWorkloadWorker = worker; + minWorkload = workload; + } + } + minWorkload.recordOneScanTask(1); + return minWorkloadWorker; + } + + @Override + public Map selectReplicaAndWorkerWithoutBucket(ScanNode scanNode) { + Map workerScanRanges = Maps.newLinkedHashMap(); + // allScanRangesLocations is all scan ranges in all partition which need to scan + List allScanRangesLocations = scanNode.getScanRangeLocations(0); + for (TScanRangeLocations onePartitionOneScanRangeLocation : allScanRangesLocations) { + // usually, the onePartitionOneScanRangeLocation is a tablet in one partition + long bytes = getScanRangeSize(scanNode, onePartitionOneScanRangeLocation); + + WorkerScanRanges assigned = selectScanReplicaAndMinWorkloadWorker( + onePartitionOneScanRangeLocation, bytes); + UninstancedScanSource scanRanges = workerScanRanges.computeIfAbsent( + assigned.worker, + w -> new UninstancedScanSource( + new DefaultScanSource(ImmutableMap.of(scanNode, new ScanRanges())) + ) + ); + DefaultScanSource scanSource = (DefaultScanSource) scanRanges.scanSource; + scanSource.scanNodeToScanRanges.get(scanNode).addScanRanges(assigned.scanRanges); + } + return workerScanRanges; + } + + @Override + public Map selectReplicaAndWorkerWithBucket( + UnassignedScanBucketOlapTableJob unassignedJob) { + PlanFragment fragment = unassignedJob.getFragment(); + List scanNodes = unassignedJob.getScanNodes(); + List olapScanNodes = unassignedJob.getOlapScanNodes(); + + BiFunction> bucketScanRangeSupplier = bucketScanRangeSupplier(); + Function> bucketBytesSupplier = bucketBytesSupplier(); + // all are olap scan nodes + if (!scanNodes.isEmpty() && scanNodes.size() == olapScanNodes.size()) { + if (olapScanNodes.size() == 1 && fragment.isBucketShuffleJoinInput()) { + return selectForBucket(unassignedJob, scanNodes, bucketScanRangeSupplier, bucketBytesSupplier); + } else if (fragment.hasColocatePlanNode()) { + return selectForBucket(unassignedJob, scanNodes, bucketScanRangeSupplier, bucketBytesSupplier); + } + } else if (olapScanNodes.isEmpty() && fragment.getDataPartition() == DataPartition.UNPARTITIONED) { + return selectForBucket(unassignedJob, scanNodes, bucketScanRangeSupplier, bucketBytesSupplier); + } + throw new IllegalStateException( + "Illegal bucket shuffle join or colocate join in fragment: " + fragment.getFragmentId() + ); + } + + private BiFunction> bucketScanRangeSupplier() { + return (scanNode, bucketIndex) -> { + if (scanNode instanceof OlapScanNode) { + return (List) ((OlapScanNode) scanNode).bucketSeq2locations.get(bucketIndex); + } else { + // the backend is selected by XxxScanNode.createScanRangeLocations() + return scanNode.getScanRangeLocations(0); + } + }; + } + + private Function> bucketBytesSupplier() { + return scanNode -> { + if (scanNode instanceof OlapScanNode) { + return ((OlapScanNode) scanNode).bucketSeq2Bytes; + } else { + // not supported yet + return ImmutableMap.of(0, 0L); + } + }; + } + + private Map selectForBucket( + UnassignedJob unassignedJob, List scanNodes, + BiFunction> bucketScanRangeSupplier, + Function> bucketBytesSupplier) { + Map assignment = Maps.newLinkedHashMap(); + + Map bucketIndexToBytes = computeEachBucketScanBytes(scanNodes, bucketBytesSupplier); + + ScanNode firstScanNode = scanNodes.get(0); + for (Entry kv : bucketIndexToBytes.entrySet()) { + Integer bucketIndex = kv.getKey(); + long allScanNodeScanBytesInOneBucket = kv.getValue(); + + List allPartitionTabletsInOneBucketInFirstTable + = bucketScanRangeSupplier.apply(firstScanNode, bucketIndex); + WorkerScanRanges replicaAndWorker = selectScanReplicaAndMinWorkloadWorker( + allPartitionTabletsInOneBucketInFirstTable.get(0), allScanNodeScanBytesInOneBucket); + Worker selectedWorker = replicaAndWorker.worker; + long workerId = selectedWorker.id(); + for (ScanNode scanNode : scanNodes) { + List allPartitionTabletsInOneBucket + = bucketScanRangeSupplier.apply(scanNode, bucketIndex); + List> selectedReplicasInOneBucket = filterReplicaByWorkerInBucket( + scanNode, workerId, bucketIndex, allPartitionTabletsInOneBucket + ); + UninstancedScanSource bucketIndexToScanNodeToTablets + = assignment.computeIfAbsent( + selectedWorker, + worker -> new UninstancedScanSource(new BucketScanSource(Maps.newLinkedHashMap())) + ); + BucketScanSource scanSource = (BucketScanSource) bucketIndexToScanNodeToTablets.scanSource; + Map scanNodeToScanRanges = scanSource.bucketIndexToScanNodeToTablets + .computeIfAbsent(bucketIndex, bucket -> Maps.newLinkedHashMap()); + ScanRanges scanRanges = scanNodeToScanRanges.computeIfAbsent(scanNode, node -> new ScanRanges()); + for (Pair replica : selectedReplicasInOneBucket) { + TScanRangeParams replicaParam = replica.first; + Long scanBytes = replica.second; + scanRanges.addScanRange(replicaParam, scanBytes); + } + } + } + return assignment; + } + + private WorkerScanRanges selectScanReplicaAndMinWorkloadWorker( + TScanRangeLocations tabletLocation, long tabletBytes) { + List replicaLocations = tabletLocation.getLocations(); + int replicaNum = replicaLocations.size(); + WorkerWorkload minWorkload = new WorkerWorkload(Integer.MAX_VALUE, Long.MAX_VALUE); + Worker minWorkLoadWorker = null; + TScanRangeLocation selectedReplicaLocation = null; + + for (int i = 0; i < replicaNum; i++) { + TScanRangeLocation replicaLocation = replicaLocations.get(i); + Worker worker = workerManager.getWorker(replicaLocation.getBackendId()); + if (!worker.available()) { + continue; + } + + WorkerWorkload workload = getWorkload(worker); + if (workload.compareTo(minWorkload) < 0) { + minWorkLoadWorker = worker; + minWorkload = workload; + selectedReplicaLocation = replicaLocation; + } + } + if (minWorkLoadWorker == null) { + throw new AnalysisException("No available workers"); + } else { + minWorkload.recordOneScanTask(tabletBytes); + ScanRanges scanRanges = new ScanRanges(); + TScanRangeParams scanReplicaParams = + ScanWorkerSelector.buildScanReplicaParams(tabletLocation, selectedReplicaLocation); + scanRanges.addScanRange(scanReplicaParams, tabletBytes); + return new WorkerScanRanges(minWorkLoadWorker, scanRanges); + } + } + + private List> filterReplicaByWorkerInBucket( + ScanNode scanNode, long filterWorkerId, int bucketIndex, + List allPartitionTabletsInOneBucket) { + List> selectedReplicasInOneBucket = Lists.newArrayList(); + for (TScanRangeLocations onePartitionOneTabletLocation : allPartitionTabletsInOneBucket) { + TScanRange scanRange = onePartitionOneTabletLocation.getScanRange(); + if (scanRange.getPaloScanRange() != null) { + long tabletId = scanRange.getPaloScanRange().getTabletId(); + boolean foundTabletInThisWorker = false; + for (TScanRangeLocation replicaLocation : onePartitionOneTabletLocation.getLocations()) { + if (replicaLocation.getBackendId() == filterWorkerId) { + TScanRangeParams scanReplicaParams = ScanWorkerSelector.buildScanReplicaParams( + onePartitionOneTabletLocation, replicaLocation); + Long replicaSize = ((OlapScanNode) scanNode).getTabletSingleReplicaSize(tabletId); + selectedReplicasInOneBucket.add(Pair.of(scanReplicaParams, replicaSize)); + foundTabletInThisWorker = true; + break; + } + } + if (!foundTabletInThisWorker) { + throw new IllegalStateException( + "Can not find tablet " + tabletId + " in the bucket: " + bucketIndex); + } + } else if (onePartitionOneTabletLocation.getLocations().size() == 1) { + TScanRangeLocation replicaLocation = onePartitionOneTabletLocation.getLocations().get(0); + TScanRangeParams scanReplicaParams = ScanWorkerSelector.buildScanReplicaParams( + onePartitionOneTabletLocation, replicaLocation); + Long replicaSize = 0L; + selectedReplicasInOneBucket.add(Pair.of(scanReplicaParams, replicaSize)); + } else { + throw new IllegalStateException("Unsupported"); + } + } + return selectedReplicasInOneBucket; + } + + private Map computeEachBucketScanBytes( + List scanNodes, Function> bucketBytesSupplier) { + Map bucketIndexToBytes = Maps.newLinkedHashMap(); + for (ScanNode scanNode : scanNodes) { + Map bucketSeq2Bytes = bucketBytesSupplier.apply(scanNode); + for (Entry bucketSeq2Byte : bucketSeq2Bytes.entrySet()) { + Integer bucketIndex = bucketSeq2Byte.getKey(); + Long scanBytes = bucketSeq2Byte.getValue(); + bucketIndexToBytes.merge(bucketIndex, scanBytes, Long::sum); + } + } + return bucketIndexToBytes; + } + + private WorkerWorkload getWorkload(Worker worker) { + return workloads.computeIfAbsent(worker, w -> new WorkerWorkload()); + } + + private long getScanRangeSize(ScanNode scanNode, TScanRangeLocations scanRangeLocations) { + TScanRange scanRange = scanRangeLocations.getScanRange(); + TPaloScanRange paloScanRange = scanRange.getPaloScanRange(); + if (paloScanRange != null) { + long tabletId = paloScanRange.getTabletId(); + Long tabletBytes = ((OlapScanNode) scanNode).getTabletSingleReplicaSize(tabletId); + return tabletBytes == null ? 0L : tabletBytes; + } + + TExternalScanRange extScanRange = scanRange.getExtScanRange(); + if (extScanRange != null) { + TFileScanRange fileScanRange = extScanRange.getFileScanRange(); + long size = 0; + for (TFileRangeDesc range : fileScanRange.getRanges()) { + size += range.getSize(); + } + return size; + } + + return 0L; + } + + private static class WorkerWorkload implements Comparable { + private int taskNum; + private long scanBytes; + + public WorkerWorkload() { + this(0, 0); + } + + public WorkerWorkload(int taskNum, long scanBytes) { + this.taskNum = taskNum; + this.scanBytes = scanBytes; + } + + public void recordOneScanTask(long scanBytes) { + this.scanBytes += scanBytes; + } + + // order by scanBytes asc, taskNum asc + @Override + public int compareTo(WorkerWorkload workerWorkload) { + int compareScanBytes = Long.compare(this.scanBytes, workerWorkload.scanBytes); + if (compareScanBytes != 0) { + return compareScanBytes; + } + return taskNum - workerWorkload.taskNum; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/ScanWorkerSelector.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/ScanWorkerSelector.java new file mode 100644 index 000000000000000..05f1f46576e8a26 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/ScanWorkerSelector.java @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +import org.apache.doris.nereids.worker.job.UnassignedScanBucketOlapTableJob; +import org.apache.doris.nereids.worker.job.UninstancedScanSource; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.thrift.TScanRangeLocation; +import org.apache.doris.thrift.TScanRangeLocations; +import org.apache.doris.thrift.TScanRangeParams; + +import java.util.List; +import java.util.Map; + +/** ScanWorkerSelector */ +public interface ScanWorkerSelector { + WorkerManager getWorkerManager(); + + Worker selectMinWorkloadWorker(List workers); + + // for a scan node, select replica for each scan range(denote tablet if the ScanNode is OlapScanNode), + // use the replica location to build a worker execute the instance + Map selectReplicaAndWorkerWithoutBucket(ScanNode scanNode); + + // return + // key: Worker, the backend which will process this fragment + // value.key: Integer, the bucket index, from 0 to (bucket_num - 1) + // for example, create table statement contains: distributed by hash(id) buckets 10, + // the bucket index will from 0 to 9 + // value.value.key: ScanNode, which ScanNode the worker will process scan task + // value.value.value: ScanRanges, the tablets in current bucket, + // for example, colocate table `tbl` has 2 range partitions: + // p1 values[(1), (10)) and p2 values[(10), 11) with integer partition column part, + // and distributed by hash(id) buckets 10. And, so, there has 10 buckets from bucket 0 to + // bucket 9, and every bucket contains two tablets, because there are two partitions. + Map selectReplicaAndWorkerWithBucket( + UnassignedScanBucketOlapTableJob unassignedJob); + + static TScanRangeParams buildScanReplicaParams( + TScanRangeLocations tabletLocation, TScanRangeLocation replicaLocation) { + TScanRangeParams replicaParam = new TScanRangeParams(); + replicaParam.scan_range = tabletLocation.scan_range; + // Volume is optional, so we need to set the value and the is-set bit + replicaParam.setVolumeId(replicaLocation.volume_id); + return replicaParam; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Worker.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Worker.java new file mode 100644 index 000000000000000..8d1179fc1e20124 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Worker.java @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +/** Worker */ +public interface Worker extends Comparable { + long id(); + + // ipv4/ipv6 address + String address(); + + String host(); + + int port(); + + // whether is this worker alive? + boolean available(); + + @Override + default int compareTo(Worker worker) { + return address().compareTo(worker.address()); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerManager.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerManager.java new file mode 100644 index 000000000000000..5db890d78227778 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerManager.java @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +/** WorkerManager */ +public interface WorkerManager { + Worker getWorker(long backendId); + + Worker randomAvailableWorker(); +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerScanRanges.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerScanRanges.java new file mode 100644 index 000000000000000..7f861d33cfc13c2 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/WorkerScanRanges.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +import org.apache.doris.nereids.worker.job.ScanRanges; + +import java.util.Objects; + +/** WorkerScanRange */ +public class WorkerScanRanges { + public final Worker worker; + public final ScanRanges scanRanges; + + public WorkerScanRanges(Worker worker, ScanRanges scanRanges) { + this.worker = Objects.requireNonNull(worker, "scanRangeParams can not be null"); + this.scanRanges = Objects.requireNonNull(scanRanges, "scanRanges can not be null"); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Workload.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Workload.java new file mode 100644 index 000000000000000..f67c3660c1520ce --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/Workload.java @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker; + +/** Workload */ +public interface Workload extends Comparable { +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedJob.java new file mode 100644 index 000000000000000..6812480dd8a1615 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedJob.java @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.trees.AbstractTreeNode; +import org.apache.doris.nereids.util.Utils; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; + +import com.google.common.collect.ListMultimap; + +import java.util.List; +import java.util.Objects; + +/** AbstractUnassignedJob */ +public abstract class AbstractUnassignedJob + extends AbstractTreeNode implements UnassignedJob { + protected final PlanFragment fragment; + protected final List scanNodes; + protected final ListMultimap exchangeToChildJob; + + /** AbstractUnassignedJob */ + public AbstractUnassignedJob(PlanFragment fragment, List scanNodes, + ListMultimap exchangeToChildJob) { + super(Utils.fastToImmutableList(exchangeToChildJob.values())); + this.fragment = Objects.requireNonNull(fragment, "fragment can not be null"); + this.scanNodes = Utils.fastToImmutableList( + Objects.requireNonNull(scanNodes, "scanNodes can not be null") + ); + this.exchangeToChildJob + = Objects.requireNonNull(exchangeToChildJob, "exchangeToChildJob can not be null"); + } + + @Override + public PlanFragment getFragment() { + return fragment; + } + + @Override + public List getScanNodes() { + return scanNodes; + } + + @Override + public ListMultimap getExchangeToChildJob() { + return exchangeToChildJob; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + + @Override + public UnassignedJob withChildren(List children) { + throw new UnsupportedOperationException(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedScanJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedScanJob.java new file mode 100644 index 000000000000000..0c29def87f7496e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AbstractUnassignedScanJob.java @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.qe.ConnectContext; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Lists; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** AbstractUnassignedScanJob */ +public abstract class AbstractUnassignedScanJob extends AbstractUnassignedJob { + public AbstractUnassignedScanJob(PlanFragment fragment, + List scanNodes, ListMultimap exchangeToChildJob) { + super(fragment, scanNodes, exchangeToChildJob); + } + + @Override + public List computeAssignedJobs(WorkerManager workerManager, + ListMultimap inputJobs) { + + Map workerToScanSource = multipleMachinesParallelization( + workerManager, inputJobs); + + return insideMachineParallelization(workerToScanSource, inputJobs); + } + + protected abstract Map multipleMachinesParallelization( + WorkerManager workerManager, ListMultimap inputJobs); + + protected List insideMachineParallelization( + Map workerToScanRanges, + ListMultimap inputJobs) { + + boolean useLocalShuffleToAddParallel = useLocalShuffleToAddParallel(workerToScanRanges); + int instanceIndexInFragment = 0; + int shareScanIndex = 0; + List instances = Lists.newArrayList(); + for (Entry entry : workerToScanRanges.entrySet()) { + Worker worker = entry.getKey(); + + // the scanRanges which this worker should scan, + // for example: + // { + // scan tbl1: [tablet_10001, tablet_10002, tablet_10003, tablet_10004] // no instances + // } + ScanSource scanSource = entry.getValue().scanSource; + + // usually, its tablets num, or buckets num + int scanSourceMaxParallel = scanSource.maxParallel(scanNodes); + + // now we should compute how many instances to process the data, + // for example: two instances + int instanceNum = degreeOfParallelism(scanSourceMaxParallel); + + List instanceToScanRanges; + if (useLocalShuffleToAddParallel) { + // only generate one instance to scan all data, in this step + instanceToScanRanges = scanSource.parallelize( + scanNodes, 1 + ); + + // Some tablets too big, we need add parallel to process these tablets after scan, + // for example, use one OlapScanNode to scan data, and use some local instances + // to process Aggregation parallel. We call it `share scan`. Backend will know this + // instances share the same ScanSource, and will not scan same data multiple times. + // + // +-------------------------------- same fragment in one host -------------------------------------+ + // | instance1 instance2 instance3 instance4 | + // | \ \ / / | + // | | + // | OlapScanNode | + // |(share scan node, and local shuffle data to other local instances to parallel compute this data)| + // +------------------------------------------------------------------------------------------------+ + ScanSource shareScanSource = instanceToScanRanges.get(0); + for (int i = 0; i < instanceNum; i++) { + // one scan range generate multiple instances, + // different instances reference the same scan source + LocalShuffleAssignedJob instance = new LocalShuffleAssignedJob( + instanceIndexInFragment++, shareScanIndex, this, worker, shareScanSource); + instances.add(instance); + } + shareScanIndex++; + } else { + // split the scanRanges to some partitions, one partition for one instance + // for example: + // [ + // scan tbl1: [tablet_10001, tablet_10003], // instance 1 + // scan tbl1: [tablet_10002, tablet_10004] // instance 2 + // ] + instanceToScanRanges = scanSource.parallelize( + scanNodes, instanceNum + ); + + for (ScanSource instanceToScanRange : instanceToScanRanges) { + instances.add(assignWorkerAndDataSources(instanceIndexInFragment++, worker, instanceToScanRange)); + } + } + } + + return instances; + } + + protected boolean useLocalShuffleToAddParallel(Map workerToScanRanges) { + if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().isForceToLocalShuffle()) { + return true; + } + return parallelTooLittle(workerToScanRanges); + } + + protected boolean parallelTooLittle(Map workerToScanRanges) { + if (this instanceof UnassignedScanBucketOlapTableJob) { + return scanRangesToLittle(workerToScanRanges) && bucketsTooLittle(workerToScanRanges); + } else if (this instanceof UnassignedScanSingleOlapTableJob + || this instanceof UnassignedScanSingleRemoteTableJob) { + return scanRangesToLittle(workerToScanRanges); + } else { + return false; + } + } + + protected boolean scanRangesToLittle( + Map workerToScanRanges) { + ConnectContext context = ConnectContext.get(); + int backendNum = workerToScanRanges.size(); + for (ScanNode scanNode : scanNodes) { + if (!scanNode.ignoreStorageDataDistribution(context, backendNum)) { + return false; + } + } + return true; + } + + protected int degreeOfParallelism(int maxParallel) { + Preconditions.checkArgument(maxParallel > 0, "maxParallel must be positive"); + if (!fragment.getDataPartition().isPartitioned()) { + return 1; + } + if (scanNodes.size() == 1 && scanNodes.get(0) instanceof OlapScanNode) { + OlapScanNode olapScanNode = (OlapScanNode) scanNodes.get(0); + // if the scan node have limit and no conjuncts, only need 1 instance to save cpu and mem resource, + // e.g. select * from tbl limit 10 + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && olapScanNode.shouldUseOneInstance(connectContext)) { + return 1; + } + } + + // the scan instance num should not larger than the tablets num + return Math.min(maxParallel, Math.max(fragment.getParallelExecNum(), 1)); + } + + protected boolean bucketsTooLittle(Map workerToScanRanges) { + int parallelExecNum = fragment.getParallelExecNum(); + for (UninstancedScanSource uninstancedScanSource : workerToScanRanges.values()) { + ScanSource scanSource = uninstancedScanSource.scanSource; + if (scanSource instanceof BucketScanSource) { + BucketScanSource bucketScanSource = (BucketScanSource) scanSource; + int bucketNum = bucketScanSource.bucketIndexToScanNodeToTablets.size(); + if (bucketNum >= parallelExecNum) { + return false; + } + } + } + return true; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJob.java new file mode 100644 index 000000000000000..31e14c64f706308 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJob.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; + +/** + * AssignedJob. + * for example: an instance job in a fragment job, which already assign to a worker and some data sources + */ +public interface AssignedJob { + int indexInUnassignedJob(); + + UnassignedJob unassignedJob(); + + Worker getAssignedWorker(); + + ScanSource getScanSource(); + + String toString(boolean showUnassignedJob); +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJobBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJobBuilder.java new file mode 100644 index 000000000000000..5ab80979b91e8de --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/AssignedJobBuilder.java @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.BackendWorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragmentId; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ListMultimap; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** AssignedJobBuilder */ +public class AssignedJobBuilder { + /** buildJobs */ + public static ListMultimap buildJobs( + Map unassignedJobs) { + BackendWorkerManager workerManager = new BackendWorkerManager(); + ListMultimap allAssignedJobs = ArrayListMultimap.create(); + for (Entry kv : unassignedJobs.entrySet()) { + PlanFragmentId fragmentId = kv.getKey(); + UnassignedJob unassignedJob = kv.getValue(); + ListMultimap inputAssignedJobs + = getInputAssignedJobs(unassignedJob, allAssignedJobs); + List fragmentAssignedJobs = + unassignedJob.computeAssignedJobs(workerManager, inputAssignedJobs); + allAssignedJobs.putAll(fragmentId, fragmentAssignedJobs); + } + return allAssignedJobs; + } + + private static ListMultimap getInputAssignedJobs( + UnassignedJob unassignedJob, ListMultimap assignedJobs) { + ListMultimap inputJobs = ArrayListMultimap.create(); + for (Entry> exchangeNodeToChildJobs + : unassignedJob.getExchangeToChildJob().asMap().entrySet()) { + ExchangeNode exchangeNode = exchangeNodeToChildJobs.getKey(); + Collection childJobs = exchangeNodeToChildJobs.getValue(); + for (UnassignedJob childJob : childJobs) { + inputJobs.putAll(exchangeNode, assignedJobs.get(childJob.getFragment().getFragmentId())); + } + } + return inputJobs; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/BucketScanSource.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/BucketScanSource.java new file mode 100644 index 000000000000000..31f6014167ba501 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/BucketScanSource.java @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.common.util.ListUtil; +import org.apache.doris.planner.ScanNode; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** BucketScanSource */ +public class BucketScanSource extends ScanSource { + // for example: + // 1. bucket 0 use OlapScanNode(tableName=`tbl`) to scan with tablet: [tablet 10001, tablet 10003] + // 2. bucket 1 use OlapScanNode(tableName=`tbl`) to scan with tablet: [tablet 10002, tablet 10004] + public final Map> bucketIndexToScanNodeToTablets; + + public BucketScanSource(Map> bucketIndexToScanNodeToTablets) { + this.bucketIndexToScanNodeToTablets = bucketIndexToScanNodeToTablets; + } + + @Override + public int maxParallel(List scanNodes) { + // maxParallel is buckets num + return bucketIndexToScanNodeToTablets.size(); + } + + @Override + public List parallelize(List scanNodes, int instanceNum) { + // current state, no any instance, we only known how many buckets + // this worker should process, and the data that this buckets should process: + // + // [ + // bucket 0: { + // scanNode1: ScanRanges([tablet_10001, tablet_10004, tablet_10007]), + // scanNode2: ScanRanges([tablet_10010, tablet_10013, tablet_10016]) + // }, + // bucket 1: { + // scanNode1: ScanRanges([tablet_10002, tablet_10005, tablet_10008]), + // scanNode2: ScanRanges([tablet_10011, tablet_10014, tablet_10017]) + // }, + // bucket 3: { + // scanNode1: ScanRanges([tablet_10003, tablet_10006, tablet_10009]), + // scanNode2: ScanRanges([tablet_10012, tablet_10015, tablet_10018]) + // } + // ] + List>> bucketIndexToScanRanges + = Lists.newArrayList(bucketIndexToScanNodeToTablets.entrySet()); + + // separate buckets to instanceNum groups. + // for example: + // [ + // // instance 1 process two buckets + // [ + // bucket 0: { + // scanNode1: ScanRanges([tablet_10001, tablet_10004, tablet_10007]), + // scanNode2: ScanRanges([tablet_10010, tablet_10013, tablet_10016]) + // }, + // bucket 3: { + // scanNode1: ScanRanges([tablet_10003, tablet_10006, tablet_10009]), + // scanNode2: ScanRanges([tablet_10012, tablet_10015, tablet_10018]) + // } + // ], + // // instance 2 process one bucket + // [ + // bucket 1: { + // scanNode1: ScanRanges([tablet_10002, tablet_10005, tablet_10008]), + // scanNode2: ScanRanges([tablet_10011, tablet_10014, tablet_10017]) + // } + // ] + // ] + List>>> scanBucketsPerInstance + = ListUtil.splitBySize(bucketIndexToScanRanges, instanceNum); + + // rebuild BucketScanSource for each instance + ImmutableList.Builder instancesScanSource = ImmutableList.builder(); + for (List>> oneInstanceScanBuckets : scanBucketsPerInstance) { + ImmutableMap.Builder> bucketsScanSources = ImmutableMap.builder(); + for (Entry> bucketIndexToScanNodeToScanRange : oneInstanceScanBuckets) { + Integer bucketIndex = bucketIndexToScanNodeToScanRange.getKey(); + Map scanNodeToScanRanges = bucketIndexToScanNodeToScanRange.getValue(); + bucketsScanSources.put(bucketIndex, scanNodeToScanRanges); + } + + instancesScanSource.add(new BucketScanSource( + bucketsScanSources.build() + )); + } + return instancesScanSource.build(); + } + + /** getBucketIndexToScanRanges */ + public Map getBucketIndexToScanRanges(ScanNode scanNode) { + Map bucketIndexToScanRanges = Maps.newLinkedHashMap(); + for (Entry> entry : bucketIndexToScanNodeToTablets.entrySet()) { + Integer bucketIndex = entry.getKey(); + Map scanNodeToScanRanges = entry.getValue(); + ScanRanges scanRanges = scanNodeToScanRanges.get(scanNode); + if (scanRanges != null) { + bucketIndexToScanRanges.put(bucketIndex, scanRanges); + } + } + + return bucketIndexToScanRanges; + } + + /** toString */ + public void toString(StringBuilder str, String prefix) { + int i = 0; + String nextIndent = prefix + " "; + str.append("[\n"); + for (Entry> entry : bucketIndexToScanNodeToTablets.entrySet()) { + Integer bucketId = entry.getKey(); + Map scanNodeToScanRanges = entry.getValue(); + str.append(prefix).append(" bucket ").append(bucketId).append(": "); + DefaultScanSource.toString(scanNodeToScanRanges, str, nextIndent); + if (++i < bucketIndexToScanNodeToTablets.size()) { + str.append(",\n"); + } + } + str.append("\n").append(prefix).append("]"); + } + + @Override + public boolean isEmpty() { + return bucketIndexToScanNodeToTablets.isEmpty(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/CustomAssignmentJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/CustomAssignmentJob.java new file mode 100644 index 000000000000000..b1f9d6d8a875a13 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/CustomAssignmentJob.java @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.planner.ExchangeNode; + +import com.google.common.collect.ListMultimap; + +import java.util.List; + +/** CustomAssignmentJob */ +public interface CustomAssignmentJob { + List customAssignment(ListMultimap inputJobs); +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/DefaultScanSource.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/DefaultScanSource.java new file mode 100644 index 000000000000000..c763e9fa90b90ca --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/DefaultScanSource.java @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.planner.ScanNode; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** DefaultScanSource */ +public class DefaultScanSource extends ScanSource { + // for example: + // 1. use OlapScanNode(tableName=`tbl1`) to scan with tablet: [tablet 10001, tablet 10002] + // 2. use OlapScanNode(tableName=`tbl2`) to scan with tablet: [tablet 10003, tablet 10004] + public final Map scanNodeToScanRanges; + + public DefaultScanSource(Map scanNodeToScanRanges) { + this.scanNodeToScanRanges = scanNodeToScanRanges; + } + + @Override + public int maxParallel(List scanNodes) { + if (scanNodes.size() == 1) { + ScanRanges scanRanges = scanNodeToScanRanges.get(scanNodes.get(0)); + if (scanRanges != null) { + // max parallel is the scan ranges(tablets) num + return Math.max(scanRanges.params.size(), 1); + } + } + return 1; + } + + @Override + public List parallelize(List scanNodes, int instanceNum) { + Preconditions.checkArgument(scanNodes.size() == 1, + "Only support parallelize one ScanNode, but meet " + scanNodes.size() + " ScanNodes"); + + ScanNode scanNode = scanNodes.get(0); + ScanRanges scanRanges = scanNodeToScanRanges.get(scanNode); + if (scanRanges == null) { + return ImmutableList.of(); + } + + List scanRangesPerInstance = scanRanges.split(instanceNum); + + ImmutableList.Builder instancesSource + = ImmutableList.builderWithExpectedSize(scanRangesPerInstance.size()); + for (ScanRanges oneInstanceScanRanges : scanRangesPerInstance) { + DefaultScanSource oneInstanceScanSource + = new DefaultScanSource(ImmutableMap.of(scanNode, oneInstanceScanRanges)); + instancesSource.add(oneInstanceScanSource); + } + return instancesSource.build(); + } + + @Override + public boolean isEmpty() { + return scanNodeToScanRanges.isEmpty(); + } + + @Override + public void toString(StringBuilder str, String prefix) { + toString(scanNodeToScanRanges, str, prefix); + } + + /** toString */ + public static void toString(Map scanNodeToScanRanges, StringBuilder str, String prefix) { + if (scanNodeToScanRanges.isEmpty()) { + str.append("[]"); + return; + } + int i = 0; + String nextIndent = prefix + " "; + str.append("[\n"); + for (Entry entry : scanNodeToScanRanges.entrySet()) { + ScanNode scanNode = entry.getKey(); + ScanRanges scanRanges = entry.getValue(); + str.append(prefix).append(" {\n") + .append(prefix).append(" scanNode: ").append(scanNode).append(",\n") + .append(prefix).append(" scanRanges: "); + + scanRanges.toString(str, nextIndent); + str.append("\n").append(prefix).append(" }"); + + if (++i < scanNodeToScanRanges.size()) { + str.append(",\n"); + } + } + str.append("\n").append(prefix).append("]"); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/LocalShuffleAssignedJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/LocalShuffleAssignedJob.java new file mode 100644 index 000000000000000..f829a673e1b8685 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/LocalShuffleAssignedJob.java @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; + +import com.google.common.collect.ImmutableMap; + +import java.util.Map; + +/** LocalShuffleAssignedJob */ +public class LocalShuffleAssignedJob extends StaticAssignedJob { + public final int shareScanId; + + public LocalShuffleAssignedJob( + int indexInUnassignedJob, int shareScanId, + UnassignedJob unassignedJob, + Worker worker, ScanSource scanSource) { + super(indexInUnassignedJob, unassignedJob, worker, scanSource); + this.shareScanId = shareScanId; + } + + @Override + protected Map extraInfo() { + return ImmutableMap.of("shareScanIndex", String.valueOf(shareScanId)); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRange.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRange.java new file mode 100644 index 000000000000000..a20897eee9426fe --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRange.java @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +/** ScanRange */ +public class ScanRange { + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRanges.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRanges.java new file mode 100644 index 000000000000000..368a8cd62d389db --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanRanges.java @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.thrift.TPaloScanRange; +import org.apache.doris.thrift.TScanRangeParams; + +import com.google.common.collect.Lists; + +import java.util.List; + +/**ScanRanges */ +public class ScanRanges implements Splittable { + // usually, it's tablets + public final List params; + // size corresponding to tablets one by one + public final List bytes; + public long totalBytes; + + public ScanRanges() { + this(Lists.newArrayList(), Lists.newArrayList()); + } + + /** ScanRanges */ + public ScanRanges(List params, List bytes) { + this.params = params; + this.bytes = bytes; + long totalBytes = 0; + for (Long size : bytes) { + totalBytes += size; + } + this.totalBytes = totalBytes; + } + + public void addScanRanges(ScanRanges scanRanges) { + this.params.addAll(scanRanges.params); + this.bytes.addAll(scanRanges.bytes); + this.totalBytes += scanRanges.totalBytes; + } + + public void addScanRange(TScanRangeParams params, long bytes) { + this.params.add(params); + this.bytes.add(bytes); + this.totalBytes += bytes; + } + + @Override + public int itemSize() { + return params.size(); + } + + @Override + public void addItem(ScanRanges other, int index) { + addScanRange(other.params.get(index), other.bytes.get(index)); + } + + @Override + public ScanRanges newSplittable() { + return new ScanRanges(); + } + + @Override + public String toString() { + StringBuilder str = new StringBuilder(); + toString(str, ""); + return str.toString(); + } + + /** toString */ + public void toString(StringBuilder str, String prefix) { + str.append("ScanRanges(bytes: " + totalBytes + ", ranges: [\n"); + for (int i = 0; i < params.size(); i++) { + str.append(prefix).append(" " + toString(params.get(i)) + ", bytes: " + bytes.get(i)); + if (i + 1 < params.size()) { + str.append(",\n"); + } + } + str.append("\n").append(prefix).append("])"); + } + + private String toString(TScanRangeParams scanRange) { + TPaloScanRange paloScanRange = scanRange.getScanRange().getPaloScanRange(); + if (paloScanRange != null) { + return "tablet " + paloScanRange.getTabletId(); + } else { + return scanRange.toString(); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanSource.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanSource.java new file mode 100644 index 000000000000000..4e8a49bcfafbc5c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/ScanSource.java @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.planner.ScanNode; + +import java.util.List; + +/** ScanSource */ +public abstract class ScanSource { + + public abstract int maxParallel(List scanNodes); + + public abstract List parallelize(List scanNodes, int instanceNum); + + public abstract boolean isEmpty(); + + public abstract void toString(StringBuilder str, String prefix); + + @Override + public String toString() { + StringBuilder str = new StringBuilder(); + toString(str, ""); + return str.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/Splittable.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/Splittable.java new file mode 100644 index 000000000000000..0a18d299402159e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/Splittable.java @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import com.google.common.base.Preconditions; + +import java.util.ArrayList; +import java.util.List; + +/** Splittable */ +public interface Splittable> { + + int itemSize(); + + void addItem(S other, int index); + + S newSplittable(); + + default List split(int splitSize) { + return Splittable.split(this, splitSize); + } + + /** + * split a list to multi expected number sublist + * for example: + * + * list is : [1, 2, 3, 4, 5, 6, 7] + * expectedSize is : 3 + * + * return : + * [1, 4, 7] + * [2, 5] + * [3, 6] + */ + static > List split(Splittable splittable, int splitSize) { + Preconditions.checkNotNull(splittable, "splittable must not be null"); + Preconditions.checkArgument(splitSize > 0, "splitSize must larger than 0"); + + int itemSize = splittable.itemSize(); + splitSize = Math.min(splitSize, itemSize); + + List result = new ArrayList<>(splitSize); + for (int i = 0; i < splitSize; i++) { + result.add(splittable.newSplittable()); + } + + int index = 0; + for (int i = 0; i < itemSize; i++) { + result.get(index).addItem((S) splittable, i); + index = (index + 1) % splitSize; + } + return result; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/StaticAssignedJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/StaticAssignedJob.java new file mode 100644 index 000000000000000..ec1f794318126c6 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/StaticAssignedJob.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; + +import com.google.common.collect.ImmutableMap; + +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; + +/** StaticAssignedJob */ +public class StaticAssignedJob implements AssignedJob { + private final int indexInUnassignedJob; + private final UnassignedJob unassignedJob; + private final Worker worker; + private final ScanSource scanSource; + + public StaticAssignedJob( + int indexInUnassignedJob, UnassignedJob unassignedJob, Worker worker, + ScanSource scanSource) { + this.indexInUnassignedJob = indexInUnassignedJob; + this.unassignedJob = Objects.requireNonNull(unassignedJob, "unassignedJob can not be null"); + this.worker = worker; + this.scanSource = Objects.requireNonNull(scanSource, "scanSource can not be null"); + } + + @Override + public int indexInUnassignedJob() { + return indexInUnassignedJob; + } + + @Override + public UnassignedJob unassignedJob() { + return unassignedJob; + } + + @Override + public Worker getAssignedWorker() { + return worker; + } + + @Override + public ScanSource getScanSource() { + return scanSource; + } + + @Override + public String toString() { + return toString(true); + } + + @Override + public String toString(boolean showUnassignedJob) { + StringBuilder scanSourceString = new StringBuilder(); + if (!scanSource.isEmpty()) { + scanSource.toString(scanSourceString, " "); + } else { + scanSourceString = new StringBuilder("[]"); + } + StringBuilder str = new StringBuilder(getClass().getSimpleName()).append("("); + if (showUnassignedJob) { + str.append("\n unassignedJob: ").append(unassignedJob).append(","); + } + str.append("\n index: " + indexInUnassignedJob) + .append(",\n worker: " + worker); + for (Entry kv : extraInfo().entrySet()) { + str.append(",\n ").append(kv.getKey()).append(": ").append(kv.getValue()); + } + + return str + .append(",\n scanSource: " + scanSourceString) + .append("\n)") + .toString(); + } + + protected Map extraInfo() { + return ImmutableMap.of(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedGatherScanMultiRemoteTablesJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedGatherScanMultiRemoteTablesJob.java new file mode 100644 index 000000000000000..bc64e256e8fdc76 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedGatherScanMultiRemoteTablesJob.java @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.ScanWorkerSelector; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.DataGenScanNode; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.thrift.TScanRangeLocations; +import org.apache.doris.thrift.TScanRangeParams; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Maps; + +import java.util.List; +import java.util.Map; + +/** UnassignedGatherScanMultiRemoteTablesJob */ +public class UnassignedGatherScanMultiRemoteTablesJob extends AbstractUnassignedJob { + + public UnassignedGatherScanMultiRemoteTablesJob(PlanFragment fragment, + List scanNodes, ListMultimap exchangeToChildJob) { + super(fragment, scanNodes, exchangeToChildJob); + } + + /** canApply */ + public static boolean canApply(List scanNodes) { + if (scanNodes.size() <= 1) { + return false; + } + for (ScanNode scanNode : scanNodes) { + if (!(scanNode instanceof DataGenScanNode)) { + return false; + } + DataGenScanNode dataGenScanNode = (DataGenScanNode) scanNode; + if (dataGenScanNode.getScanRangeLocations(0).size() != 1) { + return false; + } + } + return true; + } + + @Override + public List computeAssignedJobs(WorkerManager workerManager, + ListMultimap inputJobs) { + Map scanNodeToScanRanges = Maps.newLinkedHashMap(); + for (ScanNode scanNode : scanNodes) { + List scanRangeLocations = scanNode.getScanRangeLocations(0); + ScanRanges scanRanges = new ScanRanges(); + for (TScanRangeLocations scanRangeLocation : scanRangeLocations) { + TScanRangeParams replica = ScanWorkerSelector.buildScanReplicaParams( + scanRangeLocation, scanRangeLocation.locations.get(0)); + scanRanges.addScanRange(replica, 0); + } + + scanNodeToScanRanges.put(scanNode, scanRanges); + } + + Worker randomWorker = workerManager.randomAvailableWorker(); + return ImmutableList.of( + assignWorkerAndDataSources(0, randomWorker, + new DefaultScanSource(scanNodeToScanRanges) + ) + ); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJob.java new file mode 100644 index 000000000000000..73af5c4f65af272 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJob.java @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.trees.TreeNode; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; + +import com.google.common.collect.ListMultimap; + +import java.util.List; + +/** + * WorkerJob. + * for example: a fragment job, which doesn't parallelization to some instance jobs and also no worker to invoke it + */ +public interface UnassignedJob extends TreeNode { + PlanFragment getFragment(); + + List getScanNodes(); + + ListMultimap getExchangeToChildJob(); + + List computeAssignedJobs( + WorkerManager workerManager, ListMultimap inputJobs); + + // generate an instance job + // e.g. build an instance job by a backends and the replica ids it contains + default AssignedJob assignWorkerAndDataSources( + int instanceIndexInFragment, Worker worker, ScanSource scanSource) { + return new StaticAssignedJob(instanceIndexInFragment, this, worker, scanSource); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJobBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJobBuilder.java new file mode 100644 index 000000000000000..6020a00655dc948 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedJobBuilder.java @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping; +import org.apache.doris.nereids.worker.LoadBalanceScanWorkerSelector; +import org.apache.doris.nereids.worker.ScanWorkerSelector; +import org.apache.doris.planner.DataSink; +import org.apache.doris.planner.DataStreamSink; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.MultiCastDataSink; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanFragmentId; +import org.apache.doris.planner.PlanNodeId; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Maps; + +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** + * UnassignedJobBuilder. + * build UnassignedJob by fragment + */ +public class UnassignedJobBuilder { + private final ScanWorkerSelector scanWorkerSelector = new LoadBalanceScanWorkerSelector(); + + /** + * build job from fragment. + */ + public static FragmentIdMapping buildJobs(FragmentIdMapping fragments) { + UnassignedJobBuilder builder = new UnassignedJobBuilder(); + + FragmentLineage fragmentLineage = buildFragmentLineage(fragments); + FragmentIdMapping unassignedJobs = new FragmentIdMapping<>(); + + // build from leaf to parent + for (Entry kv : fragments.entrySet()) { + PlanFragmentId fragmentId = kv.getKey(); + PlanFragment fragment = kv.getValue(); + + ListMultimap inputJobs = findInputJobs( + fragmentLineage, fragmentId, unassignedJobs); + UnassignedJob unassignedJob = builder.buildJob(fragment, inputJobs); + unassignedJobs.put(fragmentId, unassignedJob); + } + return unassignedJobs; + } + + private UnassignedJob buildJob( + PlanFragment planFragment, ListMultimap inputJobs) { + List scanNodes = collectScanNodesInThisFragment(planFragment); + if (planFragment.specifyInstances.isPresent()) { + return buildSpecifyInstancesJob(planFragment, scanNodes, inputJobs); + } else if (!scanNodes.isEmpty() || isLeafFragment(planFragment)) { + return buildLeafOrScanJob(planFragment, scanNodes, inputJobs); + } else { + return buildShuffleJob(planFragment, inputJobs); + } + } + + private UnassignedJob buildLeafOrScanJob( + PlanFragment planFragment, List scanNodes, + ListMultimap inputJobs) { + int olapScanNodeNum = olapScanNodeNum(scanNodes); + + UnassignedJob unassignedJob = null; + if (!scanNodes.isEmpty() && olapScanNodeNum == scanNodes.size()) { + // we need assign a backend which contains the data, + // so that the OlapScanNode can find the data in the backend + // e.g. select * from olap_table + unassignedJob = buildScanOlapTableJob(planFragment, (List) scanNodes, inputJobs, scanWorkerSelector); + } else if (scanNodes.isEmpty()) { + // select constant without table, + // e.g. select 100 union select 200 + unassignedJob = buildQueryConstantJob(planFragment); + } else if (olapScanNodeNum == 0) { + // only scan external tables or cloud tables or table valued functions + // e,g. select * from numbers('number'='100') + unassignedJob = buildScanRemoteTableJob(planFragment, scanNodes, inputJobs, scanWorkerSelector); + } + + if (unassignedJob != null) { + return unassignedJob; + } + + throw new IllegalStateException( + "Unsupported fragment which contains multiple scan nodes and some of them are not OlapScanNode" + ); + } + + private UnassignedJob buildSpecifyInstancesJob( + PlanFragment planFragment, List scanNodes, ListMultimap inputJobs) { + return new UnassignedSpecifyInstancesJob(planFragment, scanNodes, inputJobs); + } + + private UnassignedJob buildScanOlapTableJob( + PlanFragment planFragment, List olapScanNodes, + ListMultimap inputJobs, + ScanWorkerSelector scanWorkerSelector) { + if (shouldAssignByBucket(planFragment)) { + return new UnassignedScanBucketOlapTableJob( + planFragment, olapScanNodes, inputJobs, scanWorkerSelector); + } else if (olapScanNodes.size() == 1) { + return new UnassignedScanSingleOlapTableJob( + planFragment, olapScanNodes.get(0), inputJobs, scanWorkerSelector); + } else { + throw new IllegalStateException("Not supported multiple scan multiple " + + "OlapTable but not contains colocate join or bucket shuffle join"); + } + } + + private List collectScanNodesInThisFragment(PlanFragment planFragment) { + return planFragment.getPlanRoot().collectInCurrentFragment(ScanNode.class::isInstance); + } + + private int olapScanNodeNum(List scanNodes) { + int olapScanNodeNum = 0; + for (ScanNode scanNode : scanNodes) { + if (scanNode instanceof OlapScanNode) { + olapScanNodeNum++; + } + } + return olapScanNodeNum; + } + + private boolean isLeafFragment(PlanFragment planFragment) { + return planFragment.getChildren().isEmpty(); + } + + private UnassignedQueryConstantJob buildQueryConstantJob(PlanFragment planFragment) { + return new UnassignedQueryConstantJob(planFragment); + } + + private UnassignedJob buildScanRemoteTableJob( + PlanFragment planFragment, List scanNodes, + ListMultimap inputJobs, + ScanWorkerSelector scanWorkerSelector) { + if (scanNodes.size() == 1) { + return new UnassignedScanSingleRemoteTableJob( + planFragment, scanNodes.get(0), inputJobs, scanWorkerSelector); + } else if (UnassignedGatherScanMultiRemoteTablesJob.canApply(scanNodes)) { + // select * from numbers("number" = "10") a union all select * from numbers("number" = "20") b; + // use an instance to scan table a and table b + return new UnassignedGatherScanMultiRemoteTablesJob(planFragment, scanNodes, inputJobs); + } else { + return null; + } + } + + private UnassignedShuffleJob buildShuffleJob( + PlanFragment planFragment, ListMultimap inputJobs) { + return new UnassignedShuffleJob(planFragment, inputJobs); + } + + private static ListMultimap findInputJobs( + FragmentLineage lineage, PlanFragmentId fragmentId, FragmentIdMapping unassignedJobs) { + ListMultimap inputJobs = ArrayListMultimap.create(); + Map exchangeNodes = lineage.parentFragmentToExchangeNode.get(fragmentId); + if (exchangeNodes != null) { + for (Entry idToExchange : exchangeNodes.entrySet()) { + PlanNodeId exchangeId = idToExchange.getKey(); + ExchangeNode exchangeNode = idToExchange.getValue(); + List childFragmentIds = lineage.exchangeToChildFragment.get(exchangeId); + for (PlanFragmentId childFragmentId : childFragmentIds) { + inputJobs.put(exchangeNode, unassignedJobs.get(childFragmentId)); + } + } + } + return inputJobs; + } + + private static List collectExchangeNodesInThisFragment(PlanFragment planFragment) { + return planFragment + .getPlanRoot() + .collectInCurrentFragment(ExchangeNode.class::isInstance); + } + + private static FragmentLineage buildFragmentLineage( + FragmentIdMapping fragments) { + ListMultimap exchangeToChildFragment = ArrayListMultimap.create(); + FragmentIdMapping> parentFragmentToExchangeNode = new FragmentIdMapping<>(); + + for (PlanFragment fragment : fragments.values()) { + PlanFragmentId fragmentId = fragment.getFragmentId(); + + // 1. link child fragment to exchange node + DataSink sink = fragment.getSink(); + if (sink instanceof DataStreamSink) { + PlanNodeId exchangeNodeId = sink.getExchNodeId(); + exchangeToChildFragment.put(exchangeNodeId, fragmentId); + } else if (sink instanceof MultiCastDataSink) { + MultiCastDataSink multiCastDataSink = (MultiCastDataSink) sink; + for (DataStreamSink dataStreamSink : multiCastDataSink.getDataStreamSinks()) { + PlanNodeId exchangeNodeId = dataStreamSink.getExchNodeId(); + exchangeToChildFragment.put(exchangeNodeId, fragmentId); + } + } + + // 2. link parent fragment to exchange node + List exchangeNodes = collectExchangeNodesInThisFragment(fragment); + Map exchangeNodesInFragment = Maps.newLinkedHashMap(); + for (ExchangeNode exchangeNode : exchangeNodes) { + exchangeNodesInFragment.put(exchangeNode.getId(), exchangeNode); + } + parentFragmentToExchangeNode.put(fragmentId, exchangeNodesInFragment); + } + + return new FragmentLineage(parentFragmentToExchangeNode, exchangeToChildFragment); + } + + private static boolean shouldAssignByBucket(PlanFragment fragment) { + if (fragment.hasColocatePlanNode()) { + return true; + } + if (enableBucketShuffleJoin() && fragment.isBucketShuffleJoinInput()) { + return true; + } + return false; + } + + private static boolean enableBucketShuffleJoin() { + if (ConnectContext.get() != null) { + SessionVariable sessionVariable = ConnectContext.get().getSessionVariable(); + if (!sessionVariable.isEnableBucketShuffleJoin() && !sessionVariable.isEnableNereidsPlanner()) { + return false; + } + } + return true; + } + + // the class support find exchange nodes in the fragment, and find child fragment by exchange node id + private static class FragmentLineage { + private final FragmentIdMapping> parentFragmentToExchangeNode; + private final ListMultimap exchangeToChildFragment; + + public FragmentLineage( + FragmentIdMapping> parentFragmentToExchangeNode, + ListMultimap exchangeToChildFragment) { + this.parentFragmentToExchangeNode = parentFragmentToExchangeNode; + this.exchangeToChildFragment = exchangeToChildFragment; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedQueryConstantJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedQueryConstantJob.java new file mode 100644 index 000000000000000..71ca43ab82547a7 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedQueryConstantJob.java @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; + +import java.util.List; + +/** UnassignedQueryConstantJob */ +public class UnassignedQueryConstantJob extends AbstractUnassignedJob { + public UnassignedQueryConstantJob(PlanFragment fragment) { + super(fragment, ImmutableList.of(), ArrayListMultimap.create()); + } + + @Override + public List computeAssignedJobs(WorkerManager workerManager, + ListMultimap inputJobs) { + Worker randomWorker = workerManager.randomAvailableWorker(); + return ImmutableList.of( + new StaticAssignedJob(0, this, randomWorker, + new DefaultScanSource(ImmutableMap.of()) + ) + ); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanBucketOlapTableJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanBucketOlapTableJob.java new file mode 100644 index 000000000000000..69452836216689b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanBucketOlapTableJob.java @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.analysis.JoinOperator; +import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.Replica; +import org.apache.doris.catalog.Tablet; +import org.apache.doris.nereids.worker.ScanWorkerSelector; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.HashJoinNode; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanNode; +import org.apache.doris.planner.ScanNode; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +/** + * UnassignedScanBucketOlapTableJob. + * bucket shuffle join olap table, or colocate join olap table + */ +public class UnassignedScanBucketOlapTableJob extends AbstractUnassignedScanJob { + private final ScanWorkerSelector scanWorkerSelector; + private final List olapScanNodes; + + /** UnassignedScanNativeTableJob */ + public UnassignedScanBucketOlapTableJob( + PlanFragment fragment, List olapScanNodes, + ListMultimap exchangeToChildJob, + ScanWorkerSelector scanWorkerSelector) { + super(fragment, (List) olapScanNodes, exchangeToChildJob); + this.scanWorkerSelector = Objects.requireNonNull( + scanWorkerSelector, "scanWorkerSelector cat not be null"); + + Preconditions.checkArgument(!olapScanNodes.isEmpty(), "OlapScanNode is empty"); + this.olapScanNodes = olapScanNodes; + } + + public List getOlapScanNodes() { + return olapScanNodes; + } + + @Override + protected Map multipleMachinesParallelization( + WorkerManager workerManager, ListMultimap inputJobs) { + // for every bucket tablet, select its replica and worker. + // for example, colocate join: + // { + // BackendWorker("172.0.0.1"): { + // bucket 0: { + // olapScanNode1: ScanRanges([tablet_10001, tablet_10002, tablet_10003, tablet_10004]), + // olapScanNode2: ScanRanges([tablet_10009, tablet_10010, tablet_10011, tablet_10012]) + // }, + // bucket 1: { + // olapScanNode1: ScanRanges([tablet_10005, tablet_10006, tablet_10007, tablet_10008]) + // olapScanNode2: ScanRanges([tablet_10013, tablet_10014, tablet_10015, tablet_10016]) + // }, + // ... + // }, + // BackendWorker("172.0.0.2"): { + // ... + // } + // } + return scanWorkerSelector.selectReplicaAndWorkerWithBucket(this); + } + + @Override + protected List insideMachineParallelization( + Map workerToScanRanges, + ListMultimap inputJobs) { + // separate buckets to instanceNum groups, let one instance process some buckets. + // for example, colocate join: + // { + // // 172.0.0.1 has two instances + // BackendWorker("172.0.0.1"): [ + // // instance 1 process two buckets + // { + // bucket 0: { + // olapScanNode1: ScanRanges([tablet_10001, tablet_10002, tablet_10003, tablet_10004]), + // olapScanNode2: ScanRanges([tablet_10009, tablet_10010, tablet_10011, tablet_10012]) + // }, + // bucket 1: { + // olapScanNode1: ScanRanges([tablet_10005, tablet_10006, tablet_10007, tablet_10008]) + // olapScanNode2: ScanRanges([tablet_10013, tablet_10014, tablet_10015, tablet_10016]) + // } + // }, + // // instance 1 process one bucket + // { + // bucket 3: ... + // } + // ] + // // instance 4... in "172.0.0.1" + // BackendWorker("172.0.0.2"): [ + // ... + // ], + // ... + // } + List assignedJobs = super.insideMachineParallelization(workerToScanRanges, inputJobs); + + // the case: + // ```sql + // SELECT * FROM + // (select * from tbl1 where c0 =1)a + // RIGHT OUTER JOIN + // (select * from tbl2)b + // ON a.id = b.id; + // ``` + // contains right outer join and missing instance in left side because of tablet pruner, for example + // left: [bucket 1] + // right: [bucket 1, bucket 2] + // + // we should join buckets corresponding: + // [ + // (left bucket 1) right outer join (right bucket 1) + // (no any machine) right outer join (right bucket 2) + // ] + // if missing the left bucket 2, it will compute an empty result + // because right bucket 2 doesn't exist destination instance, + // so we should fill up this instance + List hashJoinNodes = fragment.getPlanRoot() + .collectInCurrentFragment(HashJoinNode.class::isInstance); + if (shouldFillUpInstances(hashJoinNodes)) { + return fillUpInstances(assignedJobs, hashJoinNodes, inputJobs); + } + + return assignedJobs; + } + + private boolean shouldFillUpInstances(List hashJoinNodes) { + for (HashJoinNode hashJoinNode : hashJoinNodes) { + if (!hashJoinNode.isBucketShuffle()) { + continue; + } + JoinOperator joinOp = hashJoinNode.getJoinOp(); + switch (joinOp) { + case RIGHT_OUTER_JOIN: + case RIGHT_SEMI_JOIN: + case RIGHT_ANTI_JOIN: + return true; + default: + } + } + return false; + } + + private List fillUpInstances( + List leftSideInstances, List hashJoinNodes, + ListMultimap inputJobs) { + Set leftSideUsedBuckets = leftSideBuckets(leftSideInstances); + Set rightSideUsedBuckets = rightSideUsedBuckets(hashJoinNodes, inputJobs); + SetView missingBucketsInLeft = Sets.difference(rightSideUsedBuckets, leftSideUsedBuckets); + if (missingBucketsInLeft.isEmpty()) { + return leftSideInstances; + } + OlapScanNode olapScanNode = (OlapScanNode) scanNodes.get(0); + MaterializedIndex randomPartition = randomPartition(olapScanNode); + ListMultimap missingBuckets = selectWorkerForMissingBuckets( + olapScanNode, randomPartition, missingBucketsInLeft); + + List newInstances = new ArrayList<>(leftSideInstances); + for (Entry> workerToBuckets : missingBuckets.asMap().entrySet()) { + Map> scanEmptyBuckets = Maps.newLinkedHashMap(); + for (Integer bucketIndex : workerToBuckets.getValue()) { + scanEmptyBuckets.put(bucketIndex, ImmutableMap.of()); + } + AssignedJob fillUpInstance = assignWorkerAndDataSources( + newInstances.size(), workerToBuckets.getKey(), new BucketScanSource(scanEmptyBuckets) + ); + newInstances.add(fillUpInstance); + } + return newInstances; + } + + private Set rightSideUsedBuckets( + List hashJoinNodes, ListMultimap inputJobs) { + Set rightSideUsedBuckets = new TreeSet<>(); + for (HashJoinNode hashJoinNode : hashJoinNodes) { + PlanNode right = hashJoinNode.getChild(1); + if (!(right instanceof ExchangeNode)) { + continue; + } + List rightInstances = inputJobs.get((ExchangeNode) right); + for (AssignedJob rightInstance : rightInstances) { + ScanSource scanSource = rightInstance.getScanSource(); + if (scanSource instanceof BucketScanSource) { + BucketScanSource bucketScanSource = (BucketScanSource) scanSource; + rightSideUsedBuckets.addAll(bucketScanSource.bucketIndexToScanNodeToTablets.keySet()); + } + } + } + return rightSideUsedBuckets; + } + + private Set leftSideBuckets(List notPrunedBucketInstances) { + Set leftSideBuckets = new TreeSet<>(); + for (AssignedJob instance : notPrunedBucketInstances) { + ScanSource scanSource = instance.getScanSource(); + if (scanSource instanceof BucketScanSource) { + BucketScanSource bucketScanSource = (BucketScanSource) scanSource; + leftSideBuckets.addAll(bucketScanSource.bucketIndexToScanNodeToTablets.keySet()); + } + } + return leftSideBuckets; + } + + private MaterializedIndex randomPartition(OlapScanNode olapScanNode) { + List selectedPartitionIds = ImmutableList.copyOf(olapScanNode.getSelectedPartitionIds()); + if (selectedPartitionIds.isEmpty()) { + throw new IllegalStateException("Missing selected partitions in " + olapScanNode); + } + + Long randomSelectPartitionId = selectedPartitionIds.get((int) (Math.random() * selectedPartitionIds.size())); + Partition partition = olapScanNode.getOlapTable().getPartition(randomSelectPartitionId); + return partition.getBaseIndex(); + } + + private ListMultimap selectWorkerForMissingBuckets( + OlapScanNode olapScanNode, MaterializedIndex partition, Set selectBucketIndexes) { + List tabletIdsInOrder = partition.getTabletIdsInOrder(); + ListMultimap fillUpWorkerToBuckets = ArrayListMultimap.create(); + for (Integer bucketIndex : selectBucketIndexes) { + Long tabletIdInBucket = tabletIdsInOrder.get(bucketIndex); + Tablet tabletInBucket = partition.getTablet(tabletIdInBucket); + List workers = getWorkersByReplicas(tabletInBucket); + if (workers.isEmpty()) { + throw new IllegalStateException("Can not found available replica for bucket " + bucketIndex + + ", table: " + olapScanNode); + } + Worker worker = scanWorkerSelector.selectMinWorkloadWorker(workers); + fillUpWorkerToBuckets.put(worker, bucketIndex); + } + return fillUpWorkerToBuckets; + } + + private List getWorkersByReplicas(Tablet tablet) { + WorkerManager workerManager = scanWorkerSelector.getWorkerManager(); + List replicas = tablet.getReplicas(); + List workers = Lists.newArrayListWithCapacity(replicas.size()); + for (Replica replica : replicas) { + Worker worker = workerManager.getWorker(replica.getBackendId()); + if (worker.available()) { + workers.add(worker); + } + } + return workers; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleOlapTableJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleOlapTableJob.java new file mode 100644 index 000000000000000..410a4edd8651ddc --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleOlapTableJob.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.ScanWorkerSelector; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.OlapScanNode; +import org.apache.doris.planner.PlanFragment; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ListMultimap; + +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** UnassignedScanSingleOlapTableJob */ +public class UnassignedScanSingleOlapTableJob extends AbstractUnassignedScanJob { + private OlapScanNode olapScanNode; + private final ScanWorkerSelector scanWorkerSelector; + + public UnassignedScanSingleOlapTableJob( + PlanFragment fragment, OlapScanNode olapScanNode, + ListMultimap exchangeToChildJob, + ScanWorkerSelector scanWorkerSelector) { + super(fragment, ImmutableList.of(olapScanNode), exchangeToChildJob); + this.scanWorkerSelector = Objects.requireNonNull( + scanWorkerSelector, "scanWorkerSelector cat not be null"); + this.olapScanNode = olapScanNode; + } + + @Override + protected Map multipleMachinesParallelization( + WorkerManager workerManager, ListMultimap inputJobs) { + // for every tablet, select its replica and worker. + // for example: + // { + // BackendWorker("172.0.0.1"): + // olapScanNode1: ScanRanges([tablet_10001, tablet_10002, tablet_10003, tablet_10004]), + // BackendWorker("172.0.0.2"): + // olapScanNode1: ScanRanges([tablet_10005, tablet_10006, tablet_10007, tablet_10008, tablet_10009]) + // } + return scanWorkerSelector.selectReplicaAndWorkerWithoutBucket(olapScanNode); + } + + @Override + protected List insideMachineParallelization( + Map workerToScanRanges, + ListMultimap inputJobs) { + // for each worker, compute how many instances should be generated, and which data should be scanned. + // for example: + // { + // BackendWorker("172.0.0.1"): [ + // instance 1: olapScanNode1: ScanRanges([tablet_10001, tablet_10003]) + // instance 2: olapScanNode1: ScanRanges([tablet_10002, tablet_10004]) + // ], + // BackendWorker("172.0.0.2"): [ + // instance 3: olapScanNode1: ScanRanges([tablet_10005, tablet_10008]) + // instance 4: olapScanNode1: ScanRanges([tablet_10006, tablet_10009]) + // instance 5: olapScanNode1: ScanRanges([tablet_10007]) + // ], + // } + return super.insideMachineParallelization(workerToScanRanges, inputJobs); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleRemoteTableJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleRemoteTableJob.java new file mode 100644 index 000000000000000..4c6025dfa7356f0 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedScanSingleRemoteTableJob.java @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.ScanWorkerSelector; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ListMultimap; + +import java.util.Map; +import java.util.Objects; + +/** + * UnassignedScanSingleRemoteTableJob + * it should be a leaf job which not contains scan native olap table node, + * for example, select literal without table, or scan an external table + */ +public class UnassignedScanSingleRemoteTableJob extends AbstractUnassignedScanJob { + private final ScanWorkerSelector scanWorkerSelector; + + public UnassignedScanSingleRemoteTableJob( + PlanFragment fragment, ScanNode scanNode, ListMultimap exchangeToChildJob, + ScanWorkerSelector scanWorkerSelector) { + super(fragment, ImmutableList.of(scanNode), exchangeToChildJob); + this.scanWorkerSelector = Objects.requireNonNull(scanWorkerSelector, "scanWorkerSelector is not null"); + } + + @Override + protected Map multipleMachinesParallelization( + WorkerManager workerManager, ListMultimap inputJobs) { + return scanWorkerSelector.selectReplicaAndWorkerWithoutBucket(scanNodes.get(0)); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedShuffleJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedShuffleJob.java new file mode 100644 index 000000000000000..d1c7d46b3e1e916 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedShuffleJob.java @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.qe.ConnectContext; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map.Entry; +import java.util.Set; +import java.util.function.Function; + +/** UnassignedShuffleJob */ +public class UnassignedShuffleJob extends AbstractUnassignedJob { + public UnassignedShuffleJob(PlanFragment fragment, ListMultimap exchangeToChildJob) { + super(fragment, ImmutableList.of(), exchangeToChildJob); + } + + @Override + public List computeAssignedJobs( + WorkerManager workerManager, ListMultimap inputJobs) { + int expectInstanceNum = degreeOfParallelism(); + List biggestParallelChildFragment = getInstancesOfBiggestParallelChildFragment(inputJobs); + + if (expectInstanceNum > 0 && expectInstanceNum < biggestParallelChildFragment.size()) { + // When group by cardinality is smaller than number of backend, only some backends always + // process while other has no data to process. + // So we shuffle instances to make different backends handle different queries. + List shuffleWorkersInBiggestParallelChildFragment + = distinctShuffleWorkers(biggestParallelChildFragment); + Function workerSelector = instanceIndex -> { + int selectIndex = instanceIndex % shuffleWorkersInBiggestParallelChildFragment.size(); + return shuffleWorkersInBiggestParallelChildFragment.get(selectIndex); + }; + return buildInstances(expectInstanceNum, workerSelector); + } else { + // keep same instance num like child fragment + Function workerSelector = instanceIndex -> { + int selectIndex = instanceIndex % biggestParallelChildFragment.size(); + return biggestParallelChildFragment.get(selectIndex).getAssignedWorker(); + }; + return buildInstances(biggestParallelChildFragment.size(), workerSelector); + } + } + + protected int degreeOfParallelism() { + if (!fragment.getDataPartition().isPartitioned()) { + return 1; + } + + // TODO: check we use nested loop join do right outer / semi / anti join, + // we should add an exchange node with gather distribute under the nested loop join + + int expectInstanceNum = -1; + if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable() != null) { + expectInstanceNum = ConnectContext.get().getSessionVariable().getExchangeInstanceParallel(); + } + return expectInstanceNum; + } + + private List getInstancesOfBiggestParallelChildFragment( + ListMultimap inputJobs) { + int maxInstanceNum = -1; + List biggestParallelChildFragment = ImmutableList.of(); + // skip broadcast exchange + for (Entry> exchangeToChildInstances : inputJobs.asMap().entrySet()) { + List instances = (List) exchangeToChildInstances.getValue(); + if (instances.size() > maxInstanceNum) { + biggestParallelChildFragment = instances; + maxInstanceNum = instances.size(); + } + } + return biggestParallelChildFragment; + } + + private List buildInstances(int instanceNum, Function workerSelector) { + ImmutableList.Builder instances = ImmutableList.builderWithExpectedSize(instanceNum); + for (int i = 0; i < instanceNum; i++) { + Worker selectedWorker = workerSelector.apply(i); + AssignedJob assignedJob = assignWorkerAndDataSources( + i, selectedWorker, new DefaultScanSource(ImmutableMap.of()) + ); + instances.add(assignedJob); + } + return instances.build(); + } + + private List distinctShuffleWorkers(List instances) { + Set candidateWorkerSet = Sets.newLinkedHashSet(); + for (AssignedJob instance : instances) { + candidateWorkerSet.add(instance.getAssignedWorker()); + } + List candidateWorkers = Lists.newArrayList(candidateWorkerSet); + Collections.shuffle(candidateWorkers); + return candidateWorkers; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedSpecifyInstancesJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedSpecifyInstancesJob.java new file mode 100644 index 000000000000000..1a877d229c9ce7b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UnassignedSpecifyInstancesJob.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.WorkerManager; +import org.apache.doris.planner.ExchangeNode; +import org.apache.doris.planner.NereidsSpecifyInstances; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.ScanNode; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ListMultimap; + +import java.util.List; + +/** UnassignedSpecifyInstancesJob */ +public class UnassignedSpecifyInstancesJob extends AbstractUnassignedJob { + private final NereidsSpecifyInstances specifyInstances; + + public UnassignedSpecifyInstancesJob( + PlanFragment fragment, List scanNodes, + ListMultimap exchangeToChildJob) { + super(fragment, scanNodes, exchangeToChildJob); + Preconditions.checkArgument(fragment.specifyInstances.isPresent(), + "Missing fragment specifyInstances"); + this.specifyInstances = fragment.specifyInstances.get(); + } + + @Override + public List computeAssignedJobs(WorkerManager workerManager, + ListMultimap inputJobs) { + return specifyInstances.buildAssignedJobs(this); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UninstancedScanSource.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UninstancedScanSource.java new file mode 100644 index 000000000000000..110256530b0cbff --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/UninstancedScanSource.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.worker.job; + +import com.google.common.collect.ImmutableMap; + +/** + * UninstancedScanSource: + * a ScanSource which doesn't parallelize/split to instances + */ +public class UninstancedScanSource { + public final ScanSource scanSource; + + public UninstancedScanSource(ScanSource scanSource) { + this.scanSource = scanSource; + } + + public static UninstancedScanSource emptyDefaultScanSource() { + return new UninstancedScanSource(new DefaultScanSource(ImmutableMap.of())); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/WorkerScanSource.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/WorkerScanSource.java new file mode 100644 index 000000000000000..dbb6ed130aeaa2e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/worker/job/WorkerScanSource.java @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/apache/impala/blob/branch-2.9.0/fe/src/main/java/org/apache/impala/PlanFragment.java +// and modified by Doris + +package org.apache.doris.nereids.worker.job; + +import org.apache.doris.nereids.worker.Worker; + +/** WorkerScanSource */ +public class WorkerScanSource { + public final Worker worker; + public final S scanSource; + + public WorkerScanSource(Worker worker, S scanSource) { + this.worker = worker; + this.scanSource = scanSource; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/BucketSpecifyInstances.java b/fe/fe-core/src/main/java/org/apache/doris/planner/BucketSpecifyInstances.java new file mode 100644 index 000000000000000..3b775145981d195 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/BucketSpecifyInstances.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/apache/impala/blob/branch-2.9.0/fe/src/main/java/org/apache/impala/PlanFragment.java +// and modified by Doris + +package org.apache.doris.planner; + +import org.apache.doris.nereids.worker.job.BucketScanSource; +import org.apache.doris.nereids.worker.job.WorkerScanSource; + +import java.util.List; + +/** DefaultNereidsSpecifyInstances */ +public class BucketSpecifyInstances extends NereidsSpecifyInstances { + public BucketSpecifyInstances(List> workerScanSources) { + super(workerScanSources); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/DataGenScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/DataGenScanNode.java index 1c760adb94aa5b8..60fce4df14848f3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/DataGenScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/DataGenScanNode.java @@ -59,6 +59,10 @@ public void init(Analyzer analyzer) throws UserException { super.init(analyzer); } + public DataGenTableValuedFunction getTvf() { + return tvf; + } + @Override public List getScanRangeLocations(long maxScanRangeLength) { return scanRangeLocations; diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/DefaultSpecifyInstances.java b/fe/fe-core/src/main/java/org/apache/doris/planner/DefaultSpecifyInstances.java new file mode 100644 index 000000000000000..bda38b28614ea7b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/DefaultSpecifyInstances.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/apache/impala/blob/branch-2.9.0/fe/src/main/java/org/apache/impala/PlanFragment.java +// and modified by Doris + +package org.apache.doris.planner; + +import org.apache.doris.nereids.worker.job.DefaultScanSource; +import org.apache.doris.nereids.worker.job.WorkerScanSource; + +import java.util.List; + +/** DefaultSpecifyInstances */ +public class DefaultSpecifyInstances extends NereidsSpecifyInstances { + public DefaultSpecifyInstances(List> workerToScanSources) { + super(workerToScanSources); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/NereidsSpecifyInstances.java b/fe/fe-core/src/main/java/org/apache/doris/planner/NereidsSpecifyInstances.java new file mode 100644 index 000000000000000..40309969c8a7f05 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/NereidsSpecifyInstances.java @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/apache/impala/blob/branch-2.9.0/fe/src/main/java/org/apache/impala/PlanFragment.java +// and modified by Doris + +package org.apache.doris.planner; + +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.job.AssignedJob; +import org.apache.doris.nereids.worker.job.ScanSource; +import org.apache.doris.nereids.worker.job.StaticAssignedJob; +import org.apache.doris.nereids.worker.job.UnassignedJob; +import org.apache.doris.nereids.worker.job.WorkerScanSource; + +import com.google.common.collect.Lists; + +import java.util.List; +import java.util.Objects; + +/** NereidsSpecifyInstances */ +public abstract class NereidsSpecifyInstances { + public final List> workerScanSources; + + public NereidsSpecifyInstances(List> workerScanSources) { + this.workerScanSources = Objects.requireNonNull(workerScanSources, + "workerScanSources can not be null"); + } + + public List buildAssignedJobs(UnassignedJob unassignedJob) { + List instances = Lists.newArrayListWithCapacity(workerScanSources.size()); + int instanceNum = 0; + for (WorkerScanSource workerToScanSource : workerScanSources) { + Worker worker = workerToScanSource.worker; + ScanSource scanSource = workerToScanSource.scanSource; + StaticAssignedJob assignedJob = new StaticAssignedJob(instanceNum++, unassignedJob, worker, scanSource); + instances.add(assignedJob); + } + return instances; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java index 5849aa71d6aeff1..ad1e918d2a51db2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java @@ -169,6 +169,8 @@ public class OlapScanNode extends ScanNode { private int selectedPartitionNum = 0; private Collection selectedPartitionIds = Lists.newArrayList(); private long totalBytes = 0; + // tablet id to single replica bytes + private Map tabletBytes = Maps.newLinkedHashMap(); private SortInfo sortInfo = null; private Set outputColumnUniqueIds = new HashSet<>(); @@ -191,6 +193,7 @@ public class OlapScanNode extends ScanNode { // a bucket seq may map to many tablets, and each tablet has a // TScanRangeLocations. public ArrayListMultimap bucketSeq2locations = ArrayListMultimap.create(); + public Map bucketSeq2Bytes = Maps.newLinkedHashMap(); boolean isFromPrepareStmt = false; // For point query @@ -748,6 +751,10 @@ public void updateScanRangeVersions(Map visibleVersionMap) { } } + public Long getTabletSingleReplicaSize(Long tabletId) { + return tabletBytes.get(tabletId); + } + private void addScanRangeLocations(Partition partition, List tablets) throws UserException { long visibleVersion = Partition.PARTITION_INIT_VERSION; @@ -877,6 +884,9 @@ private void addScanRangeLocations(Partition partition, boolean tabletIsNull = true; boolean collectedStat = false; List errs = Lists.newArrayList(); + + int replicaInTablet = 0; + long oneReplicaBytes = 0; for (Replica replica : replicas) { Backend backend = Env.getCurrentSystemInfo().getBackend(replica.getBackendId()); if (backend == null || !backend.isAlive()) { @@ -916,7 +926,13 @@ private void addScanRangeLocations(Partition partition, // for CBO if (!collectedStat && replica.getRowCount() != -1) { - totalBytes += replica.getDataSize(); + long dataSize = replica.getDataSize(); + if (replicaInTablet == 0) { + oneReplicaBytes = dataSize; + tabletBytes.put(tabletId, dataSize); + } + replicaInTablet++; + totalBytes += dataSize; collectedStat = true; } scanBackendIds.add(backend.getId()); @@ -934,8 +950,9 @@ private void addScanRangeLocations(Partition partition, scanRange.setPaloScanRange(paloRange); locations.setScanRange(scanRange); - bucketSeq2locations.put(tabletId2BucketSeq.get(tabletId), locations); - + Integer bucketSeq = tabletId2BucketSeq.get(tabletId); + bucketSeq2locations.put(bucketSeq, locations); + bucketSeq2Bytes.merge(bucketSeq, oneReplicaBytes, Long::sum); scanRangeLocations.add(locations); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java index 7418e15bdc8f57d..9eac5875b70db28 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java @@ -27,6 +27,7 @@ import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.common.TreeNode; +import org.apache.doris.nereids.worker.job.ScanSource; import org.apache.doris.qe.ConnectContext; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPartitionType; @@ -34,6 +35,7 @@ import org.apache.doris.thrift.TResultSinkType; import com.google.common.base.Preconditions; +import com.google.common.base.Suppliers; import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; import org.apache.logging.log4j.LogManager; @@ -43,7 +45,9 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; /** @@ -148,9 +152,12 @@ public class PlanFragment extends TreeNode { // has colocate plan node protected boolean hasColocatePlanNode = false; + protected final Supplier hasBucketShuffleJoin; private TResultSinkType resultSinkType = TResultSinkType.MYSQL_PROTOCAL; + public Optional> specifyInstances = Optional.empty(); + /** * C'tor for fragment with specific partition; the output is by default broadcast. */ @@ -162,6 +169,7 @@ public PlanFragment(PlanFragmentId id, PlanNode root, DataPartition partition) { this.transferQueryStatisticsWithEveryBatch = false; this.builderRuntimeFilterIds = new HashSet<>(); this.targetRuntimeFilterIds = new HashSet<>(); + this.hasBucketShuffleJoin = buildHasBucketShuffleJoin(); setParallelExecNumIfExists(); setFragmentInPlanTree(planRoot); } @@ -178,6 +186,18 @@ public PlanFragment(PlanFragmentId id, PlanNode root, DataPartition partition, this.targetRuntimeFilterIds = new HashSet<>(targetRuntimeFilterIds); } + private Supplier buildHasBucketShuffleJoin() { + return Suppliers.memoize(() -> { + List hashJoinNodes = getPlanRoot().collectInCurrentFragment(HashJoinNode.class::isInstance); + for (HashJoinNode hashJoinNode : hashJoinNodes) { + if (hashJoinNode.isBucketShuffle()) { + return true; + } + } + return false; + }); + } + /** * Assigns 'this' as fragment of all PlanNodes in the plan tree rooted at node. * Does not traverse the children of ExchangeNodes because those must belong to a @@ -240,6 +260,16 @@ public void setHasColocatePlanNode(boolean hasColocatePlanNode) { this.hasColocatePlanNode = hasColocatePlanNode; } + public boolean isBucketShuffleJoinInput() { + if (hasBucketShuffleJoin.get()) { + return true; + } + if (destNode != null && destNode.getFragment().hasBucketShuffleJoin.get()) { + return true; + } + return false; + } + public void setResultSinkType(TResultSinkType resultSinkType) { this.resultSinkType = resultSinkType; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java index 198d5171e26b46c..37f121a5c23d988 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java @@ -59,6 +59,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Consumer; +import java.util.function.Predicate; import java.util.stream.Collectors; /** @@ -1256,4 +1258,27 @@ public void addIntermediateOutputTupleDescList(TupleDescriptor tupleDescriptor) public void addIntermediateProjectList(List exprs) { intermediateProjectListList.add(exprs); } + + public List collectInCurrentFragment(Predicate predicate) { + List result = Lists.newArrayList(); + foreachDownInCurrentFragment(child -> { + if (predicate.test(child)) { + result.add(child); + } + }); + return (List) result; + } + + /** foreachDownInCurrentFragment */ + public void foreachDownInCurrentFragment(Consumer visitor) { + int currentFragmentId = getFragmentId().asInt(); + foreachDown(child -> { + PlanNode childNode = (PlanNode) child; + if (childNode.getFragmentId().asInt() != currentFragmentId) { + return false; + } + visitor.accept(childNode); + return true; + }); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java index 342ec3c91490a1b..f53f69505da5bd1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java @@ -570,9 +570,12 @@ public static ColumnRanges create(List> ranges) { @Override public String toString() { - return MoreObjects.toStringHelper(this).add("tid", desc.getId().asInt()).add("tblName", - desc.getTable().getName()).add("keyRanges", "").addValue( - super.debugString()).toString(); + return MoreObjects.toStringHelper(this) + .add("id", getId().asInt()) + .add("tid", desc.getId().asInt()) + .add("tblName", desc.getTable().getName()) + .add("keyRanges", "") + .addValue(super.debugString()).toString(); } // Some of scan node(eg, DataGenScanNode) does not need to check column priv diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index c675574610617e8..fae8ab738c4ff09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -41,7 +41,18 @@ import org.apache.doris.mysql.MysqlCommand; import org.apache.doris.nereids.NereidsPlanner; import org.apache.doris.nereids.stats.StatsErrorEstimator; +import org.apache.doris.nereids.trees.plans.distribute.DistributedPlan; +import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping; +import org.apache.doris.nereids.trees.plans.distribute.PipelineDistributedPlan; import org.apache.doris.nereids.trees.plans.physical.TopnFilter; +import org.apache.doris.nereids.worker.Worker; +import org.apache.doris.nereids.worker.job.AssignedJob; +import org.apache.doris.nereids.worker.job.BucketScanSource; +import org.apache.doris.nereids.worker.job.DefaultScanSource; +import org.apache.doris.nereids.worker.job.LocalShuffleAssignedJob; +import org.apache.doris.nereids.worker.job.ScanRanges; +import org.apache.doris.nereids.worker.job.ScanSource; +import org.apache.doris.nereids.worker.job.UnassignedJob; import org.apache.doris.planner.DataPartition; import org.apache.doris.planner.DataSink; import org.apache.doris.planner.DataStreamSink; @@ -185,6 +196,7 @@ public class Coordinator implements CoordInterface { // copied from TQueryExecRequest; constant across all fragments private final TDescriptorTable descTable; + private FragmentIdMapping distributedPlans; // scan node id -> TFileScanRangeParams private Map fileScanRangeParamsMap = Maps.newHashMap(); @@ -331,6 +343,8 @@ public Coordinator(ConnectContext context, Analyzer analyzer, Planner planner) { if (!useNereids) { // Enable local shuffle on pipelineX engine only if Nereids planner is applied. queryOptions.setEnableLocalShuffle(false); + } else { + distributedPlans = ((NereidsPlanner) planner).getDistributedPlans(); } setFromUserProperty(context); @@ -1665,9 +1679,133 @@ private boolean containsSetOperationNode(PlanNode node) { return false; } + private void setForDefaultScanSource( + FInstanceExecParam instanceExecParam, DefaultScanSource scanSource, boolean isShareScan) { + for (Entry scanNodeIdToReplicaIds : scanSource.scanNodeToScanRanges.entrySet()) { + ScanNode scanNode = scanNodeIdToReplicaIds.getKey(); + ScanRanges scanReplicas = scanNodeIdToReplicaIds.getValue(); + instanceExecParam.perNodeScanRanges.put(scanNode.getId().asInt(), scanReplicas.params); + instanceExecParam.perNodeSharedScans.put(scanNode.getId().asInt(), isShareScan); + } + } + + private void setForBucketScanSource(FInstanceExecParam instanceExecParam, + BucketScanSource bucketScanSource, boolean isShareScan) { + for (Entry> bucketIndexToScanTablets : + bucketScanSource.bucketIndexToScanNodeToTablets.entrySet()) { + Integer bucketIndex = bucketIndexToScanTablets.getKey(); + instanceExecParam.addBucketSeq(bucketIndex); + Map scanNodeToRangeMap = bucketIndexToScanTablets.getValue(); + for (Entry scanNodeToRange : scanNodeToRangeMap.entrySet()) { + ScanNode scanNode = scanNodeToRange.getKey(); + ScanRanges scanRanges = scanNodeToRange.getValue(); + List scanBucketTablets = instanceExecParam.perNodeScanRanges.computeIfAbsent( + scanNode.getId().asInt(), id -> Lists.newArrayList()); + scanBucketTablets.addAll(scanRanges.params); + instanceExecParam.perNodeSharedScans.put(scanNode.getId().asInt(), isShareScan); + + if (scanNode instanceof OlapScanNode) { + OlapScanNode olapScanNode = (OlapScanNode) scanNode; + if (!fragmentIdToSeqToAddressMap.containsKey(scanNode.getFragmentId())) { + // In bucket shuffle join, we have 2 situation. + // 1. Only one partition: in this case, we use scanNode.getTotalTabletsNum() + // to get the right bucket num because when table turn on dynamic partition, + // the bucket number in default distribution info + // is not correct. + // 2. Table is colocated: in this case, table could have more than one partition, + // but all partition's bucket number must be same, so we use default bucket num is ok. + int bucketNum = 0; + if (olapScanNode.getOlapTable().isColocateTable()) { + bucketNum = olapScanNode.getOlapTable().getDefaultDistributionInfo() + .getBucketNum(); + } else { + bucketNum = (int) (olapScanNode.getTotalTabletsNum()); + } + fragmentIdToSeqToAddressMap.put(olapScanNode.getFragmentId(), new HashMap<>()); + bucketShuffleJoinController.fragmentIdBucketSeqToScanRangeMap + .put(scanNode.getFragmentId(), new BucketSeqToScanRange()); + bucketShuffleJoinController.fragmentIdToBucketNumMap + .put(scanNode.getFragmentId(), bucketNum); + olapScanNode.getFragment().setBucketNum(bucketNum); + } + } else if (!fragmentIdToSeqToAddressMap.containsKey(scanNode.getFragmentId())) { + int bucketNum = 1; + fragmentIdToSeqToAddressMap.put(scanNode.getFragmentId(), new HashMap<>()); + bucketShuffleJoinController.fragmentIdBucketSeqToScanRangeMap + .put(scanNode.getFragmentId(), new BucketSeqToScanRange()); + bucketShuffleJoinController.fragmentIdToBucketNumMap + .put(scanNode.getFragmentId(), bucketNum); + scanNode.getFragment().setBucketNum(bucketNum); + } + + BucketSeqToScanRange bucketSeqToScanRange = bucketShuffleJoinController + .fragmentIdBucketSeqToScanRangeMap.get(scanNode.getFragmentId()); + + Map> scanNodeIdToReplicas + = bucketSeqToScanRange.computeIfAbsent(bucketIndex, set -> Maps.newLinkedHashMap()); + List tablets = scanNodeIdToReplicas.computeIfAbsent( + scanNode.getId().asInt(), id -> new ArrayList<>()); + tablets.addAll(scanRanges.params); + } + } + } + // For each fragment in fragments, computes hosts on which to run the instances // and stores result in fragmentExecParams.hosts. private void computeFragmentHosts() throws Exception { + if (SessionVariable.canUseNereidsDistributePlanner()) { + for (DistributedPlan distributedPlan : distributedPlans.values()) { + UnassignedJob fragmentJob = distributedPlan.getFragmentJob(); + PlanFragment fragment = fragmentJob.getFragment(); + FragmentExecParams fragmentExecParams = fragmentExecParamsMap.computeIfAbsent( + fragment.getFragmentId(), id -> new FragmentExecParams(fragment) + ); + + bucketShuffleJoinController + .isBucketShuffleJoin(fragment.getFragmentId().asInt(), fragment.getPlanRoot()); + + for (ScanNode scanNode : distributedPlan.getFragmentJob().getScanNodes()) { + if (scanNode instanceof FileQueryScanNode) { + fileScanRangeParamsMap.put( + scanNode.getId().asInt(), + ((FileQueryScanNode) scanNode).getFileScanRangeParams() + ); + } + } + + List instanceJobs = ((PipelineDistributedPlan) distributedPlan).getInstanceJobs(); + boolean isShareScan = false; + for (AssignedJob instanceJob : instanceJobs) { + if (instanceJob instanceof LocalShuffleAssignedJob) { + isShareScan = true; + break; + } + } + + if (isShareScan) { + fragmentExecParams.ignoreDataDistribution = true; + fragmentExecParams.parallelTasksNum = 1; + } else { + fragmentExecParams.parallelTasksNum = instanceJobs.size(); + } + + for (AssignedJob instanceJob : instanceJobs) { + Worker worker = instanceJob.getAssignedWorker(); + TNetworkAddress address = new TNetworkAddress(worker.host(), worker.port()); + FInstanceExecParam instanceExecParam = new FInstanceExecParam( + null, address, 0, fragmentExecParams); + fragmentExecParams.instanceExecParams.add(instanceExecParam); + addressToBackendID.put(address, worker.id()); + ScanSource scanSource = instanceJob.getScanSource(); + if (scanSource instanceof BucketScanSource) { + setForBucketScanSource(instanceExecParam, (BucketScanSource) scanSource, isShareScan); + } else { + setForDefaultScanSource(instanceExecParam, (DefaultScanSource) scanSource, isShareScan); + } + } + } + return; + } // compute hosts of producer fragment before those of consumer fragment(s), // the latter might inherit the set of hosts from the former // compute hosts *bottom up*. @@ -2003,6 +2141,9 @@ private Map getReplicaNumPerHostForOlapTable() { // Populates scan_range_assignment_. // > protected void computeScanRangeAssignment() throws Exception { + if (SessionVariable.canUseNereidsDistributePlanner()) { + return; + } Map assignedBytesPerHost = Maps.newHashMap(); Map replicaNumPerHost = getReplicaNumPerHostForOlapTable(); boolean isAllOlapTables = scanNodes.stream().allMatch(e -> e instanceof OlapScanNode); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java new file mode 100644 index 000000000000000..d8caba578520dff --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +import org.apache.doris.common.Status; +import org.apache.doris.nereids.NereidsPlanner; +import org.apache.doris.nereids.trees.plans.distribute.DistributedPlan; +import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping; +import org.apache.doris.thrift.TNetworkAddress; + +import com.google.common.collect.ImmutableList; + +import java.util.List; +import java.util.Objects; + +/** NereidsCoordinator */ +public class NereidsCoordinator implements CoordInterface { + private NereidsPlanner nereidsPlanner; + private FragmentIdMapping distributedPlans; + + public NereidsCoordinator(NereidsPlanner nereidsPlanner) { + this.nereidsPlanner = Objects.requireNonNull(nereidsPlanner, "nereidsPlanner can not be null"); + this.distributedPlans = Objects.requireNonNull( + nereidsPlanner.getDistributedPlans(), "distributedPlans can not be null" + ); + } + + @Override + public void exec() throws Exception { + // build fragment from leaf to root + } + + @Override + public RowBatch getNext() throws Exception { + RowBatch rowBatch = new RowBatch(); + rowBatch.setEos(true); + return rowBatch; + } + + @Override + public void cancel(Status cancelReason) { + + } + + @Override + public List getInvolvedBackends() { + return ImmutableList.of(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index a72ad4f9dc2749a..ed4141a74837397 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -18,6 +18,7 @@ package org.apache.doris.qe; import org.apache.doris.analysis.SetVar; +import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.StringLiteral; import org.apache.doris.catalog.Env; import org.apache.doris.common.Config; @@ -26,10 +27,14 @@ import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.common.util.TimeUtils; +import org.apache.doris.nereids.StatementContext; +import org.apache.doris.nereids.analyzer.UnboundResultSink; +import org.apache.doris.nereids.glue.LogicalPlanAdapter; import org.apache.doris.nereids.metrics.Event; import org.apache.doris.nereids.metrics.EventSwitchParser; import org.apache.doris.nereids.parser.Dialect; import org.apache.doris.nereids.rules.RuleType; +import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; import org.apache.doris.planner.GroupCommitBlockSink; import org.apache.doris.qe.VariableMgr.VarAttr; import org.apache.doris.thrift.TGroupCommitMode; @@ -301,6 +306,7 @@ public class SessionVariable implements Serializable, Writable { public static final String NTH_OPTIMIZED_PLAN = "nth_optimized_plan"; public static final String ENABLE_NEREIDS_PLANNER = "enable_nereids_planner"; + public static final String ENABLE_NEREIDS_DISTRIBUTE_PLANNER = "enable_nereids_distribute_planner"; public static final String DISABLE_NEREIDS_RULES = "disable_nereids_rules"; public static final String ENABLE_NEREIDS_RULES = "enable_nereids_rules"; public static final String ENABLE_NEW_COST_MODEL = "enable_new_cost_model"; @@ -1231,6 +1237,10 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) { @VariableMgr.VarAttr(name = NEREIDS_STAR_SCHEMA_SUPPORT) private boolean nereidsStarSchemaSupport = true; + @VariableMgr.VarAttr(name = ENABLE_NEREIDS_DISTRIBUTE_PLANNER, needForward = true, + fuzzy = true, varType = VariableAnnotation.EXPERIMENTAL) + private boolean enableNereidsDistributePlanner = false; + @VariableMgr.VarAttr(name = REWRITE_OR_TO_IN_PREDICATE_THRESHOLD, fuzzy = true) private int rewriteOrToInPredicateThreshold = 2; @@ -3041,6 +3051,40 @@ public void setEnableNereidsPlanner(boolean enableNereidsPlanner) { this.enableNereidsPlanner = enableNereidsPlanner; } + /** canUseNereidsDistributePlanner */ + public static boolean canUseNereidsDistributePlanner() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext == null) { + return false; + } + if (!connectContext.getState().isNereids()) { + return false; + } + StatementContext statementContext = connectContext.getStatementContext(); + if (statementContext == null) { + return false; + } + StatementBase parsedStatement = statementContext.getParsedStatement(); + if (!(parsedStatement instanceof LogicalPlanAdapter)) { + return false; + } + LogicalPlan logicalPlan = ((LogicalPlanAdapter) parsedStatement).getLogicalPlan(); + SessionVariable sessionVariable = connectContext.getSessionVariable(); + if (logicalPlan instanceof UnboundResultSink + && sessionVariable.enableNereidsDistributePlanner && sessionVariable.enablePipelineXEngine) { + return true; + } + return false; + } + + public boolean isEnableNereidsDistributePlanner() { + return enableNereidsDistributePlanner; + } + + public void setEnableNereidsDistributePlanner(boolean enableNereidsDistributePlanner) { + this.enableNereidsDistributePlanner = enableNereidsDistributePlanner; + } + public int getNthOptimizedPlan() { return nthOptimizedPlan; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index df4f8d2ae5d2f13..3b40426984bcfec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -1876,8 +1876,12 @@ public void executeAndSendResult(boolean isOutfileQuery, boolean isSendFields, // this branch is for legacy planner, to be removed coordBase = new PointQueryExec(planner, analyzer, context.getSessionVariable().getMaxMsgSizeOfResultReceiver()); + // } else if (context.getState().isNereids() + // && context.getSessionVariable().isEnableNereidsCoordinator() + // && planner instanceof NereidsPlanner) { + // coordBase = new NereidsCoordinator((NereidsPlanner) planner); } else { - coord = EnvFactory.getInstance().createCoordinator(context, analyzer, + coord = EnvFactory.getInstance().createCoordinator(context, analyzer, planner, context.getStatsErrorEstimator()); profile.addExecutionProfile(coord.getExecutionProfile()); QeProcessorImpl.INSTANCE.registerQuery(context.queryId(), diff --git a/regression-test/data/nereids_syntax_p0/distribute/colocate_union_numbers.out b/regression-test/data/nereids_syntax_p0/distribute/colocate_union_numbers.out new file mode 100644 index 000000000000000..6d15eb600472503 --- /dev/null +++ b/regression-test/data/nereids_syntax_p0/distribute/colocate_union_numbers.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !union_all -- +0 +0 +1 +1 +2 +2 +3 + diff --git a/regression-test/data/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.out b/regression-test/data/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.out new file mode 100644 index 000000000000000..acbf30fb3b1f5e5 --- /dev/null +++ b/regression-test/data/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.out @@ -0,0 +1,5 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !fillup_bucket -- +\N 2.000 +\N 3.000 + diff --git a/regression-test/data/nereids_syntax_p0/distribute/shuffle_left_join.out b/regression-test/data/nereids_syntax_p0/distribute/shuffle_left_join.out new file mode 100644 index 000000000000000..99d095d87f78860 --- /dev/null +++ b/regression-test/data/nereids_syntax_p0/distribute/shuffle_left_join.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !shuffle_left_and_right -- +1 1 1 1 +2 2 2 2 + +-- !shuffle_left -- +1 1 1 1 +2 2 2 2 + diff --git a/regression-test/suites/nereids_syntax_p0/distribute/colocate_union_numbers.groovy b/regression-test/suites/nereids_syntax_p0/distribute/colocate_union_numbers.groovy new file mode 100644 index 000000000000000..04b232a65b4e8ad --- /dev/null +++ b/regression-test/suites/nereids_syntax_p0/distribute/colocate_union_numbers.groovy @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("colocate_union_numbers") { + multi_sql """ + set enable_nereids_distribute_planner=false; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + + def sqlStr = """ + select * from numbers('number'='3')a + union all + select * from numbers('number'='4')b + """ + + explain { + sql sqlStr + check { explainStr -> + log.info(explainStr) + + // union all with two exchange + assertTrue(explainStr.count("VEXCHANGE") == 2) + assertTrue(explainStr.count("VDataGenScanNode") == 2) + } + } + + multi_sql """ + set enable_nereids_distribute_planner=true; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + + explain { + sql "distributed plan ${sqlStr}" + check { explainStr -> + log.info(explainStr) + + // only contains one instance + assertTrue(explainStr.count("StaticAssignedJob") == 1) + assertTrue(explainStr.count(" DataGenScanNode{") == 2) + } + } + + order_qt_union_all sqlStr +} diff --git a/regression-test/suites/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.groovy b/regression-test/suites/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.groovy new file mode 100644 index 000000000000000..7f7c1bd5133b57b --- /dev/null +++ b/regression-test/suites/nereids_syntax_p0/distribute/prune_bucket_with_bucket_shuffle_join.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("prune_bucket_with_bucket_shuffle_join") { + multi_sql """ + drop table if exists test_outer_join1; + CREATE TABLE IF NOT EXISTS test_outer_join1 ( + c0 DECIMALV3(8,3) + ) + DISTRIBUTED BY HASH (c0) BUCKETS 10 PROPERTIES ("replication_num" = "1"); + + drop table if exists test_outer_join2; + CREATE TABLE IF NOT EXISTS test_outer_join2 ( + c0 DECIMALV3(8,3) + ) + DISTRIBUTED BY HASH (c0) BUCKETS 10 PROPERTIES ("replication_num" = "1"); + INSERT INTO test_outer_join1 (c0) VALUES (1), (3); + INSERT INTO test_outer_join2 (c0) VALUES (2), (3); + + sync; + + set enable_nereids_distribute_planner=false; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + + def assertJoinType = { String sqlStr, String containsString, int expectExchangeNum -> + explain { + sql sqlStr + check { result -> + log.info("Explain result:\n${result}") + + assertTrue(result.contains(containsString)) + assertEquals(expectExchangeNum, result.count("VEXCHANGE")) + } + } + } + + String sqlStr = """ + SELECT * FROM + (select * from test_outer_join1 where c0 =1)a + RIGHT OUTER JOIN + (select * from test_outer_join2)b + ON a.c0 = b.c0 + """ + + assertJoinType(sqlStr, "RIGHT OUTER JOIN(PARTITIONED)", 2) + + multi_sql """ + set enable_nereids_distribute_planner=true; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + assertJoinType(sqlStr, "RIGHT OUTER JOIN(BUCKET_SHUFFLE)", 1) + + explain { + sql "distributed plan ${sqlStr}" + check { explainStr -> + log.info("Distributed plan:\n${explainStr}") + + // some tablets of left table are pruned + assertTrue(explainStr.count("tablet ") < 20) + } + } + + order_qt_fillup_bucket sqlStr + +} diff --git a/regression-test/suites/nereids_syntax_p0/distribute/shuffle_left_join.groovy b/regression-test/suites/nereids_syntax_p0/distribute/shuffle_left_join.groovy new file mode 100644 index 000000000000000..f405077378c0994 --- /dev/null +++ b/regression-test/suites/nereids_syntax_p0/distribute/shuffle_left_join.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("shuffle_left_join") { + multi_sql """ + drop table if exists test_shuffle_left; + + CREATE TABLE `test_shuffle_left` ( + id int, + id2 int + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + + insert into test_shuffle_left values (1, 1), (2, 2), (3, 4); + + sync; + + set enable_nereids_distribute_planner=false; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + + def sqlStr = """ + select * + from test_shuffle_left a + inner join [shuffle] + test_shuffle_left b + on a.id2=b.id; + """ + + explain { + sql sqlStr + check { explainStr -> + log.info(explainStr) + + assertTrue(explainStr.contains("INNER JOIN(PARTITIONED)")) + + // union all with two exchange + assertTrue(explainStr.count("VEXCHANGE") == 2) + } + } + + order_qt_shuffle_left_and_right sqlStr + + multi_sql """ + set enable_nereids_distribute_planner=true; + set enable_pipeline_x_engine=true; + set disable_join_reorder=true; + """ + + explain { + sql sqlStr + check { explainStr -> + log.info(explainStr) + + assertTrue(explainStr.contains("INNER JOIN(PARTITIONED)")) + + // union all with one exchange + assertTrue(explainStr.count("VEXCHANGE") == 1) + } + } + + order_qt_shuffle_left sqlStr +}