Skip to content

Commit

Permalink
[feat](nereids)disable join reorder if column stats is invalid (apach…
Browse files Browse the repository at this point in the history
…e#41790)

## Proposed changes
disable join reorder if any condition is matched:
1. any table row count is -1
2. any column, whose ndv is 0, but MinExpr or MaxExpr is not null
3. ndv > 10* rowCount

Issue Number: close #xxx

<!--Describe your changes.-->
  • Loading branch information
englefly authored Oct 28, 2024
1 parent 01e0e72 commit e238a87
Show file tree
Hide file tree
Showing 16 changed files with 247 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,9 @@ private Plan planWithoutLock(
&& !cascadesContext.isLeadingDisableJoinReorder()) {
List<LogicalOlapScan> scans = cascadesContext.getRewritePlan()
.collectToList(LogicalOlapScan.class::isInstance);
StatsCalculator.disableJoinReorderIfTableRowCountNotAvailable(scans, cascadesContext);
Optional<String> disableJoinReorderReason = StatsCalculator
.disableJoinReorderIfStatsInvalid(scans, cascadesContext);
disableJoinReorderReason.ifPresent(statementContext::setDisableJoinReorderReason);
}

optimize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ public class StatementContext implements Closeable {

private List<PlannerHook> plannerHooks = new ArrayList<>();

private String disableJoinReorderReason;

public StatementContext() {
this(ConnectContext.get(), null, 0);
}
Expand Down Expand Up @@ -558,4 +560,12 @@ public TableId getTableId(TableIf tableIf) {
this.tableIdMapping.put(tableIdentifier, tableId);
return tableId;
}

public Optional<String> getDisableJoinReorderReason() {
return Optional.ofNullable(disableJoinReorderReason);
}

public void setDisableJoinReorderReason(String disableJoinReorderReason) {
this.disableJoinReorderReason = disableJoinReorderReason;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -221,24 +221,40 @@ public Map<String, ColumnStatistic> getTotalColumnStatisticMap() {
}

/**
* disable join reorder if any table row count is not available.
* disable join reorder if
* 1. any table rowCount is not available, or
* 2. col stats ndv=0 but minExpr or maxExpr is not null
* 3. ndv > 10 * rowCount
*/
public static void disableJoinReorderIfTableRowCountNotAvailable(
List<LogicalOlapScan> scans, CascadesContext context) {
public static Optional<String> disableJoinReorderIfStatsInvalid(List<LogicalOlapScan> scans,
CascadesContext context) {
StatsCalculator calculator = new StatsCalculator(context);
if (ConnectContext.get() == null) {
// ut case
return Optional.empty();
}
for (LogicalOlapScan scan : scans) {
double rowCount = calculator.getOlapTableRowCount(scan);
if (rowCount == -1 && ConnectContext.get() != null) {
// row count not available
if (rowCount == -1) {
LOG.info("disable join reorder since row count not available: "
+ scan.getTable().getNameWithFullQualifiers());
return Optional.of("table[" + scan.getTable().getName() + "] row count is invalid");
}
// ndv abnormal
Optional<String> reason = calculator.checkNdvValidation(scan, rowCount);
if (reason.isPresent()) {
try {
ConnectContext.get().getSessionVariable().disableNereidsJoinReorderOnce();
LOG.info("disable join reorder since row count not available: "
+ scan.getTable().getNameWithFullQualifiers());
LOG.info("disable join reorder since col stats invalid: "
+ reason.get());
} catch (Exception e) {
LOG.info("disableNereidsJoinReorderOnce failed");
}
return;
return reason;
}
}
return Optional.empty();
}

/**
Expand Down Expand Up @@ -412,6 +428,22 @@ private double getOlapTableRowCount(OlapScan olapScan) {
return rowCount;
}

// check validation of ndv.
private Optional<String> checkNdvValidation(OlapScan olapScan, double rowCount) {
for (Slot slot : ((Plan) olapScan).getOutput()) {
if (isVisibleSlotReference(slot)) {
ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, (SlotReference) slot);
if (!cache.isUnKnown) {
if ((cache.ndv == 0 && (cache.minExpr != null || cache.maxExpr != null))
|| cache.ndv > rowCount * 10) {
return Optional.of("slot " + slot.getName() + " has invalid column stats: " + cache);
}
}
}
}
return Optional.empty();
}

private Statistics computeOlapScan(OlapScan olapScan) {
OlapTable olapTable = olapScan.getTable();
double tableRowCount = getOlapTableRowCount(olapScan);
Expand Down
101 changes: 51 additions & 50 deletions regression-test/data/nereids_hint_tpcds_p0/shape/query64.out
Original file line number Diff line number Diff line change
Expand Up @@ -7,84 +7,85 @@ PhysicalCteAnchor ( cteId=CTEId#1 )
--------PhysicalDistribute[DistributionSpecHash]
----------hashAgg[LOCAL]
------------PhysicalProject
--------------hashJoin[INNER_JOIN colocated] hashCondition=((store_sales.ss_item_sk = store_returns.sr_item_sk) and (store_sales.ss_ticket_number = store_returns.sr_ticket_number)) otherCondition=() build RFs:RF18 sr_item_sk->[cr_item_sk,cs_item_sk,i_item_sk,ss_item_sk];RF19 sr_ticket_number->[ss_ticket_number]
--------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_item_sk = cs_ui.cs_item_sk)) otherCondition=() build RFs:RF19 cs_item_sk->[i_item_sk,sr_item_sk,ss_item_sk]
----------------PhysicalProject
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_first_shipto_date_sk = d3.d_date_sk)) otherCondition=() build RFs:RF17 d_date_sk->[c_first_shipto_date_sk]
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_promo_sk = promotion.p_promo_sk)) otherCondition=() build RFs:RF18 p_promo_sk->[ss_promo_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_hdemo_sk = hd1.hd_demo_sk)) otherCondition=() build RFs:RF16 hd_demo_sk->[ss_hdemo_sk]
----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_store_sk = store.s_store_sk)) otherCondition=() build RFs:RF17 s_store_sk->[ss_store_sk]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_store_sk = store.s_store_sk)) otherCondition=() build RFs:RF15 s_store_sk->[ss_store_sk]
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = d1.d_date_sk)) otherCondition=() build RFs:RF16 d_date_sk->[ss_sold_date_sk]
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_promo_sk = promotion.p_promo_sk)) otherCondition=() build RFs:RF14 p_promo_sk->[ss_promo_sk]
------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((hd2.hd_income_band_sk = ib2.ib_income_band_sk)) otherCondition=() build RFs:RF15 ib_income_band_sk->[hd_income_band_sk]
--------------------------------PhysicalProject
----------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_first_sales_date_sk = d2.d_date_sk)) otherCondition=() build RFs:RF13 d_date_sk->[c_first_sales_date_sk]
----------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_addr_sk = ad2.ca_address_sk)) otherCondition=() build RFs:RF14 ca_address_sk->[c_current_addr_sk]
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_addr_sk = ad2.ca_address_sk)) otherCondition=() build RFs:RF12 ca_address_sk->[c_current_addr_sk]
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_hdemo_sk = hd2.hd_demo_sk)) otherCondition=() build RFs:RF13 hd_demo_sk->[c_current_hdemo_sk]
----------------------------------------PhysicalProject
------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_addr_sk = ad1.ca_address_sk)) otherCondition=() build RFs:RF11 ca_address_sk->[ss_addr_sk]
------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_addr_sk = ad1.ca_address_sk)) otherCondition=() build RFs:RF12 ca_address_sk->[ss_addr_sk]
--------------------------------------------PhysicalProject
----------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_hdemo_sk = hd2.hd_demo_sk)) otherCondition=() build RFs:RF10 hd_demo_sk->[c_current_hdemo_sk]
----------------------------------------------hashJoin[INNER_JOIN colocated] hashCondition=((store_sales.ss_item_sk = store_returns.sr_item_sk) and (store_sales.ss_ticket_number = store_returns.sr_ticket_number)) otherCondition=() build RFs:RF10 sr_item_sk->[i_item_sk,ss_item_sk];RF11 sr_ticket_number->[ss_ticket_number]
------------------------------------------------PhysicalProject
--------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = d1.d_date_sk)) otherCondition=() build RFs:RF9 d_date_sk->[ss_sold_date_sk]
----------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF8 i_item_sk->[cr_item_sk,cs_item_sk,ss_item_sk]
--------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_hdemo_sk = hd1.hd_demo_sk)) otherCondition=() build RFs:RF9 hd_demo_sk->[ss_hdemo_sk]
----------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF8 i_item_sk->[ss_item_sk]
------------------------------------------------------PhysicalProject
--------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_cdemo_sk = cd2.cd_demo_sk)) otherCondition=(( not (cd_marital_status = cd_marital_status))) build RFs:RF7 cd_demo_sk->[c_current_cdemo_sk]
--------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_first_shipto_date_sk = d3.d_date_sk)) otherCondition=() build RFs:RF7 d_date_sk->[c_first_shipto_date_sk]
----------------------------------------------------------PhysicalProject
------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF6 c_customer_sk->[ss_customer_sk]
------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_cdemo_sk = cd1.cd_demo_sk)) otherCondition=(( not (cd_marital_status = cd_marital_status))) build RFs:RF6 cd_demo_sk->[ss_cdemo_sk]
--------------------------------------------------------------PhysicalProject
----------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_cdemo_sk = cd1.cd_demo_sk)) otherCondition=() build RFs:RF5 cd_demo_sk->[ss_cdemo_sk]
------------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_item_sk = cs_ui.cs_item_sk)) otherCondition=() build RFs:RF4 cs_item_sk->[ss_item_sk]
--------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF4 RF5 RF6 RF8 RF9 RF11 RF14 RF15 RF16 RF18 RF19
--------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------filter((sale > (2 * refund)))
------------------------------------------------------------------------hashAgg[GLOBAL]
--------------------------------------------------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------------------------------------------------hashAgg[LOCAL]
------------------------------------------------------------------------------PhysicalProject
--------------------------------------------------------------------------------hashJoin[INNER_JOIN colocated] hashCondition=((catalog_sales.cs_item_sk = catalog_returns.cr_item_sk) and (catalog_sales.cs_order_number = catalog_returns.cr_order_number)) otherCondition=() build RFs:RF2 cr_item_sk->[cs_item_sk];RF3 cr_order_number->[cs_order_number]
----------------------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3 RF8 RF18
----------------------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------------------PhysicalOlapScan[catalog_returns] apply RFs: RF8 RF18
----------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF5 c_customer_sk->[ss_customer_sk]
------------------------------------------------------------------PhysicalProject
--------------------------------------------------------------------PhysicalOlapScan[customer_demographics]
--------------------------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF5 RF6 RF8 RF9 RF10 RF11 RF12 RF16 RF17 RF18 RF19
------------------------------------------------------------------PhysicalProject
--------------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_current_cdemo_sk = cd2.cd_demo_sk)) otherCondition=() build RFs:RF4 cd_demo_sk->[c_current_cdemo_sk]
----------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer.c_first_sales_date_sk = d2.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[c_first_sales_date_sk]
--------------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------------PhysicalOlapScan[customer] apply RFs: RF3 RF4 RF7 RF13 RF14
--------------------------------------------------------------------------PhysicalProject
----------------------------------------------------------------------------PhysicalOlapScan[date_dim]
----------------------------------------------------------------------PhysicalProject
------------------------------------------------------------------------PhysicalOlapScan[customer_demographics]
--------------------------------------------------------------PhysicalProject
----------------------------------------------------------------PhysicalOlapScan[customer] apply RFs: RF7 RF10 RF12 RF13 RF17
----------------------------------------------------------------PhysicalOlapScan[customer_demographics]
----------------------------------------------------------PhysicalProject
------------------------------------------------------------PhysicalOlapScan[customer_demographics]
------------------------------------------------------------PhysicalOlapScan[date_dim]
------------------------------------------------------PhysicalProject
--------------------------------------------------------filter((item.i_current_price <= 58.00) and (item.i_current_price >= 49.00) and i_color IN ('blush', 'lace', 'lawn', 'misty', 'orange', 'pink'))
----------------------------------------------------------PhysicalOlapScan[item] apply RFs: RF18
----------------------------------------------------------PhysicalOlapScan[item] apply RFs: RF10 RF19
----------------------------------------------------PhysicalProject
------------------------------------------------------filter(d_year IN (1999, 2000))
--------------------------------------------------------PhysicalOlapScan[date_dim]
------------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((hd1.hd_income_band_sk = ib1.ib_income_band_sk)) otherCondition=() build RFs:RF2 ib_income_band_sk->[hd_income_band_sk]
--------------------------------------------------------PhysicalProject
----------------------------------------------------------PhysicalOlapScan[household_demographics] apply RFs: RF2
--------------------------------------------------------PhysicalProject
----------------------------------------------------------PhysicalOlapScan[income_band]
------------------------------------------------PhysicalProject
--------------------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((hd2.hd_income_band_sk = ib2.ib_income_band_sk)) otherCondition=() build RFs:RF1 ib_income_band_sk->[hd_income_band_sk]
----------------------------------------------------PhysicalProject
------------------------------------------------------PhysicalOlapScan[household_demographics] apply RFs: RF1
----------------------------------------------------PhysicalProject
------------------------------------------------------PhysicalOlapScan[income_band]
--------------------------------------------------PhysicalOlapScan[store_returns] apply RFs: RF19
--------------------------------------------PhysicalProject
----------------------------------------------PhysicalOlapScan[customer_address]
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[customer_address]
------------------------------------------PhysicalOlapScan[household_demographics] apply RFs: RF15
------------------------------------PhysicalProject
--------------------------------------PhysicalOlapScan[date_dim]
--------------------------------------PhysicalOlapScan[customer_address]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[promotion]
----------------------------------PhysicalOlapScan[income_band]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[store]
------------------------------filter(d_year IN (1999, 2000))
--------------------------------PhysicalOlapScan[date_dim]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((hd1.hd_income_band_sk = ib1.ib_income_band_sk)) otherCondition=() build RFs:RF0 ib_income_band_sk->[hd_income_band_sk]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[household_demographics] apply RFs: RF0
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[income_band]
--------------------------PhysicalOlapScan[store]
--------------------PhysicalProject
----------------------PhysicalOlapScan[date_dim]
----------------------PhysicalOlapScan[promotion]
----------------PhysicalProject
------------------PhysicalOlapScan[store_returns]
------------------filter((sale > (2 * refund)))
--------------------hashAgg[GLOBAL]
----------------------PhysicalDistribute[DistributionSpecHash]
------------------------hashAgg[LOCAL]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN colocated] hashCondition=((catalog_sales.cs_item_sk = catalog_returns.cr_item_sk) and (catalog_sales.cs_order_number = catalog_returns.cr_order_number)) otherCondition=() build RFs:RF0 cr_item_sk->[cs_item_sk];RF1 cr_order_number->[cs_order_number]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0 RF1
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[catalog_returns]
--PhysicalResultSink
----PhysicalQuickSort[MERGE_SORT]
------PhysicalDistribute[DistributionSpecGather]
Expand Down
Loading

0 comments on commit e238a87

Please sign in to comment.